From 02f4e1890cf8135b609b404c58ac7e8b27136ad6 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 24 Jul 2018 10:45:02 -0400 Subject: charwidth=1 for soft hyphen and unassigned codepoints (#135) * use width=1 for soft hyphen and for unassigned/PUA codepoints * don't count unassigned codepoints when comparing with system wcwidth * more tests * indentation fixes * NEWS for 135 * remove special-casing for arabic control characters affecting a span of numbers, which are sometimes zero-width and sometimes not * regenerate --- test/charwidth.c | 120 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 63 insertions(+), 57 deletions(-) (limited to 'test') diff --git a/test/charwidth.c b/test/charwidth.c index 330f18e..c5cbbd7 100644 --- a/test/charwidth.c +++ b/test/charwidth.c @@ -2,70 +2,76 @@ #include #include +static int my_unassigned(int c) { + int cat = utf8proc_get_property(c)->category; + return (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO); +} + static int my_isprint(int c) { - int cat = utf8proc_get_property(c)->category; - return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) || - (c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd); + int cat = utf8proc_get_property(c)->category; + return (UTF8PROC_CATEGORY_LU <= cat && cat <= UTF8PROC_CATEGORY_ZS) || + (c == 0x0601 || c == 0x0602 || c == 0x0603 || c == 0x06dd || c == 0x00ad) || + (cat == UTF8PROC_CATEGORY_CN) || (cat == UTF8PROC_CATEGORY_CO); } int main(int argc, char **argv) { - int c, error = 0, updates = 0; + int c, error = 0, updates = 0; + + (void) argc; /* unused */ + (void) argv; /* unused */ - (void) argc; /* unused */ - (void) argv; /* unused */ + /* some simple sanity tests of the character widths */ + for (c = 0; c <= 0x110000; ++c) { + int cat = utf8proc_get_property(c)->category; + int w = utf8proc_charwidth(c); + if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && w > 0) { + fprintf(stderr, "nonzero width %d for combining char %x\n", w, c); + error += 1; + } + if (w == 0 && + ((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) || + (cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) || + (cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) { + fprintf(stderr, "zero width for symbol-like char %x\n", c); + error += 1; + } + if (c <= 127 && ((!isprint(c) && w > 0) || (isprint(c) && wcwidth(c) != w))) { + fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n", + wcwidth(c), w, + isprint(c) ? "printable" : "non-printable", c); + error += 1; + } + if (!my_isprint(c) && w > 0) { + fprintf(stderr, "non-printing %x had width %d\n", c, w); + error += 1; + } + if (my_unassigned(c) && w != 1) { + fprintf(stderr, "unexpected width %d for unassigned char %x\n", w, c); + error += 1; + } + } + check(!error, "utf8proc_charwidth FAILED %d tests.", error); - /* some simple sanity tests of the character widths */ - for (c = 0; c <= 0x110000; ++c) { - int cat = utf8proc_get_property(c)->category; - int w = utf8proc_charwidth(c); - if ((cat == UTF8PROC_CATEGORY_MN || cat == UTF8PROC_CATEGORY_ME) && - w > 0) { - fprintf(stderr, "nonzero width %d for combining char %x\n", w, c); - error = 1; - } - if (w == 0 && - ((cat >= UTF8PROC_CATEGORY_LU && cat <= UTF8PROC_CATEGORY_LO) || - (cat >= UTF8PROC_CATEGORY_ND && cat <= UTF8PROC_CATEGORY_SC) || - (cat >= UTF8PROC_CATEGORY_SO && cat <= UTF8PROC_CATEGORY_ZS))) { - fprintf(stderr, "zero width for symbol-like char %x\n", c); - error = 1; - } - if (c <= 127 && ((!isprint(c) && w > 0) || - (isprint(c) && wcwidth(c) != w))) { - fprintf(stderr, "wcwidth %d mismatch %d for %s ASCII %x\n", - wcwidth(c), w, - isprint(c) ? "printable" : "non-printable", c); - error = 1; - } - if (!my_isprint(c) && w > 0) { - fprintf(stderr, "non-printing %x had width %d\n", c, w); - error = 1; - } - } - check(!error, "utf8proc_charwidth FAILED tests."); + check(utf8proc_charwidth(0x00ad) == 1, "incorrect width for U+00AD (soft hyphen)"); + check(utf8proc_charwidth(0xe000) == 1, "incorrect width for U+e000 (PUA)"); - /* print some other information by compariing with system wcwidth */ - printf("Mismatches with system wcwidth (not necessarily errors):\n"); - for (c = 0; c <= 0x110000; ++c) { - int w = utf8proc_charwidth(c); - int wc = wcwidth(c); - if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue; - /* lots of these errors for out-of-date system unicode tables */ - if (wc == -1 && my_isprint(c) && w > 0) { - updates += 1; -#if 0 - printf(" wcwidth(%x) = -1 for printable char\n", c); -#endif - } - if (wc == -1 && !my_isprint(c) && w > 0) - printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w); - if (wc >= 0 && wc != w) - printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w); - } - printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n", - updates); - printf("Character-width tests SUCCEEDED.\n"); + /* print some other information by compariing with system wcwidth */ + printf("Mismatches with system wcwidth (not necessarily errors):\n"); + for (c = 0; c <= 0x110000; ++c) { + int w = utf8proc_charwidth(c); + int wc = wcwidth(c); + if (sizeof(wchar_t) == 2 && c >= (1<<16)) continue; + /* lots of these errors for out-of-date system unicode tables */ + if (wc == -1 && my_isprint(c) && !my_unassigned(c) && w > 0) + updates += 1; + if (wc == -1 && !my_isprint(c) && w > 0) + printf(" wcwidth(%x) = -1 for non-printable width-%d char\n", c, w); + if (wc >= 0 && wc != w) + printf(" wcwidth(%x) = %d != charwidth %d\n", c, wc, w); + } + printf(" ... (positive widths for %d chars unknown to wcwidth) ...\n", updates); + printf("Character-width tests SUCCEEDED.\n"); - return 0; + return 0; } -- cgit v1.2.3