From a8fb4b17727651beadfa2c7c4d899fd0f4947c5a Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Fri, 29 May 2015 13:52:48 -0400 Subject: add toupper/tolower functions (for JuliaLang/julia#11471) --- .gitignore | 3 ++- Makefile | 6 +++++- test/case.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ utf8proc.c | 12 ++++++++++++ utf8proc.h | 15 +++++++++++++++ 5 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 test/case.c diff --git a/.gitignore b/.gitignore index 0961a6b..4c9b2df 100644 --- a/.gitignore +++ b/.gitignore @@ -22,4 +22,5 @@ utf8proc_data.c.new printproperty charwidth valid -iterate \ No newline at end of file +iterate +case diff --git a/Makefile b/Makefile index e29c349..45b1ed7 100644 --- a/Makefile +++ b/Makefile @@ -111,10 +111,14 @@ test/valid: test/valid.c utf8proc.o utf8proc.h test/tests.h test/iterate: test/iterate.c utf8proc.o utf8proc.h test/tests.h $(cc) test/iterate.c utf8proc.o -o $@ -check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o +test/case: test/case.c utf8proc.o utf8proc.h test/tests.h + $(cc) test/case.c utf8proc.o -o $@ + +check: test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/printproperty test/case test/charwidth test/valid test/iterate bench/bench.c bench/util.c bench/util.h utf8proc.o $(MAKE) -C bench test/normtest data/NormalizationTest.txt test/graphemetest data/GraphemeBreakTest.txt test/charwidth test/valid test/iterate + test/case diff --git a/test/case.c b/test/case.c new file mode 100644 index 0000000..39958e3 --- /dev/null +++ b/test/case.c @@ -0,0 +1,50 @@ +#include "tests.h" +#include + +int main(int argc, char **argv) +{ + int error = 0, better = 0; + utf8proc_int32_t c; + + (void) argc; /* unused */ + (void) argv; /* unused */ + + /* some simple sanity tests of the character widths */ + for (c = 0; c <= 0x110000; ++c) { + utf8proc_int32_t l = utf8proc_tolower(c); + utf8proc_int32_t u = utf8proc_toupper(c); + + check(l == c || utf8proc_codepoint_valid(l), "invalid tolower"); + check(u == c || utf8proc_codepoint_valid(u), "invalid toupper"); + + if (sizeof(wint_t) > 2 || c < (1<<16)) { + wint_t l0 = towlower(c), u0 = towupper(c); + + /* OS unicode tables may be out of date. But if they + do have a lower/uppercase mapping, hopefully it + is correct? */ + if (l0 != c && l0 != l) { + fprintf(stderr, "MISMATCH %x != towlower(%x) == %x\n", + l, c, l0); + ++error; + } + else if (l0 != l) { /* often true for out-of-date OS unicode */ + ++better; + /* printf("%x != towlower(%x) == %x\n", l, c, l0); */ + } + if (u0 != c && u0 != u) { + fprintf(stderr, "MISMATCH %x != towupper(%x) == %x\n", + u, c, u0); + ++error; + } + else if (u0 != u) { /* often true for out-of-date OS unicode */ + ++better; + /* printf("%x != towupper(%x) == %x\n", u, c, u0); */ + } + } + } + check(!error, "utf8proc case conversion FAILED %d tests.", error); + printf("More up-to-date than OS unicode tables for %d tests.\n", better); + printf("utf8proc case conversion tests SUCCEEDED.\n"); + return 0; +} diff --git a/utf8proc.c b/utf8proc.c index 971b87a..80f5ba8 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -264,6 +264,18 @@ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t c1, ut utf8proc_get_property(c2)->boundclass); } +UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c) +{ + utf8proc_int32_t cl = utf8proc_get_property(c)->lowercase_mapping; + return cl >= 0 ? cl : c; +} + +UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c) +{ + utf8proc_int32_t cu = utf8proc_get_property(c)->uppercase_mapping; + return cu >= 0 ? cu : c; +} + /* return a character width analogous to wcwidth (except portable and hopefully less buggy than most system wcwidth functions). */ UTF8PROC_DLLEXPORT int utf8proc_charwidth(utf8proc_int32_t c) { diff --git a/utf8proc.h b/utf8proc.h index 7108215..59f2425 100644 --- a/utf8proc.h +++ b/utf8proc.h @@ -511,6 +511,21 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_reencode(utf8proc_int32_t *buffer, */ UTF8PROC_DLLEXPORT utf8proc_bool utf8proc_grapheme_break(utf8proc_int32_t codepoint1, utf8proc_int32_t codepoint2); + +/** + * Given a codepoint `c`, return the codepoint of the corresponding + * lower-case character, if any; otherwise (if there is no lower-case + * variant, or if `c` is not a valid codepoint) return `c`. + */ +UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_tolower(utf8proc_int32_t c); + +/** + * Given a codepoint `c`, return the codepoint of the corresponding + * upper-case character, if any; otherwise (if there is no upper-case + * variant, or if `c` is not a valid codepoint) return `c`. + */ +UTF8PROC_DLLEXPORT utf8proc_int32_t utf8proc_toupper(utf8proc_int32_t c); + /** * Given a codepoint, return a character width analogous to `wcwidth(codepoint)`, * except that a width of 0 is returned for non-printable codepoints -- cgit v1.2.3