diff options
author | Steven G. Johnson <stevenj@alum.mit.edu> | 2014-12-07 22:25:31 -0500 |
---|---|---|
committer | Steven G. Johnson <stevenj@mit.edu> | 2014-12-12 16:29:29 -0500 |
commit | 539d2cc2024f494b1e3292d4730bdc96390e1361 (patch) | |
tree | 84876217e3476899f6ea2d2e5646c2e845c6843a | |
parent | 1b3992ebe5c587446aaa962a314ef9244d86fb0d (diff) | |
download | libutf8proc-539d2cc2024f494b1e3292d4730bdc96390e1361.tar.gz libutf8proc-539d2cc2024f494b1e3292d4730bdc96390e1361.tar.bz2 |
grapheme test for UAX#29
-rw-r--r-- | .gitignore | 1 | ||||
-rw-r--r-- | Makefile | 14 | ||||
-rw-r--r-- | graphemetest.c | 62 | ||||
-rw-r--r-- | normtest.c | 45 | ||||
-rw-r--r-- | tests.h | 53 |
5 files changed, 128 insertions, 47 deletions
@@ -14,3 +14,4 @@ bench/bench bench/icu bench/unistring normtest +graphemetest @@ -2,6 +2,7 @@ CURL=curl RUBY=ruby +PERL=perl MAKE=make # settings @@ -24,7 +25,7 @@ all: c-library c-library: libmojibake.a libmojibake.$(SHLIB_EXT) clean: - rm -f utf8proc.o libmojibake.a libmojibake.$(SHLIB_EXT) normtest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt + rm -f utf8proc.o libmojibake.a libmojibake.$(SHLIB_EXT) normtest graphemetest UnicodeData.txt DerivedCoreProperties.txt CompositionExclusions.txt CaseFolding.txt NormalizationTest.txt GraphemeBreakTest.txt $(MAKE) -C bench clean update: utf8proc_data.c.new @@ -67,8 +68,15 @@ libmojibake.dylib: utf8proc.o NormalizationTest.txt: $(CURL) -O http://www.unicode.org/Public/UNIDATA/NormalizationTest.txt -normtest: normtest.c utf8proc.o mojibake.h +GraphemeBreakTest.txt: + $(CURL) http://www.unicode.org/Public/UCD/latest/ucd/auxiliary/GraphemeBreakTest.txt | $(PERL) -pe 's,÷,/,g;s,×,+,g' > $@ + +normtest: normtest.c utf8proc.o mojibake.h tests.h $(cc) normtest.c utf8proc.o -o normtest -check: normtest NormalizationTest.txt +graphemetest: graphemetest.c utf8proc.o mojibake.h tests.h + $(cc) graphemetest.c utf8proc.o -o graphemetest + +check: normtest NormalizationTest.txt graphemetest GraphemeBreakTest.txt ./normtest + ./graphemetest diff --git a/graphemetest.c b/graphemetest.c new file mode 100644 index 0000000..07fe28c --- /dev/null +++ b/graphemetest.c @@ -0,0 +1,62 @@ +#include "tests.h" + +int main(void) +{ + char *buf = NULL; + size_t bufsize = 0; + FILE *f = fopen("GraphemeBreakTest.txt", "r"); + uint8_t src[1024]; + + check(f != NULL, "error opening NormalizationTest.txt"); + while (getline(&buf, &bufsize, f) > 0) { + size_t bi = 0, si = 0; + lineno += 1; + + if (lineno % 100 == 0) + printf("checking line %zd...\n", lineno); + + if (buf[0] == '#') continue; + + while (buf[bi]) { + bi = skipspaces(buf, bi); + if (buf[bi] == '/') { /* grapheme break */ + src[si++] = 0xff; + bi++; + } + else if (buf[bi] == '+') { /* no break */ + bi++; + } + else if (buf[bi] == '#') { /* start of comments */ + break; + } + else { /* hex-encoded codepoint */ + bi += encode((char*) (src + si), buf + bi) - 1; + while (src[si]) ++si; /* advance to NUL termination */ + } + } + if (si && src[si-1] == 0xff) + --si; /* no 0xff after final grapheme */ + src[si] = 0; /* NUL-terminate */ + + if (si) { + uint8_t utf8[1024]; /* copy src without 0xff grapheme separators */ + size_t i = 0, j = 0; + ssize_t glen; + uint8_t *g; /* utf8proc_map grapheme results */ + while (i < si) { + if (src[i] != 0xff) + utf8[j++] = src[i++]; + else + i++; + } + glen = utf8proc_map(utf8, j, &g, UTF8PROC_CHARBOUND); + check(glen >= 0, "utf8proc_map error = %s", + utf8proc_errmsg(glen)); + check(!strcmp((char*)g, (char*)src), + "grapheme mismatch: %s vs. %s", (char*)g, (char*)src); + } + } + fclose(f); + printf("Passed tests after %zd lines!\n", lineno); + return 0; +} @@ -1,47 +1,4 @@ -#include <stdio.h> -#include <stdlib.h> -#include <ctype.h> -#include <string.h> -#include <stdarg.h> - -#include "mojibake.h" - -size_t lineno = 0; - -void check(int cond, const char *format, ...) -{ - if (!cond) { - va_list args; - fprintf(stderr, "line %zd: ", lineno); - va_start(args, format); - vfprintf(stderr, format, args); - va_end(args); - fprintf(stderr, "\n"); - exit(1); - } -} - -/* if buf points to a sequence of codepoints encoded as hexadecimal strings, - separated by whitespace, and terminated by any character not in - [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string - in dest, returning the number of bytes read from buf */ -size_t encode(char *dest, const char *buf) -{ - size_t i = 0, j, d = 0; - do { - int c; - while (isspace(buf[i])) ++i; /* skip whitespace */ - for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j) - ; /* find end of hex input */ - if (j == i) { /* no codepoint found */ - dest[d] = 0; /* NUL-terminate destination string */ - return i + 1; - } - check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i); - i = j; /* skip to char after hex input */ - d += utf8proc_encode_char(c, (uint8_t *) (dest + d)); - } while (1); -} +#include "tests.h" #define CHECK_NORM(NRM, norm, src) { \ char *src_norm = (char*) utf8proc_ ## NRM((uint8_t*) src); \ @@ -0,0 +1,53 @@ +/* Common functions and includes for our test programs. */ + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <string.h> +#include <stdarg.h> + +#include "mojibake.h" + +size_t lineno = 0; + +void check(int cond, const char *format, ...) +{ + if (!cond) { + va_list args; + fprintf(stderr, "line %zd: ", lineno); + va_start(args, format); + vfprintf(stderr, format, args); + va_end(args); + fprintf(stderr, "\n"); + exit(1); + } +} + +size_t skipspaces(const char *buf, size_t i) +{ + while (isspace(buf[i])) ++i; + return i; +} + +/* if buf points to a sequence of codepoints encoded as hexadecimal strings, + separated by whitespace, and terminated by any character not in + [0-9a-fA-F] or whitespace, then stores the corresponding utf8 string + in dest, returning the number of bytes read from buf */ +size_t encode(char *dest, const char *buf) +{ + size_t i = 0, j, d = 0; + do { + int c; + i = skipspaces(buf, i); + for (j=i; buf[j] && strchr("0123456789abcdef", tolower(buf[j])); ++j) + ; /* find end of hex input */ + if (j == i) { /* no codepoint found */ + dest[d] = 0; /* NUL-terminate destination string */ + return i + 1; + } + check(sscanf(buf + i, "%x", &c) == 1, "invalid hex input %s", buf+i); + i = j; /* skip to char after hex input */ + d += utf8proc_encode_char(c, (uint8_t *) (dest + d)); + } while (1); +} + |