From 5aa01bdb330f82e7bc3756ac18fd47d137059ce4 Mon Sep 17 00:00:00 2001 From: John Tytgat Date: Sat, 28 Jun 2008 20:57:47 +0000 Subject: - parserutils_charset_utf16_to_ucs4(): fixed surrogate handling. - cscodec-utf8.c(run_test): Added more asserts. - Added UTF-16 tester (based on the UTF-8 one). svn path=/trunk/libparserutils/; revision=4472 --- src/charset/encodings/utf16.c | 15 +- test/INDEX | 3 +- test/Makefile | 4 +- test/cscodec-utf16.c | 316 ++++++++++++++++++++++++++++++++++ test/cscodec-utf8.c | 251 +++++++++++++++++++++++++++ test/cscodec.c | 232 ------------------------- test/data/cscodec-utf16/INDEX | 6 + test/data/cscodec-utf16/simple.dat | 33 ++++ test/data/cscodec-utf8/INDEX | 6 + test/data/cscodec-utf8/UTF-8-test.txt | Bin 0 -> 41013 bytes test/data/cscodec-utf8/simple.dat | Bin 0 -> 1109 bytes test/data/cscodec/INDEX | 6 - test/data/cscodec/UTF-8-test.txt | Bin 41013 -> 0 bytes test/data/cscodec/simple.dat | Bin 1109 -> 0 bytes 14 files changed, 626 insertions(+), 246 deletions(-) create mode 100644 test/cscodec-utf16.c create mode 100644 test/cscodec-utf8.c delete mode 100644 test/cscodec.c create mode 100644 test/data/cscodec-utf16/INDEX create mode 100644 test/data/cscodec-utf16/simple.dat create mode 100644 test/data/cscodec-utf8/INDEX create mode 100644 test/data/cscodec-utf8/UTF-8-test.txt create mode 100644 test/data/cscodec-utf8/simple.dat delete mode 100644 test/data/cscodec/INDEX delete mode 100644 test/data/cscodec/UTF-8-test.txt delete mode 100644 test/data/cscodec/simple.dat diff --git a/src/charset/encodings/utf16.c b/src/charset/encodings/utf16.c index 59cb146..3611646 100644 --- a/src/charset/encodings/utf16.c +++ b/src/charset/encodings/utf16.c @@ -19,7 +19,7 @@ * Convert a UTF-16 sequence into a single UCS-4 character * * \param s The sequence to process - * \param len Length of sequence + * \param len Length of sequence in bytes * \param ucs4 Pointer to location to receive UCS-4 character (host endian) * \param clen Pointer to location to receive byte length of UTF-16 sequence * \return PARSERUTILS_OK on success, appropriate error otherwise @@ -38,17 +38,22 @@ parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, if (*ss < 0xD800 || *ss > 0xDFFF) { *ucs4 = *ss; *clen = 2; - } else if (0xD800 <= *ss && *ss <= 0xBFFF) { + } else if (0xD800 <= *ss && *ss <= 0xDBFF) { + /* High-surrogate code unit. */ if (len < 4) return PARSERUTILS_NEEDDATA; - if (0xDC00 <= ss[1] && ss[1] <= 0xE000) { - *ucs4 = (((s[0] >> 6) & 0x1f) + 1) | - ((s[0] & 0x3f) | (s[1] & 0x3ff)); + if (0xDC00 <= ss[1] && ss[1] <= 0xDFFF) { + /* We have a valid surrogate pair. */ + *ucs4 = (((ss[0] & 0x3FF) << 10) | (ss[1] & 0x3FF)) + + (1<<16); *clen = 4; } else { return PARSERUTILS_INVALID; } + } else { + /* Low-surrogate code unit. */ + return PARSERUTILS_INVALID; } return PARSERUTILS_OK; diff --git a/test/INDEX b/test/INDEX index 0042c36..927a240 100644 --- a/test/INDEX +++ b/test/INDEX @@ -5,7 +5,8 @@ charset Charset initialisation/finalisation parserutils Library initialisation/finalisation aliases Encoding alias handling -cscodec Charset codec implementation cscodec +cscodec-utf8 UTF-8 charset codec implementation cscodec-utf8 +cscodec-utf16 UTF-16 charset codec implementation cscodec-utf16 dict Dictionary handling rbtree Red-black tree implementation filter Input stream filtering diff --git a/test/Makefile b/test/Makefile index 4c5caac..a6a1161 100644 --- a/test/Makefile +++ b/test/Makefile @@ -35,8 +35,8 @@ d := $(DIR) override CFLAGS := $(CFLAGS) -I$(TOP)/src/ -I$(d) # Tests -TESTS_$(d) := aliases cscodec charset dict filter inputstream parserutils \ - rbtree +TESTS_$(d) := aliases cscodec-utf8 cscodec-utf16 charset dict filter \ + inputstream parserutils rbtree TESTS_$(d) := $(TESTS_$(d)) regression/cscodec-segv regression/filter-segv \ regression/stream-nomem diff --git a/test/cscodec-utf16.c b/test/cscodec-utf16.c new file mode 100644 index 0000000..ee74662 --- /dev/null +++ b/test/cscodec-utf16.c @@ -0,0 +1,316 @@ +#include +#include +#include + +/* These two are for htonl / ntohl */ +#include +#include + +#include "charset/charset.h" +#include + +#include "utils/utils.h" + +#include "testutils.h" + +typedef struct line_ctx { + parserutils_charset_codec *codec; + + size_t buflen; + size_t bufused; + uint8_t *buf; + size_t explen; + size_t expused; + uint8_t *exp; + + bool indata; + bool inexp; + + parserutils_error exp_ret; + + enum { ENCODE, DECODE, BOTH } dir; +} line_ctx; + +static bool handle_line(const char *data, size_t datalen, void *pw); +static void run_test(line_ctx *ctx); + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + line_ctx ctx; + + if (argc != 3) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + assert(parserutils_charset_codec_create("NATS-SEFI-ADD", + myrealloc, NULL) == NULL); + + ctx.codec = parserutils_charset_codec_create("UTF-16", myrealloc, NULL); + assert(ctx.codec != NULL); + + ctx.buflen = parse_filesize(argv[2]); + if (ctx.buflen == 0) + return 1; + + ctx.buf = malloc(2 * ctx.buflen); + if (ctx.buf == NULL) { + printf("Failed allocating %u bytes\n", + (unsigned int) ctx.buflen); + return 1; + } + + ctx.exp = ctx.buf + ctx.buflen; + ctx.explen = ctx.buflen; + + ctx.buf[0] = '\0'; + ctx.exp[0] = '\0'; + ctx.bufused = 0; + ctx.expused = 0; + ctx.indata = false; + ctx.inexp = false; + ctx.exp_ret = PARSERUTILS_OK; + + assert(parse_testfile(argv[2], handle_line, &ctx) == true); + + /* and run final test */ + if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n') + ctx.bufused -= 1; + + if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n') + ctx.expused -= 1; + + run_test(&ctx); + + free(ctx.buf); + + parserutils_charset_codec_destroy(ctx.codec); + + assert(parserutils_charset_finalise(myrealloc, NULL) == + PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} + +/** + * Converts hex character ('0' ... '9' or 'a' ... 'f' or 'A' ... 'F') to + * digit value. + * \param hex Valid hex character + * \return Corresponding digit value. + */ +static inline int hex2digit(char hex) +{ + return (hex <= '9') ? hex - '0' : (hex | 0x20) - 'a' + 10; +} + +bool handle_line(const char *data, size_t datalen, void *pw) +{ + line_ctx *ctx = (line_ctx *) pw; + + if (data[0] == '#') { + if (ctx->inexp) { + /* This marks end of testcase, so run it */ + + if (ctx->buf[ctx->bufused - 1] == '\n') + ctx->bufused -= 1; + + if (ctx->exp[ctx->expused - 1] == '\n') + ctx->expused -= 1; + + run_test(ctx); + + ctx->buf[0] = '\0'; + ctx->exp[0] = '\0'; + ctx->bufused = 0; + ctx->expused = 0; + ctx->exp_ret = PARSERUTILS_OK; + } + + if (strncasecmp(data+1, "data", 4) == 0) { + parserutils_charset_codec_optparams params; + const char *ptr = data + 6; + + ctx->indata = true; + ctx->inexp = false; + + if (strncasecmp(ptr, "decode", 6) == 0) + ctx->dir = DECODE; + else if (strncasecmp(ptr, "encode", 6) == 0) + ctx->dir = ENCODE; + else + ctx->dir = BOTH; + + ptr += 7; + + if (strncasecmp(ptr, "LOOSE", 5) == 0) { + params.error_mode.mode = + PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE; + ptr += 6; + } else if (strncasecmp(ptr, "STRICT", 6) == 0) { + params.error_mode.mode = + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT; + ptr += 7; + } else { + params.error_mode.mode = + PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT; + ptr += 9; + } + + assert(parserutils_charset_codec_setopt(ctx->codec, + PARSERUTILS_CHARSET_CODEC_ERROR_MODE, + (parserutils_charset_codec_optparams *) ¶ms) + == PARSERUTILS_OK); + } else if (strncasecmp(data+1, "expected", 8) == 0) { + ctx->indata = false; + ctx->inexp = true; + + ctx->exp_ret = parserutils_error_from_string(data + 10, + datalen - 10 - 1 /* \n */); + } else if (strncasecmp(data+1, "reset", 5) == 0) { + ctx->indata = false; + ctx->inexp = false; + + parserutils_charset_codec_reset(ctx->codec); + } + } else { + if (ctx->indata) { + /* Process "&#xNNNN" as 16-bit code units. */ + while (datalen) { + if (data[0] == '\n') { + ctx->buf[ctx->bufused++] = *data++; + --datalen; + continue; + } + assert(datalen >= sizeof ("&#xNNNN")-1 \ + && data[0] == '&' && data[1] == '#' \ + && data[2] == 'x' && isxdigit(data[3]) \ + && isxdigit(data[4]) && isxdigit(data[5]) \ + && isxdigit(data[6])); + /* UTF-16 code is always host endian (different + than UCS-32 !). */ + ctx->buf[ctx->bufused++] + = (hex2digit(data[5]) << 4) | hex2digit(data[6]); + ctx->buf[ctx->bufused++] + = (hex2digit(data[3]) << 4) | hex2digit(data[4]); + data += sizeof ("&#xNNNN")-1; + datalen -= sizeof ("&#xNNNN")-1; + } + } + if (ctx->inexp) { + /* Process "&#xXXXXYYYY as 32-bit code units. */ + while (datalen) { + if (data[0] == '\n') { + ctx->exp[ctx->expused++] = *data++; + --datalen; + continue; + } + assert(datalen >= sizeof ("&#xXXXXYYYY")-1 \ + && data[0] == '&' && data[1] == '#' \ + && data[2] == 'x' && isxdigit(data[3]) \ + && isxdigit(data[4]) && isxdigit(data[5]) \ + && isxdigit(data[6]) && isxdigit(data[7]) \ + && isxdigit(data[8]) && isxdigit(data[9]) \ + && isxdigit(data[10])); + /* UCS-4 code is always big endian, so convert + host endian to big endian. */ + const uint32_t nCodePoint = + htonl((hex2digit(data[3]) << 28) + | (hex2digit(data[4]) << 24) + | (hex2digit(data[5]) << 20) + | (hex2digit(data[6]) << 16) + | (hex2digit(data[7]) << 12) + | (hex2digit(data[8]) << 8) + | (hex2digit(data[9]) << 4) + | hex2digit(data[10])); + ctx->exp[ctx->expused++] = (nCodePoint >> 0) & 0xFF; + ctx->exp[ctx->expused++] = (nCodePoint >> 8) & 0xFF; + ctx->exp[ctx->expused++] = (nCodePoint >> 16) & 0xFF; + ctx->exp[ctx->expused++] = (nCodePoint >> 24) & 0xFF; + data += sizeof ("&#xXXXXYYYY")-1; + datalen -= sizeof ("&#xXXXXYYYY")-1; + } + } + } + + return true; +} + +void run_test(line_ctx *ctx) +{ + static int testnum; + size_t destlen = ctx->bufused * 4; + uint8_t dest[destlen]; + uint8_t *pdest = dest; + const uint8_t *psrc = ctx->buf; + size_t srclen = ctx->bufused; + size_t i; + + if (ctx->dir == DECODE) { + assert(parserutils_charset_codec_decode(ctx->codec, + &psrc, &srclen, + &pdest, &destlen) == ctx->exp_ret); + } else if (ctx->dir == ENCODE) { + assert(parserutils_charset_codec_encode(ctx->codec, + &psrc, &srclen, + &pdest, &destlen) == ctx->exp_ret); + } else { + size_t templen = ctx->bufused * 4; + uint8_t temp[templen]; + uint8_t *ptemp = temp; + const uint8_t *ptemp2; + size_t templen2; + + assert(parserutils_charset_codec_decode(ctx->codec, + &psrc, &srclen, + &ptemp, &templen) == ctx->exp_ret); + /* \todo currently there is no way to specify the number of + consumed & produced data in case of a deliberate bad input + data set. */ + if (ctx->exp_ret == PARSERUTILS_OK) { + assert(temp + (ctx->bufused * 4 - templen) == ptemp); + } + + ptemp2 = temp; + templen2 = ctx->bufused * 4 - templen; + assert(parserutils_charset_codec_encode(ctx->codec, + &ptemp2, &templen2, + &pdest, &destlen) == ctx->exp_ret); + if (ctx->exp_ret == PARSERUTILS_OK) { + assert(templen2 == 0); + assert(temp + (ctx->bufused * 4 - templen) == ptemp2); + } + } + if (ctx->exp_ret == PARSERUTILS_OK) { + assert(srclen == 0); + assert(ctx->buf + ctx->bufused == psrc); + assert(dest + (ctx->bufused * 4 - destlen) == pdest); + } + + printf("%d: Read '", ++testnum); + for (i = 0; i < ctx->expused; i++) { + printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf], + "0123456789abcdef"[dest[i] & 0xf]); + } + printf("' Expected '"); + for (i = 0; i < ctx->expused; i++) { + printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf], + "0123456789abcdef"[ctx->exp[i] & 0xf]); + } + printf("'\n"); + + assert(pdest == dest + ctx->expused); + assert(memcmp(dest, ctx->exp, ctx->expused) == 0); +} + diff --git a/test/cscodec-utf8.c b/test/cscodec-utf8.c new file mode 100644 index 0000000..f3cabcc --- /dev/null +++ b/test/cscodec-utf8.c @@ -0,0 +1,251 @@ +#include +#include + +#include "charset/charset.h" +#include + +#include "utils/utils.h" + +#include "testutils.h" + +typedef struct line_ctx { + parserutils_charset_codec *codec; + + size_t buflen; + size_t bufused; + uint8_t *buf; + size_t explen; + size_t expused; + uint8_t *exp; + + bool indata; + bool inexp; + + parserutils_error exp_ret; + + enum { ENCODE, DECODE, BOTH } dir; +} line_ctx; + +static bool handle_line(const char *data, size_t datalen, void *pw); +static void run_test(line_ctx *ctx); + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + line_ctx ctx; + + if (argc != 3) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + assert(parserutils_charset_codec_create("NATS-SEFI-ADD", + myrealloc, NULL) == NULL); + + ctx.codec = parserutils_charset_codec_create("UTF-8", myrealloc, NULL); + assert(ctx.codec != NULL); + + ctx.buflen = parse_filesize(argv[2]); + if (ctx.buflen == 0) + return 1; + + ctx.buf = malloc(2 * ctx.buflen); + if (ctx.buf == NULL) { + printf("Failed allocating %u bytes\n", + (unsigned int) ctx.buflen); + return 1; + } + + ctx.exp = ctx.buf + ctx.buflen; + ctx.explen = ctx.buflen; + + ctx.buf[0] = '\0'; + ctx.exp[0] = '\0'; + ctx.bufused = 0; + ctx.expused = 0; + ctx.indata = false; + ctx.inexp = false; + ctx.exp_ret = PARSERUTILS_OK; + + assert(parse_testfile(argv[2], handle_line, &ctx) == true); + + /* and run final test */ + if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n') + ctx.bufused -= 1; + + if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n') + ctx.expused -= 1; + + run_test(&ctx); + + free(ctx.buf); + + parserutils_charset_codec_destroy(ctx.codec); + + assert(parserutils_charset_finalise(myrealloc, NULL) == + PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} + +bool handle_line(const char *data, size_t datalen, void *pw) +{ + line_ctx *ctx = (line_ctx *) pw; + + if (data[0] == '#') { + if (ctx->inexp) { + /* This marks end of testcase, so run it */ + + if (ctx->buf[ctx->bufused - 1] == '\n') + ctx->bufused -= 1; + + if (ctx->exp[ctx->expused - 1] == '\n') + ctx->expused -= 1; + + run_test(ctx); + + ctx->buf[0] = '\0'; + ctx->exp[0] = '\0'; + ctx->bufused = 0; + ctx->expused = 0; + ctx->exp_ret = PARSERUTILS_OK; + } + + if (strncasecmp(data+1, "data", 4) == 0) { + parserutils_charset_codec_optparams params; + const char *ptr = data + 6; + + ctx->indata = true; + ctx->inexp = false; + + if (strncasecmp(ptr, "decode", 6) == 0) + ctx->dir = DECODE; + else if (strncasecmp(ptr, "encode", 6) == 0) + ctx->dir = ENCODE; + else + ctx->dir = BOTH; + + ptr += 7; + + if (strncasecmp(ptr, "LOOSE", 5) == 0) { + params.error_mode.mode = + PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE; + ptr += 6; + } else if (strncasecmp(ptr, "STRICT", 6) == 0) { + params.error_mode.mode = + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT; + ptr += 7; + } else { + params.error_mode.mode = + PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT; + ptr += 9; + } + + assert(parserutils_charset_codec_setopt(ctx->codec, + PARSERUTILS_CHARSET_CODEC_ERROR_MODE, + (parserutils_charset_codec_optparams *) ¶ms) + == PARSERUTILS_OK); + } else if (strncasecmp(data+1, "expected", 8) == 0) { + ctx->indata = false; + ctx->inexp = true; + + ctx->exp_ret = parserutils_error_from_string(data + 10, + datalen - 10 - 1 /* \n */); + } else if (strncasecmp(data+1, "reset", 5) == 0) { + ctx->indata = false; + ctx->inexp = false; + + parserutils_charset_codec_reset(ctx->codec); + } + } else { + if (ctx->indata) { + memcpy(ctx->buf + ctx->bufused, data, datalen); + ctx->bufused += datalen; + } + if (ctx->inexp) { + memcpy(ctx->exp + ctx->expused, data, datalen); + ctx->expused += datalen; + } + } + + return true; +} + +void run_test(line_ctx *ctx) +{ + static int testnum; + size_t destlen = ctx->bufused * 4; + uint8_t dest[destlen]; + uint8_t *pdest = dest; + const uint8_t *psrc = ctx->buf; + size_t srclen = ctx->bufused; + size_t i; + + if (ctx->dir == DECODE) { + assert(parserutils_charset_codec_decode(ctx->codec, + &psrc, &srclen, + &pdest, &destlen) == ctx->exp_ret); + } else if (ctx->dir == ENCODE) { + assert(parserutils_charset_codec_encode(ctx->codec, + &psrc, &srclen, + &pdest, &destlen) == ctx->exp_ret); + } else { + size_t templen = ctx->bufused * 4; + uint8_t temp[templen]; + uint8_t *ptemp = temp; + const uint8_t *ptemp2; + size_t templen2; + + assert(parserutils_charset_codec_decode(ctx->codec, + &psrc, &srclen, + &ptemp, &templen) == ctx->exp_ret); + /* \todo currently there is no way to specify the number of + consumed & produced data in case of a deliberate bad input + data set. */ + if (ctx->exp_ret == PARSERUTILS_OK) { + assert(temp + (ctx->bufused * 4 - templen) == ptemp); + } + + ptemp2 = temp; + templen2 = ctx->bufused * 4 - templen; + assert(parserutils_charset_codec_encode(ctx->codec, + &ptemp2, &templen2, + &pdest, &destlen) == ctx->exp_ret); + if (ctx->exp_ret == PARSERUTILS_OK) { + assert(templen2 == 0); + assert(temp + (ctx->bufused * 4 - templen) == ptemp2); + } + } + if (ctx->exp_ret == PARSERUTILS_OK) { + assert(srclen == 0); + assert(ctx->buf + ctx->bufused == psrc); + assert(dest + (ctx->bufused * 4 - destlen) == pdest); + } + + printf("%d: Read '", ++testnum); + for (i = 0; i < ctx->expused; i++) { + printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf], + "0123456789abcdef"[dest[i] & 0xf]); + } + printf("' Expected '"); + for (i = 0; i < ctx->expused; i++) { + printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf], + "0123456789abcdef"[ctx->exp[i] & 0xf]); + } + printf("'\n"); + + assert(pdest == dest + ctx->expused); + assert(memcmp(dest, ctx->exp, ctx->expused) == 0); +} + diff --git a/test/cscodec.c b/test/cscodec.c deleted file mode 100644 index d3b1b76..0000000 --- a/test/cscodec.c +++ /dev/null @@ -1,232 +0,0 @@ -#include -#include - -#include "charset/charset.h" -#include - -#include "utils/utils.h" - -#include "testutils.h" - -typedef struct line_ctx { - parserutils_charset_codec *codec; - - size_t buflen; - size_t bufused; - uint8_t *buf; - size_t explen; - size_t expused; - uint8_t *exp; - - bool indata; - bool inexp; - - parserutils_error exp_ret; - - enum { ENCODE, DECODE, BOTH } dir; -} line_ctx; - -static bool handle_line(const char *data, size_t datalen, void *pw); -static void run_test(line_ctx *ctx); - -static void *myrealloc(void *ptr, size_t len, void *pw) -{ - UNUSED(pw); - - return realloc(ptr, len); -} - -int main(int argc, char **argv) -{ - line_ctx ctx; - - if (argc != 3) { - printf("Usage: %s \n", argv[0]); - return 1; - } - - assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) == - PARSERUTILS_OK); - - assert(parserutils_charset_codec_create("NATS-SEFI-ADD", - myrealloc, NULL) == NULL); - - ctx.codec = parserutils_charset_codec_create("UTF-8", myrealloc, NULL); - assert(ctx.codec != NULL); - - ctx.buflen = parse_filesize(argv[2]); - if (ctx.buflen == 0) - return 1; - - ctx.buf = malloc(2 * ctx.buflen); - if (ctx.buf == NULL) { - printf("Failed allocating %u bytes\n", - (unsigned int) ctx.buflen); - return 1; - } - - ctx.exp = ctx.buf + ctx.buflen; - ctx.explen = ctx.buflen; - - ctx.buf[0] = '\0'; - ctx.exp[0] = '\0'; - ctx.bufused = 0; - ctx.expused = 0; - ctx.indata = false; - ctx.inexp = false; - ctx.exp_ret = PARSERUTILS_OK; - - assert(parse_testfile(argv[2], handle_line, &ctx) == true); - - /* and run final test */ - if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n') - ctx.bufused -= 1; - - if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n') - ctx.expused -= 1; - - run_test(&ctx); - - free(ctx.buf); - - parserutils_charset_codec_destroy(ctx.codec); - - assert(parserutils_charset_finalise(myrealloc, NULL) == - PARSERUTILS_OK); - - printf("PASS\n"); - - return 0; -} - -bool handle_line(const char *data, size_t datalen, void *pw) -{ - line_ctx *ctx = (line_ctx *) pw; - - if (data[0] == '#') { - if (ctx->inexp) { - /* This marks end of testcase, so run it */ - - if (ctx->buf[ctx->bufused - 1] == '\n') - ctx->bufused -= 1; - - if (ctx->exp[ctx->expused - 1] == '\n') - ctx->expused -= 1; - - run_test(ctx); - - ctx->buf[0] = '\0'; - ctx->exp[0] = '\0'; - ctx->bufused = 0; - ctx->expused = 0; - ctx->exp_ret = PARSERUTILS_OK; - } - - if (strncasecmp(data+1, "data", 4) == 0) { - parserutils_charset_codec_optparams params; - const char *ptr = data + 6; - - ctx->indata = true; - ctx->inexp = false; - - if (strncasecmp(ptr, "decode", 6) == 0) - ctx->dir = DECODE; - else if (strncasecmp(ptr, "encode", 6) == 0) - ctx->dir = ENCODE; - else - ctx->dir = BOTH; - - ptr += 7; - - if (strncasecmp(ptr, "LOOSE", 5) == 0) { - params.error_mode.mode = - PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE; - ptr += 6; - } else if (strncasecmp(ptr, "STRICT", 6) == 0) { - params.error_mode.mode = - PARSERUTILS_CHARSET_CODEC_ERROR_STRICT; - ptr += 7; - } else { - params.error_mode.mode = - PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT; - ptr += 9; - } - - assert(parserutils_charset_codec_setopt(ctx->codec, - PARSERUTILS_CHARSET_CODEC_ERROR_MODE, - (parserutils_charset_codec_optparams *) ¶ms) - == PARSERUTILS_OK); - } else if (strncasecmp(data+1, "expected", 8) == 0) { - ctx->indata = false; - ctx->inexp = true; - - ctx->exp_ret = parserutils_error_from_string(data + 10, - datalen - 10 - 1 /* \n */); - } else if (strncasecmp(data+1, "reset", 5) == 0) { - ctx->indata = false; - ctx->inexp = false; - - parserutils_charset_codec_reset(ctx->codec); - } - } else { - if (ctx->indata) { - memcpy(ctx->buf + ctx->bufused, data, datalen); - ctx->bufused += datalen; - } - if (ctx->inexp) { - memcpy(ctx->exp + ctx->expused, data, datalen); - ctx->expused += datalen; - } - } - - return true; -} - -void run_test(line_ctx *ctx) -{ - static int testnum; - size_t destlen = ctx->bufused * 4; - uint8_t dest[destlen]; - uint8_t *pdest = dest; - const uint8_t *psrc = ctx->buf; - size_t srclen = ctx->bufused; - size_t i; - - if (ctx->dir == DECODE) { - assert(parserutils_charset_codec_decode(ctx->codec, - &psrc, &srclen, - &pdest, &destlen) == ctx->exp_ret); - } else if (ctx->dir == ENCODE) { - assert(parserutils_charset_codec_encode(ctx->codec, - &psrc, &srclen, - &pdest, &destlen) == ctx->exp_ret); - } else { - size_t templen = ctx->bufused * 4; - uint8_t temp[templen]; - uint8_t *ptemp = temp; - - assert(parserutils_charset_codec_decode(ctx->codec, - &psrc, &srclen, - &ptemp, &templen) == ctx->exp_ret); - ptemp = temp; - templen = ctx->bufused * 4 - templen; - assert(parserutils_charset_codec_encode(ctx->codec, - (const uint8_t **) &ptemp, &templen, - &pdest, &destlen) == ctx->exp_ret); - } - - printf("%d: Read '", ++testnum); - for (i = 0; i < ctx->expused; i++) { - printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf], - "0123456789abcdef"[dest[i] & 0xf]); - } - printf("' Expected '"); - for (i = 0; i < ctx->expused; i++) { - printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf], - "0123456789abcdef"[ctx->exp[i] & 0xf]); - } - printf("'\n"); - - assert(memcmp(dest, ctx->exp, ctx->expused) == 0); -} - diff --git a/test/data/cscodec-utf16/INDEX b/test/data/cscodec-utf16/INDEX new file mode 100644 index 0000000..99d2524 --- /dev/null +++ b/test/data/cscodec-utf16/INDEX @@ -0,0 +1,6 @@ +# Index file for UTF-16 charset codec tests +# +# Test Description + +simple.dat Simple tests, designed to validate testdriver + diff --git a/test/data/cscodec-utf16/simple.dat b/test/data/cscodec-utf16/simple.dat new file mode 100644 index 0000000..1e7d324 --- /dev/null +++ b/test/data/cscodec-utf16/simple.dat @@ -0,0 +1,33 @@ +# *** Simple test: +#data decode STRICT +@䅂 +#expected PARSERUTILS_OK +@䅂 +#reset + +# *** Surrogate test: +#data decode STRICT +�� +#expected PARSERUTILS_OK +𐌂 +#reset + +# *** Lonely high surrogate: +# This is a bit strange that end status is ok. +#data decode STRICT +� +#expected PARSERUTILS_OK +#reset + +# With an extra code point, the status is different. +#data decode STRICT +�䅂 +#expected PARSERUTILS_INVALID +#reset + +# *** Wrong low surrogate start: +#data decode STRICT +� +#expected PARSERUTILS_INVALID +#reset + diff --git a/test/data/cscodec-utf8/INDEX b/test/data/cscodec-utf8/INDEX new file mode 100644 index 0000000..d6d338a --- /dev/null +++ b/test/data/cscodec-utf8/INDEX @@ -0,0 +1,6 @@ +# Index file for charset codec tests +# +# Test Description + +simple.dat Simple tests, designed to validate testdriver +UTF-8-test.txt Markus Kuhn's UTF-8 decoding test file diff --git a/test/data/cscodec-utf8/UTF-8-test.txt b/test/data/cscodec-utf8/UTF-8-test.txt new file mode 100644 index 0000000..920e54e Binary files /dev/null and b/test/data/cscodec-utf8/UTF-8-test.txt differ diff --git a/test/data/cscodec-utf8/simple.dat b/test/data/cscodec-utf8/simple.dat new file mode 100644 index 0000000..3e2c7ae Binary files /dev/null and b/test/data/cscodec-utf8/simple.dat differ diff --git a/test/data/cscodec/INDEX b/test/data/cscodec/INDEX deleted file mode 100644 index d6d338a..0000000 --- a/test/data/cscodec/INDEX +++ /dev/null @@ -1,6 +0,0 @@ -# Index file for charset codec tests -# -# Test Description - -simple.dat Simple tests, designed to validate testdriver -UTF-8-test.txt Markus Kuhn's UTF-8 decoding test file diff --git a/test/data/cscodec/UTF-8-test.txt b/test/data/cscodec/UTF-8-test.txt deleted file mode 100644 index 920e54e..0000000 Binary files a/test/data/cscodec/UTF-8-test.txt and /dev/null differ diff --git a/test/data/cscodec/simple.dat b/test/data/cscodec/simple.dat deleted file mode 100644 index 3e2c7ae..0000000 Binary files a/test/data/cscodec/simple.dat and /dev/null differ -- cgit v1.2.3