From 2777a04ed2ba4fd36138b991d66a32a283361f7e Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Thu, 1 May 2008 16:34:46 +0000 Subject: Import parser construction utility library svn path=/trunk/libparserutils/; revision=4111 --- test/INDEX | 15 ++ test/Makefile | 80 +++++++++ test/README | 84 +++++++++ test/aliases.c | 62 +++++++ test/charset.c | 31 ++++ test/cscodec.c | 232 +++++++++++++++++++++++++ test/data/Aliases | 302 +++++++++++++++++++++++++++++++++ test/data/cscodec/INDEX | 6 + test/data/cscodec/UTF-8-test.txt | Bin 0 -> 41013 bytes test/data/cscodec/simple.dat | Bin 0 -> 1109 bytes test/data/input/INDEX | 5 + test/data/input/UTF-8-test.txt | Bin 0 -> 20334 bytes test/filter.c | 357 +++++++++++++++++++++++++++++++++++++++ test/inputstream.c | 97 +++++++++++ test/parserutils.c | 30 ++++ test/regression/cscodec-segv.c | 38 +++++ test/regression/filter-segv.c | 39 +++++ test/regression/stream-nomem.c | 94 +++++++++++ test/testrunner.pl | 167 ++++++++++++++++++ test/testutils.h | 123 ++++++++++++++ 20 files changed, 1762 insertions(+) create mode 100644 test/INDEX create mode 100644 test/Makefile create mode 100644 test/README create mode 100644 test/aliases.c create mode 100644 test/charset.c create mode 100644 test/cscodec.c create mode 100644 test/data/Aliases create mode 100644 test/data/cscodec/INDEX create mode 100644 test/data/cscodec/UTF-8-test.txt create mode 100644 test/data/cscodec/simple.dat create mode 100644 test/data/input/INDEX create mode 100644 test/data/input/UTF-8-test.txt create mode 100644 test/filter.c create mode 100644 test/inputstream.c create mode 100644 test/parserutils.c create mode 100644 test/regression/cscodec-segv.c create mode 100644 test/regression/filter-segv.c create mode 100644 test/regression/stream-nomem.c create mode 100644 test/testrunner.pl create mode 100644 test/testutils.h (limited to 'test') diff --git a/test/INDEX b/test/INDEX new file mode 100644 index 0000000..772c82f --- /dev/null +++ b/test/INDEX @@ -0,0 +1,15 @@ +# Index for testcases +# +# Test Description DataDir + +charset Charset initialisation/finalisation +parserutils Library initialisation/finalisation +aliases Encoding alias handling +cscodec Charset codec implementation cscodec +filter Input stream filtering +inputstream Inputstream handling input + +# Regression tests +regression/cscodec-segv Segfault in charset codecs +regression/filter-segv Segfault in input filtering +regression/stream-nomem Inputstream buffer expansion diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000..2ed0b44 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,80 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Extend toolchain settings +override CFLAGS := $(CFLAGS) -I$(TOP)/src/ -I$(d) + +# Tests +TESTS_$(d) := aliases cscodec charset filter inputstream parserutils +TESTS_$(d) := $(TESTS_$(d)) regression/cscodec-segv regression/filter-segv \ + regression/stream-nomem + +# Items for top-level makefile to use +ITEMS_CLEAN := $(ITEMS_CLEAN) \ + $(addprefix $(d), $(addsuffix $(EXEEXT), $(TESTS_$(d)))) \ + $(addprefix $(d), $(addsuffix .gcda, $(TESTS_$(d)))) \ + $(addprefix $(d), $(addsuffix .gcno, $(TESTS_$(d)))) +ITEMS_DISTCLEAN := $(ITEMS_DISTCLEAN) $(d)log + +# Targets for top-level makefile to run +TARGET_TESTS := $(TARGET_TESTS) test_$(d) + +# Now we get to hack around so that we know what directory we're in. +# $(d) no longer exists when running the commands for a target, so we can't +# simply use it verbatim. Assigning to a variable doesn't really help, as +# there's no guarantee that someone else hasn't overridden that variable. +# So, what we do is make the target depend on $(d), then pick it out of the +# dependency list when running commands. This isn't pretty, but is effective. +test_$(d): $(d) $(addprefix $(d), $(TESTS_$(d))) + @$(PERL) $(TOP)/$ $(1)" + @$$(CC) -c -g $$(DEBUGCFLAGS) -o $$@.o $(1) + @$$(LD) -g -o $$@ $$@.o $$(LDFLAGS) -lparserutils-debug + @$$(RM) $$(RMFLAGS) $$@.o + +endef + +$(eval $(foreach TEST,$(addprefix $(d), $(TESTS_$(d))), \ + $(call compile_test,$(addsuffix .c, $(TEST)),$(TEST)))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/test/README b/test/README new file mode 100644 index 0000000..7e41abf --- /dev/null +++ b/test/README @@ -0,0 +1,84 @@ +Libcharset testcases +==================== + +Testcases for Libcharset are self-contained binaries which test various parts +of the charset library. These may make use of external data files to drive +the testing. + +Testcase command lines +---------------------- + +Testcase command lines are in a unified format, thus: + + [ ] + +The aliases file parameter will always be specified (as it is required for +the library to work at all). + +The data file parameter is optional and may be provided on a test-by-test +basis. + +Testcase output +--------------- + +Testcases may output anything at all to stdout. The final line of the +output must begin with either PASS or FAIL (case sensitive), indicating +the success status of the test. + +Test Index +---------- + +In the test sources directory, is a file, named INDEX, which provides an +index of all available test binaries. Any new test applications should be +added to this index as they are created. + +The test index file format is as follows: + + file = *line + + line = ( entry / comment / blank ) LF + + entry = testname 1*HTAB description [ 1*HTAB datadir ] + comment = "#" *non-newline + blank = 0 + + testname = 1*non-reserved + description = 1*non-reserved + datadir = 1*non-reserved + + non-newline = VCHAR / WSP + non-reserved = VCHAR / SP + +Each entry contains a mandatory binary name and description followed by +an optional data directory specifier. The data directory specifier is +used to state the name of the directory containing data files for the +test name. This directory will be searched for within the "data" +directory in the source tree. + +If a data directory is specified, the test binary will be invoked for +each data file listed within the data directory INDEX, passing the +filename as the second parameter (, above). + +Data Index +---------- + +Each test data directory contains a file, named INDEX, which provides an +index of all available test data files. + +The data index file format is as follows: + + file = *line + + line = ( entry / comment / blank ) LF + + entry = dataname 1*HTAB description + comment = "#" *non-newline + blank = 0 + + dataname = 1*non-reserved + description = 1*non-reserved + + non-newline = VCHAR / WSP + non-reserved = VCHAR / SP + +Each entry contains a mandatory data file name and description. diff --git a/test/aliases.c b/test/aliases.c new file mode 100644 index 0000000..dff31c6 --- /dev/null +++ b/test/aliases.c @@ -0,0 +1,62 @@ +#include +#include + +#include "charset/aliases.h" + +#include "testutils.h" + +extern void charset_aliases_dump(void); + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main (int argc, char **argv) +{ + parserutils_charset_aliases_canon *c; + + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + parserutils_charset_aliases_create(argv[1], myrealloc, NULL); + + parserutils_charset_aliases_dump(); + + c = parserutils_charset_alias_canonicalise("moose", 5); + if (c) { + printf("FAIL - found invalid encoding 'moose'\n"); + return 1; + } + + c = parserutils_charset_alias_canonicalise("csinvariant", 11); + if (c) { + printf("%s %d\n", c->name, c->mib_enum); + } else { + printf("FAIL - failed finding encoding 'csinvariant'\n"); + return 1; + } + + c = parserutils_charset_alias_canonicalise("nats-sefi-add", 13); + if (c) { + printf("%s %d\n", c->name, c->mib_enum); + } else { + printf("FAIL - failed finding encoding 'nats-sefi-add'\n"); + return 1; + } + + printf("%d\n", parserutils_charset_mibenum_from_name(c->name, + strlen(c->name))); + + printf("%s\n", parserutils_charset_mibenum_to_name(c->mib_enum)); + + parserutils_charset_aliases_destroy(myrealloc, NULL); + + printf("PASS\n"); + + return 0; +} diff --git a/test/charset.c b/test/charset.c new file mode 100644 index 0000000..a793e7e --- /dev/null +++ b/test/charset.c @@ -0,0 +1,31 @@ +#include +#include + +#include "charset/charset.h" + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + assert (parserutils_charset_finalise(myrealloc, NULL) == + PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} diff --git a/test/cscodec.c b/test/cscodec.c new file mode 100644 index 0000000..d3b1b76 --- /dev/null +++ b/test/cscodec.c @@ -0,0 +1,232 @@ +#include +#include + +#include "charset/charset.h" +#include + +#include "utils/utils.h" + +#include "testutils.h" + +typedef struct line_ctx { + parserutils_charset_codec *codec; + + size_t buflen; + size_t bufused; + uint8_t *buf; + size_t explen; + size_t expused; + uint8_t *exp; + + bool indata; + bool inexp; + + parserutils_error exp_ret; + + enum { ENCODE, DECODE, BOTH } dir; +} line_ctx; + +static bool handle_line(const char *data, size_t datalen, void *pw); +static void run_test(line_ctx *ctx); + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + line_ctx ctx; + + if (argc != 3) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + assert(parserutils_charset_codec_create("NATS-SEFI-ADD", + myrealloc, NULL) == NULL); + + ctx.codec = parserutils_charset_codec_create("UTF-8", myrealloc, NULL); + assert(ctx.codec != NULL); + + ctx.buflen = parse_filesize(argv[2]); + if (ctx.buflen == 0) + return 1; + + ctx.buf = malloc(2 * ctx.buflen); + if (ctx.buf == NULL) { + printf("Failed allocating %u bytes\n", + (unsigned int) ctx.buflen); + return 1; + } + + ctx.exp = ctx.buf + ctx.buflen; + ctx.explen = ctx.buflen; + + ctx.buf[0] = '\0'; + ctx.exp[0] = '\0'; + ctx.bufused = 0; + ctx.expused = 0; + ctx.indata = false; + ctx.inexp = false; + ctx.exp_ret = PARSERUTILS_OK; + + assert(parse_testfile(argv[2], handle_line, &ctx) == true); + + /* and run final test */ + if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n') + ctx.bufused -= 1; + + if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n') + ctx.expused -= 1; + + run_test(&ctx); + + free(ctx.buf); + + parserutils_charset_codec_destroy(ctx.codec); + + assert(parserutils_charset_finalise(myrealloc, NULL) == + PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} + +bool handle_line(const char *data, size_t datalen, void *pw) +{ + line_ctx *ctx = (line_ctx *) pw; + + if (data[0] == '#') { + if (ctx->inexp) { + /* This marks end of testcase, so run it */ + + if (ctx->buf[ctx->bufused - 1] == '\n') + ctx->bufused -= 1; + + if (ctx->exp[ctx->expused - 1] == '\n') + ctx->expused -= 1; + + run_test(ctx); + + ctx->buf[0] = '\0'; + ctx->exp[0] = '\0'; + ctx->bufused = 0; + ctx->expused = 0; + ctx->exp_ret = PARSERUTILS_OK; + } + + if (strncasecmp(data+1, "data", 4) == 0) { + parserutils_charset_codec_optparams params; + const char *ptr = data + 6; + + ctx->indata = true; + ctx->inexp = false; + + if (strncasecmp(ptr, "decode", 6) == 0) + ctx->dir = DECODE; + else if (strncasecmp(ptr, "encode", 6) == 0) + ctx->dir = ENCODE; + else + ctx->dir = BOTH; + + ptr += 7; + + if (strncasecmp(ptr, "LOOSE", 5) == 0) { + params.error_mode.mode = + PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE; + ptr += 6; + } else if (strncasecmp(ptr, "STRICT", 6) == 0) { + params.error_mode.mode = + PARSERUTILS_CHARSET_CODEC_ERROR_STRICT; + ptr += 7; + } else { + params.error_mode.mode = + PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT; + ptr += 9; + } + + assert(parserutils_charset_codec_setopt(ctx->codec, + PARSERUTILS_CHARSET_CODEC_ERROR_MODE, + (parserutils_charset_codec_optparams *) ¶ms) + == PARSERUTILS_OK); + } else if (strncasecmp(data+1, "expected", 8) == 0) { + ctx->indata = false; + ctx->inexp = true; + + ctx->exp_ret = parserutils_error_from_string(data + 10, + datalen - 10 - 1 /* \n */); + } else if (strncasecmp(data+1, "reset", 5) == 0) { + ctx->indata = false; + ctx->inexp = false; + + parserutils_charset_codec_reset(ctx->codec); + } + } else { + if (ctx->indata) { + memcpy(ctx->buf + ctx->bufused, data, datalen); + ctx->bufused += datalen; + } + if (ctx->inexp) { + memcpy(ctx->exp + ctx->expused, data, datalen); + ctx->expused += datalen; + } + } + + return true; +} + +void run_test(line_ctx *ctx) +{ + static int testnum; + size_t destlen = ctx->bufused * 4; + uint8_t dest[destlen]; + uint8_t *pdest = dest; + const uint8_t *psrc = ctx->buf; + size_t srclen = ctx->bufused; + size_t i; + + if (ctx->dir == DECODE) { + assert(parserutils_charset_codec_decode(ctx->codec, + &psrc, &srclen, + &pdest, &destlen) == ctx->exp_ret); + } else if (ctx->dir == ENCODE) { + assert(parserutils_charset_codec_encode(ctx->codec, + &psrc, &srclen, + &pdest, &destlen) == ctx->exp_ret); + } else { + size_t templen = ctx->bufused * 4; + uint8_t temp[templen]; + uint8_t *ptemp = temp; + + assert(parserutils_charset_codec_decode(ctx->codec, + &psrc, &srclen, + &ptemp, &templen) == ctx->exp_ret); + ptemp = temp; + templen = ctx->bufused * 4 - templen; + assert(parserutils_charset_codec_encode(ctx->codec, + (const uint8_t **) &ptemp, &templen, + &pdest, &destlen) == ctx->exp_ret); + } + + printf("%d: Read '", ++testnum); + for (i = 0; i < ctx->expused; i++) { + printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf], + "0123456789abcdef"[dest[i] & 0xf]); + } + printf("' Expected '"); + for (i = 0; i < ctx->expused; i++) { + printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf], + "0123456789abcdef"[ctx->exp[i] & 0xf]); + } + printf("'\n"); + + assert(memcmp(dest, ctx->exp, ctx->expused) == 0); +} + diff --git a/test/data/Aliases b/test/data/Aliases new file mode 100644 index 0000000..db61ff1 --- /dev/null +++ b/test/data/Aliases @@ -0,0 +1,302 @@ +# > Unicode:Files.Aliases +# Mapping of character set encoding names to their canonical form +# +# Lines starting with a '#' are comments, blank lines are ignored. +# +# Based on http://www.iana.org/assignments/character-sets and +# http://www.iana.org/assignments/ianacharset-mib +# +# Canonical Form MIBenum Aliases... +# +US-ASCII 3 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ASCII ISO646-US ANSI_X3.4-1968 us IBM367 cp367 csASCII +ISO-10646-UTF-1 27 csISO10646UTF1 +ISO_646.basic:1983 28 ref csISO646basic1983 +INVARIANT 29 csINVARIANT +ISO_646.irv:1983 30 iso-ir-2 irv csISO2IntlRefVersion +BS_4730 20 iso-ir-4 ISO646-GB gb uk csISO4UnitedKingdom +NATS-SEFI 31 iso-ir-8-1 csNATSSEFI +NATS-SEFI-ADD 32 iso-ir-8-2 csNATSSEFIADD +NATS-DANO 33 iso-ir-9-1 csNATSDANO +NATS-DANO-ADD 34 iso-ir-9-2 csNATSDANOADD +SEN_850200_B 35 iso-ir-10 FI ISO646-FI ISO646-SE se csISO10Swedish +SEN_850200_C 21 iso-ir-11 ISO646-SE2 se2 csISO11SwedishForNames +KS_C_5601-1987 36 iso-ir-149 KS_C_5601-1989 KSC_5601 korean csKSC56011987 +ISO-2022-KR 37 csISO2022KR +EUC-KR 38 csEUCKR EUCKR +ISO-2022-JP 39 csISO2022JP +ISO-2022-JP-2 40 csISO2022JP2 +ISO-2022-CN 104 +ISO-2022-CN-EXT 105 +JIS_C6220-1969-jp 41 JIS_C6220-1969 iso-ir-13 katakana x0201-7 csISO13JISC6220jp +JIS_C6220-1969-ro 42 iso-ir-14 jp ISO646-JP csISO14JISC6220ro +IT 22 iso-ir-15 ISO646-IT csISO15Italian +PT 43 iso-ir-16 ISO646-PT csISO16Portuguese +ES 23 iso-ir-17 ISO646-ES csISO17Spanish +greek7-old 44 iso-ir-18 csISO18Greek7Old +latin-greek 45 iso-ir-19 csISO19LatinGreek +DIN_66003 24 iso-ir-21 de ISO646-DE csISO21German +NF_Z_62-010_(1973) 46 iso-ir-25 ISO646-FR1 csISO25French +Latin-greek-1 47 iso-ir-27 csISO27LatinGreek1 +ISO_5427 48 iso-ir-37 csISO5427Cyrillic +JIS_C6226-1978 49 iso-ir-42 csISO42JISC62261978 +BS_viewdata 50 iso-ir-47 csISO47BSViewdata +INIS 51 iso-ir-49 csISO49INIS +INIS-8 52 iso-ir-50 csISO50INIS8 +INIS-cyrillic 53 iso-ir-51 csISO51INISCyrillic +ISO_5427:1981 54 iso-ir-54 ISO5427Cyrillic1981 +ISO_5428:1980 55 iso-ir-55 csISO5428Greek +GB_1988-80 56 iso-ir-57 cn ISO646-CN csISO57GB1988 +GB_2312-80 57 iso-ir-58 chinese csISO58GB231280 +NS_4551-1 25 iso-ir-60 ISO646-NO no csISO60DanishNorwegian csISO60Norwegian1 +NS_4551-2 58 ISO646-NO2 iso-ir-61 no2 csISO61Norwegian2 +NF_Z_62-010 26 iso-ir-69 ISO646-FR fr csISO69French +videotex-suppl 59 iso-ir-70 csISO70VideotexSupp1 +PT2 60 iso-ir-84 ISO646-PT2 csISO84Portuguese2 +ES2 61 iso-ir-85 ISO646-ES2 csISO85Spanish2 +MSZ_7795.3 62 iso-ir-86 ISO646-HU hu csISO86Hungarian +JIS_C6226-1983 63 iso-ir-87 x0208 JIS_X0208-1983 csISO87JISX0208 +greek7 64 iso-ir-88 csISO88Greek7 +ASMO_449 65 ISO_9036 arabic7 iso-ir-89 csISO89ASMO449 +iso-ir-90 66 csISO90 +JIS_C6229-1984-a 67 iso-ir-91 jp-ocr-a csISO91JISC62291984a +JIS_C6229-1984-b 68 iso-ir-92 ISO646-JP-OCR-B jp-ocr-b csISO92JISC62991984b +JIS_C6229-1984-b-add 69 iso-ir-93 jp-ocr-b-add csISO93JIS62291984badd +JIS_C6229-1984-hand 70 iso-ir-94 jp-ocr-hand csISO94JIS62291984hand +JIS_C6229-1984-hand-add 71 iso-ir-95 jp-ocr-hand-add csISO95JIS62291984handadd +JIS_C6229-1984-kana 72 iso-ir-96 csISO96JISC62291984kana +ISO_2033-1983 73 iso-ir-98 e13b csISO2033 +ANSI_X3.110-1983 74 iso-ir-99 CSA_T500-1983 NAPLPS csISO99NAPLPS +ISO-8859-1 4 iso-ir-100 ISO_8859-1 ISO_8859-1:1987 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 ISO8859-1 +ISO-8859-2 5 iso-ir-101 ISO_8859-2 ISO_8859-2:1987 latin2 l2 csISOLatin2 8859_2 ISO8859-2 +T.61-7bit 75 iso-ir-102 csISO102T617bit +T.61-8bit 76 T.61 iso-ir-103 csISO103T618bit +ISO-8859-3 6 iso-ir-109 ISO_8859-3 ISO_8859-3:1988 latin3 l3 csISOLatin3 8859_3 ISO8859-3 +ISO-8859-4 7 iso-ir-110 ISO_8859-4 ISO_8859-4:1988 latin4 l4 csISOLatin4 8859_4 ISO8859-4 +ECMA-cyrillic 77 iso-ir-111 KOI8-E csISO111ECMACyrillic +CSA_Z243.4-1985-1 78 iso-ir-121 ISO646-CA csa7-1 ca csISO121Canadian1 +CSA_Z243.4-1985-2 79 iso-ir-122 ISO646-CA2 csa7-2 csISO122Canadian2 +CSA_Z243.4-1985-gr 80 iso-ir-123 csISO123CSAZ24341985gr +ISO-8859-6 9 iso-ir-127 ISO_8859-6 ISO_8859-6:1987 ECMA-114 ASMO-708 arabic csISOLatinArabic +ISO-8859-6-E 81 csISO88596E ISO_8859-6-E +ISO-8859-6-I 82 csISO88596I ISO_8859-6-I +ISO-8859-7 10 iso-ir-126 ISO_8859-7 ISO_8859-7:1987 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 ISO8859-7 +T.101-G2 83 iso-ir-128 csISO128T101G2 +ISO-8859-8 11 iso-ir-138 ISO_8859-8 ISO_8859-8:1988 hebrew csISOLatinHebrew 8859_8 ISO8859-8 +ISO-8859-8-E 84 csISO88598E ISO_8859-8-E +ISO-8859-8-I 85 csISO88598I ISO_8859-8-I +CSN_369103 86 iso-ir-139 csISO139CSN369103 +JUS_I.B1.002 87 iso-ir-141 ISO646-YU js yu csISO141JUSIB1002 +ISO_6937-2-add 14 iso-ir-142 csISOTextComm +IEC_P27-1 88 iso-ir-143 csISO143IECP271 +ISO-8859-5 8 iso-ir-144 ISO_8859-5 ISO_8859-5:1988 cyrillic csISOLatinCyrillic 8859_5 ISO8859-5 +JUS_I.B1.003-serb 89 iso-ir-146 serbian csISO146Serbian +JUS_I.B1.003-mac 90 macedonian iso-ir-147 csISO147Macedonian +ISO-8859-9 12 iso-ir-148 ISO_8859-9 ISO_8859-9:1989 latin5 l5 csISOLatin5 8859_9 ISO8859-9 +greek-ccitt 91 iso-ir-150 csISO150 csISO150GreekCCITT +NC_NC00-10:81 92 cuba iso-ir-151 ISO646-CU csISO151Cuba +ISO_6937-2-25 93 iso-ir-152 csISO6937Add +GOST_19768-74 94 ST_SEV_358-88 iso-ir-153 csISO153GOST1976874 +ISO_8859-supp 95 iso-ir-154 latin1-2-5 csISO8859Supp +ISO_10367-box 96 iso-ir-155 csISO10367Box +ISO-8859-10 13 iso-ir-157 l6 ISO_8859-10:1992 csISOLatin6 latin6 8859_10 ISO8859-10 +latin-lap 97 lap iso-ir-158 csISO158Lap +JIS_X0212-1990 98 x0212 iso-ir-159 csISO159JISX02121990 +DS_2089 99 DS2089 ISO646-DK dk csISO646Danish +us-dk 100 csUSDK +dk-us 101 csDKUS +JIS_X0201 15 X0201 csHalfWidthKatakana +KSC5636 102 ISO646-KR csKSC5636 +ISO-10646-UCS-2 1000 csUnicode UCS-2 UCS2 +ISO-10646-UCS-4 1001 csUCS4 UCS-4 UCS4 +DEC-MCS 2008 dec csDECMCS +hp-roman8 2004 roman8 r8 csHPRoman8 +macintosh 2027 mac csMacintosh MACROMAN MAC-ROMAN X-MAC-ROMAN +IBM037 2028 cp037 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 +IBM038 2029 EBCDIC-INT cp038 csIBM038 +IBM273 2030 CP273 csIBM273 +IBM274 2031 EBCDIC-BE CP274 csIBM274 +IBM275 2032 EBCDIC-BR cp275 csIBM275 +IBM277 2033 EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 +IBM278 2034 CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278 +IBM280 2035 CP280 ebcdic-cp-it csIBM280 +IBM281 2036 EBCDIC-JP-E cp281 csIBM281 +IBM284 2037 CP284 ebcdic-cp-es csIBM284 +IBM285 2038 CP285 ebcdic-cp-gb csIBM285 +IBM290 2039 cp290 EBCDIC-JP-kana csIBM290 +IBM297 2040 cp297 ebcdic-cp-fr csIBM297 +IBM420 2041 cp420 ebcdic-cp-ar1 csIBM420 +IBM423 2042 cp423 ebcdic-cp-gr csIBM423 +IBM424 2043 cp424 ebcdic-cp-he csIBM424 +IBM437 2011 cp437 437 csPC8CodePage437 +IBM500 2044 CP500 ebcdic-cp-be ebcdic-cp-ch csIBM500 +IBM775 2087 cp775 csPC775Baltic +IBM850 2009 cp850 850 csPC850Multilingual +IBM851 2045 cp851 851 csIBM851 +IBM852 2010 cp852 852 csPCp852 +IBM855 2046 cp855 855 csIBM855 +IBM857 2047 cp857 857 csIBM857 +IBM860 2048 cp860 860 csIBM860 +IBM861 2049 cp861 861 cp-is csIBM861 +IBM862 2013 cp862 862 csPC862LatinHebrew +IBM863 2050 cp863 863 csIBM863 +IBM864 2051 cp864 csIBM864 +IBM865 2052 cp865 865 csIBM865 +IBM866 2086 cp866 866 csIBM866 +IBM868 2053 CP868 cp-ar csIBM868 +IBM869 2054 cp869 869 cp-gr csIBM869 +IBM870 2055 CP870 ebcdic-cp-roece ebcdic-cp-yu csIBM870 +IBM871 2056 CP871 ebcdic-cp-is csIBM871 +IBM880 2057 cp880 EBCDIC-Cyrillic csIBM880 +IBM891 2058 cp891 csIBM891 +IBM903 2059 cp903 csIBM903 +IBM904 2060 cp904 904 csIBBM904 +IBM905 2061 CP905 ebcdic-cp-tr csIBM905 +IBM918 2062 CP918 ebcdic-cp-ar2 csIBM918 +IBM1026 2063 CP1026 csIBM1026 +EBCDIC-AT-DE 2064 csIBMEBCDICATDE +EBCDIC-AT-DE-A 2065 csEBCDICATDEA +EBCDIC-CA-FR 2066 csEBCDICCAFR +EBCDIC-DK-NO 2067 csEBCDICDKNO +EBCDIC-DK-NO-A 2068 csEBCDICDKNOA +EBCDIC-FI-SE 2069 csEBCDICFISE +EBCDIC-FI-SE-A 2070 csEBCDICFISEA +EBCDIC-FR 2071 csEBCDICFR +EBCDIC-IT 2072 csEBCDICIT +EBCDIC-PT 2073 csEBCDICPT +EBCDIC-ES 2074 csEBCDICES +EBCDIC-ES-A 2075 csEBCDICESA +EBCDIC-ES-S 2076 csEBCDICESS +EBCDIC-UK 2077 csEBCDICUK +EBCDIC-US 2078 csEBCDICUS +UNKNOWN-8BIT 2079 csUnknown8BiT +MNEMONIC 2080 csMnemonic +MNEM 2081 csMnem +VISCII 2082 csVISCII +VIQR 2083 csVIQR +KOI8-R 2084 csKOI8R +KOI8-U 2088 +IBM00858 2089 CCSID00858 CP00858 PC-Multilingual-850+euro +IBM00924 2090 CCSID00924 CP00924 ebcdic-Latin9--euro +IBM01140 2091 CCSID01140 CP01140 ebcdic-us-37+euro +IBM01141 2092 CCSID01141 CP01141 ebcdic-de-273+euro +IBM01142 2093 CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro +IBM01143 2094 CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro +IBM01144 2095 CCSID01144 CP01144 ebcdic-it-280+euro +IBM01145 2096 CCSID01145 CP01145 ebcdic-es-284+euro +IBM01146 2097 CCSID01146 CP01146 ebcdic-gb-285+euro +IBM01147 2098 CCSID01147 CP01147 ebcdic-fr-297+euro +IBM01148 2099 CCSID01148 CP01148 ebcdic-international-500+euro +IBM01149 2100 CCSID01149 CP01149 ebcdic-is-871+euro +Big5-HKSCS 2101 +IBM1047 2102 IBM-1047 +PTCP154 2103 csPTCP154 PT154 CP154 Cyrillic-Asian +Amiga-1251 2104 Ami1251 Amiga1251 Ami-1251 +KOI7-switched 2105 +UNICODE-1-1 1010 csUnicode11 +SCSU 1011 +UTF-7 1012 +UTF-16BE 1013 +UTF-16LE 1014 +UTF-16 1015 +CESU-8 1016 csCESU-8 +UTF-32 1017 +UTF-32BE 1018 +UTF-32LE 1019 +BOCU-1 1020 csBOCU-1 +UNICODE-1-1-UTF-7 103 csUnicode11UTF7 +UTF-8 106 UNICODE-1-1-UTF-8 UNICODE-2-0-UTF-8 utf8 +ISO-8859-13 109 8859_13 ISO8859-13 +ISO-8859-14 110 iso-ir-199 ISO_8859-14:1998 ISO_8859-14 latin8 iso-celtic l8 8859_14 ISO8859-14 +ISO-8859-15 111 ISO_8859-15 Latin-9 8859_15 ISO8859-15 +ISO-8859-16 112 iso-ir-226 ISO_8859-16:2001 ISO_8859-16 latin10 l10 +GBK 113 CP936 MS936 windows-936 +GB18030 114 +OSD_EBCDIC_DF04_15 115 +OSD_EBCDIC_DF03_IRV 116 +OSD_EBCDIC_DF04_1 117 +JIS_Encoding 16 csJISEncoding +Shift_JIS 17 MS_Kanji csShiftJIS X-SJIS Shift-JIS +EUC-JP 18 csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese EUCJP +Extended_UNIX_Code_Fixed_Width_for_Japanese 19 csEUCFixWidJapanese +ISO-10646-UCS-Basic 1002 csUnicodeASCII +ISO-10646-Unicode-Latin1 1003 csUnicodeLatin1 ISO-10646 +ISO-Unicode-IBM-1261 1005 csUnicodeIBM1261 +ISO-Unicode-IBM-1268 1006 csUnicodeIBM1268 +ISO-Unicode-IBM-1276 1007 csUnicodeIBM1276 +ISO-Unicode-IBM-1264 1008 csUnicodeIBM1264 +ISO-Unicode-IBM-1265 1009 csUnicodeIBM1265 +ISO-8859-1-Windows-3.0-Latin-1 2000 csWindows30Latin1 +ISO-8859-1-Windows-3.1-Latin-1 2001 csWindows31Latin1 +ISO-8859-2-Windows-Latin-2 2002 csWindows31Latin2 +ISO-8859-9-Windows-Latin-5 2003 csWindows31Latin5 +Adobe-Standard-Encoding 2005 csAdobeStandardEncoding +Ventura-US 2006 csVenturaUS +Ventura-International 2007 csVenturaInternational +PC8-Danish-Norwegian 2012 csPC8DanishNorwegian +PC8-Turkish 2014 csPC8Turkish +IBM-Symbols 2015 csIBMSymbols +IBM-Thai 2016 csIBMThai +HP-Legal 2017 csHPLegal +HP-Pi-font 2018 csHPPiFont +HP-Math8 2019 csHPMath8 +Adobe-Symbol-Encoding 2020 csHPPSMath +HP-DeskTop 2021 csHPDesktop +Ventura-Math 2022 csVenturaMath +Microsoft-Publishing 2023 csMicrosoftPublishing +Windows-31J 2024 csWindows31J +GB2312 2025 csGB2312 EUC-CN EUCCN CN-GB +Big5 2026 csBig5 BIG-FIVE BIG-5 CN-BIG5 BIG_FIVE +windows-1250 2250 CP1250 MS-EE +windows-1251 2251 CP1251 MS-CYRL +windows-1252 2252 CP1252 MS-ANSI +windows-1253 2253 CP1253 MS-GREEK +windows-1254 2254 CP1254 MS-TURK +windows-1255 2255 +windows-1256 2256 CP1256 MS-ARAB +windows-1257 2257 CP1257 WINBALTRIM +windows-1258 2258 +TIS-620 2259 +HZ-GB-2312 2085 + +# Additional encodings not defined by IANA + +# Arbitrary allocations +#CP737 3001 +#CP853 3002 +#CP856 3003 +CP874 3004 WINDOWS-874 +#CP922 3005 +#CP1046 3006 +#CP1124 3007 +#CP1125 3008 WINDOWS-1125 +#CP1129 3009 +#CP1133 3010 IBM-CP1133 +#CP1161 3011 IBM-1161 IBM1161 CSIBM1161 +#CP1162 3012 IBM-1162 IBM1162 CSIBM1162 +#CP1163 3013 IBM-1163 IBM1163 CSIBM1163 +#GEORGIAN-ACADEMY 3014 +#GEORGIAN-PS 3015 +#KOI8-RU 3016 +#KOI8-T 3017 +#MACARABIC 3018 X-MAC-ARABIC MAC-ARABIC +#MACCROATIAN 3019 X-MAC-CROATIAN MAC-CROATIAN +#MACGREEK 3020 X-MAC-GREEK MAC-GREEK +#MACHEBREW 3021 X-MAC-HEBREW MAC-HEBREW +#MACICELAND 3022 X-MAC-ICELAND MAC-ICELAND +#MACROMANIA 3023 X-MAC-ROMANIA MAC-ROMANIA +#MACTHAI 3024 X-MAC-THAI MAC-THAI +#MACTURKISH 3025 X-MAC-TURKISH MAC-TURKISH +#MULELAO-1 3026 + +# From Unicode Lib +ISO-IR-182 4000 +ISO-IR-197 4002 +ISO-2022-JP-1 4008 +MACCYRILLIC 4009 X-MAC-CYRILLIC MAC-CYRILLIC +MACUKRAINE 4010 X-MAC-UKRAINIAN MAC-UKRAINIAN +MACCENTRALEUROPE 4011 X-MAC-CENTRALEURROMAN MAC-CENTRALEURROMAN +JOHAB 4012 +ISO-8859-11 4014 iso-ir-166 ISO_8859-11 ISO8859-11 8859_11 +X-CURRENT 4999 X-SYSTEM +X-ACORN-LATIN1 5001 +X-ACORN-FUZZY 5002 diff --git a/test/data/cscodec/INDEX b/test/data/cscodec/INDEX new file mode 100644 index 0000000..d6d338a --- /dev/null +++ b/test/data/cscodec/INDEX @@ -0,0 +1,6 @@ +# Index file for charset codec tests +# +# Test Description + +simple.dat Simple tests, designed to validate testdriver +UTF-8-test.txt Markus Kuhn's UTF-8 decoding test file diff --git a/test/data/cscodec/UTF-8-test.txt b/test/data/cscodec/UTF-8-test.txt new file mode 100644 index 0000000..920e54e Binary files /dev/null and b/test/data/cscodec/UTF-8-test.txt differ diff --git a/test/data/cscodec/simple.dat b/test/data/cscodec/simple.dat new file mode 100644 index 0000000..3e2c7ae Binary files /dev/null and b/test/data/cscodec/simple.dat differ diff --git a/test/data/input/INDEX b/test/data/input/INDEX new file mode 100644 index 0000000..c2c97ea --- /dev/null +++ b/test/data/input/INDEX @@ -0,0 +1,5 @@ +# Index file for inputstream tests +# +# Test Description + +UTF-8-test.txt Markus Kuhn's UTF-8 decoding test file diff --git a/test/data/input/UTF-8-test.txt b/test/data/input/UTF-8-test.txt new file mode 100644 index 0000000..abd16f7 Binary files /dev/null and b/test/data/input/UTF-8-test.txt differ diff --git a/test/filter.c b/test/filter.c new file mode 100644 index 0000000..ff4d1e7 --- /dev/null +++ b/test/filter.c @@ -0,0 +1,357 @@ +#include +#include +#include +#include + +#include + +#include "utils/utils.h" + +#include "input/filter.h" + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + parserutils_filter_optparams params; + parserutils_filter *input; + uint8_t inbuf[64], outbuf[64]; + size_t inlen, outlen; + const uint8_t *in = inbuf; + uint8_t *out = outbuf; + + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + /* Initialise library */ + assert(parserutils_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + /* Create input filter */ + input = parserutils_filter_create("UTF-8", myrealloc, NULL); + assert(input); + + /* Convert filter to UTF-8 encoding */ + params.encoding.name = "UTF-8"; + assert(parserutils_filter_setopt(input, PARSERUTILS_FILTER_SET_ENCODING, + (parserutils_filter_optparams *) ¶ms) == + PARSERUTILS_OK); + + + /* Simple case - valid input & output buffer large enough */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xc2\xa0o!"); + inlen = strlen((const char *) inbuf); + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xc2\xa0o!", + SLEN("hell\xc2\xa0o!")) == 0); + + + /* Too small an output buffer; no encoding edge cases */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hello!"); + inlen = strlen((const char *) inbuf); + outbuf[0] = '\0'; + outlen = 5; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_NOMEM); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + outlen = 64 - 5 + outlen; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hello!", + SLEN("hello!")) == 0); + + + /* Illegal input sequence; output buffer large enough */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\x96o!"); + inlen = strlen((const char *) inbuf); + outbuf[0] = '\0'; + outlen = 64; + + /* Input does loose decoding, converting to U+FFFD if illegal + * input is encountered */ + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xef\xbf\xbdo!", + SLEN("hell\xef\xbf\xbdo!")) == 0); + + + /* Input ends mid-sequence */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xc2\xa0o!"); + inlen = strlen((const char *) inbuf) - 3; + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 3; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xc2\xa0o!", + SLEN("hell\xc2\xa0o!")) == 0); + + + /* Input ends mid-sequence, but second attempt has too small a + * buffer, but large enough to write out the incomplete character. */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xc2\xa0o!"); + inlen = strlen((const char *) inbuf) - 3; + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 3; + outlen = 3; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_NOMEM); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + outlen = 64 - 7; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xc2\xa0o!", + SLEN("hell\xc2\xa0o!")) == 0); + + + /* Input ends mid-sequence, but second attempt has too small a + * buffer, not large enough to write out the incomplete character. */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xc2\xa0o!"); + inlen = strlen((const char *) inbuf) - 3; + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 3; + outlen = 1; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_NOMEM); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + outlen = 60; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xc2\xa0o!", + SLEN("hell\xc2\xa0o!")) == 0); + + + /* Input ends mid-sequence, but second attempt contains + * invalid character */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xc2\xc2o!"); + inlen = strlen((const char *) inbuf) - 3; + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 3; + + /* Input does loose decoding, converting to U+FFFD if illegal + * input is encountered */ + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xef\xbf\xbd\xef\xbf\xbdo!", + SLEN("hell\xef\xbf\xbd\xef\xbf\xbdo!")) == 0); + + + /* Input ends mid-sequence, but second attempt contains another + * incomplete character */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xc2\xa0\xc2\xa1o!"); + inlen = strlen((const char *) inbuf) - 5; + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 2; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 3; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xc2\xa0\xc2\xa1o!", + SLEN("hell\xc2\xa0\xc2\xa1o!")) == 0); + + + /* Input ends mid-sequence, but second attempt contains insufficient + * data to complete the incomplete character */ + in = inbuf; + out = outbuf; + strcpy((char *) inbuf, "hell\xe2\x80\xa2o!"); + inlen = strlen((const char *) inbuf) - 4; + outbuf[0] = '\0'; + outlen = 64; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 1; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + inlen += 3; + + assert(parserutils_filter_process_chunk(input, &in, &inlen, + &out, &outlen) == PARSERUTILS_OK); + + printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen, + (int) (out - ((uint8_t *) outbuf)), + outbuf, (int) outlen); + + assert(parserutils_filter_reset(input) == PARSERUTILS_OK); + + assert(memcmp(outbuf, "hell\xe2\x80\xa2o!", + SLEN("hell\xe2\x80\xa2o!")) == 0); + + + /* Clean up */ + parserutils_filter_destroy(input); + + assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} diff --git a/test/inputstream.c b/test/inputstream.c new file mode 100644 index 0000000..bad3127 --- /dev/null +++ b/test/inputstream.c @@ -0,0 +1,97 @@ +#include +#include + +#include +#include +#include + +#include "utils/utils.h" + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + parserutils_inputstream *stream; + FILE *fp; + size_t len, origlen; +#define CHUNK_SIZE (4096) + uint8_t buf[CHUNK_SIZE]; + uintptr_t c; + size_t clen; + + if (argc != 3) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + /* Initialise library */ + assert(parserutils_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + stream = parserutils_inputstream_create("UTF-8", 1, NULL, + myrealloc, NULL); + assert(stream != NULL); + + fp = fopen(argv[2], "rb"); + if (fp == NULL) { + printf("Failed opening %s\n", argv[2]); + return 1; + } + + fseek(fp, 0, SEEK_END); + origlen = len = ftell(fp); + fseek(fp, 0, SEEK_SET); + + while (len >= CHUNK_SIZE) { + fread(buf, 1, CHUNK_SIZE, fp); + + assert(parserutils_inputstream_append(stream, + buf, CHUNK_SIZE) == PARSERUTILS_OK); + + len -= CHUNK_SIZE; + + while ((c = parserutils_inputstream_peek(stream, 0, &clen)) != + PARSERUTILS_INPUTSTREAM_OOD) { + parserutils_inputstream_advance(stream, clen); + } + } + + if (len > 0) { + fread(buf, 1, len, fp); + + assert(parserutils_inputstream_append(stream, + buf, len) == PARSERUTILS_OK); + + len = 0; + } + + fclose(fp); + + assert(parserutils_inputstream_insert(stream, + (const uint8_t *) "hello!!!", + SLEN("hello!!!")) == PARSERUTILS_OK); + + assert(parserutils_inputstream_append(stream, NULL, 0) == + PARSERUTILS_OK); + + while ((c = parserutils_inputstream_peek(stream, 0, &clen)) != + PARSERUTILS_INPUTSTREAM_EOF) { + parserutils_inputstream_advance(stream, clen); + } + + parserutils_inputstream_destroy(stream); + + assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} + diff --git a/test/parserutils.c b/test/parserutils.c new file mode 100644 index 0000000..c6d671a --- /dev/null +++ b/test/parserutils.c @@ -0,0 +1,30 @@ +#include +#include + +#include + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + assert(parserutils_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + assert (parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} diff --git a/test/regression/cscodec-segv.c b/test/regression/cscodec-segv.c new file mode 100644 index 0000000..5802fdf --- /dev/null +++ b/test/regression/cscodec-segv.c @@ -0,0 +1,38 @@ +#include + +#include "charset/charset.h" +#include + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + parserutils_charset_codec *codec; + + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + codec = parserutils_charset_codec_create("UTF-8", myrealloc, NULL); + assert(codec != NULL); + + parserutils_charset_codec_destroy(codec); + + assert(parserutils_charset_finalise(myrealloc, NULL) == + PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} diff --git a/test/regression/filter-segv.c b/test/regression/filter-segv.c new file mode 100644 index 0000000..761caab --- /dev/null +++ b/test/regression/filter-segv.c @@ -0,0 +1,39 @@ +#include +#include + +#include + +#include "input/filter.h" + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + parserutils_filter *input; + + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + assert(parserutils_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + input = parserutils_filter_create("UTF-8", myrealloc, NULL); + assert(input); + + parserutils_filter_destroy(input); + + assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} diff --git a/test/regression/stream-nomem.c b/test/regression/stream-nomem.c new file mode 100644 index 0000000..f62b392 --- /dev/null +++ b/test/regression/stream-nomem.c @@ -0,0 +1,94 @@ +#include +#include + +#include +#include + +#include "utils/utils.h" + +#include "testutils.h" + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + parserutils_inputstream *stream; + + /* This is specially calculated so that the inputstream is forced to + * reallocate (it assumes that the inputstream's buffer chunk size + * is 4k) */ +#define BUFFER_SIZE (4096 + 4) + uint8_t input_buffer[BUFFER_SIZE]; +// uint8_t *buffer; +// size_t buflen; + uintptr_t c; + size_t clen; + + if (argc != 2) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + /* Populate the buffer with something sane */ + memset(input_buffer, 'a', BUFFER_SIZE); + /* Now, set up our test data */ + input_buffer[BUFFER_SIZE - 1] = '5'; + input_buffer[BUFFER_SIZE - 2] = '4'; + input_buffer[BUFFER_SIZE - 3] = '\xbd'; + input_buffer[BUFFER_SIZE - 4] = '\xbf'; + /* This byte will occupy the 4095th byte in the buffer and + * thus cause the entirety of U+FFFD to be buffered until after + * the buffer has been enlarged */ + input_buffer[BUFFER_SIZE - 5] = '\xef'; + input_buffer[BUFFER_SIZE - 6] = '3'; + input_buffer[BUFFER_SIZE - 7] = '2'; + input_buffer[BUFFER_SIZE - 8] = '1'; + + assert(parserutils_initialise(argv[1], myrealloc, NULL) == + PARSERUTILS_OK); + + stream = parserutils_inputstream_create("UTF-8", 0, + NULL, myrealloc, NULL); + assert(stream != NULL); + + assert(parserutils_inputstream_append(stream, + input_buffer, BUFFER_SIZE) == PARSERUTILS_OK); + + assert(parserutils_inputstream_append(stream, NULL, 0) == + PARSERUTILS_OK); + + while ((c = parserutils_inputstream_peek(stream, 0, &clen)) != + PARSERUTILS_INPUTSTREAM_EOF) + parserutils_inputstream_advance(stream, clen); + +/* + assert(css_inputstream_claim_buffer(stream, &buffer, &buflen) == + CSS_OK); + + assert(buflen == BUFFER_SIZE); + + printf("Buffer: '%.*s'\n", 8, buffer + (BUFFER_SIZE - 8)); + + assert( buffer[BUFFER_SIZE - 6] == '3' && + buffer[BUFFER_SIZE - 5] == (uint8_t) '\xef' && + buffer[BUFFER_SIZE - 4] == (uint8_t) '\xbf' && + buffer[BUFFER_SIZE - 3] == (uint8_t) '\xbd' && + buffer[BUFFER_SIZE - 2] == '4'); + + free(buffer); +*/ + + parserutils_inputstream_destroy(stream); + + assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK); + + printf("PASS\n"); + + return 0; +} + diff --git a/test/testrunner.pl b/test/testrunner.pl new file mode 100644 index 0000000..1c6c66d --- /dev/null +++ b/test/testrunner.pl @@ -0,0 +1,167 @@ +#!/bin/perl +# +# Testcase runner +# +# Usage: testrunner [] +# +# Operates upon INDEX files described in the README. +# Locates and executes testcases, feeding data files to programs +# as appropriate. +# Logs testcase output to file. +# Aborts test sequence on detection of error. +# + +use warnings; +use strict; +use File::Spec; +use IPC::Open3; + +if (@ARGV < 1) { + print "Usage: testrunner.pl []\n"; + exit; +} + +# Get directory +my $directory = shift @ARGV; + +# Get EXE extension (if any) +my $exeext = ""; +$exeext = shift @ARGV if (@ARGV > 0); + +# Open log file and /dev/null +open(LOG, ">$directory/log") or die "Failed opening test log"; +open(NULL, "+<", File::Spec->devnull) or die "Failed opening /dev/null"; + +# Open testcase index +open(TINDEX, "<$directory/INDEX") or die "Failed opening test INDEX"; + +# Parse testcase index, looking for testcases +while (my $line = ) { + next if ($line =~ /^(#.*)?$/); + + # Found one; decompose + (my $test, my $desc, my $data) = split /\t+/, $line; + + # Strip whitespace + $test =~ s/^\s+|\s+$//g; + $desc =~ s/^\s+|\s+$//g; + $data =~ s/^\s+|\s+$//g if ($data); + + # Append EXE extension to binary name + $test = $test . $exeext; + + print "Test: $desc\n"; + + my $pid; + + if ($data) { + # Testcase has external data files + + # Open datafile index + open(DINDEX, "<$directory/data/$data/INDEX") or + die "Failed opening $directory/data/$data/INDEX"; + + # Parse datafile index, looking for datafiles + while (my $dentry = ) { + next if ($dentry =~ /^(#.*)?$/); + + # Found one; decompose + (my $dtest, my $ddesc) = split /\t+/, $dentry; + + # Strip whitespace + $dtest =~ s/^\s+|\s+$//g; + $ddesc =~ s/^\s+|\s+$//g; + + print LOG "Running $directory/$test " . + "$directory/data/Aliases " . + "$directory/data/$data/$dtest\n"; + + # Make message fit on an 80 column terminal + my $msg = " ==> $test [$data/$dtest]"; + $msg = $msg . "." x (80 - length($msg) - 8); + + print $msg; + + # Run testcase + $pid = open3("&) { + print LOG " $output"; + $last = $output; + } + + # Wait for child to finish + waitpid($pid, 0); + + print substr($last, 0, 4) . "\n"; + + # Bail, noisily, on failure + if (substr($last, 0, 4) eq "FAIL") { + # Write any stderr output to the log + while (my $errors = ) { + print LOG " $errors"; + } + + print "\n\nFailure detected: " . + "consult log file\n\n\n"; + + exit(1); + } + } + + close(DINDEX); + } else { + # Testcase has no external data files + print LOG "Running $directory/$test $directory/data/Aliases\n"; + + # Make message fit on an 80 column terminal + my $msg = " ==> $test"; + $msg = $msg . "." x (80 - length($msg) - 8); + + print $msg; + + # Run testcase + $pid = open3("&) { + print LOG " $output"; + $last = $output; + } + + # Wait for child to finish + waitpid($pid, 0); + + print substr($last, 0, 4) . "\n"; + + # Bail, noisily, on failure + if (substr($last, 0, 4) eq "FAIL") { + # Write any stderr output to the log + while (my $errors = ) { + print LOG " $errors"; + } + + print "\n\nFailure detected: " . + "consult log file\n\n\n"; + + exit(1); + } + } + + print "\n"; +} + +# Clean up +close(TINDEX); + +close(NULL); +close(LOG); diff --git a/test/testutils.h b/test/testutils.h new file mode 100644 index 0000000..c91c5b8 --- /dev/null +++ b/test/testutils.h @@ -0,0 +1,123 @@ +#ifndef test_testutils_h_ +#define test_testutils_h_ + +#include +#include +#include + +#ifndef UNUSED +#define UNUSED(x) ((x) = (x)) +#endif + +/* Redefine assert, so we can simply use the standard assert mechanism + * within testcases and exit with the right output for the testrunner + * to do the right thing. */ +void __assert2(const char *expr, const char *function, + const char *file, int line); + +void __assert2(const char *expr, const char *function, + const char *file, int line) +{ + UNUSED(function); + UNUSED(file); + + printf("FAIL - %s at line %d\n", expr, line); + + exit(EXIT_FAILURE); +} + +#define assert(expr) \ + ((void) ((expr) || (__assert2 (#expr, __func__, __FILE__, __LINE__), 0))) + + +typedef bool (*line_func)(const char *data, size_t datalen, void *pw); + +static size_t parse_strlen(const char *str, size_t limit); +bool parse_testfile(const char *filename, line_func callback, void *pw); +size_t parse_filesize(const char *filename); + +/** + * Testcase datafile parser driver + * + * \param filename Name of file to parse + * \param callback Pointer to function to handle each line of input data + * \param pw Pointer to client-specific private data + * \return true on success, false otherwise. + */ +bool parse_testfile(const char *filename, line_func callback, void *pw) +{ + FILE *fp; + char buf[300]; + + fp = fopen(filename, "rb"); + if (fp == NULL) { + printf("Failed opening %s\n", filename); + return false; + } + + while (fgets(buf, sizeof buf, fp)) { + if (buf[0] == '\n') + continue; + + if (!callback(buf, parse_strlen(buf, sizeof buf), pw)) { + fclose(fp); + return false; + } + } + + fclose(fp); + + return true; +} + +/** + * Utility string length measurer; assumes strings are '\n' terminated + * + * \param str String to measure length of + * \param limit Upper bound on string length + * \return String length + */ +size_t parse_strlen(const char *str, size_t limit) +{ + size_t len = 0; + + if (str == NULL) + return 0; + + while (len < limit - 1 && *str != '\n') { + len++; + str++; + } + + len++; + + return len; +} + +/** + * Read the size of a file + * + * \param filename Name of file to read size of + * \return File size (in bytes), or 0 on error + */ +size_t parse_filesize(const char *filename) +{ + FILE *fp; + size_t len = 0; + + fp = fopen(filename, "rb"); + if (fp == NULL) { + printf("Failed opening %s\n", filename); + return 0; + } + + fseek(fp, 0, SEEK_END); + len = ftell(fp); + + fclose(fp); + + return len; +} + + +#endif -- cgit v1.2.3