summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--COPYING19
-rw-r--r--Makefile43
-rw-r--r--Makefile-riscos46
-rw-r--r--README44
-rw-r--r--build/Makefile.common129
-rw-r--r--build/Makefile.config8
-rw-r--r--include/parserutils/charset/codec.h114
-rw-r--r--include/parserutils/charset/mibenum.h24
-rw-r--r--include/parserutils/charset/utf16.h38
-rw-r--r--include/parserutils/charset/utf8.h38
-rw-r--r--include/parserutils/errors.h29
-rw-r--r--include/parserutils/functypes.h21
-rw-r--r--include/parserutils/input/inputstream.h143
-rw-r--r--include/parserutils/parserutils.h23
-rw-r--r--include/parserutils/types.h15
-rw-r--r--include/parserutils/utils/buffer.h39
-rw-r--r--libparserutils.pc.in10
-rw-r--r--src/Makefile49
-rw-r--r--src/charset/Makefile49
-rw-r--r--src/charset/aliases.c410
-rw-r--r--src/charset/aliases.h36
-rw-r--r--src/charset/charset.c54
-rw-r--r--src/charset/charset.h24
-rw-r--r--src/charset/codec.c185
-rw-r--r--src/charset/codecs/Makefile46
-rw-r--r--src/charset/codecs/codec_iconv.c683
-rw-r--r--src/charset/codecs/codec_impl.h48
-rw-r--r--src/charset/codecs/codec_utf16.c544
-rw-r--r--src/charset/codecs/codec_utf8.c546
-rw-r--r--src/charset/encodings/Makefile46
-rw-r--r--src/charset/encodings/utf16.c239
-rw-r--r--src/charset/encodings/utf8.c175
-rw-r--r--src/charset/encodings/utf8impl.h339
-rw-r--r--src/input/Makefile46
-rw-r--r--src/input/filter.c384
-rw-r--r--src/input/filter.h57
-rw-r--r--src/input/inputstream.c477
-rw-r--r--src/parserutils.c54
-rw-r--r--src/utils/Makefile49
-rw-r--r--src/utils/buffer.c156
-rw-r--r--src/utils/errors.c70
-rw-r--r--src/utils/utils.h28
-rw-r--r--test/INDEX15
-rw-r--r--test/Makefile80
-rw-r--r--test/README84
-rw-r--r--test/aliases.c62
-rw-r--r--test/charset.c31
-rw-r--r--test/cscodec.c232
-rw-r--r--test/data/Aliases302
-rw-r--r--test/data/cscodec/INDEX6
-rw-r--r--test/data/cscodec/UTF-8-test.txtbin0 -> 41013 bytes
-rw-r--r--test/data/cscodec/simple.datbin0 -> 1109 bytes
-rw-r--r--test/data/input/INDEX5
-rw-r--r--test/data/input/UTF-8-test.txtbin0 -> 20334 bytes
-rw-r--r--test/filter.c357
-rw-r--r--test/inputstream.c97
-rw-r--r--test/parserutils.c30
-rw-r--r--test/regression/cscodec-segv.c38
-rw-r--r--test/regression/filter-segv.c39
-rw-r--r--test/regression/stream-nomem.c94
-rw-r--r--test/testrunner.pl167
-rw-r--r--test/testutils.h123
62 files changed, 7339 insertions, 0 deletions
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..0f8d92b
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,19 @@
+Copyright (C) 2007-8 J-M Bell
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..e4de9b9
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,43 @@
+# Toolchain definitions for building on the destination platform
+CC := gcc
+AR := ar
+LD := gcc
+
+CP := cp
+RM := rm
+MKDIR := mkdir
+MV := mv
+ECHO := echo
+MAKE := make
+PERL := perl
+PKGCONFIG := pkg-config
+INSTALL := install
+SED := sed
+LCOV := lcov
+GENHTML := genhtml
+
+# Toolchain flags
+WARNFLAGS := -Wall -Wextra -Wundef -Wpointer-arith -Wcast-align \
+ -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \
+ -Wmissing-declarations -Wnested-externs -Werror -pedantic
+override CFLAGS += -std=c99 -D_BSD_SOURCE -I$(TOP)/include/ $(WARNFLAGS)
+RELEASECFLAGS = $(CFLAGS) -DNDEBUG -O2
+DEBUGCFLAGS = $(CFLAGS) -O0 -g
+ARFLAGS := -cru
+override LDFLAGS += -L$(TOP)/
+
+CPFLAGS :=
+RMFLAGS := -f
+MKDIRFLAGS := -p
+MVFLAGS :=
+ECHOFLAGS :=
+MAKEFLAGS :=
+PKGCONFIGFLAGS :=
+
+EXEEXT :=
+
+# Default installation prefix
+PREFIX ?= /usr/local
+
+
+include build/Makefile.common
diff --git a/Makefile-riscos b/Makefile-riscos
new file mode 100644
index 0000000..c9fef3c
--- /dev/null
+++ b/Makefile-riscos
@@ -0,0 +1,46 @@
+# Toolchain definitions for building for RISC OS using the GCCSDK cross-compiler
+GCCSDK_INSTALL_CROSSBIN ?= /home/riscos/cross/bin
+GCCSDK_INSTALL_ENV ?= /home/riscos/env
+
+CC := $(GCCSDK_INSTALL_CROSSBIN)/gcc
+AR := $(GCCSDK_INSTALL_CROSSBIN)/ar
+LD := $(GCCSDK_INSTALL_CROSSBIN)/gcc
+
+CP := cp
+RM := rm
+MKDIR := mkdir
+MV := mv
+ECHO := echo
+MAKE := make
+PERL := perl
+PKGCONFIG := pkg-config
+INSTALL := install
+SED := sed
+LCOV := echo
+GENHTML := echo
+
+# Toolchain flags
+WARNFLAGS := -Wall -Wextra -Wundef -Wpointer-arith -Wcast-align \
+ -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \
+ -Wmissing-declarations -Wnested-externs -Werror -pedantic
+CFLAGS += -std=c99 -D_BSD_SOURCE -I$(TOP)/include/ $(WARNFLAGS) \
+ -mpoke-function-name
+RELEASECFLAGS = $(CFLAGS) -DNDEBUG -O2
+DEBUGCFLAGS = $(CFLAGS) -O0 -g
+ARFLAGS := -cru
+LDFLAGS = -L$(TOP)/
+
+CPFLAGS :=
+RMFLAGS := -f
+MKDIRFLAGS := -p
+MVFLAGS :=
+ECHOFLAGS :=
+MAKEFLAGS :=
+PKGCONFIGFLAGS :=
+
+EXEEXT := ,ff8
+
+# Default installation prefix
+PREFIX ?= $(GCCSDK_INSTALL_ENV)
+
+include build/Makefile.common
diff --git a/README b/README
new file mode 100644
index 0000000..72041c0
--- /dev/null
+++ b/README
@@ -0,0 +1,44 @@
+LibParserUtils -- a utility library for parser building
+=======================================================
+
+Overview
+--------
+
+ LibParserUtils provides various pieces of functionality that are useful
+ when writing parsers.
+
+Requirements
+------------
+
+ LibParserUtils requires the following tools:
+
+ + A C99 capable C compiler
+ + GNU make or compatible
+ + Perl (for the testcases)
+ + Pkg-config (for the testcases)
+
+ For enhanced charset support, LibParserUtils may also be configured to use
+ an iconv() implementation.
+
+Compilation
+-----------
+
+ If necessary, modify the toolchain settings in the Makefile.
+ Invoke make:
+ $ make
+
+Verification
+------------
+
+ To verify that the parser is working, it is necessary to specify a
+ different makefile target than that used for normal compilation, thus:
+
+ $ make test
+
+API documentation
+-----------------
+
+ Currently, there is none. However, the code is well commented and the
+ public API may be found in the "include" directory. The testcase sources
+ may also be of use in working out how to use it.
+
diff --git a/build/Makefile.common b/build/Makefile.common
new file mode 100644
index 0000000..418a5a8
--- /dev/null
+++ b/build/Makefile.common
@@ -0,0 +1,129 @@
+# Top-level Makefile fragment
+
+# Default target
+all: release
+
+# Name of component
+COMPONENT := libparserutils
+
+# Environment
+EXPORT := $(CURDIR)/dist
+TOP := $(CURDIR)
+RELEASEDIR := build/Release
+DEBUGDIR := build/Debug
+COVERAGEDIR := build/coverage
+
+# List of items to delete on clean
+ITEMS_CLEAN :=
+# List of items to delete on distclean
+ITEMS_DISTCLEAN :=
+
+# List of targets to run for testing
+TARGET_TESTS :=
+
+# Source files
+SOURCES :=
+
+# Include configuration Makefile fragment
+include build/Makefile.config
+
+# Include Makefile fragments in subdirectories
+
+define do_include
+DIR := $$(dir $(1))
+include $(1)
+
+endef
+
+MAKE_INCLUDES := $(wildcard */Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Calculate objects to build
+OBJECTS := $(subst /,_,$(subst .c,.o,$(SOURCES)))
+
+.PHONY: release debug test coverage profile \
+ clean distclean setup export install uninstall
+
+# Rules
+release: setup $(addprefix $(RELEASEDIR)/,$(OBJECTS))
+ @$(AR) $(ARFLAGS) $(COMPONENT).a $(RELEASEDIR)/*
+
+debug: setup $(addprefix $(DEBUGDIR)/,$(OBJECTS))
+ @$(AR) $(ARFLAGS) $(COMPONENT)-debug.a $(DEBUGDIR)/*
+
+test: debug $(TARGET_TESTS)
+
+coverage: clean
+ @$(LCOV) --directory . --zerocounters
+ @$(MAKE) test CFLAGS="$(CFLAGS) -fprofile-arcs -ftest-coverage" \
+ LDFLAGS="$(LDFLAGS) -lgcov"
+ @$(LCOV) --directory $(DEBUGDIR) --base-directory $(TOP) \
+ --capture --output-file $(COVERAGEDIR)/$(COMPONENT)_tmp.info
+ @$(LCOV) --extract $(COVERAGEDIR)/$(COMPONENT)_tmp.info "$(TOP)/src*" \
+ -o $(COVERAGEDIR)/$(COMPONENT).info
+ @$(RM) $(RMFLAGS) $(COVERAGEDIR)/$(COMPONENT)_tmp.info
+ @$(GENHTML) -o $(COVERAGEDIR) --num-spaces 2 \
+ $(COVERAGEDIR)/$(COMPONENT).info
+
+profile: clean
+ @$(MAKE) test CFLAGS="$(CFLAGS) -pg" LDFLAGS="-pg $(LDFLAGS)"
+
+clean:
+ -@$(RM) $(RMFLAGS) $(ITEMS_CLEAN)
+ -@$(RM) $(RMFLAGS) gmon.out
+ -@$(RM) $(RMFLAGS) -r $(COVERAGEDIR)
+ -@$(RM) $(RMFLAGS) -r $(RELEASEDIR)
+ -@$(RM) $(RMFLAGS) -r $(DEBUGDIR)
+ -@$(RM) $(RMFLAGS) $(COMPONENT).a
+ -@$(RM) $(RMFLAGS) $(COMPONENT)-debug.a
+ -@$(RM) $(RMFLAGS) $(COMPONENT).pc
+
+distclean: clean
+ -@$(RM) $(RMFLAGS) $(ITEMS_DISTCLEAN)
+ -@$(RM) $(RMFLAGS) -r $(TOP)/dist
+
+setup:
+ @$(MKDIR) $(MKDIRFLAGS) $(RELEASEDIR)
+ @$(MKDIR) $(MKDIRFLAGS) $(DEBUGDIR)
+ @$(MKDIR) $(MKDIRFLAGS) $(COVERAGEDIR)
+
+export: release
+ @$(MKDIR) $(MKDIRFLAGS) $(TOP)/dist/lib
+ @$(CP) $(CPFLAGS) -r include $(EXPORT)/
+ @${CP} ${CPFLAGS} $(COMPONENT).a ${EXPORT}/lib/
+
+install: release
+ @$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/lib/pkgconfig
+ @$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/include/parserutils
+ @$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/include/parserutils/charset
+ @$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/include/parserutils/input
+ @$(MKDIR) $(MKDIRFLAGS) -p $(PREFIX)/include/parserutils/utils
+ @$(SED) -e 's#PREFIX#$(PREFIX)#' $(COMPONENT).pc.in >$(COMPONENT).pc
+ @$(INSTALL) --mode=644 -t $(PREFIX)/lib $(COMPONENT).a
+ @$(INSTALL) --mode=644 -t $(PREFIX)/lib/pkgconfig $(COMPONENT).pc
+ @$(INSTALL) --mode=644 -t $(PREFIX)/include/parserutils $(filter %.h, $(wildcard include/parserutils/*))
+ @$(INSTALL) --mode=644 -t $(PREFIX)/include/parserutils/charset $(filter %.h, $(wildcard include/parserutils/charset/*))
+ @$(INSTALL) --mode=644 -t $(PREFIX)/include/parserutils/input $(filter %.h, $(wildcard include/parserutils/input/*))
+ @$(INSTALL) --mode=644 -t $(PREFIX)/include/parserutils/utils $(filter %.h, $(wildcard include/parserutils/utils/*))
+
+
+uninstall:
+ @$(RM) $(RMFLAGS) $(PREFIX)/lib/$(COMPONENT).a
+ @$(RM) $(RMFLAGS) $(PREFIX)/lib/pkgconfig/$(COMPONENT).pc
+ @$(RM) $(RMFLAGS) -r $(PREFIX)/include/parserutils
+
+# Finally, build rules for compilation
+define do_compile
+$$(RELEASEDIR)/$(2): $(1)
+ @$$(ECHO) $$(ECHOFLAGS) "==> $(1)"
+ @$$(CC) -c $$(RELEASECFLAGS) -o $$@ $(1)
+
+$$(DEBUGDIR)/$(2): $(1)
+ @$$(ECHO) $$(ECHOFLAGS) "==> $(1)"
+ @$$(CC) -c $$(DEBUGCFLAGS) -o $$@ $(1)
+
+endef
+
+$(eval $(foreach SOURCE,$(filter %.c,$(SOURCES)), \
+ $(call do_compile,$(SOURCE),$(subst /,_,$(SOURCE:.c=.o)))))
+
diff --git a/build/Makefile.config b/build/Makefile.config
new file mode 100644
index 0000000..b6560c1
--- /dev/null
+++ b/build/Makefile.config
@@ -0,0 +1,8 @@
+# Configuration Makefile fragment
+
+# Build the iconv codec
+# override CFLAGS += -DWITH_ICONV_CODEC
+
+# Use iconv directly in the input filter
+# override CFLAGS += -DWITH_ICONV_FILTER
+
diff --git a/include/parserutils/charset/codec.h b/include/parserutils/charset/codec.h
new file mode 100644
index 0000000..ca98db5
--- /dev/null
+++ b/include/parserutils/charset/codec.h
@@ -0,0 +1,114 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_codec_h_
+#define parserutils_charset_codec_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+typedef struct parserutils_charset_codec parserutils_charset_codec;
+
+#define PARSERUTILS_CHARSET_CODEC_NULL (0xffffffffU)
+
+/**
+ * Charset codec error mode
+ *
+ * A codec's error mode determines its behaviour in the face of:
+ *
+ * + characters which are unrepresentable in the destination charset (if
+ * encoding data) or which cannot be converted to UCS4 (if decoding data).
+ * + invalid byte sequences (both encoding and decoding)
+ *
+ * The options provide a choice between the following approaches:
+ *
+ * + draconian, "stop processing" ("strict")
+ * + "replace the unrepresentable character with something else" ("loose")
+ * + "attempt to transliterate, or replace if unable" ("translit")
+ *
+ * The default error mode is "loose".
+ *
+ *
+ * In the "loose" case, the replacement character will depend upon:
+ *
+ * + Whether the operation was encoding or decoding
+ * + If encoding, what the destination charset is.
+ *
+ * If decoding, the replacement character will be:
+ *
+ * U+FFFD (REPLACEMENT CHARACTER)
+ *
+ * If encoding, the replacement character will be:
+ *
+ * U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32)
+ * U+FFFD (REPLACEMENT CHARACTER) otherwise.
+ *
+ *
+ * In the "translit" case, the codec will attempt to transliterate into
+ * the destination charset, if encoding. If decoding, or if transliteration
+ * fails, this option is identical to "loose".
+ */
+typedef enum parserutils_charset_codec_errormode {
+ /** Abort processing if unrepresentable character encountered */
+ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT = 0,
+ /** Replace unrepresentable characters with single alternate */
+ PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE = 1,
+ /** Transliterate unrepresentable characters, if possible */
+ PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT = 2,
+} parserutils_charset_codec_errormode;
+
+/**
+ * Charset codec option types
+ */
+typedef enum parserutils_charset_codec_opttype {
+ /** Set codec error mode */
+ PARSERUTILS_CHARSET_CODEC_ERROR_MODE = 1,
+} parserutils_charset_codec_opttype;
+
+/**
+ * Charset codec option parameters
+ */
+typedef union parserutils_charset_codec_optparams {
+ /** Parameters for error mode setting */
+ struct {
+ /** The desired error handling mode */
+ parserutils_charset_codec_errormode mode;
+ } error_mode;
+} parserutils_charset_codec_optparams;
+
+
+/* Create a charset codec */
+parserutils_charset_codec *parserutils_charset_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw);
+/* Destroy a charset codec */
+void parserutils_charset_codec_destroy(parserutils_charset_codec *codec);
+
+/* Configure a charset codec */
+parserutils_error parserutils_charset_codec_setopt(
+ parserutils_charset_codec *codec,
+ parserutils_charset_codec_opttype type,
+ parserutils_charset_codec_optparams *params);
+
+/* Encode a chunk of UCS4 data into a codec's charset */
+parserutils_error parserutils_charset_codec_encode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+
+/* Decode a chunk of data in a codec's charset into UCS4 */
+parserutils_error parserutils_charset_codec_decode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+
+/* Reset a charset codec */
+parserutils_error parserutils_charset_codec_reset(
+ parserutils_charset_codec *codec);
+
+#endif
diff --git a/include/parserutils/charset/mibenum.h b/include/parserutils/charset/mibenum.h
new file mode 100644
index 0000000..8b3ac9d
--- /dev/null
+++ b/include/parserutils/charset/mibenum.h
@@ -0,0 +1,24 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_mibenum_h_
+#define parserutils_charset_mibenum_h_
+
+#include <inttypes.h>
+#include <stdbool.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+/* Convert an encoding alias to a MIB enum value */
+uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len);
+/* Convert a MIB enum value into an encoding alias */
+const char *parserutils_charset_mibenum_to_name(uint16_t mibenum);
+/* Determine if a MIB enum value represents a Unicode variant */
+bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum);
+
+#endif
diff --git a/include/parserutils/charset/utf16.h b/include/parserutils/charset/utf16.h
new file mode 100644
index 0000000..6569d6e
--- /dev/null
+++ b/include/parserutils/charset/utf16.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-16 manipulation functions (interface).
+ */
+
+#ifndef parserutils_charset_utf16_h_
+#define parserutils_charset_utf16_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+
+parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s,
+ size_t len, uint32_t *ucs4, size_t *clen);
+parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4,
+ uint8_t *s, size_t *len);
+
+parserutils_error parserutils_charset_utf16_length(const uint8_t *s,
+ size_t max, size_t *len);
+parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
+ size_t *len);
+
+parserutils_error parserutils_charset_utf16_prev(const uint8_t *s,
+ uint32_t off, uint32_t *prevoff);
+parserutils_error parserutils_charset_utf16_next(const uint8_t *s,
+ uint32_t len, uint32_t off, uint32_t *nextoff);
+
+parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
+ uint32_t len, uint32_t off, uint32_t *nextoff);
+
+#endif
+
diff --git a/include/parserutils/charset/utf8.h b/include/parserutils/charset/utf8.h
new file mode 100644
index 0000000..16e012e
--- /dev/null
+++ b/include/parserutils/charset/utf8.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (interface).
+ */
+
+#ifndef parserutils_charset_utf8_h_
+#define parserutils_charset_utf8_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+
+parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
+ uint32_t *ucs4, size_t *clen);
+parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, uint8_t **s,
+ size_t *len);
+
+parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
+ size_t *len);
+parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
+ size_t *len);
+
+parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
+ uint32_t *prevoff);
+parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff);
+
+parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s,
+ uint32_t len, uint32_t off, uint32_t *nextoff);
+
+#endif
+
diff --git a/include/parserutils/errors.h b/include/parserutils/errors.h
new file mode 100644
index 0000000..09c715c
--- /dev/null
+++ b/include/parserutils/errors.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_errors_h_
+#define parserutils_errors_h_
+
+#include <stddef.h>
+
+typedef enum parserutils_error {
+ PARSERUTILS_OK = 0,
+
+ PARSERUTILS_NOMEM = 1,
+ PARSERUTILS_BADPARM = 2,
+ PARSERUTILS_INVALID = 3,
+ PARSERUTILS_FILENOTFOUND = 4,
+ PARSERUTILS_NEEDDATA = 5,
+} parserutils_error;
+
+/* Convert a parserutils error value to a string */
+const char *parserutils_error_to_string(parserutils_error error);
+/* Convert a string to a parserutils error value */
+parserutils_error parserutils_error_from_string(const char *str, size_t len);
+
+#endif
+
diff --git a/include/parserutils/functypes.h b/include/parserutils/functypes.h
new file mode 100644
index 0000000..703a329
--- /dev/null
+++ b/include/parserutils/functypes.h
@@ -0,0 +1,21 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007-8 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_functypes_h_
+#define parserutils_functypes_h_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <parserutils/types.h>
+
+/* Type of allocation function for parserutils */
+typedef void *(*parserutils_alloc)(void *ptr, size_t size, void *pw);
+
+#endif
+
diff --git a/include/parserutils/input/inputstream.h b/include/parserutils/input/inputstream.h
new file mode 100644
index 0000000..2b0c407
--- /dev/null
+++ b/include/parserutils/input/inputstream.h
@@ -0,0 +1,143 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_input_inputstream_h_
+#define parserutils_input_inputstream_h_
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+#include <parserutils/types.h>
+#include <parserutils/charset/utf8.h>
+#include <parserutils/utils/buffer.h>
+
+/**
+ * Type of charset detection function
+ */
+typedef parserutils_error (*parserutils_charset_detect_func)(
+ const uint8_t *data, size_t len,
+ uint16_t *mibenum, uint32_t *source);
+
+/**
+ * Input stream object
+ */
+typedef struct parserutils_inputstream
+{
+ parserutils_buffer *utf8; /**< Buffer containing utf8 data */
+
+ uint32_t cursor; /**< Byte offset of current position */
+
+ bool had_eof; /**< Whether EOF has been reached */
+} parserutils_inputstream;
+
+/* EOF pseudo-character */
+#define PARSERUTILS_INPUTSTREAM_EOF (0xFFFFFFFFU)
+/* Out-of-data indicator */
+#define PARSERUTILS_INPUTSTREAM_OOD (0xFFFFFFFEU)
+
+/* Create an input stream */
+parserutils_inputstream *parserutils_inputstream_create(const char *enc,
+ uint32_t encsrc, parserutils_charset_detect_func csdetect,
+ parserutils_alloc alloc, void *pw);
+/* Destroy an input stream */
+void parserutils_inputstream_destroy(parserutils_inputstream *stream);
+
+/* Append data to an input stream */
+parserutils_error parserutils_inputstream_append(
+ parserutils_inputstream *stream,
+ const uint8_t *data, size_t len);
+/* Insert data into stream at current location */
+parserutils_error parserutils_inputstream_insert(
+ parserutils_inputstream *stream,
+ const uint8_t *data, size_t len);
+
+/* Slow form of css_inputstream_peek. */
+uintptr_t parserutils_inputstream_peek_slow(parserutils_inputstream *stream,
+ size_t offset, size_t *length);
+
+/* Look at the character in the stream that starts at
+ * offset bytes from the cursor
+ *
+ * \param stream Stream to look in
+ * \param offset Byte offset of start of character
+ * \param length Pointer to location to receive character length (in bytes)
+ * \return Pointer to character data, or EOF or OOD.
+ *
+ * Once the character pointed to by the result of this call has been advanced
+ * past (i.e. parserutils_inputstream_advance has caused the stream cursor to
+ * pass over the character), then no guarantee is made as to the validity of
+ * the data pointed to. Thus, any attempt to dereference the pointer after
+ * advancing past the data it points to is a bug.
+ */
+static inline uintptr_t parserutils_inputstream_peek(
+ parserutils_inputstream *stream, size_t offset, size_t *length)
+{
+ parserutils_error error = PARSERUTILS_OK;
+ size_t len;
+
+ if (stream == NULL)
+ return PARSERUTILS_INPUTSTREAM_OOD;
+
+#define IS_ASCII(x) (((x) & 0x80) == 0)
+
+ if (stream->cursor + offset < stream->utf8->length) {
+ if (IS_ASCII(stream->utf8->data[stream->cursor + offset])) {
+ len = 1;
+ } else {
+ error = parserutils_charset_utf8_char_byte_length(
+ stream->utf8->data + stream->cursor + offset,
+ &len);
+
+ if (error != PARSERUTILS_OK &&
+ error != PARSERUTILS_NEEDDATA)
+ return PARSERUTILS_INPUTSTREAM_OOD;
+ }
+ }
+
+#undef IS_ASCII
+
+ if (stream->cursor + offset == stream->utf8->length ||
+ error == PARSERUTILS_NEEDDATA) {
+ return parserutils_inputstream_peek_slow(stream,
+ offset, length);
+ }
+
+ *length = len;
+
+ return (uintptr_t) (stream->utf8->data + stream->cursor + offset);
+}
+
+/**
+ * Advance the stream's current position
+ *
+ * \param stream The stream whose position to advance
+ * \param bytes The number of bytes to advance
+ */
+static inline void parserutils_inputstream_advance(
+ parserutils_inputstream *stream, size_t bytes)
+{
+ if (stream == NULL)
+ return;
+
+ if (bytes > stream->utf8->length - stream->cursor)
+ abort();
+
+ if (stream->cursor == stream->utf8->length)
+ return;
+
+ stream->cursor += bytes;
+}
+
+/* Read the document charset */
+const char *parserutils_inputstream_read_charset(
+ parserutils_inputstream *stream, uint32_t *source);
+
+#endif
+
diff --git a/include/parserutils/parserutils.h b/include/parserutils/parserutils.h
new file mode 100644
index 0000000..460e80c
--- /dev/null
+++ b/include/parserutils/parserutils.h
@@ -0,0 +1,23 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_parserutils_h_
+#define parserutils_parserutils_h_
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+#include <parserutils/types.h>
+
+/* Initialise the ParserUtils library for use */
+parserutils_error parserutils_initialise(const char *aliases_file,
+ parserutils_alloc alloc, void *pw);
+
+/* Clean up after ParserUtils */
+parserutils_error parserutils_finalise(parserutils_alloc alloc, void *pw);
+
+#endif
+
diff --git a/include/parserutils/types.h b/include/parserutils/types.h
new file mode 100644
index 0000000..b36e4aa
--- /dev/null
+++ b/include/parserutils/types.h
@@ -0,0 +1,15 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_types_h_
+#define parserutils_types_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#endif
+
diff --git a/include/parserutils/utils/buffer.h b/include/parserutils/utils/buffer.h
new file mode 100644
index 0000000..f3a1883
--- /dev/null
+++ b/include/parserutils/utils/buffer.h
@@ -0,0 +1,39 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_utils_buffer_h_
+#define parserutils_utils_buffer_h_
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+struct parserutils_buffer
+{
+ uint8_t *data;
+ size_t length;
+ size_t allocated;
+
+ parserutils_alloc alloc;
+ void *pw;
+};
+typedef struct parserutils_buffer parserutils_buffer;
+
+parserutils_buffer *parserutils_buffer_create(parserutils_alloc alloc,
+ void *pw);
+void parserutils_buffer_destroy(parserutils_buffer *buffer);
+
+parserutils_error parserutils_buffer_append(parserutils_buffer *buffer,
+ const uint8_t *data, size_t len);
+parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer,
+ size_t offset, const uint8_t *data, size_t len);
+parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer,
+ size_t offset, size_t len);
+
+parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer);
+
+#endif
+
diff --git a/libparserutils.pc.in b/libparserutils.pc.in
new file mode 100644
index 0000000..400ce78
--- /dev/null
+++ b/libparserutils.pc.in
@@ -0,0 +1,10 @@
+prefix=PREFIX
+exec_prefix=${prefix}
+libdir=${exec_prefix}/lib
+includedir=${prefix}/include
+
+Name: libparserutils
+Description: Utility library for facilitating parser development
+Version: 0.0.1
+Libs: -L${libdir} -lparserutils
+Cflags: -I${includedir}
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..bb6c585
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,49 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Manipulate include paths
+override CFLAGS := $(CFLAGS) -I$(d)
+
+# Sources
+SRCS_$(d) := parserutils.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/Makefile b/src/charset/Makefile
new file mode 100644
index 0000000..fc34d7c
--- /dev/null
+++ b/src/charset/Makefile
@@ -0,0 +1,49 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Manipulate include paths
+override CFLAGS := $(CFLAGS) -I$(d)
+
+# Sources
+SRCS_$(d) := aliases.c charset.c codec.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/aliases.c b/src/charset/aliases.c
new file mode 100644
index 0000000..1e7e6ea
--- /dev/null
+++ b/src/charset/aliases.c
@@ -0,0 +1,410 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "utils/utils.h"
+
+struct alias {
+ struct alias *next;
+ parserutils_charset_aliases_canon *canon;
+ uint16_t name_len;
+ char name[1];
+};
+
+#define HASH_SIZE (43)
+static parserutils_charset_aliases_canon *canon_tab[HASH_SIZE];
+static struct alias *alias_tab[HASH_SIZE];
+
+static parserutils_error parserutils_charset_create_alias(const char *alias,
+ parserutils_charset_aliases_canon *c,
+ parserutils_alloc alloc, void *pw);
+static parserutils_charset_aliases_canon *parserutils_charset_create_canon(
+ const char *canon, uint16_t mibenum,
+ parserutils_alloc alloc, void *pw);
+static uint32_t parserutils_charset_hash_val(const char *alias, size_t len);
+
+/**
+ * Create alias data from Aliases file
+ *
+ * \param filename The path to the Aliases file
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_charset_aliases_create(const char *filename,
+ parserutils_alloc alloc, void *pw)
+{
+ char buf[300];
+ FILE *fp;
+
+ if (filename == NULL || alloc == NULL)
+ return PARSERUTILS_BADPARM;
+
+ fp = fopen(filename, "r");
+ if (fp == NULL)
+ return PARSERUTILS_FILENOTFOUND;
+
+ while (fgets(buf, sizeof buf, fp)) {
+ char *p, *aliases = 0, *mib, *end;
+ parserutils_charset_aliases_canon *cf;
+
+ if (buf[0] == 0 || buf[0] == '#')
+ /* skip blank lines or comments */
+ continue;
+
+ buf[strlen(buf) - 1] = 0; /* lose terminating newline */
+ end = buf + strlen(buf);
+
+ /* find end of canonical form */
+ for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ *p++ = '\0'; /* terminate canonical form */
+
+ /* skip whitespace */
+ for (; *p && isspace(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ mib = p;
+
+ /* find end of mibenum */
+ for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p < end)
+ *p++ = '\0'; /* terminate mibenum */
+
+ cf = parserutils_charset_create_canon(buf, atoi(mib), alloc, pw);
+ if (cf == NULL)
+ continue;
+
+ /* skip whitespace */
+ for (; p < end && *p && isspace(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ aliases = p;
+
+ while (p < end) {
+ /* find end of alias */
+ for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p > end)
+ /* stop if we've gone past the end */
+ break;
+ /* terminate current alias */
+ *p++ = '\0';
+
+ if (parserutils_charset_create_alias(aliases, cf,
+ alloc, pw) != PARSERUTILS_OK)
+ break;
+
+ /* in terminating, we may have advanced
+ * past the end - check this here */
+ if (p >= end)
+ break;
+
+ /* skip whitespace */
+ for (; *p && isspace(*p); p++)
+ ; /* do nothing */
+
+ if (p >= end)
+ /* gone past end => stop */
+ break;
+
+ /* update pointer to current alias */
+ aliases = p;
+ }
+ }
+
+ fclose(fp);
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Free all alias data
+ *
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data
+ */
+void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw)
+{
+ parserutils_charset_aliases_canon *c, *d;
+ struct alias *a, *b;
+ int i;
+
+ for (i = 0; i != HASH_SIZE; i++) {
+ for (c = canon_tab[i]; c; c = d) {
+ d = c->next;
+ alloc(c, 0, pw);
+ }
+ canon_tab[i] = NULL;
+
+ for (a = alias_tab[i]; a; a = b) {
+ b = a->next;
+ alloc(a, 0, pw);
+ }
+ alias_tab[i] = NULL;
+ }
+}
+
+/**
+ * Retrieve the MIB enum value assigned to an encoding name
+ *
+ * \param alias The alias to lookup
+ * \param len The length of the alias string
+ * \return The MIB enum value, or 0 if not found
+ */
+uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
+{
+ parserutils_charset_aliases_canon *c;
+
+ if (alias == NULL)
+ return 0;
+
+ c = parserutils_charset_alias_canonicalise(alias, len);
+ if (c == NULL)
+ return 0;
+
+ return c->mib_enum;
+}
+
+/**
+ * Retrieve the canonical name of an encoding from the MIB enum
+ *
+ * \param mibenum The MIB enum value
+ * \return Pointer to canonical name, or NULL if not found
+ */
+const char *parserutils_charset_mibenum_to_name(uint16_t mibenum)
+{
+ int i;
+ parserutils_charset_aliases_canon *c;
+
+ for (i = 0; i != HASH_SIZE; i++)
+ for (c = canon_tab[i]; c; c = c->next)
+ if (c->mib_enum == mibenum)
+ return c->name;
+
+ return NULL;
+}
+
+/**
+ * Detect if a parserutils_charset is Unicode
+ *
+ * \param mibenum The MIB enum to consider
+ * \return true if a Unicode variant, false otherwise
+ */
+bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum)
+{
+ static uint16_t ucs4;
+ static uint16_t ucs2;
+ static uint16_t utf8;
+ static uint16_t utf16;
+ static uint16_t utf16be;
+ static uint16_t utf16le;
+ static uint16_t utf32;
+ static uint16_t utf32be;
+ static uint16_t utf32le;
+
+ if (ucs4 == 0) {
+ ucs4 = parserutils_charset_mibenum_from_name("UCS-4",
+ SLEN("UCS-4"));
+ ucs2 = parserutils_charset_mibenum_from_name("UCS-2",
+ SLEN("UCS-2"));
+ utf8 = parserutils_charset_mibenum_from_name("UTF-8",
+ SLEN("UTF-8"));
+ utf16 = parserutils_charset_mibenum_from_name("UTF-16",
+ SLEN("UTF-16"));
+ utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE"));
+ utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE"));
+ utf32 = parserutils_charset_mibenum_from_name("UTF-32",
+ SLEN("UTF-32"));
+ utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"));
+ utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE"));
+ }
+
+ return (mibenum == ucs4 || mibenum == ucs2 || mibenum == utf8 ||
+ mibenum == utf16 || mibenum == utf16be ||
+ mibenum == utf16le || mibenum == utf32 ||
+ mibenum == utf32be || mibenum == utf32le);
+}
+
+/**
+ * Retrieve the canonical form of an alias name
+ *
+ * \param alias The alias name
+ * \param len The length of the alias name
+ * \return Pointer to canonical form or NULL if not found
+ */
+parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise(
+ const char *alias, size_t len)
+{
+ uint32_t hash;
+ parserutils_charset_aliases_canon *c;
+ struct alias *a;
+
+ if (alias == NULL)
+ return NULL;
+
+ hash = parserutils_charset_hash_val(alias, len);
+
+ for (c = canon_tab[hash]; c; c = c->next)
+ if (c->name_len == len &&
+ strncasecmp(c->name, alias, len) == 0)
+ break;
+ if (c)
+ return c;
+
+ for (a = alias_tab[hash]; a; a = a->next)
+ if (a->name_len == len &&
+ strncasecmp(a->name, alias, len) == 0)
+ break;
+ if (a)
+ return a->canon;
+
+ return NULL;
+}
+
+
+/**
+ * Create an alias
+ *
+ * \param alias The alias name
+ * \param c The canonical form
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_create_alias(const char *alias,
+ parserutils_charset_aliases_canon *c,
+ parserutils_alloc alloc, void *pw)
+{
+ struct alias *a;
+ uint32_t hash;
+
+ if (alias == NULL || c == NULL || alloc == NULL)
+ return PARSERUTILS_BADPARM;
+
+ a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw);
+ if (a == NULL)
+ return PARSERUTILS_NOMEM;
+
+ a->canon = c;
+ a->name_len = strlen(alias);
+ strcpy(a->name, alias);
+ a->name[a->name_len] = '\0';
+
+ hash = parserutils_charset_hash_val(alias, a->name_len);
+
+ a->next = alias_tab[hash];
+ alias_tab[hash] = a;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Create a canonical form
+ *
+ * \param canon The canonical name
+ * \param mibenum The MIB enum value
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to canonical form or NULL on error
+ */
+parserutils_charset_aliases_canon *parserutils_charset_create_canon(
+ const char *canon, uint16_t mibenum,
+ parserutils_alloc alloc, void *pw)
+{
+ parserutils_charset_aliases_canon *c;
+ uint32_t hash, len;
+
+ if (canon == NULL || alloc == NULL)
+ return NULL;
+
+ len = strlen(canon);
+
+ c = alloc(NULL, sizeof(parserutils_charset_aliases_canon) + len + 1, pw);
+ if (c == NULL)
+ return NULL;
+
+ c->mib_enum = mibenum;
+ c->name_len = len;
+ strcpy(c->name, canon);
+ c->name[len] = '\0';
+
+ hash = parserutils_charset_hash_val(canon, len);
+
+ c->next = canon_tab[hash];
+ canon_tab[hash] = c;
+
+ return c;
+}
+
+/**
+ * Hash function
+ *
+ * \param alias String to hash
+ * \return The hashed value
+ */
+uint32_t parserutils_charset_hash_val(const char *alias, size_t len)
+{
+ const char *s = alias;
+ uint32_t h = 5381;
+
+ if (alias == NULL)
+ return 0;
+
+ while (len--)
+ h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */
+
+ return h % HASH_SIZE;
+}
+
+
+#ifndef NDEBUG
+/**
+ * Dump all alias data to stdout
+ */
+void parserutils_charset_aliases_dump(void)
+{
+ parserutils_charset_aliases_canon *c;
+ struct alias *a;
+ int i;
+ size_t size = 0;
+
+ for (i = 0; i != HASH_SIZE; i++) {
+ for (c = canon_tab[i]; c; c = c->next) {
+ printf("%d %s\n", i, c->name);
+ size += offsetof(parserutils_charset_aliases_canon,
+ name) + c->name_len;
+ }
+
+ for (a = alias_tab[i]; a; a = a->next) {
+ printf("%d %s\n", i, a->name);
+ size += offsetof(struct alias, name) + a->name_len;
+ }
+ }
+
+ size += (sizeof(canon_tab) / sizeof(canon_tab[0]));
+ size += (sizeof(alias_tab) / sizeof(alias_tab[0]));
+
+ printf("%u\n", (unsigned int) size);
+}
+#endif
diff --git a/src/charset/aliases.h b/src/charset/aliases.h
new file mode 100644
index 0000000..9abd2c8
--- /dev/null
+++ b/src/charset/aliases.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_aliases_h_
+#define parserutils_charset_aliases_h_
+
+#include <inttypes.h>
+
+#include <parserutils/charset/mibenum.h>
+
+typedef struct parserutils_charset_aliases_canon {
+ struct parserutils_charset_aliases_canon *next;
+ uint16_t mib_enum;
+ uint16_t name_len;
+ char name[1];
+} parserutils_charset_aliases_canon;
+
+/* Load encoding aliases from file */
+parserutils_error parserutils_charset_aliases_create(const char *filename,
+ parserutils_alloc alloc, void *pw);
+/* Destroy encoding aliases */
+void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw);
+
+/* Canonicalise an alias name */
+parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise(
+ const char *alias, size_t len);
+
+#ifndef NDEBUG
+void parserutils_charset_aliases_dump(void);
+#endif
+
+#endif
diff --git a/src/charset/charset.c b/src/charset/charset.c
new file mode 100644
index 0000000..3ef1a71
--- /dev/null
+++ b/src/charset/charset.c
@@ -0,0 +1,54 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include "charset/aliases.h"
+#include "charset/charset.h"
+
+/**
+ * Initialise the Charset library for use.
+ *
+ * This _must_ be called before using any libparserutils charset functions
+ *
+ * \param aliases_file Pointer to name of file containing encoding alias data
+ * \param alloc Pointer to (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, applicable error otherwise.
+ */
+parserutils_error parserutils_charset_initialise(const char *aliases_file,
+ parserutils_alloc alloc, void *pw)
+{
+ parserutils_error error;
+
+ if (aliases_file == NULL || alloc == NULL)
+ return PARSERUTILS_BADPARM;
+
+ error = parserutils_charset_aliases_create(aliases_file, alloc, pw);
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Clean up after Libparserutils
+ *
+ * \param alloc Pointer to (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, applicable error otherwise.
+ */
+parserutils_error parserutils_charset_finalise(parserutils_alloc alloc,
+ void *pw)
+{
+ if (alloc == NULL)
+ return PARSERUTILS_BADPARM;
+
+ parserutils_charset_aliases_destroy(alloc, pw);
+
+ return PARSERUTILS_OK;
+}
+
+
diff --git a/src/charset/charset.h b/src/charset/charset.h
new file mode 100644
index 0000000..4b07577
--- /dev/null
+++ b/src/charset/charset.h
@@ -0,0 +1,24 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_charset_h_
+#define parserutils_charset_charset_h_
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+#include <parserutils/types.h>
+
+/* Initialise the Charset library for use */
+parserutils_error parserutils_charset_initialise(const char *aliases_file,
+ parserutils_alloc alloc, void *pw);
+
+/* Clean up after Charset */
+parserutils_error parserutils_charset_finalise(parserutils_alloc alloc,
+ void *pw);
+
+#endif
+
diff --git a/src/charset/codec.c b/src/charset/codec.c
new file mode 100644
index 0000000..5c3fb3a
--- /dev/null
+++ b/src/charset/codec.c
@@ -0,0 +1,185 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "charset/codecs/codec_impl.h"
+
+#ifdef WITH_ICONV_CODEC
+extern parserutils_charset_handler iconv_codec_handler;
+#endif
+
+extern parserutils_charset_handler charset_utf8_codec_handler;
+extern parserutils_charset_handler charset_utf16_codec_handler;
+
+static parserutils_charset_handler *handler_table[] = {
+ &charset_utf8_codec_handler,
+ &charset_utf16_codec_handler,
+#ifdef WITH_ICONV_CODEC
+ &iconv_codec_handler,
+#endif
+ NULL,
+};
+
+/**
+ * Create a charset codec
+ *
+ * \param charset Target charset
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec instance, or NULL on failure
+ */
+parserutils_charset_codec *parserutils_charset_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw)
+{
+ parserutils_charset_codec *codec;
+ parserutils_charset_handler **handler;
+ const parserutils_charset_aliases_canon * canon;
+
+ if (charset == NULL || alloc == NULL)
+ return NULL;
+
+ /* Canonicalise parserutils_charset name. */
+ canon = parserutils_charset_alias_canonicalise(charset,
+ strlen(charset));
+ if (canon == NULL)
+ return NULL;
+
+ /* Search for handler class */
+ for (handler = handler_table; *handler != NULL; handler++) {
+ if ((*handler)->handles_charset(canon->name))
+ break;
+ }
+
+ /* None found */
+ if ((*handler) == NULL)
+ return NULL;
+
+ /* Instantiate class */
+ codec = (*handler)->create(canon->name, alloc, pw);
+ if (codec == NULL)
+ return NULL;
+
+ /* and initialise it */
+ codec->mibenum = canon->mib_enum;
+
+ codec->errormode = PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
+
+ codec->alloc = alloc;
+ codec->alloc_pw = pw;
+
+ return codec;
+}
+
+/**
+ * Destroy a charset codec
+ *
+ * \param codec The codec to destroy
+ */
+void parserutils_charset_codec_destroy(parserutils_charset_codec *codec)
+{
+ if (codec == NULL)
+ return;
+
+ codec->handler.destroy(codec);
+
+ codec->alloc(codec, 0, codec->alloc_pw);
+}
+
+/**
+ * Configure a charset codec
+ *
+ * \param codec The codec to configure
+ * \parem type The codec option type to configure
+ * \param params Option-specific parameters
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_codec_setopt(
+ parserutils_charset_codec *codec,
+ parserutils_charset_codec_opttype type,
+ parserutils_charset_codec_optparams *params)
+{
+ if (codec == NULL || params == NULL)
+ return PARSERUTILS_BADPARM;
+
+ switch (type) {
+ case PARSERUTILS_CHARSET_CODEC_ERROR_MODE:
+ codec->errormode = params->error_mode.mode;
+ break;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Encode a chunk of UCS4 data into a codec's charset
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ */
+parserutils_error parserutils_charset_codec_encode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ if (codec == NULL || source == NULL || *source == NULL ||
+ sourcelen == NULL || dest == NULL || *dest == NULL ||
+ destlen == NULL)
+ return PARSERUTILS_BADPARM;
+
+ return codec->handler.encode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Decode a chunk of data in a codec's charset into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ *
+ * Call this with a source length of 0 to flush any buffers.
+ */
+parserutils_error parserutils_charset_codec_decode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ if (codec == NULL || source == NULL || *source == NULL ||
+ sourcelen == NULL || dest == NULL || *dest == NULL ||
+ destlen == NULL)
+ return PARSERUTILS_BADPARM;
+
+ return codec->handler.decode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Clear a charset codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_codec_reset(
+ parserutils_charset_codec *codec)
+{
+ if (codec == NULL)
+ return PARSERUTILS_BADPARM;
+
+ return codec->handler.reset(codec);
+}
+
diff --git a/src/charset/codecs/Makefile b/src/charset/codecs/Makefile
new file mode 100644
index 0000000..6d3b78e
--- /dev/null
+++ b/src/charset/codecs/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Sources
+SRCS_$(d) := codec_iconv.c codec_utf8.c codec_utf16.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/codecs/codec_iconv.c b/src/charset/codecs/codec_iconv.c
new file mode 100644
index 0000000..bbe8bc4
--- /dev/null
+++ b/src/charset/codecs/codec_iconv.c
@@ -0,0 +1,683 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/* This codec is hideously slow. Only use it as a last resort */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* We put this here rather than at the top as GCC complains
+ * about the source file being empty otherwise. */
+#ifdef WITH_ICONV_CODEC
+
+#include <iconv.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <parserutils/charset/mibenum.h>
+
+#include "charset/codecs/codec_impl.h"
+#include "utils/utils.h"
+
+/**
+ * Iconv-based charset codec
+ */
+typedef struct iconv_codec {
+ parserutils_charset_codec base; /**< Base class */
+
+ iconv_t read_cd; /**< Iconv handle for reading */
+#define INVAL_BUFSIZE (32)
+ uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
+ * incomplete input
+ * sequences */
+ size_t inval_len; /**< Number of bytes in inval_buf */
+
+#define READ_BUFSIZE (8)
+ uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
+ * output sequences (decode)
+ */
+ size_t read_len; /**< Number of characters in
+ * read_buf */
+
+ iconv_t write_cd; /**< Iconv handle for writing */
+#define WRITE_BUFSIZE (8)
+ uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
+ * output sequences (encode)
+ */
+ size_t write_len; /**< Number of characters in
+ * write_buf */
+} iconv_codec;
+
+
+static bool iconv_codec_handles_charset(const char *charset);
+static parserutils_charset_codec *iconv_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw);
+static void iconv_codec_destroy (parserutils_charset_codec *codec);
+static parserutils_error iconv_codec_encode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error iconv_codec_decode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error iconv_codec_reset(parserutils_charset_codec *codec);
+static parserutils_error iconv_codec_output_decoded_char(
+ iconv_codec *c, uint32_t ucs4, uint8_t **dest,
+ size_t *destlen);
+static parserutils_error iconv_codec_read_char(iconv_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error iconv_codec_write_char(iconv_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool iconv_codec_handles_charset(const char *charset)
+{
+ iconv_t cd;
+ bool ret;
+
+ cd = iconv_open("UCS-4", charset);
+
+ ret = (cd != (iconv_t) -1);
+
+ if (ret)
+ iconv_close(cd);
+
+ return ret;
+}
+
+/**
+ * Create an iconv-based codec
+ *
+ * \param charset The charset to read from / write to
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+parserutils_charset_codec *iconv_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw)
+{
+ iconv_codec *codec;
+
+ codec = alloc(NULL, sizeof(iconv_codec), pw);
+ if (codec == NULL)
+ return NULL;
+
+ codec->read_cd = iconv_open("UCS-4", charset);
+ if (codec->read_cd == (iconv_t) -1) {
+ alloc(codec, 0, pw);
+ return NULL;
+ }
+
+ codec->write_cd = iconv_open(charset, "UCS-4");
+ if (codec->write_cd == (iconv_t) -1) {
+ iconv_close(codec->read_cd);
+ alloc(codec, 0, pw);
+ return NULL;
+ }
+
+ codec->inval_buf[0] = '\0';
+ codec->inval_len = 0;
+
+ codec->read_buf[0] = 0;
+ codec->read_len = 0;
+
+ codec->write_buf[0] = 0;
+ codec->write_len = 0;
+
+ /* Finally, populate vtable */
+ codec->base.handler.destroy = iconv_codec_destroy;
+ codec->base.handler.encode = iconv_codec_encode;
+ codec->base.handler.decode = iconv_codec_decode;
+ codec->base.handler.reset = iconv_codec_reset;
+
+ return (parserutils_charset_codec *) codec;
+}
+
+/**
+ * Destroy an iconv-based codec
+ *
+ * \param codec The codec to destroy
+ */
+void iconv_codec_destroy (parserutils_charset_codec *codec)
+{
+ iconv_codec *c = (iconv_codec *) codec;
+
+ iconv_close(c->read_cd);
+ iconv_close(c->write_cd);
+
+ return;
+}
+
+/**
+ * Encode a chunk of UCS4 data into an iconv-based codec's charset
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error iconv_codec_encode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ iconv_codec *c = (iconv_codec *) codec;
+ uint32_t ucs4;
+ const uint32_t *towrite;
+ size_t towritelen;
+ parserutils_error error;
+
+ /* Process any outstanding characters from the previous call */
+ if (c->write_len > 0) {
+ uint32_t *pwrite = c->write_buf;
+
+ while (c->write_len > 0) {
+ error = iconv_codec_write_char(c, pwrite[0],
+ dest, destlen);
+ if (error != PARSERUTILS_OK) {
+ /* Copy outstanding chars down, skipping
+ * invalid one, if present, so as to avoid
+ * reprocessing the invalid character */
+ if (error == PARSERUTILS_INVALID) {
+ for (ucs4 = 1; ucs4 < c->write_len;
+ ucs4++) {
+ c->write_buf[ucs4] =
+ pwrite[ucs4];
+ }
+ }
+
+ return error;
+ }
+
+ pwrite++;
+ c->write_len--;
+ }
+ }
+
+ /* Now process the characters for this call */
+ while (*sourcelen > 0) {
+ towrite = (const uint32_t *) (const void *) *source;
+ towritelen = 1;
+ ucs4 = *towrite;
+
+ /* Output current character(s) */
+ while (towritelen > 0) {
+ error = iconv_codec_write_char(c, towrite[0],
+ dest, destlen);
+
+ if (error != PARSERUTILS_OK) {
+ ucs4 = (error == PARSERUTILS_INVALID) ? 1 : 0;
+
+ if (towritelen - ucs4 >= WRITE_BUFSIZE)
+ abort();
+
+ c->write_len = towritelen - ucs4;
+
+ /* Copy pending chars to save area, for
+ * processing next call; skipping invalid
+ * character, if present, so it's not
+ * reprocessed. */
+ for (; ucs4 < towritelen; ucs4++) {
+ c->write_buf[ucs4] = towrite[ucs4];
+ }
+
+ /* Claim character we've just buffered,
+ * so it's not repreocessed */
+ *source += 4;
+ *sourcelen -= 4;
+
+ return error;
+ }
+
+ towrite++;
+ towritelen--;
+ }
+
+ *source += 4;
+ *sourcelen -= 4;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Decode a chunk of data in an iconv-based codec's charset into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error iconv_codec_decode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ iconv_codec *c = (iconv_codec *) codec;
+ parserutils_error error;
+
+ if (c->read_len > 0) {
+ /* Output left over from last decode
+ * Attempt to finish this here */
+ uint32_t *pread = c->read_buf;
+
+ while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+ *((uint32_t *) (void *) *dest) = pread[0];
+
+ *dest += 4;
+ *destlen -= 4;
+
+ pread++;
+ c->read_len--;
+ }
+
+ if (*destlen < c->read_len * 4) {
+ /* Run out of output buffer */
+ size_t i;
+
+ /* Shuffle remaining output down */
+ for (i = 0; i < c->read_len; i++) {
+ c->read_buf[i] = pread[i];
+ }
+
+ return PARSERUTILS_NOMEM;
+ }
+ }
+
+ if (c->inval_len > 0) {
+ /* The last decode ended in an incomplete sequence.
+ * Fill up inval_buf with data from the start of the
+ * new chunk and process it. */
+ uint8_t *in = c->inval_buf;
+ size_t ol = c->inval_len;
+ size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+ size_t orig_l = l;
+
+ memcpy(c->inval_buf + ol, *source, l);
+
+ l += c->inval_len;
+
+ error = iconv_codec_read_char(c,
+ (const uint8_t **) &in, &l, dest, destlen);
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+ return error;
+ }
+
+
+ /* And now, fix everything up so the normal processing
+ * does the right thing. */
+ *source += max((signed) (orig_l - l), 0);
+ *sourcelen -= max((signed) (orig_l - l), 0);
+
+ /* Failed to resolve an incomplete character and
+ * ran out of buffer space. No recovery strategy
+ * possible, so explode everywhere. */
+ if ((orig_l + ol) - l == 0)
+ abort();
+
+ /* Handle memry exhaustion case from above */
+ if (error != PARSERUTILS_OK)
+ return error;
+ }
+
+ while (*sourcelen > 0) {
+ error = iconv_codec_read_char(c,
+ source, sourcelen, dest, destlen);
+ if (error != PARSERUTILS_OK) {
+ return error;
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Clear an iconv-based codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error iconv_codec_reset(parserutils_charset_codec *codec)
+{
+ iconv_codec *c = (iconv_codec *) codec;
+
+ iconv(c->read_cd, NULL, NULL, NULL, NULL);
+ iconv(c->write_cd, NULL, NULL, NULL, NULL);
+
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ c->read_buf[0] = 0;
+ c->read_len = 0;
+
+ c->write_buf[0] = 0;
+ c->write_len = 0;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Output a UCS4 character
+ *
+ * \param c Codec to use
+ * \param ucs4 UCS4 character (big endian)
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to output buffer length
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ */
+parserutils_error iconv_codec_output_decoded_char(iconv_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ if (*destlen < 4) {
+ /* Run out of output buffer */
+
+ c->read_len = 1;
+ c->read_buf[0] = ucs4;
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ *((uint32_t *) (void *) *dest) = ucs4;
+ *dest += 4;
+ *destlen -= 4;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Read a character from the codec's native charset to UCS4 (big endian)
+ *
+ * \param c The codec
+ * \param source Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen Pointer to length of source buffer (updated on exit)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error iconv_codec_read_char(iconv_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ size_t iconv_ret;
+ const uint8_t *origsrc = *source;
+ size_t origsrclen = *sourcelen;
+ uint32_t ucs4;
+ uint8_t *pucs4 = (uint8_t *) &ucs4;
+ size_t sucs4 = 4;
+ parserutils_error error;
+
+ /* Use iconv to convert a single character
+ * Side effect: Updates *source to point at next input
+ * character and *sourcelen to reflect reduced input length
+ */
+ iconv_ret = iconv(c->read_cd, (char **) source, sourcelen,
+ (char **) (void *) &pucs4, &sucs4);
+
+ if (iconv_ret != (size_t) -1 ||
+ (*source != origsrc && sucs4 == 0)) {
+ /* Read a character */
+ error = iconv_codec_output_decoded_char(c, ucs4, dest, destlen);
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+ /* output failed; restore source pointers */
+ *source = origsrc;
+ *sourcelen = origsrclen;
+ }
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ return error;
+ } else if (errno == E2BIG) {
+ /* Should never happen */
+ abort();
+ } else if (errno == EINVAL) {
+ /* Incomplete input sequence */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (const char *) *source, *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ return PARSERUTILS_OK;
+ } else if (errno == EILSEQ) {
+ /* Illegal input sequence */
+ bool found = false;
+ const uint8_t *oldsrc;
+ size_t oldsrclen;
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ /* Strict errormode; simply flag invalid character */
+ if (c->base.errormode ==
+ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+ /* restore source pointers */
+ *source = origsrc;
+ *sourcelen = origsrclen;
+
+ return PARSERUTILS_INVALID;
+ }
+
+ /* Ok, this becomes problematic. The iconv API here
+ * is particularly unhelpful; *source will point at
+ * the _start_ of the illegal sequence. This means
+ * that we must find the end of the sequence */
+
+ /* Search for the start of the next valid input
+ * sequence (or the end of the input stream) */
+ while (*sourcelen > 1) {
+ pucs4 = (uint8_t *) &ucs4;
+ sucs4 = 4;
+
+ (*source)++;
+ (*sourcelen)--;
+
+ oldsrc = *source;
+ oldsrclen = *sourcelen;
+
+ iconv_ret = iconv(c->read_cd,
+ (char **) source, sourcelen,
+ (char **) (void *) &pucs4, &sucs4);
+ if (iconv_ret != (size_t) -1 || errno != EILSEQ) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found) {
+ /* Found start of next valid sequence */
+ *source = oldsrc;
+ *sourcelen = oldsrclen;
+ } else {
+ /* Not found - skip last byte in buffer */
+ (*source)++;
+ (*sourcelen)--;
+
+ if (*sourcelen != 0)
+ abort();
+ }
+
+ /* output U+FFFD and continue processing. */
+ error = iconv_codec_output_decoded_char(c,
+ htonl(0xFFFD), dest, destlen);
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+ /* output failed; restore source pointers */
+ *source = origsrc;
+ *sourcelen = origsrclen;
+ }
+
+ return error;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Write a UCS4 character in a codec's native charset
+ *
+ * \param c The codec
+ * \param ucs4 The UCS4 character to write (big endian)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if character cannot be represented and the
+ * codec's error handling mode is set to STRICT.
+ */
+parserutils_error iconv_codec_write_char(iconv_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ size_t iconv_ret;
+ uint8_t *pucs4 = (uint8_t *) &ucs4;
+ size_t sucs4 = 4;
+ uint8_t *origdest = *dest;
+
+ iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4,
+ &sucs4, (char **) dest, destlen);
+
+ if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+ /* Output buffer is too small */
+ return PARSERUTILS_NOMEM;
+ } else if (iconv_ret == (size_t) -1 && errno == EILSEQ) {
+ /* Illegal multibyte sequence */
+ /* This should never happen */
+ abort();
+ } else if (iconv_ret == (size_t) -1 && errno == EINVAL) {
+ /* Incomplete input character */
+ /* This should never happen */
+ abort();
+ } else if (*dest == origdest) {
+ /* Nothing was output */
+ switch (c->base.errormode) {
+ case PARSERUTILS_CHARSET_CODEC_ERROR_STRICT:
+ return PARSERUTILS_INVALID;
+
+ case PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT:
+ /** \todo transliteration */
+ case PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE:
+ {
+ pucs4 = (uint8_t *) &ucs4;
+ sucs4 = 4;
+
+ ucs4 = parserutils_charset_mibenum_is_unicode(
+ c->base.mibenum)
+ ? htonl(0xFFFD) : htonl(0x3F);
+
+ iconv_ret = iconv(c->write_cd,
+ (char **) (void *) &pucs4, &sucs4,
+ (char **) dest, destlen);
+
+ if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+ return PARSERUTILS_NOMEM;
+ } else if (iconv_ret == (size_t) -1 &&
+ errno == EILSEQ) {
+ /* Illegal multibyte sequence */
+ /* This should never happen */
+ abort();
+ } else if (iconv_ret == (size_t) -1 &&
+ errno == EINVAL) {
+ /* Incomplete input character */
+ /* This should never happen */
+ abort();
+ }
+ }
+ break;
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
+const parserutils_charset_handler iconv_codec_handler = {
+ iconv_codec_handles_charset,
+ iconv_codec_create
+};
+
+#endif
diff --git a/src/charset/codecs/codec_impl.h b/src/charset/codecs/codec_impl.h
new file mode 100644
index 0000000..9183594
--- /dev/null
+++ b/src/charset/codecs/codec_impl.h
@@ -0,0 +1,48 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_codecs_codecimpl_h_
+#define parserutils_charset_codecs_codecimpl_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include <parserutils/charset/codec.h>
+
+/**
+ * Core charset codec definition; implementations extend this
+ */
+struct parserutils_charset_codec {
+ uint16_t mibenum; /**< MIB enum for charset */
+
+ parserutils_charset_codec_errormode errormode; /**< error mode */
+
+ parserutils_alloc alloc; /**< allocation function */
+ void *alloc_pw; /**< private word */
+
+ struct {
+ void (*destroy)(parserutils_charset_codec *codec);
+ parserutils_error (*encode)(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+ parserutils_error (*decode)(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+ parserutils_error (*reset)(parserutils_charset_codec *codec);
+ } handler; /**< Vtable for handler code */
+};
+
+/**
+ * Codec factory component definition
+ */
+typedef struct parserutils_charset_handler {
+ bool (*handles_charset)(const char *charset);
+ parserutils_charset_codec *(*create)(const char *charset,
+ parserutils_alloc alloc, void *pw);
+} parserutils_charset_handler;
+
+#endif
diff --git a/src/charset/codecs/codec_utf16.c b/src/charset/codecs/codec_utf16.c
new file mode 100644
index 0000000..0dd7a07
--- /dev/null
+++ b/src/charset/codecs/codec_utf16.c
@@ -0,0 +1,544 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/utf16.h>
+
+#include "charset/codecs/codec_impl.h"
+#include "utils/utils.h"
+
+/**
+ * UTF-16 charset codec
+ */
+typedef struct charset_utf16_codec {
+ parserutils_charset_codec base; /**< Base class */
+
+#define INVAL_BUFSIZE (32)
+ uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
+ * incomplete input
+ * sequences */
+ size_t inval_len; /*< Byte length of inval_buf **/
+
+#define READ_BUFSIZE (8)
+ uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
+ * output sequences (decode)
+ * (host-endian) */
+ size_t read_len; /**< Character length of read_buf */
+
+#define WRITE_BUFSIZE (8)
+ uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
+ * output sequences (encode)
+ * (host-endian) */
+ size_t write_len; /**< Character length of write_buf */
+
+} charset_utf16_codec;
+
+static bool charset_utf16_codec_handles_charset(const char *charset);
+static parserutils_charset_codec *charset_utf16_codec_create(
+ const char *charset, parserutils_alloc alloc, void *pw);
+static void charset_utf16_codec_destroy (parserutils_charset_codec *codec);
+static parserutils_error charset_utf16_codec_encode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf16_codec_decode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf16_codec_reset(
+ parserutils_charset_codec *codec);
+static inline parserutils_error charset_utf16_codec_read_char(
+ charset_utf16_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_utf16_codec_output_decoded_char(
+ charset_utf16_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_utf16_codec_handles_charset(const char *charset)
+{
+ return parserutils_charset_mibenum_from_name(charset, strlen(charset))
+ ==
+ parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16"));
+}
+
+/**
+ * Create a utf16 codec
+ *
+ * \param charset The charset to read from / write to
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+parserutils_charset_codec *charset_utf16_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw)
+{
+ charset_utf16_codec *codec;
+
+ UNUSED(charset);
+
+ codec = alloc(NULL, sizeof(charset_utf16_codec), pw);
+ if (codec == NULL)
+ return NULL;
+
+ codec->inval_buf[0] = '\0';
+ codec->inval_len = 0;
+
+ codec->read_buf[0] = 0;
+ codec->read_len = 0;
+
+ codec->write_buf[0] = 0;
+ codec->write_len = 0;
+
+ /* Finally, populate vtable */
+ codec->base.handler.destroy = charset_utf16_codec_destroy;
+ codec->base.handler.encode = charset_utf16_codec_encode;
+ codec->base.handler.decode = charset_utf16_codec_decode;
+ codec->base.handler.reset = charset_utf16_codec_reset;
+
+ return (parserutils_charset_codec *) codec;
+}
+
+/**
+ * Destroy a utf16 codec
+ *
+ * \param codec The codec to destroy
+ */
+void charset_utf16_codec_destroy (parserutils_charset_codec *codec)
+{
+ UNUSED(codec);
+}
+
+/**
+ * Encode a chunk of UCS4 data into utf16
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ charset_utf16_codec *c = (charset_utf16_codec *) codec;
+ uint32_t ucs4;
+ uint32_t *towrite;
+ size_t towritelen;
+ parserutils_error error;
+
+ /* Process any outstanding characters from the previous call */
+ if (c->write_len > 0) {
+ uint32_t *pwrite = c->write_buf;
+ uint8_t buf[4];
+ size_t len;
+
+ while (c->write_len > 0) {
+ error = parserutils_charset_utf16_from_ucs4(
+ pwrite[0], buf, &len);
+ if (error != PARSERUTILS_OK)
+ abort();
+
+ if (*destlen < len) {
+ /* Insufficient output buffer space */
+ for (len = 0; len < c->write_len; len++)
+ c->write_buf[len] = pwrite[len];
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ memcpy(*dest, buf, len);
+
+ *dest += len;
+ *destlen -= len;
+
+ pwrite++;
+ c->write_len--;
+ }
+ }
+
+ /* Now process the characters for this call */
+ while (*sourcelen > 0) {
+ ucs4 = ntohl(*((uint32_t *) (void *) *source));
+ towrite = &ucs4;
+ towritelen = 1;
+
+ /* Output current characters */
+ while (towritelen > 0) {
+ uint8_t buf[4];
+ size_t len;
+
+ error = parserutils_charset_utf16_from_ucs4(
+ towrite[0], buf, &len);
+ if (error != PARSERUTILS_OK)
+ abort();
+
+ if (*destlen < len) {
+ /* Insufficient output space */
+ if (towritelen >= WRITE_BUFSIZE)
+ abort();
+
+ c->write_len = towritelen;
+
+ /* Copy pending chars to save area, for
+ * processing next call. */
+ for (len = 0; len < towritelen; len++)
+ c->write_buf[len] = towrite[len];
+
+ /* Claim character we've just buffered,
+ * so it's not reprocessed */
+ *source += 4;
+ *sourcelen -= 4;
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ memcpy(*dest, buf, len);
+
+ *dest += len;
+ *destlen -= len;
+
+ towrite++;
+ towritelen--;
+ }
+
+ *source += 4;
+ *sourcelen -= 4;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Decode a chunk of utf16 data into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ charset_utf16_codec *c = (charset_utf16_codec *) codec;
+ parserutils_error error;
+
+ if (c->read_len > 0) {
+ /* Output left over from last decode */
+ uint32_t *pread = c->read_buf;
+
+ while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+ *((uint32_t *) (void *) *dest) = htonl(pread[0]);
+
+ *dest += 4;
+ *destlen -= 4;
+
+ pread++;
+ c->read_len--;
+ }
+
+ if (*destlen < c->read_len * 4) {
+ /* Ran out of output buffer */
+ size_t i;
+
+ /* Shuffle remaining output down */
+ for (i = 0; i < c->read_len; i++)
+ c->read_buf[i] = pread[i];
+
+ return PARSERUTILS_NOMEM;
+ }
+ }
+
+ if (c->inval_len > 0) {
+ /* The last decode ended in an incomplete sequence.
+ * Fill up inval_buf with data from the start of the
+ * new chunk and process it. */
+ uint8_t *in = c->inval_buf;
+ size_t ol = c->inval_len;
+ size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+ size_t orig_l = l;
+
+ memcpy(c->inval_buf + ol, *source, l);
+
+ l += c->inval_len;
+
+ error = charset_utf16_codec_read_char(c,
+ (const uint8_t **) &in, &l, dest, destlen);
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+ return error;
+ }
+
+ /* And now, fix up source pointers */
+ *source += max((signed) (orig_l - l), 0);
+ *sourcelen -= max((signed) (orig_l - l), 0);
+
+ /* Failed to resolve an incomplete character and
+ * ran out of buffer space. No recovery strategy
+ * possible, so explode everywhere. */
+ if ((orig_l + ol) - l == 0)
+ abort();
+
+ /* Report memory exhaustion case from above */
+ if (error != PARSERUTILS_OK)
+ return error;
+ }
+
+ /* Finally, the "normal" case; process all outstanding characters */
+ while (*sourcelen > 0) {
+ error = charset_utf16_codec_read_char(c,
+ source, sourcelen, dest, destlen);
+ if (error != PARSERUTILS_OK) {
+ return error;
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Clear a utf16 codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec)
+{
+ charset_utf16_codec *c = (charset_utf16_codec *) codec;
+
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ c->read_buf[0] = 0;
+ c->read_len = 0;
+
+ c->write_buf[0] = 0;
+ c->write_len = 0;
+
+ return PARSERUTILS_OK;
+}
+
+
+/**
+ * Read a character from the UTF-16 to UCS4 (big endian)
+ *
+ * \param c The codec
+ * \param source Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen Pointer to length of source buffer (updated on exit)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ uint32_t ucs4;
+ size_t sucs4;
+ parserutils_error error;
+
+ /* Convert a single character */
+ error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen,
+ &ucs4, &sucs4);
+ if (error == PARSERUTILS_OK) {
+ /* Read a character */
+ error = charset_utf16_codec_output_decoded_char(c,
+ ucs4, dest, destlen);
+ if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+ /* output succeeded; update source pointers */
+ *source += sucs4;
+ *sourcelen -= sucs4;
+ }
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ return error;
+ } else if (error == PARSERUTILS_NEEDDATA) {
+ /* Incomplete input sequence */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (char *) *source, *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ return PARSERUTILS_OK;
+ } else if (error == PARSERUTILS_INVALID) {
+ /* Illegal input sequence */
+ uint32_t nextchar;
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ /* Strict errormode; simply flag invalid character */
+ if (c->base.errormode ==
+ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+ return PARSERUTILS_INVALID;
+ }
+
+ /* Find next valid UTF-16 sequence.
+ * We're processing client-provided data, so let's
+ * be paranoid about its validity. */
+ error = parserutils_charset_utf16_next_paranoid(
+ *source, *sourcelen, 0, &nextchar);
+ if (error != PARSERUTILS_OK) {
+ if (error == PARSERUTILS_NEEDDATA) {
+ /* Need more data to be sure */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (char *) *source,
+ *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ nextchar = 0;
+ } else {
+ return error;
+ }
+ }
+
+ /* output U+FFFD and continue processing. */
+ error = charset_utf16_codec_output_decoded_char(c,
+ 0xFFFD, dest, destlen);
+ if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+ /* output succeeded; update source pointers */
+ *source += nextchar;
+ *sourcelen -= nextchar;
+ }
+
+ return error;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Output a UCS4 character
+ *
+ * \param c Codec to use
+ * \param ucs4 UCS4 character (host endian)
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to output buffer length
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ */
+parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ if (*destlen < 4) {
+ /* Run out of output buffer */
+ c->read_len = 1;
+ c->read_buf[0] = ucs4;
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ *((uint32_t *) (void *) *dest) = htonl(ucs4);
+ *dest += 4;
+ *destlen -= 4;
+
+ return PARSERUTILS_OK;
+}
+
+
+const parserutils_charset_handler charset_utf16_codec_handler = {
+ charset_utf16_codec_handles_charset,
+ charset_utf16_codec_create
+};
diff --git a/src/charset/codecs/codec_utf8.c b/src/charset/codecs/codec_utf8.c
new file mode 100644
index 0000000..838d051
--- /dev/null
+++ b/src/charset/codecs/codec_utf8.c
@@ -0,0 +1,546 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <parserutils/charset/mibenum.h>
+
+#include "charset/codecs/codec_impl.h"
+#include "charset/encodings/utf8impl.h"
+#include "utils/utils.h"
+
+/**
+ * UTF-8 charset codec
+ */
+typedef struct charset_utf8_codec {
+ parserutils_charset_codec base; /**< Base class */
+
+#define INVAL_BUFSIZE (32)
+ uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
+ * incomplete input
+ * sequences */
+ size_t inval_len; /*< Byte length of inval_buf **/
+
+#define READ_BUFSIZE (8)
+ uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
+ * output sequences (decode)
+ * (host-endian) */
+ size_t read_len; /**< Character length of read_buf */
+
+#define WRITE_BUFSIZE (8)
+ uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
+ * output sequences (encode)
+ * (host-endian) */
+ size_t write_len; /**< Character length of write_buf */
+
+} charset_utf8_codec;
+
+static bool charset_utf8_codec_handles_charset(const char *charset);
+static parserutils_charset_codec *charset_utf8_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw);
+static void charset_utf8_codec_destroy (parserutils_charset_codec *codec);
+static parserutils_error charset_utf8_codec_encode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf8_codec_decode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf8_codec_reset(
+ parserutils_charset_codec *codec);
+static inline parserutils_error charset_utf8_codec_read_char(
+ charset_utf8_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_utf8_codec_output_decoded_char(
+ charset_utf8_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_utf8_codec_handles_charset(const char *charset)
+{
+ return parserutils_charset_mibenum_from_name(charset,
+ strlen(charset)) ==
+ parserutils_charset_mibenum_from_name("UTF-8",
+ SLEN("UTF-8"));
+}
+
+/**
+ * Create a utf8 codec
+ *
+ * \param charset The charset to read from / write to
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+parserutils_charset_codec *charset_utf8_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw)
+{
+ charset_utf8_codec *codec;
+
+ UNUSED(charset);
+
+ codec = alloc(NULL, sizeof(charset_utf8_codec), pw);
+ if (codec == NULL)
+ return NULL;
+
+ codec->inval_buf[0] = '\0';
+ codec->inval_len = 0;
+
+ codec->read_buf[0] = 0;
+ codec->read_len = 0;
+
+ codec->write_buf[0] = 0;
+ codec->write_len = 0;
+
+ /* Finally, populate vtable */
+ codec->base.handler.destroy = charset_utf8_codec_destroy;
+ codec->base.handler.encode = charset_utf8_codec_encode;
+ codec->base.handler.decode = charset_utf8_codec_decode;
+ codec->base.handler.reset = charset_utf8_codec_reset;
+
+ return (parserutils_charset_codec *) codec;
+}
+
+/**
+ * Destroy a utf8 codec
+ *
+ * \param codec The codec to destroy
+ */
+void charset_utf8_codec_destroy (parserutils_charset_codec *codec)
+{
+ UNUSED(codec);
+}
+
+/**
+ * Encode a chunk of UCS4 data into utf8
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ charset_utf8_codec *c = (charset_utf8_codec *) codec;
+ uint32_t ucs4;
+ uint32_t *towrite;
+ size_t towritelen;
+ parserutils_error error;
+
+ /* Process any outstanding characters from the previous call */
+ if (c->write_len > 0) {
+ uint32_t *pwrite = c->write_buf;
+
+ while (c->write_len > 0) {
+ UTF8_FROM_UCS4(pwrite[0], dest, destlen, error);
+ if (error != PARSERUTILS_OK) {
+ if (error != PARSERUTILS_NOMEM)
+ abort();
+
+ /* Insufficient output buffer space */
+ for (uint32_t len = 0;
+ len < c->write_len; len++) {
+ c->write_buf[len] = pwrite[len];
+ }
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ pwrite++;
+ c->write_len--;
+ }
+ }
+
+ /* Now process the characters for this call */
+ while (*sourcelen > 0) {
+ ucs4 = ntohl(*((uint32_t *) (void *) *source));
+ towrite = &ucs4;
+ towritelen = 1;
+
+ /* Output current characters */
+ while (towritelen > 0) {
+ UTF8_FROM_UCS4(towrite[0], dest, destlen, error);
+ if (error != PARSERUTILS_OK) {
+ if (error != PARSERUTILS_NOMEM)
+ abort();
+
+ /* Insufficient output space */
+ if (towritelen >= WRITE_BUFSIZE)
+ abort();
+
+ c->write_len = towritelen;
+
+ /* Copy pending chars to save area, for
+ * processing next call. */
+ for (uint32_t len = 0; len < towritelen; len++)
+ c->write_buf[len] = towrite[len];
+
+ /* Claim character we've just buffered,
+ * so it's not reprocessed */
+ *source += 4;
+ *sourcelen -= 4;
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ towrite++;
+ towritelen--;
+ }
+
+ *source += 4;
+ *sourcelen -= 4;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Decode a chunk of utf8 data into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ charset_utf8_codec *c = (charset_utf8_codec *) codec;
+ parserutils_error error;
+
+ if (c->read_len > 0) {
+ /* Output left over from last decode */
+ uint32_t *pread = c->read_buf;
+
+ while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+ *((uint32_t *) (void *) *dest) = htonl(pread[0]);
+
+ *dest += 4;
+ *destlen -= 4;
+
+ pread++;
+ c->read_len--;
+ }
+
+ if (*destlen < c->read_len * 4) {
+ /* Ran out of output buffer */
+ size_t i;
+
+ /* Shuffle remaining output down */
+ for (i = 0; i < c->read_len; i++)
+ c->read_buf[i] = pread[i];
+
+ return PARSERUTILS_NOMEM;
+ }
+ }
+
+ if (c->inval_len > 0) {
+ /* The last decode ended in an incomplete sequence.
+ * Fill up inval_buf with data from the start of the
+ * new chunk and process it. */
+ uint8_t *in = c->inval_buf;
+ size_t ol = c->inval_len;
+ size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+ size_t orig_l = l;
+
+ memcpy(c->inval_buf + ol, *source, l);
+
+ l += c->inval_len;
+
+ error = charset_utf8_codec_read_char(c,
+ (const uint8_t **) &in, &l, dest, destlen);
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+ return error;
+ }
+
+ /* And now, fix up source pointers */
+ *source += max((signed) (orig_l - l), 0);
+ *sourcelen -= max((signed) (orig_l - l), 0);
+
+ /* Failed to resolve an incomplete character and
+ * ran out of buffer space. No recovery strategy
+ * possible, so explode everywhere. */
+ if ((orig_l + ol) - l == 0)
+ abort();
+
+ /* Report memory exhaustion case from above */
+ if (error != PARSERUTILS_OK)
+ return error;
+ }
+
+ /* Finally, the "normal" case; process all outstanding characters */
+ while (*sourcelen > 0) {
+ error = charset_utf8_codec_read_char(c,
+ source, sourcelen, dest, destlen);
+ if (error != PARSERUTILS_OK) {
+ return error;
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Clear a utf8 codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec)
+{
+ charset_utf8_codec *c = (charset_utf8_codec *) codec;
+
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ c->read_buf[0] = 0;
+ c->read_len = 0;
+
+ c->write_buf[0] = 0;
+ c->write_len = 0;
+
+ return PARSERUTILS_OK;
+}
+
+
+/**
+ * Read a character from the UTF-8 to UCS4 (big endian)
+ *
+ * \param c The codec
+ * \param source Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen Pointer to length of source buffer (updated on exit)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ uint32_t ucs4;
+ size_t sucs4;
+ parserutils_error error;
+
+ /* Convert a single character */
+ {
+ const uint8_t *src = *source;
+ size_t srclen = *sourcelen;
+ uint32_t *uptr = &ucs4;
+ size_t *usptr = &sucs4;
+ UTF8_TO_UCS4(src, srclen, uptr, usptr, error);
+ }
+ if (error == PARSERUTILS_OK) {
+ /* Read a character */
+ error = charset_utf8_codec_output_decoded_char(c,
+ ucs4, dest, destlen);
+ if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+ /* output succeeded; update source pointers */
+ *source += sucs4;
+ *sourcelen -= sucs4;
+ }
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ return error;
+ } else if (error == PARSERUTILS_NEEDDATA) {
+ /* Incomplete input sequence */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (char *) *source, *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ return PARSERUTILS_OK;
+ } else if (error == PARSERUTILS_INVALID) {
+ /* Illegal input sequence */
+ uint32_t nextchar;
+
+ /* Strict errormode; simply flag invalid character */
+ if (c->base.errormode ==
+ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ return PARSERUTILS_INVALID;
+ }
+
+ /* Find next valid UTF-8 sequence.
+ * We're processing client-provided data, so let's
+ * be paranoid about its validity. */
+ {
+ const uint8_t *src = *source;
+ size_t srclen = *sourcelen;
+ uint32_t off = 0;
+ uint32_t *ncptr = &nextchar;
+
+ UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error);
+ }
+ if (error != PARSERUTILS_OK) {
+ if (error == PARSERUTILS_NEEDDATA) {
+ /* Need more data to be sure */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (char *) *source,
+ *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ nextchar = 0;
+ } else {
+ return error;
+ }
+ }
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ /* output U+FFFD and continue processing. */
+ error = charset_utf8_codec_output_decoded_char(c,
+ 0xFFFD, dest, destlen);
+ if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+ /* output succeeded; update source pointers */
+ *source += nextchar;
+ *sourcelen -= nextchar;
+ }
+
+ return error;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Output a UCS4 character
+ *
+ * \param c Codec to use
+ * \param ucs4 UCS4 character (host endian)
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to output buffer length
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ */
+parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ if (*destlen < 4) {
+ /* Run out of output buffer */
+ c->read_len = 1;
+ c->read_buf[0] = ucs4;
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ *((uint32_t *) (void *) *dest) = htonl(ucs4);
+ *dest += 4;
+ *destlen -= 4;
+
+ return PARSERUTILS_OK;
+}
+
+
+const parserutils_charset_handler charset_utf8_codec_handler = {
+ charset_utf8_codec_handles_charset,
+ charset_utf8_codec_create
+};
+
diff --git a/src/charset/encodings/Makefile b/src/charset/encodings/Makefile
new file mode 100644
index 0000000..47d9210
--- /dev/null
+++ b/src/charset/encodings/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Sources
+SRCS_$(d) := utf8.c utf16.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/encodings/utf16.c b/src/charset/encodings/utf16.c
new file mode 100644
index 0000000..95dc64f
--- /dev/null
+++ b/src/charset/encodings/utf16.c
@@ -0,0 +1,239 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-16 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/utf16.h>
+
+/**
+ * Convert a UTF-16 sequence into a single UCS4 character
+ *
+ * \param s The sequence to process
+ * \param len Length of sequence
+ * \param ucs4 Pointer to location to receive UCS4 character (host endian)
+ * \param clen Pointer to location to receive byte length of UTF-16 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s,
+ size_t len, uint32_t *ucs4, size_t *clen)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || ucs4 == NULL || clen == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (len < 2)
+ return PARSERUTILS_NEEDDATA;
+
+ if (*ss < 0xD800 || *ss > 0xDFFF) {
+ *ucs4 = *ss;
+ *clen = 2;
+ } else if (0xD800 <= *ss && *ss <= 0xBFFF) {
+ if (len < 4)
+ return PARSERUTILS_NEEDDATA;
+
+ if (0xDC00 <= ss[1] && ss[1] <= 0xE000) {
+ *ucs4 = (((s[0] >> 6) & 0x1f) + 1) |
+ ((s[0] & 0x3f) | (s[1] & 0x3ff));
+ *clen = 4;
+ } else {
+ return PARSERUTILS_INVALID;
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-16 sequence
+ *
+ * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s Pointer to 4 byte long output buffer
+ * \param len Pointer to location to receive length of multibyte sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
+ size_t *len)
+{
+ uint16_t *ss = (uint16_t *) (void *) s;
+ uint32_t l = 0;
+
+ if (s == NULL || len == NULL)
+ return PARSERUTILS_BADPARM;
+ else if (ucs4 < 0x10000) {
+ *ss = (uint16_t) ucs4;
+ l = 2;
+ } else if (ucs4 < 0x110000) {
+ ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
+ ss[1] = 0xDC00 | (ucs4 & 0x3ff);
+ l = 4;
+ } else {
+ return PARSERUTILS_INVALID;
+ }
+
+ *len = l;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-16 string
+ *
+ * \param s The string
+ * \param max Maximum length
+ * \param len Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max,
+ size_t *len)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+ const uint16_t *end = (const uint16_t *) (const void *) (s + max);
+ int l = 0;
+
+ if (s == NULL || len == NULL)
+ return PARSERUTILS_BADPARM;
+
+ while (ss < end) {
+ if (*ss < 0xD800 || 0xDFFF < *ss)
+ ss++;
+ else
+ ss += 2;
+
+ l++;
+ }
+
+ *len = l;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-16 character
+ *
+ * \param s Pointer to start of character
+ * \param len Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
+ size_t *len)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || len == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (*ss < 0xD800 || 0xDFFF < *ss)
+ *len = 2;
+ else
+ *len = 4;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Find previous legal UTF-16 char in string
+ *
+ * \param s The string
+ * \param off Offset in the string to start at
+ * \param prevoff Pointer to location to receive offset of first byte of
+ * previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off,
+ uint32_t *prevoff)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || prevoff == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (off < 2)
+ *prevoff = 0;
+ else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
+ *prevoff = off - 2;
+ else
+ *prevoff = (off < 4) ? 0 : off - 4;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s The string (assumed valid)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || off >= len || nextoff == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (len - off < 4)
+ *nextoff = len;
+ else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
+ *nextoff = off + 2;
+ else
+ *nextoff = (len - off < 6) ? len : off + 4;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s The string (assumed to be of dubious validity)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
+ uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || off >= len || nextoff == NULL)
+ return PARSERUTILS_BADPARM;
+
+ while (1) {
+ if (len - off < 4) {
+ return PARSERUTILS_NEEDDATA;
+ } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
+ *nextoff = off + 2;
+ break;
+ } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
+ if (len - off < 6)
+ return PARSERUTILS_NEEDDATA;
+
+ if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
+ *nextoff = off + 4;
+ break;
+ } else {
+ ss++;
+ off += 2;
+ }
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
diff --git a/src/charset/encodings/utf8.c b/src/charset/encodings/utf8.c
new file mode 100644
index 0000000..5b4ba95
--- /dev/null
+++ b/src/charset/encodings/utf8.c
@@ -0,0 +1,175 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/utf8.h>
+#include "charset/encodings/utf8impl.h"
+
+/** Number of continuation bytes for a given start byte */
+const uint8_t numContinuations[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
+};
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param s The sequence to process
+ * \param len Length of sequence
+ * \param ucs4 Pointer to location to receive UCS4 character (host endian)
+ * \param clen Pointer to location to receive byte length of UTF-8 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
+ uint32_t *ucs4, size_t *clen)
+{
+ parserutils_error error;
+
+ UTF8_TO_UCS4(s, len, ucs4, clen, error);
+
+ return error;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s Pointer to pointer to output buffer, updated on exit
+ * \param len Pointer to length, in bytes, of output buffer, updated on exit
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4,
+ uint8_t **s, size_t *len)
+{
+ parserutils_error error;
+
+ UTF8_FROM_UCS4(ucs4, s, len, error);
+
+ return error;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s The string
+ * \param max Maximum length
+ * \param len Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
+ size_t *len)
+{
+ parserutils_error error;
+
+ UTF8_LENGTH(s, max, len, error);
+
+ return error;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s Pointer to start of character
+ * \param len Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
+ size_t *len)
+{
+ parserutils_error error;
+
+ UTF8_CHAR_BYTE_LENGTH(s, len, error);
+
+ return error;
+}
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s The string
+ * \param off Offset in the string to start at
+ * \param prevoff Pointer to location to receive offset of first byte of
+ * previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
+ uint32_t *prevoff)
+{
+ parserutils_error error;
+
+ UTF8_PREV(s, off, prevoff, error);
+
+ return error;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s The string (assumed valid)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff)
+{
+ parserutils_error error;
+
+ UTF8_NEXT(s, len, off, nextoff, error);
+
+ return error;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s The string (assumed to be of dubious validity)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s,
+ uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+ parserutils_error error;
+
+ UTF8_NEXT_PARANOID(s, len, off, nextoff, error);
+
+ return error;
+}
+
diff --git a/src/charset/encodings/utf8impl.h b/src/charset/encodings/utf8impl.h
new file mode 100644
index 0000000..1ca9de7
--- /dev/null
+++ b/src/charset/encodings/utf8impl.h
@@ -0,0 +1,339 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_encodings_utf8impl_h_
+#define parserutils_charset_encodings_utf8impl_h_
+
+/** \file
+ * UTF-8 manipulation macros (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+/** Number of continuation bytes for a given start byte */
+extern const uint8_t numContinuations[256];
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param s The sequence to process
+ * \param len Length of sequence
+ * \param ucs4 Pointer to location to receive UCS4 character (host endian)
+ * \param clen Pointer to location to receive byte length of UTF-8 sequence
+ * \param error Location to receive error code
+ */
+#define UTF8_TO_UCS4(s, len, ucs4, clen, error) \
+do { \
+ uint32_t c, min; \
+ uint8_t n; \
+ \
+ error = PARSERUTILS_OK; \
+ \
+ if (s == NULL || ucs4 == NULL || clen == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ if (len == 0) { \
+ error = PARSERUTILS_NEEDDATA; \
+ break; \
+ } \
+ \
+ c = s[0]; \
+ \
+ if (c < 0x80) { \
+ n = 1; \
+ min = 0; \
+ } else if ((c & 0xE0) == 0xC0) { \
+ c &= 0x1F; \
+ n = 2; \
+ min = 0x80; \
+ } else if ((c & 0xF0) == 0xE0) { \
+ c &= 0x0F; \
+ n = 3; \
+ min = 0x800; \
+ } else if ((c & 0xF8) == 0xF0) { \
+ c &= 0x07; \
+ n = 4; \
+ min = 0x10000; \
+ } else if ((c & 0xFC) == 0xF8) { \
+ c &= 0x03; \
+ n = 5; \
+ min = 0x200000; \
+ } else if ((c & 0xFE) == 0xFC) { \
+ c &= 0x01; \
+ n = 6; \
+ min = 0x4000000; \
+ } else { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ if (len < n) { \
+ error = PARSERUTILS_NEEDDATA; \
+ break; \
+ } \
+ \
+ for (uint8_t i = 1; i < n; i++) { \
+ uint32_t t = s[i]; \
+ \
+ if ((t & 0xC0) != 0x80) { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ c <<= 6; \
+ c |= t & 0x3F; \
+ } \
+ \
+ if (error == PARSERUTILS_OK) { \
+ /* Detect overlong sequences, surrogates and fffe/ffff */ \
+ if (c < min || (c >= 0xD800 && c <= 0xDFFF) || \
+ c == 0xFFFE || c == 0xFFFF) { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ *ucs4 = c; \
+ *clen = n; \
+ } \
+} while(0)
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s Pointer to pointer to output buffer, updated on exit
+ * \param len Pointer to length, in bytes, of output buffer, updated on exit
+ * \param error Location to receive error code
+ */
+#define UTF8_FROM_UCS4(ucs4, s, len, error) \
+do { \
+ uint8_t *buf; \
+ uint8_t l = 0; \
+ \
+ error = PARSERUTILS_OK; \
+ \
+ if (s == NULL || *s == NULL || len == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ if (ucs4 < 0x80) { \
+ l = 1; \
+ } else if (ucs4 < 0x800) { \
+ l = 2; \
+ } else if (ucs4 < 0x10000) { \
+ l = 3; \
+ } else if (ucs4 < 0x200000) { \
+ l = 4; \
+ } else if (ucs4 < 0x4000000) { \
+ l = 5; \
+ } else if (ucs4 <= 0x7FFFFFFF) { \
+ l = 6; \
+ } else { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ if (l > *len) { \
+ error = PARSERUTILS_NOMEM; \
+ break; \
+ } \
+ \
+ buf = *s; \
+ \
+ if (l == 1) { \
+ buf[0] = (uint8_t) ucs4; \
+ } else { \
+ for (uint8_t i = l; i > 1; i--) { \
+ buf[i - 1] = 0x80 | (ucs4 & 0x3F); \
+ ucs4 >>= 6; \
+ } \
+ buf[0] = ~((1 << (8 - l)) - 1) | ucs4; \
+ } \
+ \
+ *s += l; \
+ *len -= l; \
+} while(0)
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s The string
+ * \param max Maximum length
+ * \param len Pointer to location to receive length of string
+ * \param error Location to receive error code
+ */
+#define UTF8_LENGTH(s, max, len, error) \
+do { \
+ const uint8_t *end = s + max; \
+ int l = 0; \
+ \
+ error = PARSERUTILS_OK; \
+ \
+ if (s == NULL || len == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ while (s < end) { \
+ uint32_t c = s[0]; \
+ \
+ if ((c & 0x80) == 0x00) \
+ s += 1; \
+ else if ((c & 0xE0) == 0xC0) \
+ s += 2; \
+ else if ((c & 0xF0) == 0xE0) \
+ s += 3; \
+ else if ((c & 0xF8) == 0xF0) \
+ s += 4; \
+ else if ((c & 0xFC) == 0xF8) \
+ s += 5; \
+ else if ((c & 0xFE) == 0xFC) \
+ s += 6; \
+ else { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ l++; \
+ } \
+ \
+ if (error == PARSERUTILS_OK) \
+ *len = l; \
+} while(0)
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s Pointer to start of character
+ * \param len Pointer to location to receive length
+ * \param error Location to receive error code
+ */
+#define UTF8_CHAR_BYTE_LENGTH(s, len, error) \
+do { \
+ if (s == NULL || len == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ *len = numContinuations[s[0]] + 1 /* Start byte */; \
+ \
+ error = PARSERUTILS_OK; \
+} while(0)
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s The string
+ * \param off Offset in the string to start at
+ * \param prevoff Pointer to location to receive offset of first byte of
+ * previous legal character
+ * \param error Location to receive error code
+ */
+#define UTF8_PREV(s, off, prevoff, error) \
+do { \
+ if (s == NULL || prevoff == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ while (off != 0 && (s[--off] & 0xC0) == 0x80) \
+ /* do nothing */; \
+ \
+ *prevoff = off; \
+ \
+ error = PARSERUTILS_OK; \
+} while(0)
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s The string (assumed valid)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \param error Location to receive error code
+ */
+#define UTF8_NEXT(s, len, off, nextoff, error) \
+do { \
+ if (s == NULL || off >= len || nextoff == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ /* Skip current start byte (if present - may be mid-sequence) */\
+ if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) \
+ off++; \
+ \
+ while (off < len && (s[off] & 0xC0) == 0x80) \
+ off++; \
+ \
+ *nextoff = off; \
+ \
+ error = PARSERUTILS_OK; \
+} while(0)
+
+/**
+ * Skip to start of next sequence in UTF-8 input
+ *
+ * \param s The string (assumed to be of dubious validity)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \param error Location to receive error code
+ */
+#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error) \
+do { \
+ uint8_t c; \
+ \
+ error = PARSERUTILS_OK; \
+ \
+ if (s == NULL || off >= len || nextoff == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ c = s[off]; \
+ \
+ /* If we're mid-sequence, simply advance to next byte */ \
+ if (!(c < 0x80 || (c & 0xC0) == 0xC0)) { \
+ off++; \
+ } else { \
+ uint32_t nCont = numContinuations[c]; \
+ uint32_t nToSkip; \
+ \
+ if (off + nCont + 1 >= len) { \
+ error = PARSERUTILS_NEEDDATA; \
+ break; \
+ } \
+ \
+ /* Verify continuation bytes */ \
+ for (nToSkip = 1; nToSkip <= nCont; nToSkip++) { \
+ if ((s[off + nToSkip] & 0xC0) != 0x80) \
+ break; \
+ } \
+ \
+ /* Skip over the valid bytes */ \
+ off += nToSkip; \
+ } \
+ \
+ *nextoff = off; \
+} while(0)
+
+#endif
diff --git a/src/input/Makefile b/src/input/Makefile
new file mode 100644
index 0000000..d62740e
--- /dev/null
+++ b/src/input/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Sources
+SRCS_$(d) := filter.c inputstream.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/input/filter.c b/src/input/filter.c
new file mode 100644
index 0000000..f40c98f
--- /dev/null
+++ b/src/input/filter.c
@@ -0,0 +1,384 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef WITH_ICONV_FILTER
+#include <iconv.h>
+#endif
+
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/codec.h>
+
+#include "input/filter.h"
+#include "utils/utils.h"
+
+/** Input filter */
+struct parserutils_filter {
+#ifdef WITH_ICONV_FILTER
+ iconv_t cd; /**< Iconv conversion descriptor */
+ uint16_t int_enc; /**< The internal encoding */
+#else
+ parserutils_charset_codec *read_codec; /**< Read codec */
+ parserutils_charset_codec *write_codec; /**< Write codec */
+
+ uint32_t pivot_buf[64]; /**< Conversion pivot buffer */
+
+ bool leftover; /**< Data remains from last call */
+ uint8_t *pivot_left; /**< Remaining pivot to write */
+ size_t pivot_len; /**< Length of pivot remaining */
+#endif
+
+ struct {
+ uint16_t encoding; /**< Input encoding */
+ } settings; /**< Filter settings */
+
+ parserutils_alloc alloc; /**< Memory (de)allocation function */
+ void *pw; /**< Client private data */
+};
+
+static parserutils_error filter_set_defaults(parserutils_filter *input);
+static parserutils_error filter_set_encoding(parserutils_filter *input,
+ const char *enc);
+
+/**
+ * Create an input filter
+ *
+ * \param int_enc Desired encoding of document
+ * \param alloc Function used to (de)allocate data
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to filter instance, or NULL on failure
+ */
+parserutils_filter *parserutils_filter_create(const char *int_enc,
+ parserutils_alloc alloc, void *pw)
+{
+ parserutils_filter *filter;
+
+ if (int_enc == NULL || alloc == NULL)
+ return NULL;
+
+ filter = alloc(NULL, sizeof(*filter), pw);
+ if (!filter)
+ return NULL;
+
+#ifdef WITH_ICONV_FILTER
+ filter->cd = (iconv_t) -1;
+ filter->int_enc = parserutils_charset_mibenum_from_name(
+ int_enc, strlen(int_enc));
+ if (filter->int_enc == 0) {
+ alloc(filter, 0, pw);
+ return NULL;
+ }
+#else
+ filter->leftover = false;
+ filter->pivot_left = NULL;
+ filter->pivot_len = 0;
+#endif
+
+ filter->alloc = alloc;
+ filter->pw = pw;
+
+ if (filter_set_defaults(filter) != PARSERUTILS_OK) {
+ filter->alloc(filter, 0, pw);
+ return NULL;
+ }
+
+#ifndef WITH_ICONV_FILTER
+ filter->write_codec =
+ parserutils_charset_codec_create(int_enc, alloc, pw);
+ if (filter->write_codec == NULL) {
+ if (filter->read_codec != NULL)
+ parserutils_charset_codec_destroy(filter->read_codec);
+ filter->alloc(filter, 0, pw);
+ return NULL;
+ }
+#endif
+
+ return filter;
+}
+
+/**
+ * Destroy an input filter
+ *
+ * \param input Pointer to filter instance
+ */
+void parserutils_filter_destroy(parserutils_filter *input)
+{
+ if (input == NULL)
+ return;
+
+#ifdef WITH_ICONV_FILTER
+ if (input->cd != (iconv_t) -1)
+ iconv_close(input->cd);
+#else
+ if (input->read_codec != NULL)
+ parserutils_charset_codec_destroy(input->read_codec);
+
+ if (input->write_codec != NULL)
+ parserutils_charset_codec_destroy(input->write_codec);
+#endif
+
+ input->alloc(input, 0, input->pw);
+
+ return;
+}
+
+/**
+ * Configure an input filter
+ *
+ * \param input Pointer to filter instance
+ * \param type Input option type to configure
+ * \param params Option-specific parameters
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_filter_setopt(parserutils_filter *input,
+ parserutils_filter_opttype type,
+ parserutils_filter_optparams *params)
+{
+ parserutils_error error = PARSERUTILS_OK;
+
+ if (input == NULL || params == NULL)
+ return PARSERUTILS_BADPARM;
+
+ switch (type) {
+ case PARSERUTILS_FILTER_SET_ENCODING:
+ error = filter_set_encoding(input, params->encoding.name);
+ break;
+ }
+
+ return error;
+}
+
+/**
+ * Process a chunk of data
+ *
+ * \param input Pointer to filter instance
+ * \param data Pointer to pointer to input buffer
+ * \param len Pointer to length of input buffer
+ * \param output Pointer to pointer to output buffer
+ * \param outlen Pointer to length of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ *
+ * Call this with an input buffer length of 0 to flush any buffers.
+ */
+parserutils_error parserutils_filter_process_chunk(parserutils_filter *input,
+ const uint8_t **data, size_t *len,
+ uint8_t **output, size_t *outlen)
+{
+ if (input == NULL || data == NULL || *data == NULL || len == NULL ||
+ output == NULL || *output == NULL || outlen == NULL)
+ return PARSERUTILS_BADPARM;
+
+#ifdef WITH_ICONV_FILTER
+ if (iconv(input->cd, (char **) data, len,
+ (char **) output, outlen) == (size_t) -1) {
+ switch (errno) {
+ case E2BIG:
+ return PARSERUTILS_NOMEM;
+ case EILSEQ:
+ if (*outlen < 3)
+ return PARSERUTILS_NOMEM;
+
+ (*output)[0] = 0xef;
+ (*output)[1] = 0xbf;
+ (*output)[2] = 0xbd;
+
+ *output += 3;
+ *outlen -= 3;
+
+ (*data)++;
+ (*len)--;
+
+ while (*len > 0) {
+ size_t ret;
+
+ ret = iconv(input->cd, (char **) data, len,
+ (char **) output, outlen);
+ if (ret != (size_t) -1 || errno != EILSEQ)
+ break;
+
+ (*data)++;
+ (*len)--;
+ }
+
+ return errno == E2BIG ? PARSERUTILS_NOMEM
+ : PARSERUTILS_OK;
+ }
+ }
+
+ return PARSERUTILS_OK;
+#else
+ parserutils_error read_error, write_error;
+
+ if (input->leftover) {
+ /* Some data left to be written from last call */
+
+ /* Attempt to flush the remaining data. */
+ write_error = parserutils_charset_codec_encode(
+ input->write_codec,
+ (const uint8_t **) &input->pivot_left,
+ &input->pivot_len,
+ output, outlen);
+
+ if (write_error != PARSERUTILS_OK)
+ return write_error;
+
+
+ /* And clear leftover */
+ input->pivot_left = NULL;
+ input->pivot_len = 0;
+ input->leftover = false;
+ }
+
+ while (*len > 0) {
+ size_t pivot_len = sizeof(input->pivot_buf);
+ uint8_t *pivot = (uint8_t *) input->pivot_buf;
+
+ read_error = parserutils_charset_codec_decode(input->read_codec,
+ data, len,
+ (uint8_t **) &pivot, &pivot_len);
+
+ pivot = (uint8_t *) input->pivot_buf;
+ pivot_len = sizeof(input->pivot_buf) - pivot_len;
+
+ if (pivot_len > 0) {
+ write_error = parserutils_charset_codec_encode(
+ input->write_codec,
+ (const uint8_t **) &pivot,
+ &pivot_len,
+ output, outlen);
+
+ if (write_error != PARSERUTILS_OK) {
+ input->leftover = true;
+ input->pivot_left = pivot;
+ input->pivot_len = pivot_len;
+
+ return write_error;
+ }
+ }
+
+ if (read_error != PARSERUTILS_OK &&
+ read_error != PARSERUTILS_NOMEM)
+ return read_error;
+ }
+
+ return PARSERUTILS_OK;
+#endif
+}
+
+/**
+ * Reset an input filter's state
+ *
+ * \param input The input filter to reset
+ * \param PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_filter_reset(parserutils_filter *input)
+{
+ if (input == NULL)
+ return PARSERUTILS_BADPARM;
+
+#ifdef WITH_ICONV_FILTER
+ iconv(input->cd, NULL, 0, NULL, 0);
+#else
+ parserutils_error error;
+
+ /* Clear pivot buffer leftovers */
+ input->pivot_left = NULL;
+ input->pivot_len = 0;
+ input->leftover = false;
+
+ /* Reset read codec */
+ error = parserutils_charset_codec_reset(input->read_codec);
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ /* Reset write codec */
+ error = parserutils_charset_codec_reset(input->write_codec);
+ if (error != PARSERUTILS_OK)
+ return error;
+#endif
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Set an input filter's default settings
+ *
+ * \param input Input filter to configure
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error filter_set_defaults(parserutils_filter *input)
+{
+ parserutils_error error;
+
+ if (input == NULL)
+ return PARSERUTILS_BADPARM;
+
+#ifndef WITH_ICONV_FILTER
+ input->read_codec = NULL;
+ input->write_codec = NULL;
+#endif
+
+ input->settings.encoding = 0;
+ error = filter_set_encoding(input, "UTF-8");
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Set an input filter's encoding
+ *
+ * \param input Input filter to configure
+ * \param enc Encoding name
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error filter_set_encoding(parserutils_filter *input,
+ const char *enc)
+{
+ const char *old_enc;
+ uint16_t mibenum;
+
+ if (input == NULL || enc == NULL)
+ return PARSERUTILS_BADPARM;
+
+ mibenum = parserutils_charset_mibenum_from_name(enc, strlen(enc));
+ if (mibenum == 0)
+ return PARSERUTILS_INVALID;
+
+ /* Exit early if we're already using this encoding */
+ if (input->settings.encoding == mibenum)
+ return PARSERUTILS_OK;
+
+ old_enc = parserutils_charset_mibenum_to_name(input->settings.encoding);
+ if (old_enc == NULL)
+ old_enc = "UTF-8";
+
+#ifdef WITH_ICONV_FILTER
+ if (input->cd != (iconv_t) -1)
+ iconv_close(input->cd);
+
+ input->cd = iconv_open(
+ parserutils_charset_mibenum_to_name(input->int_enc), enc);
+#else
+ if (input->read_codec != NULL)
+ parserutils_charset_codec_destroy(input->read_codec);
+
+ input->read_codec = parserutils_charset_codec_create(enc, input->alloc,
+ input->pw);
+ if (input->read_codec == NULL)
+ return PARSERUTILS_NOMEM;
+#endif
+
+ input->settings.encoding = mibenum;
+
+ return PARSERUTILS_OK;
+}
diff --git a/src/input/filter.h b/src/input/filter.h
new file mode 100644
index 0000000..96941a6
--- /dev/null
+++ b/src/input/filter.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_input_filter_h_
+#define parserutils_input_filter_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+typedef struct parserutils_filter parserutils_filter;
+
+/**
+ * Input filter option types
+ */
+typedef enum parserutils_filter_opttype {
+ PARSERUTILS_FILTER_SET_ENCODING = 0,
+} parserutils_filter_opttype;
+
+/**
+ * Input filter option parameters
+ */
+typedef union parserutils_filter_optparams {
+ /** Parameters for encoding setting */
+ struct {
+ /** Encoding name */
+ const char *name;
+ } encoding;
+} parserutils_filter_optparams;
+
+
+/* Create an input filter */
+parserutils_filter *parserutils_filter_create(const char *int_enc,
+ parserutils_alloc alloc, void *pw);
+/* Destroy an input filter */
+void parserutils_filter_destroy(parserutils_filter *input);
+
+/* Configure an input filter */
+parserutils_error parserutils_filter_setopt(parserutils_filter *input,
+ parserutils_filter_opttype type,
+ parserutils_filter_optparams *params);
+
+/* Process a chunk of data */
+parserutils_error parserutils_filter_process_chunk(parserutils_filter *input,
+ const uint8_t **data, size_t *len,
+ uint8_t **output, size_t *outlen);
+
+/* Reset an input filter's state */
+parserutils_error parserutils_filter_reset(parserutils_filter *input);
+
+#endif
+
diff --git a/src/input/inputstream.c b/src/input/inputstream.c
new file mode 100644
index 0000000..fd44995
--- /dev/null
+++ b/src/input/inputstream.c
@@ -0,0 +1,477 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/utf8.h>
+#include <parserutils/input/inputstream.h>
+
+#include "input/filter.h"
+#include "utils/utils.h"
+
+/**
+ * Private input stream definition
+ */
+typedef struct parserutils_inputstream_private {
+ parserutils_inputstream public; /**< Public part. Must be first */
+
+ parserutils_buffer *raw; /**< Buffer containing raw data */
+
+ bool done_first_chunk; /**< Whether the first chunk has
+ * been processed */
+
+ uint16_t mibenum; /**< MIB enum for charset, or 0 */
+ uint32_t encsrc; /**< Charset source */
+
+ parserutils_filter *input; /**< Charset conversion filter */
+
+ parserutils_charset_detect_func csdetect; /**< Charset detection func.*/
+
+ parserutils_alloc alloc; /**< Memory (de)allocation function */
+ void *pw; /**< Client private data */
+} parserutils_inputstream_private;
+
+static inline parserutils_error parserutils_inputstream_refill_buffer(
+ parserutils_inputstream_private *stream);
+static inline parserutils_error parserutils_inputstream_strip_bom(
+ uint16_t mibenum, parserutils_buffer *buffer);
+
+/**
+ * Create an input stream
+ *
+ * \param enc Document charset, or NULL to autodetect
+ * \param encsrc Value for encoding source, if specified, or 0
+ * \param csdetect Charset detection function, or NULL
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to stream instance, or NULL on failure
+ *
+ * The value 0 is defined as being the lowest priority encoding source
+ * (i.e. the default fallback encoding). Beyond this, no further
+ * interpretation is made upon the encoding source.
+ */
+parserutils_inputstream *parserutils_inputstream_create(const char *enc,
+ uint32_t encsrc, parserutils_charset_detect_func csdetect,
+ parserutils_alloc alloc, void *pw)
+{
+ parserutils_inputstream_private *stream;
+
+ if (alloc == NULL)
+ return NULL;
+
+ stream = alloc(NULL, sizeof(parserutils_inputstream_private), pw);
+ if (stream == NULL)
+ return NULL;
+
+ stream->raw = parserutils_buffer_create(alloc, pw);
+ if (stream->raw == NULL) {
+ alloc(stream, 0, pw);
+ return NULL;
+ }
+
+ stream->public.utf8 = parserutils_buffer_create(alloc, pw);
+ if (stream->public.utf8 == NULL) {
+ parserutils_buffer_destroy(stream->raw);
+ alloc(stream, 0, pw);
+ return NULL;
+ }
+
+ stream->public.cursor = 0;
+ stream->public.had_eof = false;
+ stream->done_first_chunk = false;
+
+ stream->input = parserutils_filter_create("UTF-8", alloc, pw);
+ if (stream->input == NULL) {
+ parserutils_buffer_destroy(stream->public.utf8);
+ parserutils_buffer_destroy(stream->raw);
+ alloc(stream, 0, pw);
+ return NULL;
+ }
+
+ if (enc != NULL) {
+ parserutils_error error;
+ parserutils_filter_optparams params;
+
+ stream->mibenum =
+ parserutils_charset_mibenum_from_name(enc, strlen(enc));
+
+ if (stream->mibenum != 0) {
+ params.encoding.name = enc;
+
+ error = parserutils_filter_setopt(stream->input,
+ PARSERUTILS_FILTER_SET_ENCODING,
+ &params);
+ if (error != PARSERUTILS_OK &&
+ error != PARSERUTILS_INVALID) {
+ parserutils_filter_destroy(stream->input);
+ parserutils_buffer_destroy(stream->public.utf8);
+ parserutils_buffer_destroy(stream->raw);
+ alloc(stream, 0, pw);
+ return NULL;
+ }
+
+ stream->encsrc = encsrc;
+ }
+ } else {
+ stream->mibenum = 0;
+ stream->encsrc = 0;
+ }
+
+ stream->csdetect = csdetect;
+
+ stream->alloc = alloc;
+ stream->pw = pw;
+
+ return (parserutils_inputstream *) stream;
+}
+
+/**
+ * Destroy an input stream
+ *
+ * \param stream Input stream to destroy
+ */
+void parserutils_inputstream_destroy(parserutils_inputstream *stream)
+{
+ parserutils_inputstream_private *s =
+ (parserutils_inputstream_private *) stream;
+
+ if (stream == NULL)
+ return;
+
+ parserutils_filter_destroy(s->input);
+ parserutils_buffer_destroy(s->public.utf8);
+ parserutils_buffer_destroy(s->raw);
+ s->alloc(s, 0, s->pw);
+}
+
+/**
+ * Append data to an input stream
+ *
+ * \param stream Input stream to append data to
+ * \param data Data to append (in document charset), or NULL to flag EOF
+ * \param len Length, in bytes, of data
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_inputstream_append(
+ parserutils_inputstream *stream,
+ const uint8_t *data, size_t len)
+{
+ parserutils_inputstream_private *s =
+ (parserutils_inputstream_private *) stream;
+
+ if (stream == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (data == NULL) {
+ s->public.had_eof = true;
+ return PARSERUTILS_OK;
+ }
+
+ return parserutils_buffer_append(s->raw, data, len);
+}
+
+/**
+ * Insert data into stream at current location
+ *
+ * \param stream Input stream to insert into
+ * \param data Data to insert (UTF-8 encoded)
+ * \param len Length, in bytes, of data
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_inputstream_insert(
+ parserutils_inputstream *stream,
+ const uint8_t *data, size_t len)
+{
+ parserutils_inputstream_private *s =
+ (parserutils_inputstream_private *) stream;
+
+ if (stream == NULL || data == NULL)
+ return PARSERUTILS_BADPARM;
+
+ return parserutils_buffer_insert(s->public.utf8, s->public.cursor,
+ data, len);
+}
+
+#define IS_ASCII(x) (((x) & 0x80) == 0)
+
+/* Look at the character in the stream that starts at
+ * offset bytes from the cursor (slow version)
+ *
+ * \param stream Stream to look in
+ * \param offset Byte offset of start of character
+ * \param length Pointer to location to receive character length (in bytes)
+ * \return Pointer to character data, or EOF or OOD.
+ *
+ * Once the character pointed to by the result of this call has been advanced
+ * past (i.e. parserutils_inputstream_advance has caused the stream cursor to
+ * pass over the character), then no guarantee is made as to the validity of
+ * the data pointed to. Thus, any attempt to dereference the pointer after
+ * advancing past the data it points to is a bug.
+ */
+uintptr_t parserutils_inputstream_peek_slow(parserutils_inputstream *stream,
+ size_t offset, size_t *length)
+{
+ parserutils_inputstream_private *s =
+ (parserutils_inputstream_private *) stream;
+ parserutils_error error = PARSERUTILS_OK;
+ size_t len;
+
+ if (stream == NULL)
+ return PARSERUTILS_INPUTSTREAM_OOD;
+
+ /* There's insufficient data in the buffer, so read some more */
+ if (s->raw->length == 0) {
+ /* No more data to be had */
+ return s->public.had_eof ? PARSERUTILS_INPUTSTREAM_EOF
+ : PARSERUTILS_INPUTSTREAM_OOD;
+ }
+
+ /* Refill utf8 buffer from raw buffer */
+ error = parserutils_inputstream_refill_buffer(s);
+ if (error != PARSERUTILS_OK)
+ return PARSERUTILS_INPUTSTREAM_OOD;
+
+ /* Now try the read */
+ if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) {
+ len = 1;
+ } else {
+ error = parserutils_charset_utf8_char_byte_length(
+ s->public.utf8->data + s->public.cursor + offset,
+ &len);
+
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA)
+ return PARSERUTILS_INPUTSTREAM_OOD;
+
+ if (error == PARSERUTILS_NEEDDATA) {
+ return s->public.had_eof ? PARSERUTILS_INPUTSTREAM_EOF
+ : PARSERUTILS_INPUTSTREAM_OOD;
+ }
+ }
+
+ *length = len;
+
+ return (uintptr_t) (s->public.utf8->data + s->public.cursor + offset);
+}
+
+#undef IS_ASCII
+
+/**
+ * Read the source charset of the input stream
+ *
+ * \param stream Input stream to query
+ * \param source Pointer to location to receive charset source identifier
+ * \return Pointer to charset name (constant; do not free)
+ */
+const char *parserutils_inputstream_read_charset(
+ parserutils_inputstream *stream, uint32_t *source)
+{
+ parserutils_inputstream_private *s =
+ (parserutils_inputstream_private *) stream;
+
+ if (stream == NULL || source == NULL)
+ return NULL;
+
+ *source = s->encsrc;
+
+ if (s->encsrc == 0)
+ return "UTF-8";
+
+ return parserutils_charset_mibenum_to_name(s->mibenum);
+}
+
+/******************************************************************************
+ ******************************************************************************/
+
+/**
+ * Refill the UTF-8 buffer from the raw buffer
+ *
+ * \param stream The inputstream to operate on
+ * \return PARSERUTILS_OK on success
+ */
+parserutils_error parserutils_inputstream_refill_buffer(
+ parserutils_inputstream_private *stream)
+{
+ const uint8_t *raw;
+ uint8_t *utf8;
+ size_t raw_length, utf8_space;
+ parserutils_error error;
+
+ /* If this is the first chunk of data, we must detect the charset and
+ * strip the BOM, if one exists */
+ if (!stream->done_first_chunk) {
+ if (stream->csdetect != NULL) {
+ error = stream->csdetect(stream->raw->data,
+ stream->raw->length,
+ &stream->mibenum, &stream->encsrc);
+ if (error != PARSERUTILS_OK)
+ return error;
+ } else {
+ /* Default to UTF-8 */
+ stream->mibenum =
+ parserutils_charset_mibenum_from_name("UTF-8",
+ SLEN("UTF-8"));
+ stream->encsrc = 0;
+ }
+
+ if (stream->mibenum == 0)
+ abort();
+
+ error = parserutils_inputstream_strip_bom(stream->mibenum,
+ stream->raw);
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ stream->done_first_chunk = true;
+ }
+
+ /* Work out how to perform the buffer fill */
+ if (stream->public.cursor == stream->public.utf8->length) {
+ /* Cursor's at the end, so simply reuse the entire buffer */
+ utf8 = stream->public.utf8->data;
+ utf8_space = stream->public.utf8->allocated;
+ } else {
+ /* Cursor's not at the end, so shift data after cursor to the
+ * bottom of the buffer. If the buffer's still over half full,
+ * extend it. */
+ memmove(stream->public.utf8->data,
+ stream->public.utf8->data + stream->public.cursor,
+ stream->public.utf8->length - stream->public.cursor);
+
+ stream->public.utf8->length -= stream->public.cursor;
+
+ if (stream->public.utf8->length >
+ stream->public.utf8->allocated / 2) {
+ error = parserutils_buffer_grow(stream->public.utf8);
+ if (error != PARSERUTILS_OK)
+ return error;
+ }
+
+ utf8 = stream->public.utf8->data + stream->public.utf8->length;
+ utf8_space = stream->public.utf8->allocated -
+ stream->public.utf8->length;
+ }
+
+ raw = stream->raw->data;
+ raw_length = stream->raw->length;
+
+ /* Try to fill utf8 buffer from the raw data */
+ error = parserutils_filter_process_chunk(stream->input,
+ &raw, &raw_length, &utf8, &utf8_space);
+ /* _NOMEM implies that there's more input to read than available space
+ * in the utf8 buffer. That's fine, so we'll ignore that error. */
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM)
+ return error;
+
+ /* Remove the raw data we've processed from the raw buffer */
+ error = parserutils_buffer_discard(stream->raw, 0,
+ stream->raw->length - raw_length);
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ /* Fix up the utf8 buffer information */
+ stream->public.utf8->length =
+ stream->public.utf8->allocated - utf8_space;
+
+ /* Finally, fix up the cursor */
+ stream->public.cursor = 0;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Strip a BOM from a buffer in the given encoding
+ *
+ * \param mibenum The character set of the buffer
+ * \param buffer The buffer to process
+ */
+parserutils_error parserutils_inputstream_strip_bom(uint16_t mibenum,
+ parserutils_buffer *buffer)
+{
+ static uint16_t utf8;
+ static uint16_t utf16;
+ static uint16_t utf16be;
+ static uint16_t utf16le;
+ static uint16_t utf32;
+ static uint16_t utf32be;
+ static uint16_t utf32le;
+
+ if (utf8 == 0) {
+ utf8 = parserutils_charset_mibenum_from_name("UTF-8",
+ SLEN("UTF-8"));
+ utf16 = parserutils_charset_mibenum_from_name("UTF-16",
+ SLEN("UTF-16"));
+ utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE"));
+ utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE"));
+ utf32 = parserutils_charset_mibenum_from_name("UTF-32",
+ SLEN("UTF-32"));
+ utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"));
+ utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE"));
+ }
+
+ /** \todo Handle unmarked UTF-16 and UTF-32. Endianness is specified
+ * by the BOM, if present, or is assumed to be big endian. */
+
+#define UTF32_BOM_LEN (4)
+#define UTF16_BOM_LEN (2)
+#define UTF8_BOM_LEN (3)
+
+ if (mibenum == utf8) {
+ if (buffer->length >= UTF8_BOM_LEN &&
+ buffer->data[0] == 0xEF &&
+ buffer->data[1] == 0xBB &&
+ buffer->data[2] == 0xBF) {
+ return parserutils_buffer_discard(
+ buffer, 0, UTF8_BOM_LEN);
+ }
+ } else if (mibenum == utf16be) {
+ if (buffer->length >= UTF16_BOM_LEN &&
+ buffer->data[0] == 0xFE &&
+ buffer->data[1] == 0xFF) {
+ return parserutils_buffer_discard(
+ buffer, 0, UTF16_BOM_LEN);
+ }
+ } else if (mibenum == utf16le) {
+ if (buffer->length >= UTF16_BOM_LEN &&
+ buffer->data[0] == 0xFF &&
+ buffer->data[1] == 0xFE) {
+ return parserutils_buffer_discard(
+ buffer, 0, UTF16_BOM_LEN);
+ }
+ } else if (mibenum == utf32be) {
+ if (buffer->length >= UTF32_BOM_LEN &&
+ buffer->data[0] == 0x00 &&
+ buffer->data[1] == 0x00 &&
+ buffer->data[2] == 0xFE &&
+ buffer->data[3] == 0xFF) {
+ return parserutils_buffer_discard(
+ buffer, 0, UTF32_BOM_LEN);
+ }
+ } else if (mibenum == utf32le) {
+ if (buffer->length >= UTF32_BOM_LEN &&
+ buffer->data[0] == 0xFF &&
+ buffer->data[1] == 0xFE &&
+ buffer->data[2] == 0x00 &&
+ buffer->data[3] == 0x00) {
+ return parserutils_buffer_discard(
+ buffer, 0, UTF32_BOM_LEN);
+ }
+ }
+
+#undef UTF8_BOM_LEN
+#undef UTF16_BOM_LEN
+#undef UTF32_BOM_LEN
+
+ return PARSERUTILS_OK;
+}
+
diff --git a/src/parserutils.c b/src/parserutils.c
new file mode 100644
index 0000000..ed9b21f
--- /dev/null
+++ b/src/parserutils.c
@@ -0,0 +1,54 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <parserutils/parserutils.h>
+
+#include "charset/charset.h"
+
+/**
+ * Initialise the ParserUtils library for use.
+ *
+ * This _must_ be called before using any libparserutils functions
+ *
+ * \param aliases_file Pointer to name of file containing encoding alias data
+ * \param alloc Pointer to (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, applicable error otherwise.
+ */
+parserutils_error parserutils_initialise(const char *aliases_file,
+ parserutils_alloc alloc, void *pw)
+{
+ parserutils_error error;
+
+ if (aliases_file == NULL || alloc == NULL)
+ return PARSERUTILS_BADPARM;
+
+ error = parserutils_charset_initialise(aliases_file, alloc, pw);
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Clean up after Libparserutils
+ *
+ * \param alloc Pointer to (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, applicable error otherwise.
+ */
+parserutils_error parserutils_finalise(parserutils_alloc alloc, void *pw)
+{
+ if (alloc == NULL)
+ return PARSERUTILS_BADPARM;
+
+ parserutils_charset_finalise(alloc, pw);
+
+ return PARSERUTILS_OK;
+}
+
+
diff --git a/src/utils/Makefile b/src/utils/Makefile
new file mode 100644
index 0000000..e053673
--- /dev/null
+++ b/src/utils/Makefile
@@ -0,0 +1,49 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Manipulate include paths
+override CFLAGS := $(CFLAGS) -I$(d)
+
+# Sources
+SRCS_$(d) := buffer.c errors.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/utils/buffer.c b/src/utils/buffer.c
new file mode 100644
index 0000000..21c47fc
--- /dev/null
+++ b/src/utils/buffer.c
@@ -0,0 +1,156 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include <parserutils/utils/buffer.h>
+
+#define DEFAULT_SIZE (4096)
+
+/**
+ * Create a memory buffer
+ *
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data
+ * \return Pointer to memory buffer, or NULL on memory exhaustion
+ */
+parserutils_buffer *parserutils_buffer_create(parserutils_alloc alloc, void *pw)
+{
+ parserutils_buffer *buffer =
+ alloc(NULL, sizeof(parserutils_buffer), pw);
+
+ if (buffer == NULL)
+ return NULL;
+
+ buffer->data = alloc(NULL, DEFAULT_SIZE, pw);
+ if (buffer->data == NULL) {
+ alloc(buffer, 0, pw);
+ return NULL;
+ }
+
+ buffer->length = 0;
+ buffer->allocated = DEFAULT_SIZE;
+
+ buffer->alloc = alloc;
+ buffer->pw = pw;
+
+ return buffer;
+}
+
+/**
+ * Destroy a memory buffer
+ *
+ * \param buffer The buffer to destroy
+ */
+void parserutils_buffer_destroy(parserutils_buffer *buffer)
+{
+ if (buffer == NULL)
+ return;
+
+ buffer->alloc(buffer->data, 0, buffer->pw);
+ buffer->alloc(buffer, 0, buffer->pw);
+}
+
+/**
+ * Append data to a memory buffer
+ *
+ * \param buffer The buffer to append to
+ * \param data The data to append
+ * \param len The length, in bytes, of the data to append
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_buffer_append(parserutils_buffer *buffer,
+ const uint8_t *data, size_t len)
+{
+ while (len >= buffer->allocated - buffer->length) {
+ parserutils_error error = parserutils_buffer_grow(buffer);
+ if (error != PARSERUTILS_OK)
+ return error;
+ }
+
+ memcpy(buffer->data + buffer->length, data, len);
+
+ buffer->length += len;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Insert data into a memory buffer
+ *
+ * \param buffer The buffer to insert into
+ * \param offset The offset into the buffer to insert at
+ * \param data The data to insert
+ * \param len The length, in bytes, of the data to insert
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer,
+ size_t offset, const uint8_t *data, size_t len)
+{
+ if (offset > buffer->length)
+ return PARSERUTILS_BADPARM;
+
+ if (offset == buffer->length)
+ return parserutils_buffer_append(buffer, data, len);
+
+ while (len >= buffer->allocated - buffer->length) {
+ parserutils_error error = parserutils_buffer_grow(buffer);
+ if (error != PARSERUTILS_OK)
+ return error;
+ }
+
+ memmove(buffer->data + buffer->length + len,
+ buffer->data + offset, buffer->length - offset);
+
+ memcpy(buffer->data + offset, data, len);
+
+ buffer->length += len;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Discard a section of a memory buffer
+ *
+ * \param buffer The buffer to discard data from
+ * \param offset The offset into the buffer of the start of the section
+ * \param len The number of bytes to discard
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer,
+ size_t offset, size_t len)
+{
+ if (offset >= buffer->length || offset + len > buffer->length)
+ return PARSERUTILS_BADPARM;
+
+ memmove(buffer->data + offset, buffer->data + offset + len,
+ buffer->length - len);
+
+ buffer->length -= len;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Extend the amount of space allocated for a memory buffer
+ *
+ * \param buffer The buffer to extend
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer)
+{
+ uint8_t *temp = buffer->alloc(buffer->data,
+ buffer->allocated * 2, buffer->pw);
+ if (temp == NULL)
+ return PARSERUTILS_NOMEM;
+
+ buffer->data = temp;
+ buffer->allocated *= 2;
+
+ return PARSERUTILS_OK;
+}
+
diff --git a/src/utils/errors.c b/src/utils/errors.c
new file mode 100644
index 0000000..353cda1
--- /dev/null
+++ b/src/utils/errors.c
@@ -0,0 +1,70 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include <parserutils/errors.h>
+
+/**
+ * Convert a parserutils error code to a string
+ *
+ * \param error The error code to convert
+ * \return Pointer to string representation of error, or NULL if unknown.
+ */
+const char *parserutils_error_to_string(parserutils_error error)
+{
+ const char *result = NULL;
+
+ switch (error) {
+ case PARSERUTILS_OK:
+ result = "No error";
+ break;
+ case PARSERUTILS_NOMEM:
+ result = "Insufficient memory";
+ break;
+ case PARSERUTILS_BADPARM:
+ result = "Bad parameter";
+ break;
+ case PARSERUTILS_INVALID:
+ result = "Invalid input";
+ break;
+ case PARSERUTILS_FILENOTFOUND:
+ result = "File not found";
+ break;
+ case PARSERUTILS_NEEDDATA:
+ result = "Insufficient data";
+ break;
+ }
+
+ return result;
+}
+
+/**
+ * Convert a string representation of an error name to a parserutils error code
+ *
+ * \param str String containing error name
+ * \param len Length of string (bytes)
+ * \return Error code, or PARSERUTILS_OK if unknown
+ */
+parserutils_error parserutils_error_from_string(const char *str, size_t len)
+{
+ if (strncmp(str, "PARSERUTILS_OK", len) == 0) {
+ return PARSERUTILS_OK;
+ } else if (strncmp(str, "PARSERUTILS_NOMEM", len) == 0) {
+ return PARSERUTILS_NOMEM;
+ } else if (strncmp(str, "PARSERUTILS_BADPARM", len) == 0) {
+ return PARSERUTILS_BADPARM;
+ } else if (strncmp(str, "PARSERUTILS_INVALID", len) == 0) {
+ return PARSERUTILS_INVALID;
+ } else if (strncmp(str, "PARSERUTILS_FILENOTFOUND", len) == 0) {
+ return PARSERUTILS_FILENOTFOUND;
+ } else if (strncmp(str, "PARSERUTILS_NEEDDATA", len) == 0) {
+ return PARSERUTILS_NEEDDATA;
+ }
+
+ return PARSERUTILS_OK;
+}
diff --git a/src/utils/utils.h b/src/utils/utils.h
new file mode 100644
index 0000000..5162945
--- /dev/null
+++ b/src/utils/utils.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_utils_h_
+#define parserutils_utils_h_
+
+#ifndef max
+#define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#ifndef min
+#define min(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef SLEN
+/* Calculate length of a string constant */
+#define SLEN(s) (sizeof((s)) - 1) /* -1 for '\0' */
+#endif
+
+#ifndef UNUSED
+#define UNUSED(x) ((x)=(x))
+#endif
+
+#endif
diff --git a/test/INDEX b/test/INDEX
new file mode 100644
index 0000000..772c82f
--- /dev/null
+++ b/test/INDEX
@@ -0,0 +1,15 @@
+# Index for testcases
+#
+# Test Description DataDir
+
+charset Charset initialisation/finalisation
+parserutils Library initialisation/finalisation
+aliases Encoding alias handling
+cscodec Charset codec implementation cscodec
+filter Input stream filtering
+inputstream Inputstream handling input
+
+# Regression tests
+regression/cscodec-segv Segfault in charset codecs
+regression/filter-segv Segfault in input filtering
+regression/stream-nomem Inputstream buffer expansion
diff --git a/test/Makefile b/test/Makefile
new file mode 100644
index 0000000..2ed0b44
--- /dev/null
+++ b/test/Makefile
@@ -0,0 +1,80 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Extend toolchain settings
+override CFLAGS := $(CFLAGS) -I$(TOP)/src/ -I$(d)
+
+# Tests
+TESTS_$(d) := aliases cscodec charset filter inputstream parserutils
+TESTS_$(d) := $(TESTS_$(d)) regression/cscodec-segv regression/filter-segv \
+ regression/stream-nomem
+
+# Items for top-level makefile to use
+ITEMS_CLEAN := $(ITEMS_CLEAN) \
+ $(addprefix $(d), $(addsuffix $(EXEEXT), $(TESTS_$(d)))) \
+ $(addprefix $(d), $(addsuffix .gcda, $(TESTS_$(d)))) \
+ $(addprefix $(d), $(addsuffix .gcno, $(TESTS_$(d))))
+ITEMS_DISTCLEAN := $(ITEMS_DISTCLEAN) $(d)log
+
+# Targets for top-level makefile to run
+TARGET_TESTS := $(TARGET_TESTS) test_$(d)
+
+# Now we get to hack around so that we know what directory we're in.
+# $(d) no longer exists when running the commands for a target, so we can't
+# simply use it verbatim. Assigning to a variable doesn't really help, as
+# there's no guarantee that someone else hasn't overridden that variable.
+# So, what we do is make the target depend on $(d), then pick it out of the
+# dependency list when running commands. This isn't pretty, but is effective.
+test_$(d): $(d) $(addprefix $(d), $(TESTS_$(d)))
+ @$(PERL) $(TOP)/$<testrunner.pl $(TOP)/$< $(EXEEXT)
+
+# Build rules for each test binary -- they all depend on the debug library
+define compile_test
+$(2): $$(TOP)/$$(COMPONENT)-debug.a $(1)
+ @$$(ECHO) $$(ECHOFLAGS) "==> $(1)"
+ @$$(CC) -c -g $$(DEBUGCFLAGS) -o $$@.o $(1)
+ @$$(LD) -g -o $$@ $$@.o $$(LDFLAGS) -lparserutils-debug
+ @$$(RM) $$(RMFLAGS) $$@.o
+
+endef
+
+$(eval $(foreach TEST,$(addprefix $(d), $(TESTS_$(d))), \
+ $(call compile_test,$(addsuffix .c, $(TEST)),$(TEST))))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/test/README b/test/README
new file mode 100644
index 0000000..7e41abf
--- /dev/null
+++ b/test/README
@@ -0,0 +1,84 @@
+Libcharset testcases
+====================
+
+Testcases for Libcharset are self-contained binaries which test various parts
+of the charset library. These may make use of external data files to drive
+the testing.
+
+Testcase command lines
+----------------------
+
+Testcase command lines are in a unified format, thus:
+
+ <aliases_file> [ <data_file> ]
+
+The aliases file parameter will always be specified (as it is required for
+the library to work at all).
+
+The data file parameter is optional and may be provided on a test-by-test
+basis.
+
+Testcase output
+---------------
+
+Testcases may output anything at all to stdout. The final line of the
+output must begin with either PASS or FAIL (case sensitive), indicating
+the success status of the test.
+
+Test Index
+----------
+
+In the test sources directory, is a file, named INDEX, which provides an
+index of all available test binaries. Any new test applications should be
+added to this index as they are created.
+
+The test index file format is as follows:
+
+ file = *line
+
+ line = ( entry / comment / blank ) LF
+
+ entry = testname 1*HTAB description [ 1*HTAB datadir ]
+ comment = "#" *non-newline
+ blank = 0<OCTET>
+
+ testname = 1*non-reserved
+ description = 1*non-reserved
+ datadir = 1*non-reserved
+
+ non-newline = VCHAR / WSP
+ non-reserved = VCHAR / SP
+
+Each entry contains a mandatory binary name and description followed by
+an optional data directory specifier. The data directory specifier is
+used to state the name of the directory containing data files for the
+test name. This directory will be searched for within the "data"
+directory in the source tree.
+
+If a data directory is specified, the test binary will be invoked for
+each data file listed within the data directory INDEX, passing the
+filename as the second parameter (<data_file>, above).
+
+Data Index
+----------
+
+Each test data directory contains a file, named INDEX, which provides an
+index of all available test data files.
+
+The data index file format is as follows:
+
+ file = *line
+
+ line = ( entry / comment / blank ) LF
+
+ entry = dataname 1*HTAB description
+ comment = "#" *non-newline
+ blank = 0<OCTET>
+
+ dataname = 1*non-reserved
+ description = 1*non-reserved
+
+ non-newline = VCHAR / WSP
+ non-reserved = VCHAR / SP
+
+Each entry contains a mandatory data file name and description.
diff --git a/test/aliases.c b/test/aliases.c
new file mode 100644
index 0000000..dff31c6
--- /dev/null
+++ b/test/aliases.c
@@ -0,0 +1,62 @@
+#include <stdio.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+
+#include "testutils.h"
+
+extern void charset_aliases_dump(void);
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+ UNUSED(pw);
+
+ return realloc(ptr, len);
+}
+
+int main (int argc, char **argv)
+{
+ parserutils_charset_aliases_canon *c;
+
+ if (argc != 2) {
+ printf("Usage: %s <filename>\n", argv[0]);
+ return 1;
+ }
+
+ parserutils_charset_aliases_create(argv[1], myrealloc, NULL);
+
+ parserutils_charset_aliases_dump();
+
+ c = parserutils_charset_alias_canonicalise("moose", 5);
+ if (c) {
+ printf("FAIL - found invalid encoding 'moose'\n");
+ return 1;
+ }
+
+ c = parserutils_charset_alias_canonicalise("csinvariant", 11);
+ if (c) {
+ printf("%s %d\n", c->name, c->mib_enum);
+ } else {
+ printf("FAIL - failed finding encoding 'csinvariant'\n");
+ return 1;
+ }
+
+ c = parserutils_charset_alias_canonicalise("nats-sefi-add", 13);
+ if (c) {
+ printf("%s %d\n", c->name, c->mib_enum);
+ } else {
+ printf("FAIL - failed finding encoding 'nats-sefi-add'\n");
+ return 1;
+ }
+
+ printf("%d\n", parserutils_charset_mibenum_from_name(c->name,
+ strlen(c->name)));
+
+ printf("%s\n", parserutils_charset_mibenum_to_name(c->mib_enum));
+
+ parserutils_charset_aliases_destroy(myrealloc, NULL);
+
+ printf("PASS\n");
+
+ return 0;
+}
diff --git a/test/charset.c b/test/charset.c
new file mode 100644
index 0000000..a793e7e
--- /dev/null
+++ b/test/charset.c
@@ -0,0 +1,31 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "charset/charset.h"
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+ UNUSED(pw);
+
+ return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+ if (argc != 2) {
+ printf("Usage: %s <filename>\n", argv[0]);
+ return 1;
+ }
+
+ assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) ==
+ PARSERUTILS_OK);
+
+ assert (parserutils_charset_finalise(myrealloc, NULL) ==
+ PARSERUTILS_OK);
+
+ printf("PASS\n");
+
+ return 0;
+}
diff --git a/test/cscodec.c b/test/cscodec.c
new file mode 100644
index 0000000..d3b1b76
--- /dev/null
+++ b/test/cscodec.c
@@ -0,0 +1,232 @@
+#include <stdio.h>
+#include <string.h>
+
+#include "charset/charset.h"
+#include <parserutils/charset/codec.h>
+
+#include "utils/utils.h"
+
+#include "testutils.h"
+
+typedef struct line_ctx {
+ parserutils_charset_codec *codec;
+
+ size_t buflen;
+ size_t bufused;
+ uint8_t *buf;
+ size_t explen;
+ size_t expused;
+ uint8_t *exp;
+
+ bool indata;
+ bool inexp;
+
+ parserutils_error exp_ret;
+
+ enum { ENCODE, DECODE, BOTH } dir;
+} line_ctx;
+
+static bool handle_line(const char *data, size_t datalen, void *pw);
+static void run_test(line_ctx *ctx);
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+ UNUSED(pw);
+
+ return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+ line_ctx ctx;
+
+ if (argc != 3) {
+ printf("Usage: %s <aliases_file> <filename>\n", argv[0]);
+ return 1;
+ }
+
+ assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) ==
+ PARSERUTILS_OK);
+
+ assert(parserutils_charset_codec_create("NATS-SEFI-ADD",
+ myrealloc, NULL) == NULL);
+
+ ctx.codec = parserutils_charset_codec_create("UTF-8", myrealloc, NULL);
+ assert(ctx.codec != NULL);
+
+ ctx.buflen = parse_filesize(argv[2]);
+ if (ctx.buflen == 0)
+ return 1;
+
+ ctx.buf = malloc(2 * ctx.buflen);
+ if (ctx.buf == NULL) {
+ printf("Failed allocating %u bytes\n",
+ (unsigned int) ctx.buflen);
+ return 1;
+ }
+
+ ctx.exp = ctx.buf + ctx.buflen;
+ ctx.explen = ctx.buflen;
+
+ ctx.buf[0] = '\0';
+ ctx.exp[0] = '\0';
+ ctx.bufused = 0;
+ ctx.expused = 0;
+ ctx.indata = false;
+ ctx.inexp = false;
+ ctx.exp_ret = PARSERUTILS_OK;
+
+ assert(parse_testfile(argv[2], handle_line, &ctx) == true);
+
+ /* and run final test */
+ if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
+ ctx.bufused -= 1;
+
+ if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
+ ctx.expused -= 1;
+
+ run_test(&ctx);
+
+ free(ctx.buf);
+
+ parserutils_charset_codec_destroy(ctx.codec);
+
+ assert(parserutils_charset_finalise(myrealloc, NULL) ==
+ PARSERUTILS_OK);
+
+ printf("PASS\n");
+
+ return 0;
+}
+
+bool handle_line(const char *data, size_t datalen, void *pw)
+{
+ line_ctx *ctx = (line_ctx *) pw;
+
+ if (data[0] == '#') {
+ if (ctx->inexp) {
+ /* This marks end of testcase, so run it */
+
+ if (ctx->buf[ctx->bufused - 1] == '\n')
+ ctx->bufused -= 1;
+
+ if (ctx->exp[ctx->expused - 1] == '\n')
+ ctx->expused -= 1;
+
+ run_test(ctx);
+
+ ctx->buf[0] = '\0';
+ ctx->exp[0] = '\0';
+ ctx->bufused = 0;
+ ctx->expused = 0;
+ ctx->exp_ret = PARSERUTILS_OK;
+ }
+
+ if (strncasecmp(data+1, "data", 4) == 0) {
+ parserutils_charset_codec_optparams params;
+ const char *ptr = data + 6;
+
+ ctx->indata = true;
+ ctx->inexp = false;
+
+ if (strncasecmp(ptr, "decode", 6) == 0)
+ ctx->dir = DECODE;
+ else if (strncasecmp(ptr, "encode", 6) == 0)
+ ctx->dir = ENCODE;
+ else
+ ctx->dir = BOTH;
+
+ ptr += 7;
+
+ if (strncasecmp(ptr, "LOOSE", 5) == 0) {
+ params.error_mode.mode =
+ PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
+ ptr += 6;
+ } else if (strncasecmp(ptr, "STRICT", 6) == 0) {
+ params.error_mode.mode =
+ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT;
+ ptr += 7;
+ } else {
+ params.error_mode.mode =
+ PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT;
+ ptr += 9;
+ }
+
+ assert(parserutils_charset_codec_setopt(ctx->codec,
+ PARSERUTILS_CHARSET_CODEC_ERROR_MODE,
+ (parserutils_charset_codec_optparams *) &params)
+ == PARSERUTILS_OK);
+ } else if (strncasecmp(data+1, "expected", 8) == 0) {
+ ctx->indata = false;
+ ctx->inexp = true;
+
+ ctx->exp_ret = parserutils_error_from_string(data + 10,
+ datalen - 10 - 1 /* \n */);
+ } else if (strncasecmp(data+1, "reset", 5) == 0) {
+ ctx->indata = false;
+ ctx->inexp = false;
+
+ parserutils_charset_codec_reset(ctx->codec);
+ }
+ } else {
+ if (ctx->indata) {
+ memcpy(ctx->buf + ctx->bufused, data, datalen);
+ ctx->bufused += datalen;
+ }
+ if (ctx->inexp) {
+ memcpy(ctx->exp + ctx->expused, data, datalen);
+ ctx->expused += datalen;
+ }
+ }
+
+ return true;
+}
+
+void run_test(line_ctx *ctx)
+{
+ static int testnum;
+ size_t destlen = ctx->bufused * 4;
+ uint8_t dest[destlen];
+ uint8_t *pdest = dest;
+ const uint8_t *psrc = ctx->buf;
+ size_t srclen = ctx->bufused;
+ size_t i;
+
+ if (ctx->dir == DECODE) {
+ assert(parserutils_charset_codec_decode(ctx->codec,
+ &psrc, &srclen,
+ &pdest, &destlen) == ctx->exp_ret);
+ } else if (ctx->dir == ENCODE) {
+ assert(parserutils_charset_codec_encode(ctx->codec,
+ &psrc, &srclen,
+ &pdest, &destlen) == ctx->exp_ret);
+ } else {
+ size_t templen = ctx->bufused * 4;
+ uint8_t temp[templen];
+ uint8_t *ptemp = temp;
+
+ assert(parserutils_charset_codec_decode(ctx->codec,
+ &psrc, &srclen,
+ &ptemp, &templen) == ctx->exp_ret);
+ ptemp = temp;
+ templen = ctx->bufused * 4 - templen;
+ assert(parserutils_charset_codec_encode(ctx->codec,
+ (const uint8_t **) &ptemp, &templen,
+ &pdest, &destlen) == ctx->exp_ret);
+ }
+
+ printf("%d: Read '", ++testnum);
+ for (i = 0; i < ctx->expused; i++) {
+ printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
+ "0123456789abcdef"[dest[i] & 0xf]);
+ }
+ printf("' Expected '");
+ for (i = 0; i < ctx->expused; i++) {
+ printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
+ "0123456789abcdef"[ctx->exp[i] & 0xf]);
+ }
+ printf("'\n");
+
+ assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
+}
+
diff --git a/test/data/Aliases b/test/data/Aliases
new file mode 100644
index 0000000..db61ff1
--- /dev/null
+++ b/test/data/Aliases
@@ -0,0 +1,302 @@
+# > Unicode:Files.Aliases
+# Mapping of character set encoding names to their canonical form
+#
+# Lines starting with a '#' are comments, blank lines are ignored.
+#
+# Based on http://www.iana.org/assignments/character-sets and
+# http://www.iana.org/assignments/ianacharset-mib
+#
+# Canonical Form MIBenum Aliases...
+#
+US-ASCII 3 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ASCII ISO646-US ANSI_X3.4-1968 us IBM367 cp367 csASCII
+ISO-10646-UTF-1 27 csISO10646UTF1
+ISO_646.basic:1983 28 ref csISO646basic1983
+INVARIANT 29 csINVARIANT
+ISO_646.irv:1983 30 iso-ir-2 irv csISO2IntlRefVersion
+BS_4730 20 iso-ir-4 ISO646-GB gb uk csISO4UnitedKingdom
+NATS-SEFI 31 iso-ir-8-1 csNATSSEFI
+NATS-SEFI-ADD 32 iso-ir-8-2 csNATSSEFIADD
+NATS-DANO 33 iso-ir-9-1 csNATSDANO
+NATS-DANO-ADD 34 iso-ir-9-2 csNATSDANOADD
+SEN_850200_B 35 iso-ir-10 FI ISO646-FI ISO646-SE se csISO10Swedish
+SEN_850200_C 21 iso-ir-11 ISO646-SE2 se2 csISO11SwedishForNames
+KS_C_5601-1987 36 iso-ir-149 KS_C_5601-1989 KSC_5601 korean csKSC56011987
+ISO-2022-KR 37 csISO2022KR
+EUC-KR 38 csEUCKR EUCKR
+ISO-2022-JP 39 csISO2022JP
+ISO-2022-JP-2 40 csISO2022JP2
+ISO-2022-CN 104
+ISO-2022-CN-EXT 105
+JIS_C6220-1969-jp 41 JIS_C6220-1969 iso-ir-13 katakana x0201-7 csISO13JISC6220jp
+JIS_C6220-1969-ro 42 iso-ir-14 jp ISO646-JP csISO14JISC6220ro
+IT 22 iso-ir-15 ISO646-IT csISO15Italian
+PT 43 iso-ir-16 ISO646-PT csISO16Portuguese
+ES 23 iso-ir-17 ISO646-ES csISO17Spanish
+greek7-old 44 iso-ir-18 csISO18Greek7Old
+latin-greek 45 iso-ir-19 csISO19LatinGreek
+DIN_66003 24 iso-ir-21 de ISO646-DE csISO21German
+NF_Z_62-010_(1973) 46 iso-ir-25 ISO646-FR1 csISO25French
+Latin-greek-1 47 iso-ir-27 csISO27LatinGreek1
+ISO_5427 48 iso-ir-37 csISO5427Cyrillic
+JIS_C6226-1978 49 iso-ir-42 csISO42JISC62261978
+BS_viewdata 50 iso-ir-47 csISO47BSViewdata
+INIS 51 iso-ir-49 csISO49INIS
+INIS-8 52 iso-ir-50 csISO50INIS8
+INIS-cyrillic 53 iso-ir-51 csISO51INISCyrillic
+ISO_5427:1981 54 iso-ir-54 ISO5427Cyrillic1981
+ISO_5428:1980 55 iso-ir-55 csISO5428Greek
+GB_1988-80 56 iso-ir-57 cn ISO646-CN csISO57GB1988
+GB_2312-80 57 iso-ir-58 chinese csISO58GB231280
+NS_4551-1 25 iso-ir-60 ISO646-NO no csISO60DanishNorwegian csISO60Norwegian1
+NS_4551-2 58 ISO646-NO2 iso-ir-61 no2 csISO61Norwegian2
+NF_Z_62-010 26 iso-ir-69 ISO646-FR fr csISO69French
+videotex-suppl 59 iso-ir-70 csISO70VideotexSupp1
+PT2 60 iso-ir-84 ISO646-PT2 csISO84Portuguese2
+ES2 61 iso-ir-85 ISO646-ES2 csISO85Spanish2
+MSZ_7795.3 62 iso-ir-86 ISO646-HU hu csISO86Hungarian
+JIS_C6226-1983 63 iso-ir-87 x0208 JIS_X0208-1983 csISO87JISX0208
+greek7 64 iso-ir-88 csISO88Greek7
+ASMO_449 65 ISO_9036 arabic7 iso-ir-89 csISO89ASMO449
+iso-ir-90 66 csISO90
+JIS_C6229-1984-a 67 iso-ir-91 jp-ocr-a csISO91JISC62291984a
+JIS_C6229-1984-b 68 iso-ir-92 ISO646-JP-OCR-B jp-ocr-b csISO92JISC62991984b
+JIS_C6229-1984-b-add 69 iso-ir-93 jp-ocr-b-add csISO93JIS62291984badd
+JIS_C6229-1984-hand 70 iso-ir-94 jp-ocr-hand csISO94JIS62291984hand
+JIS_C6229-1984-hand-add 71 iso-ir-95 jp-ocr-hand-add csISO95JIS62291984handadd
+JIS_C6229-1984-kana 72 iso-ir-96 csISO96JISC62291984kana
+ISO_2033-1983 73 iso-ir-98 e13b csISO2033
+ANSI_X3.110-1983 74 iso-ir-99 CSA_T500-1983 NAPLPS csISO99NAPLPS
+ISO-8859-1 4 iso-ir-100 ISO_8859-1 ISO_8859-1:1987 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 ISO8859-1
+ISO-8859-2 5 iso-ir-101 ISO_8859-2 ISO_8859-2:1987 latin2 l2 csISOLatin2 8859_2 ISO8859-2
+T.61-7bit 75 iso-ir-102 csISO102T617bit
+T.61-8bit 76 T.61 iso-ir-103 csISO103T618bit
+ISO-8859-3 6 iso-ir-109 ISO_8859-3 ISO_8859-3:1988 latin3 l3 csISOLatin3 8859_3 ISO8859-3
+ISO-8859-4 7 iso-ir-110 ISO_8859-4 ISO_8859-4:1988 latin4 l4 csISOLatin4 8859_4 ISO8859-4
+ECMA-cyrillic 77 iso-ir-111 KOI8-E csISO111ECMACyrillic
+CSA_Z243.4-1985-1 78 iso-ir-121 ISO646-CA csa7-1 ca csISO121Canadian1
+CSA_Z243.4-1985-2 79 iso-ir-122 ISO646-CA2 csa7-2 csISO122Canadian2
+CSA_Z243.4-1985-gr 80 iso-ir-123 csISO123CSAZ24341985gr
+ISO-8859-6 9 iso-ir-127 ISO_8859-6 ISO_8859-6:1987 ECMA-114 ASMO-708 arabic csISOLatinArabic
+ISO-8859-6-E 81 csISO88596E ISO_8859-6-E
+ISO-8859-6-I 82 csISO88596I ISO_8859-6-I
+ISO-8859-7 10 iso-ir-126 ISO_8859-7 ISO_8859-7:1987 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 ISO8859-7
+T.101-G2 83 iso-ir-128 csISO128T101G2
+ISO-8859-8 11 iso-ir-138 ISO_8859-8 ISO_8859-8:1988 hebrew csISOLatinHebrew 8859_8 ISO8859-8
+ISO-8859-8-E 84 csISO88598E ISO_8859-8-E
+ISO-8859-8-I 85 csISO88598I ISO_8859-8-I
+CSN_369103 86 iso-ir-139 csISO139CSN369103
+JUS_I.B1.002 87 iso-ir-141 ISO646-YU js yu csISO141JUSIB1002
+ISO_6937-2-add 14 iso-ir-142 csISOTextComm
+IEC_P27-1 88 iso-ir-143 csISO143IECP271
+ISO-8859-5 8 iso-ir-144 ISO_8859-5 ISO_8859-5:1988 cyrillic csISOLatinCyrillic 8859_5 ISO8859-5
+JUS_I.B1.003-serb 89 iso-ir-146 serbian csISO146Serbian
+JUS_I.B1.003-mac 90 macedonian iso-ir-147 csISO147Macedonian
+ISO-8859-9 12 iso-ir-148 ISO_8859-9 ISO_8859-9:1989 latin5 l5 csISOLatin5 8859_9 ISO8859-9
+greek-ccitt 91 iso-ir-150 csISO150 csISO150GreekCCITT
+NC_NC00-10:81 92 cuba iso-ir-151 ISO646-CU csISO151Cuba
+ISO_6937-2-25 93 iso-ir-152 csISO6937Add
+GOST_19768-74 94 ST_SEV_358-88 iso-ir-153 csISO153GOST1976874
+ISO_8859-supp 95 iso-ir-154 latin1-2-5 csISO8859Supp
+ISO_10367-box 96 iso-ir-155 csISO10367Box
+ISO-8859-10 13 iso-ir-157 l6 ISO_8859-10:1992 csISOLatin6 latin6 8859_10 ISO8859-10
+latin-lap 97 lap iso-ir-158 csISO158Lap
+JIS_X0212-1990 98 x0212 iso-ir-159 csISO159JISX02121990
+DS_2089 99 DS2089 ISO646-DK dk csISO646Danish
+us-dk 100 csUSDK
+dk-us 101 csDKUS
+JIS_X0201 15 X0201 csHalfWidthKatakana
+KSC5636 102 ISO646-KR csKSC5636
+ISO-10646-UCS-2 1000 csUnicode UCS-2 UCS2
+ISO-10646-UCS-4 1001 csUCS4 UCS-4 UCS4
+DEC-MCS 2008 dec csDECMCS
+hp-roman8 2004 roman8 r8 csHPRoman8
+macintosh 2027 mac csMacintosh MACROMAN MAC-ROMAN X-MAC-ROMAN
+IBM037 2028 cp037 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037
+IBM038 2029 EBCDIC-INT cp038 csIBM038
+IBM273 2030 CP273 csIBM273
+IBM274 2031 EBCDIC-BE CP274 csIBM274
+IBM275 2032 EBCDIC-BR cp275 csIBM275
+IBM277 2033 EBCDIC-CP-DK EBCDIC-CP-NO csIBM277
+IBM278 2034 CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278
+IBM280 2035 CP280 ebcdic-cp-it csIBM280
+IBM281 2036 EBCDIC-JP-E cp281 csIBM281
+IBM284 2037 CP284 ebcdic-cp-es csIBM284
+IBM285 2038 CP285 ebcdic-cp-gb csIBM285
+IBM290 2039 cp290 EBCDIC-JP-kana csIBM290
+IBM297 2040 cp297 ebcdic-cp-fr csIBM297
+IBM420 2041 cp420 ebcdic-cp-ar1 csIBM420
+IBM423 2042 cp423 ebcdic-cp-gr csIBM423
+IBM424 2043 cp424 ebcdic-cp-he csIBM424
+IBM437 2011 cp437 437 csPC8CodePage437
+IBM500 2044 CP500 ebcdic-cp-be ebcdic-cp-ch csIBM500
+IBM775 2087 cp775 csPC775Baltic
+IBM850 2009 cp850 850 csPC850Multilingual
+IBM851 2045 cp851 851 csIBM851
+IBM852 2010 cp852 852 csPCp852
+IBM855 2046 cp855 855 csIBM855
+IBM857 2047 cp857 857 csIBM857
+IBM860 2048 cp860 860 csIBM860
+IBM861 2049 cp861 861 cp-is csIBM861
+IBM862 2013 cp862 862 csPC862LatinHebrew
+IBM863 2050 cp863 863 csIBM863
+IBM864 2051 cp864 csIBM864
+IBM865 2052 cp865 865 csIBM865
+IBM866 2086 cp866 866 csIBM866
+IBM868 2053 CP868 cp-ar csIBM868
+IBM869 2054 cp869 869 cp-gr csIBM869
+IBM870 2055 CP870 ebcdic-cp-roece ebcdic-cp-yu csIBM870
+IBM871 2056 CP871 ebcdic-cp-is csIBM871
+IBM880 2057 cp880 EBCDIC-Cyrillic csIBM880
+IBM891 2058 cp891 csIBM891
+IBM903 2059 cp903 csIBM903
+IBM904 2060 cp904 904 csIBBM904
+IBM905 2061 CP905 ebcdic-cp-tr csIBM905
+IBM918 2062 CP918 ebcdic-cp-ar2 csIBM918
+IBM1026 2063 CP1026 csIBM1026
+EBCDIC-AT-DE 2064 csIBMEBCDICATDE
+EBCDIC-AT-DE-A 2065 csEBCDICATDEA
+EBCDIC-CA-FR 2066 csEBCDICCAFR
+EBCDIC-DK-NO 2067 csEBCDICDKNO
+EBCDIC-DK-NO-A 2068 csEBCDICDKNOA
+EBCDIC-FI-SE 2069 csEBCDICFISE
+EBCDIC-FI-SE-A 2070 csEBCDICFISEA
+EBCDIC-FR 2071 csEBCDICFR
+EBCDIC-IT 2072 csEBCDICIT
+EBCDIC-PT 2073 csEBCDICPT
+EBCDIC-ES 2074 csEBCDICES
+EBCDIC-ES-A 2075 csEBCDICESA
+EBCDIC-ES-S 2076 csEBCDICESS
+EBCDIC-UK 2077 csEBCDICUK
+EBCDIC-US 2078 csEBCDICUS
+UNKNOWN-8BIT 2079 csUnknown8BiT
+MNEMONIC 2080 csMnemonic
+MNEM 2081 csMnem
+VISCII 2082 csVISCII
+VIQR 2083 csVIQR
+KOI8-R 2084 csKOI8R
+KOI8-U 2088
+IBM00858 2089 CCSID00858 CP00858 PC-Multilingual-850+euro
+IBM00924 2090 CCSID00924 CP00924 ebcdic-Latin9--euro
+IBM01140 2091 CCSID01140 CP01140 ebcdic-us-37+euro
+IBM01141 2092 CCSID01141 CP01141 ebcdic-de-273+euro
+IBM01142 2093 CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro
+IBM01143 2094 CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro
+IBM01144 2095 CCSID01144 CP01144 ebcdic-it-280+euro
+IBM01145 2096 CCSID01145 CP01145 ebcdic-es-284+euro
+IBM01146 2097 CCSID01146 CP01146 ebcdic-gb-285+euro
+IBM01147 2098 CCSID01147 CP01147 ebcdic-fr-297+euro
+IBM01148 2099 CCSID01148 CP01148 ebcdic-international-500+euro
+IBM01149 2100 CCSID01149 CP01149 ebcdic-is-871+euro
+Big5-HKSCS 2101
+IBM1047 2102 IBM-1047
+PTCP154 2103 csPTCP154 PT154 CP154 Cyrillic-Asian
+Amiga-1251 2104 Ami1251 Amiga1251 Ami-1251
+KOI7-switched 2105
+UNICODE-1-1 1010 csUnicode11
+SCSU 1011
+UTF-7 1012
+UTF-16BE 1013
+UTF-16LE 1014
+UTF-16 1015
+CESU-8 1016 csCESU-8
+UTF-32 1017
+UTF-32BE 1018
+UTF-32LE 1019
+BOCU-1 1020 csBOCU-1
+UNICODE-1-1-UTF-7 103 csUnicode11UTF7
+UTF-8 106 UNICODE-1-1-UTF-8 UNICODE-2-0-UTF-8 utf8
+ISO-8859-13 109 8859_13 ISO8859-13
+ISO-8859-14 110 iso-ir-199 ISO_8859-14:1998 ISO_8859-14 latin8 iso-celtic l8 8859_14 ISO8859-14
+ISO-8859-15 111 ISO_8859-15 Latin-9 8859_15 ISO8859-15
+ISO-8859-16 112 iso-ir-226 ISO_8859-16:2001 ISO_8859-16 latin10 l10
+GBK 113 CP936 MS936 windows-936
+GB18030 114
+OSD_EBCDIC_DF04_15 115
+OSD_EBCDIC_DF03_IRV 116
+OSD_EBCDIC_DF04_1 117
+JIS_Encoding 16 csJISEncoding
+Shift_JIS 17 MS_Kanji csShiftJIS X-SJIS Shift-JIS
+EUC-JP 18 csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese EUCJP
+Extended_UNIX_Code_Fixed_Width_for_Japanese 19 csEUCFixWidJapanese
+ISO-10646-UCS-Basic 1002 csUnicodeASCII
+ISO-10646-Unicode-Latin1 1003 csUnicodeLatin1 ISO-10646
+ISO-Unicode-IBM-1261 1005 csUnicodeIBM1261
+ISO-Unicode-IBM-1268 1006 csUnicodeIBM1268
+ISO-Unicode-IBM-1276 1007 csUnicodeIBM1276
+ISO-Unicode-IBM-1264 1008 csUnicodeIBM1264
+ISO-Unicode-IBM-1265 1009 csUnicodeIBM1265
+ISO-8859-1-Windows-3.0-Latin-1 2000 csWindows30Latin1
+ISO-8859-1-Windows-3.1-Latin-1 2001 csWindows31Latin1
+ISO-8859-2-Windows-Latin-2 2002 csWindows31Latin2
+ISO-8859-9-Windows-Latin-5 2003 csWindows31Latin5
+Adobe-Standard-Encoding 2005 csAdobeStandardEncoding
+Ventura-US 2006 csVenturaUS
+Ventura-International 2007 csVenturaInternational
+PC8-Danish-Norwegian 2012 csPC8DanishNorwegian
+PC8-Turkish 2014 csPC8Turkish
+IBM-Symbols 2015 csIBMSymbols
+IBM-Thai 2016 csIBMThai
+HP-Legal 2017 csHPLegal
+HP-Pi-font 2018 csHPPiFont
+HP-Math8 2019 csHPMath8
+Adobe-Symbol-Encoding 2020 csHPPSMath
+HP-DeskTop 2021 csHPDesktop
+Ventura-Math 2022 csVenturaMath
+Microsoft-Publishing 2023 csMicrosoftPublishing
+Windows-31J 2024 csWindows31J
+GB2312 2025 csGB2312 EUC-CN EUCCN CN-GB
+Big5 2026 csBig5 BIG-FIVE BIG-5 CN-BIG5 BIG_FIVE
+windows-1250 2250 CP1250 MS-EE
+windows-1251 2251 CP1251 MS-CYRL
+windows-1252 2252 CP1252 MS-ANSI
+windows-1253 2253 CP1253 MS-GREEK
+windows-1254 2254 CP1254 MS-TURK
+windows-1255 2255
+windows-1256 2256 CP1256 MS-ARAB
+windows-1257 2257 CP1257 WINBALTRIM
+windows-1258 2258
+TIS-620 2259
+HZ-GB-2312 2085
+
+# Additional encodings not defined by IANA
+
+# Arbitrary allocations
+#CP737 3001
+#CP853 3002
+#CP856 3003
+CP874 3004 WINDOWS-874
+#CP922 3005
+#CP1046 3006
+#CP1124 3007
+#CP1125 3008 WINDOWS-1125
+#CP1129 3009
+#CP1133 3010 IBM-CP1133
+#CP1161 3011 IBM-1161 IBM1161 CSIBM1161
+#CP1162 3012 IBM-1162 IBM1162 CSIBM1162
+#CP1163 3013 IBM-1163 IBM1163 CSIBM1163
+#GEORGIAN-ACADEMY 3014
+#GEORGIAN-PS 3015
+#KOI8-RU 3016
+#KOI8-T 3017
+#MACARABIC 3018 X-MAC-ARABIC MAC-ARABIC
+#MACCROATIAN 3019 X-MAC-CROATIAN MAC-CROATIAN
+#MACGREEK 3020 X-MAC-GREEK MAC-GREEK
+#MACHEBREW 3021 X-MAC-HEBREW MAC-HEBREW
+#MACICELAND 3022 X-MAC-ICELAND MAC-ICELAND
+#MACROMANIA 3023 X-MAC-ROMANIA MAC-ROMANIA
+#MACTHAI 3024 X-MAC-THAI MAC-THAI
+#MACTURKISH 3025 X-MAC-TURKISH MAC-TURKISH
+#MULELAO-1 3026
+
+# From Unicode Lib
+ISO-IR-182 4000
+ISO-IR-197 4002
+ISO-2022-JP-1 4008
+MACCYRILLIC 4009 X-MAC-CYRILLIC MAC-CYRILLIC
+MACUKRAINE 4010 X-MAC-UKRAINIAN MAC-UKRAINIAN
+MACCENTRALEUROPE 4011 X-MAC-CENTRALEURROMAN MAC-CENTRALEURROMAN
+JOHAB 4012
+ISO-8859-11 4014 iso-ir-166 ISO_8859-11 ISO8859-11 8859_11
+X-CURRENT 4999 X-SYSTEM
+X-ACORN-LATIN1 5001
+X-ACORN-FUZZY 5002
diff --git a/test/data/cscodec/INDEX b/test/data/cscodec/INDEX
new file mode 100644
index 0000000..d6d338a
--- /dev/null
+++ b/test/data/cscodec/INDEX
@@ -0,0 +1,6 @@
+# Index file for charset codec tests
+#
+# Test Description
+
+simple.dat Simple tests, designed to validate testdriver
+UTF-8-test.txt Markus Kuhn's UTF-8 decoding test file
diff --git a/test/data/cscodec/UTF-8-test.txt b/test/data/cscodec/UTF-8-test.txt
new file mode 100644
index 0000000..920e54e
--- /dev/null
+++ b/test/data/cscodec/UTF-8-test.txt
Binary files differ
diff --git a/test/data/cscodec/simple.dat b/test/data/cscodec/simple.dat
new file mode 100644
index 0000000..3e2c7ae
--- /dev/null
+++ b/test/data/cscodec/simple.dat
Binary files differ
diff --git a/test/data/input/INDEX b/test/data/input/INDEX
new file mode 100644
index 0000000..c2c97ea
--- /dev/null
+++ b/test/data/input/INDEX
@@ -0,0 +1,5 @@
+# Index file for inputstream tests
+#
+# Test Description
+
+UTF-8-test.txt Markus Kuhn's UTF-8 decoding test file
diff --git a/test/data/input/UTF-8-test.txt b/test/data/input/UTF-8-test.txt
new file mode 100644
index 0000000..abd16f7
--- /dev/null
+++ b/test/data/input/UTF-8-test.txt
Binary files differ
diff --git a/test/filter.c b/test/filter.c
new file mode 100644
index 0000000..ff4d1e7
--- /dev/null
+++ b/test/filter.c
@@ -0,0 +1,357 @@
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/parserutils.h>
+
+#include "utils/utils.h"
+
+#include "input/filter.h"
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+ UNUSED(pw);
+
+ return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+ parserutils_filter_optparams params;
+ parserutils_filter *input;
+ uint8_t inbuf[64], outbuf[64];
+ size_t inlen, outlen;
+ const uint8_t *in = inbuf;
+ uint8_t *out = outbuf;
+
+ if (argc != 2) {
+ printf("Usage: %s <filename>\n", argv[0]);
+ return 1;
+ }
+
+ /* Initialise library */
+ assert(parserutils_initialise(argv[1], myrealloc, NULL) ==
+ PARSERUTILS_OK);
+
+ /* Create input filter */
+ input = parserutils_filter_create("UTF-8", myrealloc, NULL);
+ assert(input);
+
+ /* Convert filter to UTF-8 encoding */
+ params.encoding.name = "UTF-8";
+ assert(parserutils_filter_setopt(input, PARSERUTILS_FILTER_SET_ENCODING,
+ (parserutils_filter_optparams *) &params) ==
+ PARSERUTILS_OK);
+
+
+ /* Simple case - valid input & output buffer large enough */
+ in = inbuf;
+ out = outbuf;
+ strcpy((char *) inbuf, "hell\xc2\xa0o!");
+ inlen = strlen((const char *) inbuf);
+ outbuf[0] = '\0';
+ outlen = 64;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+ assert(memcmp(outbuf, "hell\xc2\xa0o!",
+ SLEN("hell\xc2\xa0o!")) == 0);
+
+
+ /* Too small an output buffer; no encoding edge cases */
+ in = inbuf;
+ out = outbuf;
+ strcpy((char *) inbuf, "hello!");
+ inlen = strlen((const char *) inbuf);
+ outbuf[0] = '\0';
+ outlen = 5;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_NOMEM);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ outlen = 64 - 5 + outlen;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+ assert(memcmp(outbuf, "hello!",
+ SLEN("hello!")) == 0);
+
+
+ /* Illegal input sequence; output buffer large enough */
+ in = inbuf;
+ out = outbuf;
+ strcpy((char *) inbuf, "hell\x96o!");
+ inlen = strlen((const char *) inbuf);
+ outbuf[0] = '\0';
+ outlen = 64;
+
+ /* Input does loose decoding, converting to U+FFFD if illegal
+ * input is encountered */
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+ assert(memcmp(outbuf, "hell\xef\xbf\xbdo!",
+ SLEN("hell\xef\xbf\xbdo!")) == 0);
+
+
+ /* Input ends mid-sequence */
+ in = inbuf;
+ out = outbuf;
+ strcpy((char *) inbuf, "hell\xc2\xa0o!");
+ inlen = strlen((const char *) inbuf) - 3;
+ outbuf[0] = '\0';
+ outlen = 64;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ inlen += 3;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+ assert(memcmp(outbuf, "hell\xc2\xa0o!",
+ SLEN("hell\xc2\xa0o!")) == 0);
+
+
+ /* Input ends mid-sequence, but second attempt has too small a
+ * buffer, but large enough to write out the incomplete character. */
+ in = inbuf;
+ out = outbuf;
+ strcpy((char *) inbuf, "hell\xc2\xa0o!");
+ inlen = strlen((const char *) inbuf) - 3;
+ outbuf[0] = '\0';
+ outlen = 64;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ inlen += 3;
+ outlen = 3;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_NOMEM);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ outlen = 64 - 7;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+ assert(memcmp(outbuf, "hell\xc2\xa0o!",
+ SLEN("hell\xc2\xa0o!")) == 0);
+
+
+ /* Input ends mid-sequence, but second attempt has too small a
+ * buffer, not large enough to write out the incomplete character. */
+ in = inbuf;
+ out = outbuf;
+ strcpy((char *) inbuf, "hell\xc2\xa0o!");
+ inlen = strlen((const char *) inbuf) - 3;
+ outbuf[0] = '\0';
+ outlen = 64;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ inlen += 3;
+ outlen = 1;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_NOMEM);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ outlen = 60;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+ assert(memcmp(outbuf, "hell\xc2\xa0o!",
+ SLEN("hell\xc2\xa0o!")) == 0);
+
+
+ /* Input ends mid-sequence, but second attempt contains
+ * invalid character */
+ in = inbuf;
+ out = outbuf;
+ strcpy((char *) inbuf, "hell\xc2\xc2o!");
+ inlen = strlen((const char *) inbuf) - 3;
+ outbuf[0] = '\0';
+ outlen = 64;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ inlen += 3;
+
+ /* Input does loose decoding, converting to U+FFFD if illegal
+ * input is encountered */
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+ assert(memcmp(outbuf, "hell\xef\xbf\xbd\xef\xbf\xbdo!",
+ SLEN("hell\xef\xbf\xbd\xef\xbf\xbdo!")) == 0);
+
+
+ /* Input ends mid-sequence, but second attempt contains another
+ * incomplete character */
+ in = inbuf;
+ out = outbuf;
+ strcpy((char *) inbuf, "hell\xc2\xa0\xc2\xa1o!");
+ inlen = strlen((const char *) inbuf) - 5;
+ outbuf[0] = '\0';
+ outlen = 64;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ inlen += 2;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ inlen += 3;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+ assert(memcmp(outbuf, "hell\xc2\xa0\xc2\xa1o!",
+ SLEN("hell\xc2\xa0\xc2\xa1o!")) == 0);
+
+
+ /* Input ends mid-sequence, but second attempt contains insufficient
+ * data to complete the incomplete character */
+ in = inbuf;
+ out = outbuf;
+ strcpy((char *) inbuf, "hell\xe2\x80\xa2o!");
+ inlen = strlen((const char *) inbuf) - 4;
+ outbuf[0] = '\0';
+ outlen = 64;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ inlen += 1;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ inlen += 3;
+
+ assert(parserutils_filter_process_chunk(input, &in, &inlen,
+ &out, &outlen) == PARSERUTILS_OK);
+
+ printf("'%.*s' %d '%.*s' %d\n", (int) inlen, in, (int) inlen,
+ (int) (out - ((uint8_t *) outbuf)),
+ outbuf, (int) outlen);
+
+ assert(parserutils_filter_reset(input) == PARSERUTILS_OK);
+
+ assert(memcmp(outbuf, "hell\xe2\x80\xa2o!",
+ SLEN("hell\xe2\x80\xa2o!")) == 0);
+
+
+ /* Clean up */
+ parserutils_filter_destroy(input);
+
+ assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK);
+
+ printf("PASS\n");
+
+ return 0;
+}
diff --git a/test/inputstream.c b/test/inputstream.c
new file mode 100644
index 0000000..bad3127
--- /dev/null
+++ b/test/inputstream.c
@@ -0,0 +1,97 @@
+#include <inttypes.h>
+#include <stdio.h>
+
+#include <parserutils/parserutils.h>
+#include <parserutils/charset/utf8.h>
+#include <parserutils/input/inputstream.h>
+
+#include "utils/utils.h"
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+ UNUSED(pw);
+
+ return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+ parserutils_inputstream *stream;
+ FILE *fp;
+ size_t len, origlen;
+#define CHUNK_SIZE (4096)
+ uint8_t buf[CHUNK_SIZE];
+ uintptr_t c;
+ size_t clen;
+
+ if (argc != 3) {
+ printf("Usage: %s <aliases_file> <filename>\n", argv[0]);
+ return 1;
+ }
+
+ /* Initialise library */
+ assert(parserutils_initialise(argv[1], myrealloc, NULL) ==
+ PARSERUTILS_OK);
+
+ stream = parserutils_inputstream_create("UTF-8", 1, NULL,
+ myrealloc, NULL);
+ assert(stream != NULL);
+
+ fp = fopen(argv[2], "rb");
+ if (fp == NULL) {
+ printf("Failed opening %s\n", argv[2]);
+ return 1;
+ }
+
+ fseek(fp, 0, SEEK_END);
+ origlen = len = ftell(fp);
+ fseek(fp, 0, SEEK_SET);
+
+ while (len >= CHUNK_SIZE) {
+ fread(buf, 1, CHUNK_SIZE, fp);
+
+ assert(parserutils_inputstream_append(stream,
+ buf, CHUNK_SIZE) == PARSERUTILS_OK);
+
+ len -= CHUNK_SIZE;
+
+ while ((c = parserutils_inputstream_peek(stream, 0, &clen)) !=
+ PARSERUTILS_INPUTSTREAM_OOD) {
+ parserutils_inputstream_advance(stream, clen);
+ }
+ }
+
+ if (len > 0) {
+ fread(buf, 1, len, fp);
+
+ assert(parserutils_inputstream_append(stream,
+ buf, len) == PARSERUTILS_OK);
+
+ len = 0;
+ }
+
+ fclose(fp);
+
+ assert(parserutils_inputstream_insert(stream,
+ (const uint8_t *) "hello!!!",
+ SLEN("hello!!!")) == PARSERUTILS_OK);
+
+ assert(parserutils_inputstream_append(stream, NULL, 0) ==
+ PARSERUTILS_OK);
+
+ while ((c = parserutils_inputstream_peek(stream, 0, &clen)) !=
+ PARSERUTILS_INPUTSTREAM_EOF) {
+ parserutils_inputstream_advance(stream, clen);
+ }
+
+ parserutils_inputstream_destroy(stream);
+
+ assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK);
+
+ printf("PASS\n");
+
+ return 0;
+}
+
diff --git a/test/parserutils.c b/test/parserutils.c
new file mode 100644
index 0000000..c6d671a
--- /dev/null
+++ b/test/parserutils.c
@@ -0,0 +1,30 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <parserutils/parserutils.h>
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+ UNUSED(pw);
+
+ return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+ if (argc != 2) {
+ printf("Usage: %s <filename>\n", argv[0]);
+ return 1;
+ }
+
+ assert(parserutils_initialise(argv[1], myrealloc, NULL) ==
+ PARSERUTILS_OK);
+
+ assert (parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK);
+
+ printf("PASS\n");
+
+ return 0;
+}
diff --git a/test/regression/cscodec-segv.c b/test/regression/cscodec-segv.c
new file mode 100644
index 0000000..5802fdf
--- /dev/null
+++ b/test/regression/cscodec-segv.c
@@ -0,0 +1,38 @@
+#include <stdio.h>
+
+#include "charset/charset.h"
+#include <parserutils/charset/codec.h>
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+ UNUSED(pw);
+
+ return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+ parserutils_charset_codec *codec;
+
+ if (argc != 2) {
+ printf("Usage: %s <aliases_file>\n", argv[0]);
+ return 1;
+ }
+
+ assert(parserutils_charset_initialise(argv[1], myrealloc, NULL) ==
+ PARSERUTILS_OK);
+
+ codec = parserutils_charset_codec_create("UTF-8", myrealloc, NULL);
+ assert(codec != NULL);
+
+ parserutils_charset_codec_destroy(codec);
+
+ assert(parserutils_charset_finalise(myrealloc, NULL) ==
+ PARSERUTILS_OK);
+
+ printf("PASS\n");
+
+ return 0;
+}
diff --git a/test/regression/filter-segv.c b/test/regression/filter-segv.c
new file mode 100644
index 0000000..761caab
--- /dev/null
+++ b/test/regression/filter-segv.c
@@ -0,0 +1,39 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include <parserutils/parserutils.h>
+
+#include "input/filter.h"
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+ UNUSED(pw);
+
+ return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+ parserutils_filter *input;
+
+ if (argc != 2) {
+ printf("Usage: %s <filename>\n", argv[0]);
+ return 1;
+ }
+
+ assert(parserutils_initialise(argv[1], myrealloc, NULL) ==
+ PARSERUTILS_OK);
+
+ input = parserutils_filter_create("UTF-8", myrealloc, NULL);
+ assert(input);
+
+ parserutils_filter_destroy(input);
+
+ assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK);
+
+ printf("PASS\n");
+
+ return 0;
+}
diff --git a/test/regression/stream-nomem.c b/test/regression/stream-nomem.c
new file mode 100644
index 0000000..f62b392
--- /dev/null
+++ b/test/regression/stream-nomem.c
@@ -0,0 +1,94 @@
+#include <stdio.h>
+#include <string.h>
+
+#include <parserutils/parserutils.h>
+#include <parserutils/input/inputstream.h>
+
+#include "utils/utils.h"
+
+#include "testutils.h"
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+ UNUSED(pw);
+
+ return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+ parserutils_inputstream *stream;
+
+ /* This is specially calculated so that the inputstream is forced to
+ * reallocate (it assumes that the inputstream's buffer chunk size
+ * is 4k) */
+#define BUFFER_SIZE (4096 + 4)
+ uint8_t input_buffer[BUFFER_SIZE];
+// uint8_t *buffer;
+// size_t buflen;
+ uintptr_t c;
+ size_t clen;
+
+ if (argc != 2) {
+ printf("Usage: %s <aliases_file>\n", argv[0]);
+ return 1;
+ }
+
+ /* Populate the buffer with something sane */
+ memset(input_buffer, 'a', BUFFER_SIZE);
+ /* Now, set up our test data */
+ input_buffer[BUFFER_SIZE - 1] = '5';
+ input_buffer[BUFFER_SIZE - 2] = '4';
+ input_buffer[BUFFER_SIZE - 3] = '\xbd';
+ input_buffer[BUFFER_SIZE - 4] = '\xbf';
+ /* This byte will occupy the 4095th byte in the buffer and
+ * thus cause the entirety of U+FFFD to be buffered until after
+ * the buffer has been enlarged */
+ input_buffer[BUFFER_SIZE - 5] = '\xef';
+ input_buffer[BUFFER_SIZE - 6] = '3';
+ input_buffer[BUFFER_SIZE - 7] = '2';
+ input_buffer[BUFFER_SIZE - 8] = '1';
+
+ assert(parserutils_initialise(argv[1], myrealloc, NULL) ==
+ PARSERUTILS_OK);
+
+ stream = parserutils_inputstream_create("UTF-8", 0,
+ NULL, myrealloc, NULL);
+ assert(stream != NULL);
+
+ assert(parserutils_inputstream_append(stream,
+ input_buffer, BUFFER_SIZE) == PARSERUTILS_OK);
+
+ assert(parserutils_inputstream_append(stream, NULL, 0) ==
+ PARSERUTILS_OK);
+
+ while ((c = parserutils_inputstream_peek(stream, 0, &clen)) !=
+ PARSERUTILS_INPUTSTREAM_EOF)
+ parserutils_inputstream_advance(stream, clen);
+
+/*
+ assert(css_inputstream_claim_buffer(stream, &buffer, &buflen) ==
+ CSS_OK);
+
+ assert(buflen == BUFFER_SIZE);
+
+ printf("Buffer: '%.*s'\n", 8, buffer + (BUFFER_SIZE - 8));
+
+ assert( buffer[BUFFER_SIZE - 6] == '3' &&
+ buffer[BUFFER_SIZE - 5] == (uint8_t) '\xef' &&
+ buffer[BUFFER_SIZE - 4] == (uint8_t) '\xbf' &&
+ buffer[BUFFER_SIZE - 3] == (uint8_t) '\xbd' &&
+ buffer[BUFFER_SIZE - 2] == '4');
+
+ free(buffer);
+*/
+
+ parserutils_inputstream_destroy(stream);
+
+ assert(parserutils_finalise(myrealloc, NULL) == PARSERUTILS_OK);
+
+ printf("PASS\n");
+
+ return 0;
+}
+
diff --git a/test/testrunner.pl b/test/testrunner.pl
new file mode 100644
index 0000000..1c6c66d
--- /dev/null
+++ b/test/testrunner.pl
@@ -0,0 +1,167 @@
+#!/bin/perl
+#
+# Testcase runner
+#
+# Usage: testrunner <directory> [<executable extension>]
+#
+# Operates upon INDEX files described in the README.
+# Locates and executes testcases, feeding data files to programs
+# as appropriate.
+# Logs testcase output to file.
+# Aborts test sequence on detection of error.
+#
+
+use warnings;
+use strict;
+use File::Spec;
+use IPC::Open3;
+
+if (@ARGV < 1) {
+ print "Usage: testrunner.pl <directory> [<exeext>]\n";
+ exit;
+}
+
+# Get directory
+my $directory = shift @ARGV;
+
+# Get EXE extension (if any)
+my $exeext = "";
+$exeext = shift @ARGV if (@ARGV > 0);
+
+# Open log file and /dev/null
+open(LOG, ">$directory/log") or die "Failed opening test log";
+open(NULL, "+<", File::Spec->devnull) or die "Failed opening /dev/null";
+
+# Open testcase index
+open(TINDEX, "<$directory/INDEX") or die "Failed opening test INDEX";
+
+# Parse testcase index, looking for testcases
+while (my $line = <TINDEX>) {
+ next if ($line =~ /^(#.*)?$/);
+
+ # Found one; decompose
+ (my $test, my $desc, my $data) = split /\t+/, $line;
+
+ # Strip whitespace
+ $test =~ s/^\s+|\s+$//g;
+ $desc =~ s/^\s+|\s+$//g;
+ $data =~ s/^\s+|\s+$//g if ($data);
+
+ # Append EXE extension to binary name
+ $test = $test . $exeext;
+
+ print "Test: $desc\n";
+
+ my $pid;
+
+ if ($data) {
+ # Testcase has external data files
+
+ # Open datafile index
+ open(DINDEX, "<$directory/data/$data/INDEX") or
+ die "Failed opening $directory/data/$data/INDEX";
+
+ # Parse datafile index, looking for datafiles
+ while (my $dentry = <DINDEX>) {
+ next if ($dentry =~ /^(#.*)?$/);
+
+ # Found one; decompose
+ (my $dtest, my $ddesc) = split /\t+/, $dentry;
+
+ # Strip whitespace
+ $dtest =~ s/^\s+|\s+$//g;
+ $ddesc =~ s/^\s+|\s+$//g;
+
+ print LOG "Running $directory/$test " .
+ "$directory/data/Aliases " .
+ "$directory/data/$data/$dtest\n";
+
+ # Make message fit on an 80 column terminal
+ my $msg = " ==> $test [$data/$dtest]";
+ $msg = $msg . "." x (80 - length($msg) - 8);
+
+ print $msg;
+
+ # Run testcase
+ $pid = open3("&<NULL", \*OUT, \*ERR,
+ "$directory/$test",
+ "$directory/data/Aliases",
+ "$directory/data/$data/$dtest");
+
+ my $last = "FAIL";
+
+ # Marshal testcase output to log file
+ while (my $output = <OUT>) {
+ print LOG " $output";
+ $last = $output;
+ }
+
+ # Wait for child to finish
+ waitpid($pid, 0);
+
+ print substr($last, 0, 4) . "\n";
+
+ # Bail, noisily, on failure
+ if (substr($last, 0, 4) eq "FAIL") {
+ # Write any stderr output to the log
+ while (my $errors = <ERR>) {
+ print LOG " $errors";
+ }
+
+ print "\n\nFailure detected: " .
+ "consult log file\n\n\n";
+
+ exit(1);
+ }
+ }
+
+ close(DINDEX);
+ } else {
+ # Testcase has no external data files
+ print LOG "Running $directory/$test $directory/data/Aliases\n";
+
+ # Make message fit on an 80 column terminal
+ my $msg = " ==> $test";
+ $msg = $msg . "." x (80 - length($msg) - 8);
+
+ print $msg;
+
+ # Run testcase
+ $pid = open3("&<NULL", \*OUT, \*ERR,
+ "$directory/$test", "$directory/data/Aliases");
+
+ my $last = "FAIL";
+
+ # Marshal testcase output to log file
+ while (my $output = <OUT>) {
+ print LOG " $output";
+ $last = $output;
+ }
+
+ # Wait for child to finish
+ waitpid($pid, 0);
+
+ print substr($last, 0, 4) . "\n";
+
+ # Bail, noisily, on failure
+ if (substr($last, 0, 4) eq "FAIL") {
+ # Write any stderr output to the log
+ while (my $errors = <ERR>) {
+ print LOG " $errors";
+ }
+
+ print "\n\nFailure detected: " .
+ "consult log file\n\n\n";
+
+ exit(1);
+ }
+ }
+
+ print "\n";
+}
+
+# Clean up
+close(TINDEX);
+
+close(NULL);
+close(LOG);
diff --git a/test/testutils.h b/test/testutils.h
new file mode 100644
index 0000000..c91c5b8
--- /dev/null
+++ b/test/testutils.h
@@ -0,0 +1,123 @@
+#ifndef test_testutils_h_
+#define test_testutils_h_
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#ifndef UNUSED
+#define UNUSED(x) ((x) = (x))
+#endif
+
+/* Redefine assert, so we can simply use the standard assert mechanism
+ * within testcases and exit with the right output for the testrunner
+ * to do the right thing. */
+void __assert2(const char *expr, const char *function,
+ const char *file, int line);
+
+void __assert2(const char *expr, const char *function,
+ const char *file, int line)
+{
+ UNUSED(function);
+ UNUSED(file);
+
+ printf("FAIL - %s at line %d\n", expr, line);
+
+ exit(EXIT_FAILURE);
+}
+
+#define assert(expr) \
+ ((void) ((expr) || (__assert2 (#expr, __func__, __FILE__, __LINE__), 0)))
+
+
+typedef bool (*line_func)(const char *data, size_t datalen, void *pw);
+
+static size_t parse_strlen(const char *str, size_t limit);
+bool parse_testfile(const char *filename, line_func callback, void *pw);
+size_t parse_filesize(const char *filename);
+
+/**
+ * Testcase datafile parser driver
+ *
+ * \param filename Name of file to parse
+ * \param callback Pointer to function to handle each line of input data
+ * \param pw Pointer to client-specific private data
+ * \return true on success, false otherwise.
+ */
+bool parse_testfile(const char *filename, line_func callback, void *pw)
+{
+ FILE *fp;
+ char buf[300];
+
+ fp = fopen(filename, "rb");
+ if (fp == NULL) {
+ printf("Failed opening %s\n", filename);
+ return false;
+ }
+
+ while (fgets(buf, sizeof buf, fp)) {
+ if (buf[0] == '\n')
+ continue;
+
+ if (!callback(buf, parse_strlen(buf, sizeof buf), pw)) {
+ fclose(fp);
+ return false;
+ }
+ }
+
+ fclose(fp);
+
+ return true;
+}
+
+/**
+ * Utility string length measurer; assumes strings are '\n' terminated
+ *
+ * \param str String to measure length of
+ * \param limit Upper bound on string length
+ * \return String length
+ */
+size_t parse_strlen(const char *str, size_t limit)
+{
+ size_t len = 0;
+
+ if (str == NULL)
+ return 0;
+
+ while (len < limit - 1 && *str != '\n') {
+ len++;
+ str++;
+ }
+
+ len++;
+
+ return len;
+}
+
+/**
+ * Read the size of a file
+ *
+ * \param filename Name of file to read size of
+ * \return File size (in bytes), or 0 on error
+ */
+size_t parse_filesize(const char *filename)
+{
+ FILE *fp;
+ size_t len = 0;
+
+ fp = fopen(filename, "rb");
+ if (fp == NULL) {
+ printf("Failed opening %s\n", filename);
+ return 0;
+ }
+
+ fseek(fp, 0, SEEK_END);
+ len = ftell(fp);
+
+ fclose(fp);
+
+ return len;
+}
+
+
+#endif