summaryrefslogtreecommitdiff
path: root/src/charset
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2008-05-01 16:34:46 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2008-05-01 16:34:46 +0000
commit2777a04ed2ba4fd36138b991d66a32a283361f7e (patch)
treeb0c3730533c36ca41402b6d0c5b98413f0a57bee /src/charset
downloadlibparserutils-2777a04ed2ba4fd36138b991d66a32a283361f7e.tar.gz
libparserutils-2777a04ed2ba4fd36138b991d66a32a283361f7e.tar.bz2
Import parser construction utility library
svn path=/trunk/libparserutils/; revision=4111
Diffstat (limited to 'src/charset')
-rw-r--r--src/charset/Makefile49
-rw-r--r--src/charset/aliases.c410
-rw-r--r--src/charset/aliases.h36
-rw-r--r--src/charset/charset.c54
-rw-r--r--src/charset/charset.h24
-rw-r--r--src/charset/codec.c185
-rw-r--r--src/charset/codecs/Makefile46
-rw-r--r--src/charset/codecs/codec_iconv.c683
-rw-r--r--src/charset/codecs/codec_impl.h48
-rw-r--r--src/charset/codecs/codec_utf16.c544
-rw-r--r--src/charset/codecs/codec_utf8.c546
-rw-r--r--src/charset/encodings/Makefile46
-rw-r--r--src/charset/encodings/utf16.c239
-rw-r--r--src/charset/encodings/utf8.c175
-rw-r--r--src/charset/encodings/utf8impl.h339
15 files changed, 3424 insertions, 0 deletions
diff --git a/src/charset/Makefile b/src/charset/Makefile
new file mode 100644
index 0000000..fc34d7c
--- /dev/null
+++ b/src/charset/Makefile
@@ -0,0 +1,49 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Manipulate include paths
+override CFLAGS := $(CFLAGS) -I$(d)
+
+# Sources
+SRCS_$(d) := aliases.c charset.c codec.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/aliases.c b/src/charset/aliases.c
new file mode 100644
index 0000000..1e7e6ea
--- /dev/null
+++ b/src/charset/aliases.c
@@ -0,0 +1,410 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "utils/utils.h"
+
+struct alias {
+ struct alias *next;
+ parserutils_charset_aliases_canon *canon;
+ uint16_t name_len;
+ char name[1];
+};
+
+#define HASH_SIZE (43)
+static parserutils_charset_aliases_canon *canon_tab[HASH_SIZE];
+static struct alias *alias_tab[HASH_SIZE];
+
+static parserutils_error parserutils_charset_create_alias(const char *alias,
+ parserutils_charset_aliases_canon *c,
+ parserutils_alloc alloc, void *pw);
+static parserutils_charset_aliases_canon *parserutils_charset_create_canon(
+ const char *canon, uint16_t mibenum,
+ parserutils_alloc alloc, void *pw);
+static uint32_t parserutils_charset_hash_val(const char *alias, size_t len);
+
+/**
+ * Create alias data from Aliases file
+ *
+ * \param filename The path to the Aliases file
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ */
+parserutils_error parserutils_charset_aliases_create(const char *filename,
+ parserutils_alloc alloc, void *pw)
+{
+ char buf[300];
+ FILE *fp;
+
+ if (filename == NULL || alloc == NULL)
+ return PARSERUTILS_BADPARM;
+
+ fp = fopen(filename, "r");
+ if (fp == NULL)
+ return PARSERUTILS_FILENOTFOUND;
+
+ while (fgets(buf, sizeof buf, fp)) {
+ char *p, *aliases = 0, *mib, *end;
+ parserutils_charset_aliases_canon *cf;
+
+ if (buf[0] == 0 || buf[0] == '#')
+ /* skip blank lines or comments */
+ continue;
+
+ buf[strlen(buf) - 1] = 0; /* lose terminating newline */
+ end = buf + strlen(buf);
+
+ /* find end of canonical form */
+ for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ *p++ = '\0'; /* terminate canonical form */
+
+ /* skip whitespace */
+ for (; *p && isspace(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ mib = p;
+
+ /* find end of mibenum */
+ for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p < end)
+ *p++ = '\0'; /* terminate mibenum */
+
+ cf = parserutils_charset_create_canon(buf, atoi(mib), alloc, pw);
+ if (cf == NULL)
+ continue;
+
+ /* skip whitespace */
+ for (; p < end && *p && isspace(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ aliases = p;
+
+ while (p < end) {
+ /* find end of alias */
+ for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p > end)
+ /* stop if we've gone past the end */
+ break;
+ /* terminate current alias */
+ *p++ = '\0';
+
+ if (parserutils_charset_create_alias(aliases, cf,
+ alloc, pw) != PARSERUTILS_OK)
+ break;
+
+ /* in terminating, we may have advanced
+ * past the end - check this here */
+ if (p >= end)
+ break;
+
+ /* skip whitespace */
+ for (; *p && isspace(*p); p++)
+ ; /* do nothing */
+
+ if (p >= end)
+ /* gone past end => stop */
+ break;
+
+ /* update pointer to current alias */
+ aliases = p;
+ }
+ }
+
+ fclose(fp);
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Free all alias data
+ *
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data
+ */
+void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw)
+{
+ parserutils_charset_aliases_canon *c, *d;
+ struct alias *a, *b;
+ int i;
+
+ for (i = 0; i != HASH_SIZE; i++) {
+ for (c = canon_tab[i]; c; c = d) {
+ d = c->next;
+ alloc(c, 0, pw);
+ }
+ canon_tab[i] = NULL;
+
+ for (a = alias_tab[i]; a; a = b) {
+ b = a->next;
+ alloc(a, 0, pw);
+ }
+ alias_tab[i] = NULL;
+ }
+}
+
+/**
+ * Retrieve the MIB enum value assigned to an encoding name
+ *
+ * \param alias The alias to lookup
+ * \param len The length of the alias string
+ * \return The MIB enum value, or 0 if not found
+ */
+uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len)
+{
+ parserutils_charset_aliases_canon *c;
+
+ if (alias == NULL)
+ return 0;
+
+ c = parserutils_charset_alias_canonicalise(alias, len);
+ if (c == NULL)
+ return 0;
+
+ return c->mib_enum;
+}
+
+/**
+ * Retrieve the canonical name of an encoding from the MIB enum
+ *
+ * \param mibenum The MIB enum value
+ * \return Pointer to canonical name, or NULL if not found
+ */
+const char *parserutils_charset_mibenum_to_name(uint16_t mibenum)
+{
+ int i;
+ parserutils_charset_aliases_canon *c;
+
+ for (i = 0; i != HASH_SIZE; i++)
+ for (c = canon_tab[i]; c; c = c->next)
+ if (c->mib_enum == mibenum)
+ return c->name;
+
+ return NULL;
+}
+
+/**
+ * Detect if a parserutils_charset is Unicode
+ *
+ * \param mibenum The MIB enum to consider
+ * \return true if a Unicode variant, false otherwise
+ */
+bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum)
+{
+ static uint16_t ucs4;
+ static uint16_t ucs2;
+ static uint16_t utf8;
+ static uint16_t utf16;
+ static uint16_t utf16be;
+ static uint16_t utf16le;
+ static uint16_t utf32;
+ static uint16_t utf32be;
+ static uint16_t utf32le;
+
+ if (ucs4 == 0) {
+ ucs4 = parserutils_charset_mibenum_from_name("UCS-4",
+ SLEN("UCS-4"));
+ ucs2 = parserutils_charset_mibenum_from_name("UCS-2",
+ SLEN("UCS-2"));
+ utf8 = parserutils_charset_mibenum_from_name("UTF-8",
+ SLEN("UTF-8"));
+ utf16 = parserutils_charset_mibenum_from_name("UTF-16",
+ SLEN("UTF-16"));
+ utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE"));
+ utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE"));
+ utf32 = parserutils_charset_mibenum_from_name("UTF-32",
+ SLEN("UTF-32"));
+ utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"));
+ utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE"));
+ }
+
+ return (mibenum == ucs4 || mibenum == ucs2 || mibenum == utf8 ||
+ mibenum == utf16 || mibenum == utf16be ||
+ mibenum == utf16le || mibenum == utf32 ||
+ mibenum == utf32be || mibenum == utf32le);
+}
+
+/**
+ * Retrieve the canonical form of an alias name
+ *
+ * \param alias The alias name
+ * \param len The length of the alias name
+ * \return Pointer to canonical form or NULL if not found
+ */
+parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise(
+ const char *alias, size_t len)
+{
+ uint32_t hash;
+ parserutils_charset_aliases_canon *c;
+ struct alias *a;
+
+ if (alias == NULL)
+ return NULL;
+
+ hash = parserutils_charset_hash_val(alias, len);
+
+ for (c = canon_tab[hash]; c; c = c->next)
+ if (c->name_len == len &&
+ strncasecmp(c->name, alias, len) == 0)
+ break;
+ if (c)
+ return c;
+
+ for (a = alias_tab[hash]; a; a = a->next)
+ if (a->name_len == len &&
+ strncasecmp(a->name, alias, len) == 0)
+ break;
+ if (a)
+ return a->canon;
+
+ return NULL;
+}
+
+
+/**
+ * Create an alias
+ *
+ * \param alias The alias name
+ * \param c The canonical form
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_create_alias(const char *alias,
+ parserutils_charset_aliases_canon *c,
+ parserutils_alloc alloc, void *pw)
+{
+ struct alias *a;
+ uint32_t hash;
+
+ if (alias == NULL || c == NULL || alloc == NULL)
+ return PARSERUTILS_BADPARM;
+
+ a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw);
+ if (a == NULL)
+ return PARSERUTILS_NOMEM;
+
+ a->canon = c;
+ a->name_len = strlen(alias);
+ strcpy(a->name, alias);
+ a->name[a->name_len] = '\0';
+
+ hash = parserutils_charset_hash_val(alias, a->name_len);
+
+ a->next = alias_tab[hash];
+ alias_tab[hash] = a;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Create a canonical form
+ *
+ * \param canon The canonical name
+ * \param mibenum The MIB enum value
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to canonical form or NULL on error
+ */
+parserutils_charset_aliases_canon *parserutils_charset_create_canon(
+ const char *canon, uint16_t mibenum,
+ parserutils_alloc alloc, void *pw)
+{
+ parserutils_charset_aliases_canon *c;
+ uint32_t hash, len;
+
+ if (canon == NULL || alloc == NULL)
+ return NULL;
+
+ len = strlen(canon);
+
+ c = alloc(NULL, sizeof(parserutils_charset_aliases_canon) + len + 1, pw);
+ if (c == NULL)
+ return NULL;
+
+ c->mib_enum = mibenum;
+ c->name_len = len;
+ strcpy(c->name, canon);
+ c->name[len] = '\0';
+
+ hash = parserutils_charset_hash_val(canon, len);
+
+ c->next = canon_tab[hash];
+ canon_tab[hash] = c;
+
+ return c;
+}
+
+/**
+ * Hash function
+ *
+ * \param alias String to hash
+ * \return The hashed value
+ */
+uint32_t parserutils_charset_hash_val(const char *alias, size_t len)
+{
+ const char *s = alias;
+ uint32_t h = 5381;
+
+ if (alias == NULL)
+ return 0;
+
+ while (len--)
+ h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */
+
+ return h % HASH_SIZE;
+}
+
+
+#ifndef NDEBUG
+/**
+ * Dump all alias data to stdout
+ */
+void parserutils_charset_aliases_dump(void)
+{
+ parserutils_charset_aliases_canon *c;
+ struct alias *a;
+ int i;
+ size_t size = 0;
+
+ for (i = 0; i != HASH_SIZE; i++) {
+ for (c = canon_tab[i]; c; c = c->next) {
+ printf("%d %s\n", i, c->name);
+ size += offsetof(parserutils_charset_aliases_canon,
+ name) + c->name_len;
+ }
+
+ for (a = alias_tab[i]; a; a = a->next) {
+ printf("%d %s\n", i, a->name);
+ size += offsetof(struct alias, name) + a->name_len;
+ }
+ }
+
+ size += (sizeof(canon_tab) / sizeof(canon_tab[0]));
+ size += (sizeof(alias_tab) / sizeof(alias_tab[0]));
+
+ printf("%u\n", (unsigned int) size);
+}
+#endif
diff --git a/src/charset/aliases.h b/src/charset/aliases.h
new file mode 100644
index 0000000..9abd2c8
--- /dev/null
+++ b/src/charset/aliases.h
@@ -0,0 +1,36 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_aliases_h_
+#define parserutils_charset_aliases_h_
+
+#include <inttypes.h>
+
+#include <parserutils/charset/mibenum.h>
+
+typedef struct parserutils_charset_aliases_canon {
+ struct parserutils_charset_aliases_canon *next;
+ uint16_t mib_enum;
+ uint16_t name_len;
+ char name[1];
+} parserutils_charset_aliases_canon;
+
+/* Load encoding aliases from file */
+parserutils_error parserutils_charset_aliases_create(const char *filename,
+ parserutils_alloc alloc, void *pw);
+/* Destroy encoding aliases */
+void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw);
+
+/* Canonicalise an alias name */
+parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise(
+ const char *alias, size_t len);
+
+#ifndef NDEBUG
+void parserutils_charset_aliases_dump(void);
+#endif
+
+#endif
diff --git a/src/charset/charset.c b/src/charset/charset.c
new file mode 100644
index 0000000..3ef1a71
--- /dev/null
+++ b/src/charset/charset.c
@@ -0,0 +1,54 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include "charset/aliases.h"
+#include "charset/charset.h"
+
+/**
+ * Initialise the Charset library for use.
+ *
+ * This _must_ be called before using any libparserutils charset functions
+ *
+ * \param aliases_file Pointer to name of file containing encoding alias data
+ * \param alloc Pointer to (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, applicable error otherwise.
+ */
+parserutils_error parserutils_charset_initialise(const char *aliases_file,
+ parserutils_alloc alloc, void *pw)
+{
+ parserutils_error error;
+
+ if (aliases_file == NULL || alloc == NULL)
+ return PARSERUTILS_BADPARM;
+
+ error = parserutils_charset_aliases_create(aliases_file, alloc, pw);
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Clean up after Libparserutils
+ *
+ * \param alloc Pointer to (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return PARSERUTILS_OK on success, applicable error otherwise.
+ */
+parserutils_error parserutils_charset_finalise(parserutils_alloc alloc,
+ void *pw)
+{
+ if (alloc == NULL)
+ return PARSERUTILS_BADPARM;
+
+ parserutils_charset_aliases_destroy(alloc, pw);
+
+ return PARSERUTILS_OK;
+}
+
+
diff --git a/src/charset/charset.h b/src/charset/charset.h
new file mode 100644
index 0000000..4b07577
--- /dev/null
+++ b/src/charset/charset.h
@@ -0,0 +1,24 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_charset_h_
+#define parserutils_charset_charset_h_
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+#include <parserutils/types.h>
+
+/* Initialise the Charset library for use */
+parserutils_error parserutils_charset_initialise(const char *aliases_file,
+ parserutils_alloc alloc, void *pw);
+
+/* Clean up after Charset */
+parserutils_error parserutils_charset_finalise(parserutils_alloc alloc,
+ void *pw);
+
+#endif
+
diff --git a/src/charset/codec.c b/src/charset/codec.c
new file mode 100644
index 0000000..5c3fb3a
--- /dev/null
+++ b/src/charset/codec.c
@@ -0,0 +1,185 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "charset/codecs/codec_impl.h"
+
+#ifdef WITH_ICONV_CODEC
+extern parserutils_charset_handler iconv_codec_handler;
+#endif
+
+extern parserutils_charset_handler charset_utf8_codec_handler;
+extern parserutils_charset_handler charset_utf16_codec_handler;
+
+static parserutils_charset_handler *handler_table[] = {
+ &charset_utf8_codec_handler,
+ &charset_utf16_codec_handler,
+#ifdef WITH_ICONV_CODEC
+ &iconv_codec_handler,
+#endif
+ NULL,
+};
+
+/**
+ * Create a charset codec
+ *
+ * \param charset Target charset
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec instance, or NULL on failure
+ */
+parserutils_charset_codec *parserutils_charset_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw)
+{
+ parserutils_charset_codec *codec;
+ parserutils_charset_handler **handler;
+ const parserutils_charset_aliases_canon * canon;
+
+ if (charset == NULL || alloc == NULL)
+ return NULL;
+
+ /* Canonicalise parserutils_charset name. */
+ canon = parserutils_charset_alias_canonicalise(charset,
+ strlen(charset));
+ if (canon == NULL)
+ return NULL;
+
+ /* Search for handler class */
+ for (handler = handler_table; *handler != NULL; handler++) {
+ if ((*handler)->handles_charset(canon->name))
+ break;
+ }
+
+ /* None found */
+ if ((*handler) == NULL)
+ return NULL;
+
+ /* Instantiate class */
+ codec = (*handler)->create(canon->name, alloc, pw);
+ if (codec == NULL)
+ return NULL;
+
+ /* and initialise it */
+ codec->mibenum = canon->mib_enum;
+
+ codec->errormode = PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE;
+
+ codec->alloc = alloc;
+ codec->alloc_pw = pw;
+
+ return codec;
+}
+
+/**
+ * Destroy a charset codec
+ *
+ * \param codec The codec to destroy
+ */
+void parserutils_charset_codec_destroy(parserutils_charset_codec *codec)
+{
+ if (codec == NULL)
+ return;
+
+ codec->handler.destroy(codec);
+
+ codec->alloc(codec, 0, codec->alloc_pw);
+}
+
+/**
+ * Configure a charset codec
+ *
+ * \param codec The codec to configure
+ * \parem type The codec option type to configure
+ * \param params Option-specific parameters
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_codec_setopt(
+ parserutils_charset_codec *codec,
+ parserutils_charset_codec_opttype type,
+ parserutils_charset_codec_optparams *params)
+{
+ if (codec == NULL || params == NULL)
+ return PARSERUTILS_BADPARM;
+
+ switch (type) {
+ case PARSERUTILS_CHARSET_CODEC_ERROR_MODE:
+ codec->errormode = params->error_mode.mode;
+ break;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Encode a chunk of UCS4 data into a codec's charset
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ */
+parserutils_error parserutils_charset_codec_encode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ if (codec == NULL || source == NULL || *source == NULL ||
+ sourcelen == NULL || dest == NULL || *dest == NULL ||
+ destlen == NULL)
+ return PARSERUTILS_BADPARM;
+
+ return codec->handler.encode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Decode a chunk of data in a codec's charset into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ *
+ * Call this with a source length of 0 to flush any buffers.
+ */
+parserutils_error parserutils_charset_codec_decode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ if (codec == NULL || source == NULL || *source == NULL ||
+ sourcelen == NULL || dest == NULL || *dest == NULL ||
+ destlen == NULL)
+ return PARSERUTILS_BADPARM;
+
+ return codec->handler.decode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Clear a charset codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_codec_reset(
+ parserutils_charset_codec *codec)
+{
+ if (codec == NULL)
+ return PARSERUTILS_BADPARM;
+
+ return codec->handler.reset(codec);
+}
+
diff --git a/src/charset/codecs/Makefile b/src/charset/codecs/Makefile
new file mode 100644
index 0000000..6d3b78e
--- /dev/null
+++ b/src/charset/codecs/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Sources
+SRCS_$(d) := codec_iconv.c codec_utf8.c codec_utf16.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/codecs/codec_iconv.c b/src/charset/codecs/codec_iconv.c
new file mode 100644
index 0000000..bbe8bc4
--- /dev/null
+++ b/src/charset/codecs/codec_iconv.c
@@ -0,0 +1,683 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/* This codec is hideously slow. Only use it as a last resort */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+/* We put this here rather than at the top as GCC complains
+ * about the source file being empty otherwise. */
+#ifdef WITH_ICONV_CODEC
+
+#include <iconv.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <parserutils/charset/mibenum.h>
+
+#include "charset/codecs/codec_impl.h"
+#include "utils/utils.h"
+
+/**
+ * Iconv-based charset codec
+ */
+typedef struct iconv_codec {
+ parserutils_charset_codec base; /**< Base class */
+
+ iconv_t read_cd; /**< Iconv handle for reading */
+#define INVAL_BUFSIZE (32)
+ uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
+ * incomplete input
+ * sequences */
+ size_t inval_len; /**< Number of bytes in inval_buf */
+
+#define READ_BUFSIZE (8)
+ uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
+ * output sequences (decode)
+ */
+ size_t read_len; /**< Number of characters in
+ * read_buf */
+
+ iconv_t write_cd; /**< Iconv handle for writing */
+#define WRITE_BUFSIZE (8)
+ uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
+ * output sequences (encode)
+ */
+ size_t write_len; /**< Number of characters in
+ * write_buf */
+} iconv_codec;
+
+
+static bool iconv_codec_handles_charset(const char *charset);
+static parserutils_charset_codec *iconv_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw);
+static void iconv_codec_destroy (parserutils_charset_codec *codec);
+static parserutils_error iconv_codec_encode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error iconv_codec_decode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error iconv_codec_reset(parserutils_charset_codec *codec);
+static parserutils_error iconv_codec_output_decoded_char(
+ iconv_codec *c, uint32_t ucs4, uint8_t **dest,
+ size_t *destlen);
+static parserutils_error iconv_codec_read_char(iconv_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error iconv_codec_write_char(iconv_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool iconv_codec_handles_charset(const char *charset)
+{
+ iconv_t cd;
+ bool ret;
+
+ cd = iconv_open("UCS-4", charset);
+
+ ret = (cd != (iconv_t) -1);
+
+ if (ret)
+ iconv_close(cd);
+
+ return ret;
+}
+
+/**
+ * Create an iconv-based codec
+ *
+ * \param charset The charset to read from / write to
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+parserutils_charset_codec *iconv_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw)
+{
+ iconv_codec *codec;
+
+ codec = alloc(NULL, sizeof(iconv_codec), pw);
+ if (codec == NULL)
+ return NULL;
+
+ codec->read_cd = iconv_open("UCS-4", charset);
+ if (codec->read_cd == (iconv_t) -1) {
+ alloc(codec, 0, pw);
+ return NULL;
+ }
+
+ codec->write_cd = iconv_open(charset, "UCS-4");
+ if (codec->write_cd == (iconv_t) -1) {
+ iconv_close(codec->read_cd);
+ alloc(codec, 0, pw);
+ return NULL;
+ }
+
+ codec->inval_buf[0] = '\0';
+ codec->inval_len = 0;
+
+ codec->read_buf[0] = 0;
+ codec->read_len = 0;
+
+ codec->write_buf[0] = 0;
+ codec->write_len = 0;
+
+ /* Finally, populate vtable */
+ codec->base.handler.destroy = iconv_codec_destroy;
+ codec->base.handler.encode = iconv_codec_encode;
+ codec->base.handler.decode = iconv_codec_decode;
+ codec->base.handler.reset = iconv_codec_reset;
+
+ return (parserutils_charset_codec *) codec;
+}
+
+/**
+ * Destroy an iconv-based codec
+ *
+ * \param codec The codec to destroy
+ */
+void iconv_codec_destroy (parserutils_charset_codec *codec)
+{
+ iconv_codec *c = (iconv_codec *) codec;
+
+ iconv_close(c->read_cd);
+ iconv_close(c->write_cd);
+
+ return;
+}
+
+/**
+ * Encode a chunk of UCS4 data into an iconv-based codec's charset
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error iconv_codec_encode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ iconv_codec *c = (iconv_codec *) codec;
+ uint32_t ucs4;
+ const uint32_t *towrite;
+ size_t towritelen;
+ parserutils_error error;
+
+ /* Process any outstanding characters from the previous call */
+ if (c->write_len > 0) {
+ uint32_t *pwrite = c->write_buf;
+
+ while (c->write_len > 0) {
+ error = iconv_codec_write_char(c, pwrite[0],
+ dest, destlen);
+ if (error != PARSERUTILS_OK) {
+ /* Copy outstanding chars down, skipping
+ * invalid one, if present, so as to avoid
+ * reprocessing the invalid character */
+ if (error == PARSERUTILS_INVALID) {
+ for (ucs4 = 1; ucs4 < c->write_len;
+ ucs4++) {
+ c->write_buf[ucs4] =
+ pwrite[ucs4];
+ }
+ }
+
+ return error;
+ }
+
+ pwrite++;
+ c->write_len--;
+ }
+ }
+
+ /* Now process the characters for this call */
+ while (*sourcelen > 0) {
+ towrite = (const uint32_t *) (const void *) *source;
+ towritelen = 1;
+ ucs4 = *towrite;
+
+ /* Output current character(s) */
+ while (towritelen > 0) {
+ error = iconv_codec_write_char(c, towrite[0],
+ dest, destlen);
+
+ if (error != PARSERUTILS_OK) {
+ ucs4 = (error == PARSERUTILS_INVALID) ? 1 : 0;
+
+ if (towritelen - ucs4 >= WRITE_BUFSIZE)
+ abort();
+
+ c->write_len = towritelen - ucs4;
+
+ /* Copy pending chars to save area, for
+ * processing next call; skipping invalid
+ * character, if present, so it's not
+ * reprocessed. */
+ for (; ucs4 < towritelen; ucs4++) {
+ c->write_buf[ucs4] = towrite[ucs4];
+ }
+
+ /* Claim character we've just buffered,
+ * so it's not repreocessed */
+ *source += 4;
+ *sourcelen -= 4;
+
+ return error;
+ }
+
+ towrite++;
+ towritelen--;
+ }
+
+ *source += 4;
+ *sourcelen -= 4;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Decode a chunk of data in an iconv-based codec's charset into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error iconv_codec_decode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ iconv_codec *c = (iconv_codec *) codec;
+ parserutils_error error;
+
+ if (c->read_len > 0) {
+ /* Output left over from last decode
+ * Attempt to finish this here */
+ uint32_t *pread = c->read_buf;
+
+ while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+ *((uint32_t *) (void *) *dest) = pread[0];
+
+ *dest += 4;
+ *destlen -= 4;
+
+ pread++;
+ c->read_len--;
+ }
+
+ if (*destlen < c->read_len * 4) {
+ /* Run out of output buffer */
+ size_t i;
+
+ /* Shuffle remaining output down */
+ for (i = 0; i < c->read_len; i++) {
+ c->read_buf[i] = pread[i];
+ }
+
+ return PARSERUTILS_NOMEM;
+ }
+ }
+
+ if (c->inval_len > 0) {
+ /* The last decode ended in an incomplete sequence.
+ * Fill up inval_buf with data from the start of the
+ * new chunk and process it. */
+ uint8_t *in = c->inval_buf;
+ size_t ol = c->inval_len;
+ size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+ size_t orig_l = l;
+
+ memcpy(c->inval_buf + ol, *source, l);
+
+ l += c->inval_len;
+
+ error = iconv_codec_read_char(c,
+ (const uint8_t **) &in, &l, dest, destlen);
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+ return error;
+ }
+
+
+ /* And now, fix everything up so the normal processing
+ * does the right thing. */
+ *source += max((signed) (orig_l - l), 0);
+ *sourcelen -= max((signed) (orig_l - l), 0);
+
+ /* Failed to resolve an incomplete character and
+ * ran out of buffer space. No recovery strategy
+ * possible, so explode everywhere. */
+ if ((orig_l + ol) - l == 0)
+ abort();
+
+ /* Handle memry exhaustion case from above */
+ if (error != PARSERUTILS_OK)
+ return error;
+ }
+
+ while (*sourcelen > 0) {
+ error = iconv_codec_read_char(c,
+ source, sourcelen, dest, destlen);
+ if (error != PARSERUTILS_OK) {
+ return error;
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Clear an iconv-based codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error iconv_codec_reset(parserutils_charset_codec *codec)
+{
+ iconv_codec *c = (iconv_codec *) codec;
+
+ iconv(c->read_cd, NULL, NULL, NULL, NULL);
+ iconv(c->write_cd, NULL, NULL, NULL, NULL);
+
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ c->read_buf[0] = 0;
+ c->read_len = 0;
+
+ c->write_buf[0] = 0;
+ c->write_len = 0;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Output a UCS4 character
+ *
+ * \param c Codec to use
+ * \param ucs4 UCS4 character (big endian)
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to output buffer length
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ */
+parserutils_error iconv_codec_output_decoded_char(iconv_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ if (*destlen < 4) {
+ /* Run out of output buffer */
+
+ c->read_len = 1;
+ c->read_buf[0] = ucs4;
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ *((uint32_t *) (void *) *dest) = ucs4;
+ *dest += 4;
+ *destlen -= 4;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Read a character from the codec's native charset to UCS4 (big endian)
+ *
+ * \param c The codec
+ * \param source Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen Pointer to length of source buffer (updated on exit)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error iconv_codec_read_char(iconv_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ size_t iconv_ret;
+ const uint8_t *origsrc = *source;
+ size_t origsrclen = *sourcelen;
+ uint32_t ucs4;
+ uint8_t *pucs4 = (uint8_t *) &ucs4;
+ size_t sucs4 = 4;
+ parserutils_error error;
+
+ /* Use iconv to convert a single character
+ * Side effect: Updates *source to point at next input
+ * character and *sourcelen to reflect reduced input length
+ */
+ iconv_ret = iconv(c->read_cd, (char **) source, sourcelen,
+ (char **) (void *) &pucs4, &sucs4);
+
+ if (iconv_ret != (size_t) -1 ||
+ (*source != origsrc && sucs4 == 0)) {
+ /* Read a character */
+ error = iconv_codec_output_decoded_char(c, ucs4, dest, destlen);
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+ /* output failed; restore source pointers */
+ *source = origsrc;
+ *sourcelen = origsrclen;
+ }
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ return error;
+ } else if (errno == E2BIG) {
+ /* Should never happen */
+ abort();
+ } else if (errno == EINVAL) {
+ /* Incomplete input sequence */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (const char *) *source, *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ return PARSERUTILS_OK;
+ } else if (errno == EILSEQ) {
+ /* Illegal input sequence */
+ bool found = false;
+ const uint8_t *oldsrc;
+ size_t oldsrclen;
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ /* Strict errormode; simply flag invalid character */
+ if (c->base.errormode ==
+ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+ /* restore source pointers */
+ *source = origsrc;
+ *sourcelen = origsrclen;
+
+ return PARSERUTILS_INVALID;
+ }
+
+ /* Ok, this becomes problematic. The iconv API here
+ * is particularly unhelpful; *source will point at
+ * the _start_ of the illegal sequence. This means
+ * that we must find the end of the sequence */
+
+ /* Search for the start of the next valid input
+ * sequence (or the end of the input stream) */
+ while (*sourcelen > 1) {
+ pucs4 = (uint8_t *) &ucs4;
+ sucs4 = 4;
+
+ (*source)++;
+ (*sourcelen)--;
+
+ oldsrc = *source;
+ oldsrclen = *sourcelen;
+
+ iconv_ret = iconv(c->read_cd,
+ (char **) source, sourcelen,
+ (char **) (void *) &pucs4, &sucs4);
+ if (iconv_ret != (size_t) -1 || errno != EILSEQ) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found) {
+ /* Found start of next valid sequence */
+ *source = oldsrc;
+ *sourcelen = oldsrclen;
+ } else {
+ /* Not found - skip last byte in buffer */
+ (*source)++;
+ (*sourcelen)--;
+
+ if (*sourcelen != 0)
+ abort();
+ }
+
+ /* output U+FFFD and continue processing. */
+ error = iconv_codec_output_decoded_char(c,
+ htonl(0xFFFD), dest, destlen);
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+ /* output failed; restore source pointers */
+ *source = origsrc;
+ *sourcelen = origsrclen;
+ }
+
+ return error;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Write a UCS4 character in a codec's native charset
+ *
+ * \param c The codec
+ * \param ucs4 The UCS4 character to write (big endian)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if character cannot be represented and the
+ * codec's error handling mode is set to STRICT.
+ */
+parserutils_error iconv_codec_write_char(iconv_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ size_t iconv_ret;
+ uint8_t *pucs4 = (uint8_t *) &ucs4;
+ size_t sucs4 = 4;
+ uint8_t *origdest = *dest;
+
+ iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4,
+ &sucs4, (char **) dest, destlen);
+
+ if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+ /* Output buffer is too small */
+ return PARSERUTILS_NOMEM;
+ } else if (iconv_ret == (size_t) -1 && errno == EILSEQ) {
+ /* Illegal multibyte sequence */
+ /* This should never happen */
+ abort();
+ } else if (iconv_ret == (size_t) -1 && errno == EINVAL) {
+ /* Incomplete input character */
+ /* This should never happen */
+ abort();
+ } else if (*dest == origdest) {
+ /* Nothing was output */
+ switch (c->base.errormode) {
+ case PARSERUTILS_CHARSET_CODEC_ERROR_STRICT:
+ return PARSERUTILS_INVALID;
+
+ case PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT:
+ /** \todo transliteration */
+ case PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE:
+ {
+ pucs4 = (uint8_t *) &ucs4;
+ sucs4 = 4;
+
+ ucs4 = parserutils_charset_mibenum_is_unicode(
+ c->base.mibenum)
+ ? htonl(0xFFFD) : htonl(0x3F);
+
+ iconv_ret = iconv(c->write_cd,
+ (char **) (void *) &pucs4, &sucs4,
+ (char **) dest, destlen);
+
+ if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+ return PARSERUTILS_NOMEM;
+ } else if (iconv_ret == (size_t) -1 &&
+ errno == EILSEQ) {
+ /* Illegal multibyte sequence */
+ /* This should never happen */
+ abort();
+ } else if (iconv_ret == (size_t) -1 &&
+ errno == EINVAL) {
+ /* Incomplete input character */
+ /* This should never happen */
+ abort();
+ }
+ }
+ break;
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
+const parserutils_charset_handler iconv_codec_handler = {
+ iconv_codec_handles_charset,
+ iconv_codec_create
+};
+
+#endif
diff --git a/src/charset/codecs/codec_impl.h b/src/charset/codecs/codec_impl.h
new file mode 100644
index 0000000..9183594
--- /dev/null
+++ b/src/charset/codecs/codec_impl.h
@@ -0,0 +1,48 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_codecs_codecimpl_h_
+#define parserutils_charset_codecs_codecimpl_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include <parserutils/charset/codec.h>
+
+/**
+ * Core charset codec definition; implementations extend this
+ */
+struct parserutils_charset_codec {
+ uint16_t mibenum; /**< MIB enum for charset */
+
+ parserutils_charset_codec_errormode errormode; /**< error mode */
+
+ parserutils_alloc alloc; /**< allocation function */
+ void *alloc_pw; /**< private word */
+
+ struct {
+ void (*destroy)(parserutils_charset_codec *codec);
+ parserutils_error (*encode)(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+ parserutils_error (*decode)(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+ parserutils_error (*reset)(parserutils_charset_codec *codec);
+ } handler; /**< Vtable for handler code */
+};
+
+/**
+ * Codec factory component definition
+ */
+typedef struct parserutils_charset_handler {
+ bool (*handles_charset)(const char *charset);
+ parserutils_charset_codec *(*create)(const char *charset,
+ parserutils_alloc alloc, void *pw);
+} parserutils_charset_handler;
+
+#endif
diff --git a/src/charset/codecs/codec_utf16.c b/src/charset/codecs/codec_utf16.c
new file mode 100644
index 0000000..0dd7a07
--- /dev/null
+++ b/src/charset/codecs/codec_utf16.c
@@ -0,0 +1,544 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/utf16.h>
+
+#include "charset/codecs/codec_impl.h"
+#include "utils/utils.h"
+
+/**
+ * UTF-16 charset codec
+ */
+typedef struct charset_utf16_codec {
+ parserutils_charset_codec base; /**< Base class */
+
+#define INVAL_BUFSIZE (32)
+ uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
+ * incomplete input
+ * sequences */
+ size_t inval_len; /*< Byte length of inval_buf **/
+
+#define READ_BUFSIZE (8)
+ uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
+ * output sequences (decode)
+ * (host-endian) */
+ size_t read_len; /**< Character length of read_buf */
+
+#define WRITE_BUFSIZE (8)
+ uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
+ * output sequences (encode)
+ * (host-endian) */
+ size_t write_len; /**< Character length of write_buf */
+
+} charset_utf16_codec;
+
+static bool charset_utf16_codec_handles_charset(const char *charset);
+static parserutils_charset_codec *charset_utf16_codec_create(
+ const char *charset, parserutils_alloc alloc, void *pw);
+static void charset_utf16_codec_destroy (parserutils_charset_codec *codec);
+static parserutils_error charset_utf16_codec_encode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf16_codec_decode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf16_codec_reset(
+ parserutils_charset_codec *codec);
+static inline parserutils_error charset_utf16_codec_read_char(
+ charset_utf16_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_utf16_codec_output_decoded_char(
+ charset_utf16_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_utf16_codec_handles_charset(const char *charset)
+{
+ return parserutils_charset_mibenum_from_name(charset, strlen(charset))
+ ==
+ parserutils_charset_mibenum_from_name("UTF-16", SLEN("UTF-16"));
+}
+
+/**
+ * Create a utf16 codec
+ *
+ * \param charset The charset to read from / write to
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+parserutils_charset_codec *charset_utf16_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw)
+{
+ charset_utf16_codec *codec;
+
+ UNUSED(charset);
+
+ codec = alloc(NULL, sizeof(charset_utf16_codec), pw);
+ if (codec == NULL)
+ return NULL;
+
+ codec->inval_buf[0] = '\0';
+ codec->inval_len = 0;
+
+ codec->read_buf[0] = 0;
+ codec->read_len = 0;
+
+ codec->write_buf[0] = 0;
+ codec->write_len = 0;
+
+ /* Finally, populate vtable */
+ codec->base.handler.destroy = charset_utf16_codec_destroy;
+ codec->base.handler.encode = charset_utf16_codec_encode;
+ codec->base.handler.decode = charset_utf16_codec_decode;
+ codec->base.handler.reset = charset_utf16_codec_reset;
+
+ return (parserutils_charset_codec *) codec;
+}
+
+/**
+ * Destroy a utf16 codec
+ *
+ * \param codec The codec to destroy
+ */
+void charset_utf16_codec_destroy (parserutils_charset_codec *codec)
+{
+ UNUSED(codec);
+}
+
+/**
+ * Encode a chunk of UCS4 data into utf16
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf16_codec_encode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ charset_utf16_codec *c = (charset_utf16_codec *) codec;
+ uint32_t ucs4;
+ uint32_t *towrite;
+ size_t towritelen;
+ parserutils_error error;
+
+ /* Process any outstanding characters from the previous call */
+ if (c->write_len > 0) {
+ uint32_t *pwrite = c->write_buf;
+ uint8_t buf[4];
+ size_t len;
+
+ while (c->write_len > 0) {
+ error = parserutils_charset_utf16_from_ucs4(
+ pwrite[0], buf, &len);
+ if (error != PARSERUTILS_OK)
+ abort();
+
+ if (*destlen < len) {
+ /* Insufficient output buffer space */
+ for (len = 0; len < c->write_len; len++)
+ c->write_buf[len] = pwrite[len];
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ memcpy(*dest, buf, len);
+
+ *dest += len;
+ *destlen -= len;
+
+ pwrite++;
+ c->write_len--;
+ }
+ }
+
+ /* Now process the characters for this call */
+ while (*sourcelen > 0) {
+ ucs4 = ntohl(*((uint32_t *) (void *) *source));
+ towrite = &ucs4;
+ towritelen = 1;
+
+ /* Output current characters */
+ while (towritelen > 0) {
+ uint8_t buf[4];
+ size_t len;
+
+ error = parserutils_charset_utf16_from_ucs4(
+ towrite[0], buf, &len);
+ if (error != PARSERUTILS_OK)
+ abort();
+
+ if (*destlen < len) {
+ /* Insufficient output space */
+ if (towritelen >= WRITE_BUFSIZE)
+ abort();
+
+ c->write_len = towritelen;
+
+ /* Copy pending chars to save area, for
+ * processing next call. */
+ for (len = 0; len < towritelen; len++)
+ c->write_buf[len] = towrite[len];
+
+ /* Claim character we've just buffered,
+ * so it's not reprocessed */
+ *source += 4;
+ *sourcelen -= 4;
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ memcpy(*dest, buf, len);
+
+ *dest += len;
+ *destlen -= len;
+
+ towrite++;
+ towritelen--;
+ }
+
+ *source += 4;
+ *sourcelen -= 4;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Decode a chunk of utf16 data into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_utf16_codec_decode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ charset_utf16_codec *c = (charset_utf16_codec *) codec;
+ parserutils_error error;
+
+ if (c->read_len > 0) {
+ /* Output left over from last decode */
+ uint32_t *pread = c->read_buf;
+
+ while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+ *((uint32_t *) (void *) *dest) = htonl(pread[0]);
+
+ *dest += 4;
+ *destlen -= 4;
+
+ pread++;
+ c->read_len--;
+ }
+
+ if (*destlen < c->read_len * 4) {
+ /* Ran out of output buffer */
+ size_t i;
+
+ /* Shuffle remaining output down */
+ for (i = 0; i < c->read_len; i++)
+ c->read_buf[i] = pread[i];
+
+ return PARSERUTILS_NOMEM;
+ }
+ }
+
+ if (c->inval_len > 0) {
+ /* The last decode ended in an incomplete sequence.
+ * Fill up inval_buf with data from the start of the
+ * new chunk and process it. */
+ uint8_t *in = c->inval_buf;
+ size_t ol = c->inval_len;
+ size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+ size_t orig_l = l;
+
+ memcpy(c->inval_buf + ol, *source, l);
+
+ l += c->inval_len;
+
+ error = charset_utf16_codec_read_char(c,
+ (const uint8_t **) &in, &l, dest, destlen);
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+ return error;
+ }
+
+ /* And now, fix up source pointers */
+ *source += max((signed) (orig_l - l), 0);
+ *sourcelen -= max((signed) (orig_l - l), 0);
+
+ /* Failed to resolve an incomplete character and
+ * ran out of buffer space. No recovery strategy
+ * possible, so explode everywhere. */
+ if ((orig_l + ol) - l == 0)
+ abort();
+
+ /* Report memory exhaustion case from above */
+ if (error != PARSERUTILS_OK)
+ return error;
+ }
+
+ /* Finally, the "normal" case; process all outstanding characters */
+ while (*sourcelen > 0) {
+ error = charset_utf16_codec_read_char(c,
+ source, sourcelen, dest, destlen);
+ if (error != PARSERUTILS_OK) {
+ return error;
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Clear a utf16 codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_utf16_codec_reset(parserutils_charset_codec *codec)
+{
+ charset_utf16_codec *c = (charset_utf16_codec *) codec;
+
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ c->read_buf[0] = 0;
+ c->read_len = 0;
+
+ c->write_buf[0] = 0;
+ c->write_len = 0;
+
+ return PARSERUTILS_OK;
+}
+
+
+/**
+ * Read a character from the UTF-16 to UCS4 (big endian)
+ *
+ * \param c The codec
+ * \param source Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen Pointer to length of source buffer (updated on exit)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf16_codec_read_char(charset_utf16_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ uint32_t ucs4;
+ size_t sucs4;
+ parserutils_error error;
+
+ /* Convert a single character */
+ error = parserutils_charset_utf16_to_ucs4(*source, *sourcelen,
+ &ucs4, &sucs4);
+ if (error == PARSERUTILS_OK) {
+ /* Read a character */
+ error = charset_utf16_codec_output_decoded_char(c,
+ ucs4, dest, destlen);
+ if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+ /* output succeeded; update source pointers */
+ *source += sucs4;
+ *sourcelen -= sucs4;
+ }
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ return error;
+ } else if (error == PARSERUTILS_NEEDDATA) {
+ /* Incomplete input sequence */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (char *) *source, *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ return PARSERUTILS_OK;
+ } else if (error == PARSERUTILS_INVALID) {
+ /* Illegal input sequence */
+ uint32_t nextchar;
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ /* Strict errormode; simply flag invalid character */
+ if (c->base.errormode ==
+ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+ return PARSERUTILS_INVALID;
+ }
+
+ /* Find next valid UTF-16 sequence.
+ * We're processing client-provided data, so let's
+ * be paranoid about its validity. */
+ error = parserutils_charset_utf16_next_paranoid(
+ *source, *sourcelen, 0, &nextchar);
+ if (error != PARSERUTILS_OK) {
+ if (error == PARSERUTILS_NEEDDATA) {
+ /* Need more data to be sure */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (char *) *source,
+ *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ nextchar = 0;
+ } else {
+ return error;
+ }
+ }
+
+ /* output U+FFFD and continue processing. */
+ error = charset_utf16_codec_output_decoded_char(c,
+ 0xFFFD, dest, destlen);
+ if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+ /* output succeeded; update source pointers */
+ *source += nextchar;
+ *sourcelen -= nextchar;
+ }
+
+ return error;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Output a UCS4 character
+ *
+ * \param c Codec to use
+ * \param ucs4 UCS4 character (host endian)
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to output buffer length
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ */
+parserutils_error charset_utf16_codec_output_decoded_char(charset_utf16_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ if (*destlen < 4) {
+ /* Run out of output buffer */
+ c->read_len = 1;
+ c->read_buf[0] = ucs4;
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ *((uint32_t *) (void *) *dest) = htonl(ucs4);
+ *dest += 4;
+ *destlen -= 4;
+
+ return PARSERUTILS_OK;
+}
+
+
+const parserutils_charset_handler charset_utf16_codec_handler = {
+ charset_utf16_codec_handles_charset,
+ charset_utf16_codec_create
+};
diff --git a/src/charset/codecs/codec_utf8.c b/src/charset/codecs/codec_utf8.c
new file mode 100644
index 0000000..838d051
--- /dev/null
+++ b/src/charset/codecs/codec_utf8.c
@@ -0,0 +1,546 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include <parserutils/charset/mibenum.h>
+
+#include "charset/codecs/codec_impl.h"
+#include "charset/encodings/utf8impl.h"
+#include "utils/utils.h"
+
+/**
+ * UTF-8 charset codec
+ */
+typedef struct charset_utf8_codec {
+ parserutils_charset_codec base; /**< Base class */
+
+#define INVAL_BUFSIZE (32)
+ uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
+ * incomplete input
+ * sequences */
+ size_t inval_len; /*< Byte length of inval_buf **/
+
+#define READ_BUFSIZE (8)
+ uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
+ * output sequences (decode)
+ * (host-endian) */
+ size_t read_len; /**< Character length of read_buf */
+
+#define WRITE_BUFSIZE (8)
+ uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
+ * output sequences (encode)
+ * (host-endian) */
+ size_t write_len; /**< Character length of write_buf */
+
+} charset_utf8_codec;
+
+static bool charset_utf8_codec_handles_charset(const char *charset);
+static parserutils_charset_codec *charset_utf8_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw);
+static void charset_utf8_codec_destroy (parserutils_charset_codec *codec);
+static parserutils_error charset_utf8_codec_encode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf8_codec_decode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static parserutils_error charset_utf8_codec_reset(
+ parserutils_charset_codec *codec);
+static inline parserutils_error charset_utf8_codec_read_char(
+ charset_utf8_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static inline parserutils_error charset_utf8_codec_output_decoded_char(
+ charset_utf8_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool charset_utf8_codec_handles_charset(const char *charset)
+{
+ return parserutils_charset_mibenum_from_name(charset,
+ strlen(charset)) ==
+ parserutils_charset_mibenum_from_name("UTF-8",
+ SLEN("UTF-8"));
+}
+
+/**
+ * Create a utf8 codec
+ *
+ * \param charset The charset to read from / write to
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+parserutils_charset_codec *charset_utf8_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw)
+{
+ charset_utf8_codec *codec;
+
+ UNUSED(charset);
+
+ codec = alloc(NULL, sizeof(charset_utf8_codec), pw);
+ if (codec == NULL)
+ return NULL;
+
+ codec->inval_buf[0] = '\0';
+ codec->inval_len = 0;
+
+ codec->read_buf[0] = 0;
+ codec->read_len = 0;
+
+ codec->write_buf[0] = 0;
+ codec->write_len = 0;
+
+ /* Finally, populate vtable */
+ codec->base.handler.destroy = charset_utf8_codec_destroy;
+ codec->base.handler.encode = charset_utf8_codec_encode;
+ codec->base.handler.decode = charset_utf8_codec_decode;
+ codec->base.handler.reset = charset_utf8_codec_reset;
+
+ return (parserutils_charset_codec *) codec;
+}
+
+/**
+ * Destroy a utf8 codec
+ *
+ * \param codec The codec to destroy
+ */
+void charset_utf8_codec_destroy (parserutils_charset_codec *codec)
+{
+ UNUSED(codec);
+}
+
+/**
+ * Encode a chunk of UCS4 data into utf8
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf8_codec_encode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ charset_utf8_codec *c = (charset_utf8_codec *) codec;
+ uint32_t ucs4;
+ uint32_t *towrite;
+ size_t towritelen;
+ parserutils_error error;
+
+ /* Process any outstanding characters from the previous call */
+ if (c->write_len > 0) {
+ uint32_t *pwrite = c->write_buf;
+
+ while (c->write_len > 0) {
+ UTF8_FROM_UCS4(pwrite[0], dest, destlen, error);
+ if (error != PARSERUTILS_OK) {
+ if (error != PARSERUTILS_NOMEM)
+ abort();
+
+ /* Insufficient output buffer space */
+ for (uint32_t len = 0;
+ len < c->write_len; len++) {
+ c->write_buf[len] = pwrite[len];
+ }
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ pwrite++;
+ c->write_len--;
+ }
+ }
+
+ /* Now process the characters for this call */
+ while (*sourcelen > 0) {
+ ucs4 = ntohl(*((uint32_t *) (void *) *source));
+ towrite = &ucs4;
+ towritelen = 1;
+
+ /* Output current characters */
+ while (towritelen > 0) {
+ UTF8_FROM_UCS4(towrite[0], dest, destlen, error);
+ if (error != PARSERUTILS_OK) {
+ if (error != PARSERUTILS_NOMEM)
+ abort();
+
+ /* Insufficient output space */
+ if (towritelen >= WRITE_BUFSIZE)
+ abort();
+
+ c->write_len = towritelen;
+
+ /* Copy pending chars to save area, for
+ * processing next call. */
+ for (uint32_t len = 0; len < towritelen; len++)
+ c->write_buf[len] = towrite[len];
+
+ /* Claim character we've just buffered,
+ * so it's not reprocessed */
+ *source += 4;
+ *sourcelen -= 4;
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ towrite++;
+ towritelen--;
+ }
+
+ *source += 4;
+ *sourcelen -= 4;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Decode a chunk of utf8 data into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+parserutils_error charset_utf8_codec_decode(parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ charset_utf8_codec *c = (charset_utf8_codec *) codec;
+ parserutils_error error;
+
+ if (c->read_len > 0) {
+ /* Output left over from last decode */
+ uint32_t *pread = c->read_buf;
+
+ while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+ *((uint32_t *) (void *) *dest) = htonl(pread[0]);
+
+ *dest += 4;
+ *destlen -= 4;
+
+ pread++;
+ c->read_len--;
+ }
+
+ if (*destlen < c->read_len * 4) {
+ /* Ran out of output buffer */
+ size_t i;
+
+ /* Shuffle remaining output down */
+ for (i = 0; i < c->read_len; i++)
+ c->read_buf[i] = pread[i];
+
+ return PARSERUTILS_NOMEM;
+ }
+ }
+
+ if (c->inval_len > 0) {
+ /* The last decode ended in an incomplete sequence.
+ * Fill up inval_buf with data from the start of the
+ * new chunk and process it. */
+ uint8_t *in = c->inval_buf;
+ size_t ol = c->inval_len;
+ size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+ size_t orig_l = l;
+
+ memcpy(c->inval_buf + ol, *source, l);
+
+ l += c->inval_len;
+
+ error = charset_utf8_codec_read_char(c,
+ (const uint8_t **) &in, &l, dest, destlen);
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM) {
+ return error;
+ }
+
+ /* And now, fix up source pointers */
+ *source += max((signed) (orig_l - l), 0);
+ *sourcelen -= max((signed) (orig_l - l), 0);
+
+ /* Failed to resolve an incomplete character and
+ * ran out of buffer space. No recovery strategy
+ * possible, so explode everywhere. */
+ if ((orig_l + ol) - l == 0)
+ abort();
+
+ /* Report memory exhaustion case from above */
+ if (error != PARSERUTILS_OK)
+ return error;
+ }
+
+ /* Finally, the "normal" case; process all outstanding characters */
+ while (*sourcelen > 0) {
+ error = charset_utf8_codec_read_char(c,
+ source, sourcelen, dest, destlen);
+ if (error != PARSERUTILS_OK) {
+ return error;
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Clear a utf8 codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error charset_utf8_codec_reset(parserutils_charset_codec *codec)
+{
+ charset_utf8_codec *c = (charset_utf8_codec *) codec;
+
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ c->read_buf[0] = 0;
+ c->read_len = 0;
+
+ c->write_buf[0] = 0;
+ c->write_len = 0;
+
+ return PARSERUTILS_OK;
+}
+
+
+/**
+ * Read a character from the UTF-8 to UCS4 (big endian)
+ *
+ * \param c The codec
+ * \param source Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen Pointer to length of source buffer (updated on exit)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ * PARSERUTILS_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ *
+ * In the case of the result being _INVALID, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+parserutils_error charset_utf8_codec_read_char(charset_utf8_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ uint32_t ucs4;
+ size_t sucs4;
+ parserutils_error error;
+
+ /* Convert a single character */
+ {
+ const uint8_t *src = *source;
+ size_t srclen = *sourcelen;
+ uint32_t *uptr = &ucs4;
+ size_t *usptr = &sucs4;
+ UTF8_TO_UCS4(src, srclen, uptr, usptr, error);
+ }
+ if (error == PARSERUTILS_OK) {
+ /* Read a character */
+ error = charset_utf8_codec_output_decoded_char(c,
+ ucs4, dest, destlen);
+ if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+ /* output succeeded; update source pointers */
+ *source += sucs4;
+ *sourcelen -= sucs4;
+ }
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ return error;
+ } else if (error == PARSERUTILS_NEEDDATA) {
+ /* Incomplete input sequence */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (char *) *source, *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ return PARSERUTILS_OK;
+ } else if (error == PARSERUTILS_INVALID) {
+ /* Illegal input sequence */
+ uint32_t nextchar;
+
+ /* Strict errormode; simply flag invalid character */
+ if (c->base.errormode ==
+ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT) {
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ return PARSERUTILS_INVALID;
+ }
+
+ /* Find next valid UTF-8 sequence.
+ * We're processing client-provided data, so let's
+ * be paranoid about its validity. */
+ {
+ const uint8_t *src = *source;
+ size_t srclen = *sourcelen;
+ uint32_t off = 0;
+ uint32_t *ncptr = &nextchar;
+
+ UTF8_NEXT_PARANOID(src, srclen, off, ncptr, error);
+ }
+ if (error != PARSERUTILS_OK) {
+ if (error == PARSERUTILS_NEEDDATA) {
+ /* Need more data to be sure */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (char *) *source,
+ *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ nextchar = 0;
+ } else {
+ return error;
+ }
+ }
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ /* output U+FFFD and continue processing. */
+ error = charset_utf8_codec_output_decoded_char(c,
+ 0xFFFD, dest, destlen);
+ if (error == PARSERUTILS_OK || error == PARSERUTILS_NOMEM) {
+ /* output succeeded; update source pointers */
+ *source += nextchar;
+ *sourcelen -= nextchar;
+ }
+
+ return error;
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Output a UCS4 character
+ *
+ * \param c Codec to use
+ * \param ucs4 UCS4 character (host endian)
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to output buffer length
+ * \return PARSERUTILS_OK on success,
+ * PARSERUTILS_NOMEM if output buffer is too small,
+ */
+parserutils_error charset_utf8_codec_output_decoded_char(charset_utf8_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ if (*destlen < 4) {
+ /* Run out of output buffer */
+ c->read_len = 1;
+ c->read_buf[0] = ucs4;
+
+ return PARSERUTILS_NOMEM;
+ }
+
+ *((uint32_t *) (void *) *dest) = htonl(ucs4);
+ *dest += 4;
+ *destlen -= 4;
+
+ return PARSERUTILS_OK;
+}
+
+
+const parserutils_charset_handler charset_utf8_codec_handler = {
+ charset_utf8_codec_handles_charset,
+ charset_utf8_codec_create
+};
+
diff --git a/src/charset/encodings/Makefile b/src/charset/encodings/Makefile
new file mode 100644
index 0000000..47d9210
--- /dev/null
+++ b/src/charset/encodings/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Sources
+SRCS_$(d) := utf8.c utf16.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/encodings/utf16.c b/src/charset/encodings/utf16.c
new file mode 100644
index 0000000..95dc64f
--- /dev/null
+++ b/src/charset/encodings/utf16.c
@@ -0,0 +1,239 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-16 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/utf16.h>
+
+/**
+ * Convert a UTF-16 sequence into a single UCS4 character
+ *
+ * \param s The sequence to process
+ * \param len Length of sequence
+ * \param ucs4 Pointer to location to receive UCS4 character (host endian)
+ * \param clen Pointer to location to receive byte length of UTF-16 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s,
+ size_t len, uint32_t *ucs4, size_t *clen)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || ucs4 == NULL || clen == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (len < 2)
+ return PARSERUTILS_NEEDDATA;
+
+ if (*ss < 0xD800 || *ss > 0xDFFF) {
+ *ucs4 = *ss;
+ *clen = 2;
+ } else if (0xD800 <= *ss && *ss <= 0xBFFF) {
+ if (len < 4)
+ return PARSERUTILS_NEEDDATA;
+
+ if (0xDC00 <= ss[1] && ss[1] <= 0xE000) {
+ *ucs4 = (((s[0] >> 6) & 0x1f) + 1) |
+ ((s[0] & 0x3f) | (s[1] & 0x3ff));
+ *clen = 4;
+ } else {
+ return PARSERUTILS_INVALID;
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-16 sequence
+ *
+ * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s Pointer to 4 byte long output buffer
+ * \param len Pointer to location to receive length of multibyte sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
+ size_t *len)
+{
+ uint16_t *ss = (uint16_t *) (void *) s;
+ uint32_t l = 0;
+
+ if (s == NULL || len == NULL)
+ return PARSERUTILS_BADPARM;
+ else if (ucs4 < 0x10000) {
+ *ss = (uint16_t) ucs4;
+ l = 2;
+ } else if (ucs4 < 0x110000) {
+ ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
+ ss[1] = 0xDC00 | (ucs4 & 0x3ff);
+ l = 4;
+ } else {
+ return PARSERUTILS_INVALID;
+ }
+
+ *len = l;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-16 string
+ *
+ * \param s The string
+ * \param max Maximum length
+ * \param len Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max,
+ size_t *len)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+ const uint16_t *end = (const uint16_t *) (const void *) (s + max);
+ int l = 0;
+
+ if (s == NULL || len == NULL)
+ return PARSERUTILS_BADPARM;
+
+ while (ss < end) {
+ if (*ss < 0xD800 || 0xDFFF < *ss)
+ ss++;
+ else
+ ss += 2;
+
+ l++;
+ }
+
+ *len = l;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-16 character
+ *
+ * \param s Pointer to start of character
+ * \param len Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
+ size_t *len)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || len == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (*ss < 0xD800 || 0xDFFF < *ss)
+ *len = 2;
+ else
+ *len = 4;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Find previous legal UTF-16 char in string
+ *
+ * \param s The string
+ * \param off Offset in the string to start at
+ * \param prevoff Pointer to location to receive offset of first byte of
+ * previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off,
+ uint32_t *prevoff)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || prevoff == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (off < 2)
+ *prevoff = 0;
+ else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
+ *prevoff = off - 2;
+ else
+ *prevoff = (off < 4) ? 0 : off - 4;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s The string (assumed valid)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || off >= len || nextoff == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (len - off < 4)
+ *nextoff = len;
+ else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
+ *nextoff = off + 2;
+ else
+ *nextoff = (len - off < 6) ? len : off + 4;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s The string (assumed to be of dubious validity)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
+ uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || off >= len || nextoff == NULL)
+ return PARSERUTILS_BADPARM;
+
+ while (1) {
+ if (len - off < 4) {
+ return PARSERUTILS_NEEDDATA;
+ } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
+ *nextoff = off + 2;
+ break;
+ } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
+ if (len - off < 6)
+ return PARSERUTILS_NEEDDATA;
+
+ if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
+ *nextoff = off + 4;
+ break;
+ } else {
+ ss++;
+ off += 2;
+ }
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
diff --git a/src/charset/encodings/utf8.c b/src/charset/encodings/utf8.c
new file mode 100644
index 0000000..5b4ba95
--- /dev/null
+++ b/src/charset/encodings/utf8.c
@@ -0,0 +1,175 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/utf8.h>
+#include "charset/encodings/utf8impl.h"
+
+/** Number of continuation bytes for a given start byte */
+const uint8_t numContinuations[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
+};
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param s The sequence to process
+ * \param len Length of sequence
+ * \param ucs4 Pointer to location to receive UCS4 character (host endian)
+ * \param clen Pointer to location to receive byte length of UTF-8 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
+ uint32_t *ucs4, size_t *clen)
+{
+ parserutils_error error;
+
+ UTF8_TO_UCS4(s, len, ucs4, clen, error);
+
+ return error;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s Pointer to pointer to output buffer, updated on exit
+ * \param len Pointer to length, in bytes, of output buffer, updated on exit
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4,
+ uint8_t **s, size_t *len)
+{
+ parserutils_error error;
+
+ UTF8_FROM_UCS4(ucs4, s, len, error);
+
+ return error;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s The string
+ * \param max Maximum length
+ * \param len Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
+ size_t *len)
+{
+ parserutils_error error;
+
+ UTF8_LENGTH(s, max, len, error);
+
+ return error;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s Pointer to start of character
+ * \param len Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
+ size_t *len)
+{
+ parserutils_error error;
+
+ UTF8_CHAR_BYTE_LENGTH(s, len, error);
+
+ return error;
+}
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s The string
+ * \param off Offset in the string to start at
+ * \param prevoff Pointer to location to receive offset of first byte of
+ * previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
+ uint32_t *prevoff)
+{
+ parserutils_error error;
+
+ UTF8_PREV(s, off, prevoff, error);
+
+ return error;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s The string (assumed valid)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff)
+{
+ parserutils_error error;
+
+ UTF8_NEXT(s, len, off, nextoff, error);
+
+ return error;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s The string (assumed to be of dubious validity)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s,
+ uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+ parserutils_error error;
+
+ UTF8_NEXT_PARANOID(s, len, off, nextoff, error);
+
+ return error;
+}
+
diff --git a/src/charset/encodings/utf8impl.h b/src/charset/encodings/utf8impl.h
new file mode 100644
index 0000000..1ca9de7
--- /dev/null
+++ b/src/charset/encodings/utf8impl.h
@@ -0,0 +1,339 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_encodings_utf8impl_h_
+#define parserutils_charset_encodings_utf8impl_h_
+
+/** \file
+ * UTF-8 manipulation macros (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+/** Number of continuation bytes for a given start byte */
+extern const uint8_t numContinuations[256];
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param s The sequence to process
+ * \param len Length of sequence
+ * \param ucs4 Pointer to location to receive UCS4 character (host endian)
+ * \param clen Pointer to location to receive byte length of UTF-8 sequence
+ * \param error Location to receive error code
+ */
+#define UTF8_TO_UCS4(s, len, ucs4, clen, error) \
+do { \
+ uint32_t c, min; \
+ uint8_t n; \
+ \
+ error = PARSERUTILS_OK; \
+ \
+ if (s == NULL || ucs4 == NULL || clen == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ if (len == 0) { \
+ error = PARSERUTILS_NEEDDATA; \
+ break; \
+ } \
+ \
+ c = s[0]; \
+ \
+ if (c < 0x80) { \
+ n = 1; \
+ min = 0; \
+ } else if ((c & 0xE0) == 0xC0) { \
+ c &= 0x1F; \
+ n = 2; \
+ min = 0x80; \
+ } else if ((c & 0xF0) == 0xE0) { \
+ c &= 0x0F; \
+ n = 3; \
+ min = 0x800; \
+ } else if ((c & 0xF8) == 0xF0) { \
+ c &= 0x07; \
+ n = 4; \
+ min = 0x10000; \
+ } else if ((c & 0xFC) == 0xF8) { \
+ c &= 0x03; \
+ n = 5; \
+ min = 0x200000; \
+ } else if ((c & 0xFE) == 0xFC) { \
+ c &= 0x01; \
+ n = 6; \
+ min = 0x4000000; \
+ } else { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ if (len < n) { \
+ error = PARSERUTILS_NEEDDATA; \
+ break; \
+ } \
+ \
+ for (uint8_t i = 1; i < n; i++) { \
+ uint32_t t = s[i]; \
+ \
+ if ((t & 0xC0) != 0x80) { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ c <<= 6; \
+ c |= t & 0x3F; \
+ } \
+ \
+ if (error == PARSERUTILS_OK) { \
+ /* Detect overlong sequences, surrogates and fffe/ffff */ \
+ if (c < min || (c >= 0xD800 && c <= 0xDFFF) || \
+ c == 0xFFFE || c == 0xFFFF) { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ *ucs4 = c; \
+ *clen = n; \
+ } \
+} while(0)
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s Pointer to pointer to output buffer, updated on exit
+ * \param len Pointer to length, in bytes, of output buffer, updated on exit
+ * \param error Location to receive error code
+ */
+#define UTF8_FROM_UCS4(ucs4, s, len, error) \
+do { \
+ uint8_t *buf; \
+ uint8_t l = 0; \
+ \
+ error = PARSERUTILS_OK; \
+ \
+ if (s == NULL || *s == NULL || len == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ if (ucs4 < 0x80) { \
+ l = 1; \
+ } else if (ucs4 < 0x800) { \
+ l = 2; \
+ } else if (ucs4 < 0x10000) { \
+ l = 3; \
+ } else if (ucs4 < 0x200000) { \
+ l = 4; \
+ } else if (ucs4 < 0x4000000) { \
+ l = 5; \
+ } else if (ucs4 <= 0x7FFFFFFF) { \
+ l = 6; \
+ } else { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ if (l > *len) { \
+ error = PARSERUTILS_NOMEM; \
+ break; \
+ } \
+ \
+ buf = *s; \
+ \
+ if (l == 1) { \
+ buf[0] = (uint8_t) ucs4; \
+ } else { \
+ for (uint8_t i = l; i > 1; i--) { \
+ buf[i - 1] = 0x80 | (ucs4 & 0x3F); \
+ ucs4 >>= 6; \
+ } \
+ buf[0] = ~((1 << (8 - l)) - 1) | ucs4; \
+ } \
+ \
+ *s += l; \
+ *len -= l; \
+} while(0)
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s The string
+ * \param max Maximum length
+ * \param len Pointer to location to receive length of string
+ * \param error Location to receive error code
+ */
+#define UTF8_LENGTH(s, max, len, error) \
+do { \
+ const uint8_t *end = s + max; \
+ int l = 0; \
+ \
+ error = PARSERUTILS_OK; \
+ \
+ if (s == NULL || len == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ while (s < end) { \
+ uint32_t c = s[0]; \
+ \
+ if ((c & 0x80) == 0x00) \
+ s += 1; \
+ else if ((c & 0xE0) == 0xC0) \
+ s += 2; \
+ else if ((c & 0xF0) == 0xE0) \
+ s += 3; \
+ else if ((c & 0xF8) == 0xF0) \
+ s += 4; \
+ else if ((c & 0xFC) == 0xF8) \
+ s += 5; \
+ else if ((c & 0xFE) == 0xFC) \
+ s += 6; \
+ else { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ l++; \
+ } \
+ \
+ if (error == PARSERUTILS_OK) \
+ *len = l; \
+} while(0)
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s Pointer to start of character
+ * \param len Pointer to location to receive length
+ * \param error Location to receive error code
+ */
+#define UTF8_CHAR_BYTE_LENGTH(s, len, error) \
+do { \
+ if (s == NULL || len == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ *len = numContinuations[s[0]] + 1 /* Start byte */; \
+ \
+ error = PARSERUTILS_OK; \
+} while(0)
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s The string
+ * \param off Offset in the string to start at
+ * \param prevoff Pointer to location to receive offset of first byte of
+ * previous legal character
+ * \param error Location to receive error code
+ */
+#define UTF8_PREV(s, off, prevoff, error) \
+do { \
+ if (s == NULL || prevoff == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ while (off != 0 && (s[--off] & 0xC0) == 0x80) \
+ /* do nothing */; \
+ \
+ *prevoff = off; \
+ \
+ error = PARSERUTILS_OK; \
+} while(0)
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s The string (assumed valid)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \param error Location to receive error code
+ */
+#define UTF8_NEXT(s, len, off, nextoff, error) \
+do { \
+ if (s == NULL || off >= len || nextoff == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ /* Skip current start byte (if present - may be mid-sequence) */\
+ if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) \
+ off++; \
+ \
+ while (off < len && (s[off] & 0xC0) == 0x80) \
+ off++; \
+ \
+ *nextoff = off; \
+ \
+ error = PARSERUTILS_OK; \
+} while(0)
+
+/**
+ * Skip to start of next sequence in UTF-8 input
+ *
+ * \param s The string (assumed to be of dubious validity)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \param error Location to receive error code
+ */
+#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error) \
+do { \
+ uint8_t c; \
+ \
+ error = PARSERUTILS_OK; \
+ \
+ if (s == NULL || off >= len || nextoff == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ c = s[off]; \
+ \
+ /* If we're mid-sequence, simply advance to next byte */ \
+ if (!(c < 0x80 || (c & 0xC0) == 0xC0)) { \
+ off++; \
+ } else { \
+ uint32_t nCont = numContinuations[c]; \
+ uint32_t nToSkip; \
+ \
+ if (off + nCont + 1 >= len) { \
+ error = PARSERUTILS_NEEDDATA; \
+ break; \
+ } \
+ \
+ /* Verify continuation bytes */ \
+ for (nToSkip = 1; nToSkip <= nCont; nToSkip++) { \
+ if ((s[off + nToSkip] & 0xC0) != 0x80) \
+ break; \
+ } \
+ \
+ /* Skip over the valid bytes */ \
+ off += nToSkip; \
+ } \
+ \
+ *nextoff = off; \
+} while(0)
+
+#endif