summaryrefslogtreecommitdiff
path: root/src/charset
diff options
context:
space:
mode:
Diffstat (limited to 'src/charset')
-rw-r--r--src/charset/Makefile53
-rw-r--r--src/charset/aliases.c361
-rw-r--r--src/charset/aliases.h42
-rw-r--r--src/charset/codec.c186
-rw-r--r--src/charset/codec.h153
-rw-r--r--src/charset/codec_iconv.c837
-rw-r--r--src/charset/codec_impl.h51
-rw-r--r--src/charset/codec_utf8.c620
-rw-r--r--src/charset/detect.c673
-rw-r--r--src/charset/detect.h22
10 files changed, 2998 insertions, 0 deletions
diff --git a/src/charset/Makefile b/src/charset/Makefile
new file mode 100644
index 0000000..62817b3
--- /dev/null
+++ b/src/charset/Makefile
@@ -0,0 +1,53 @@
+# Makefile for libhubbub
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT Name of component
+# EXPORT Absolute path of export directory
+# TOP Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean Clean source tree
+# debug Create a debug binary
+# distclean Fully clean source tree, back to pristine condition
+# export Export distributable components to ${EXPORT}
+# release Create a release binary
+# setup Perform any setup required prior to compilation
+# test Execute any test cases
+
+# Manipulate include paths
+CFLAGS += -I$(CURDIR)
+
+# Objects
+OBJS = aliases codec codec_iconv codec_utf8 detect
+
+.PHONY: clean debug distclean export release setup test
+
+# Targets
+release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS)))
+
+debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS)))
+
+clean:
+ -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS}))
+ -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS}))
+
+distclean:
+
+setup:
+
+export:
+
+test:
+
+# Pattern rules
+../Release/%.o: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $<
+
+../Debug/%.o: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c -g ${CFLAGS} -o $@ $<
diff --git a/src/charset/aliases.c b/src/charset/aliases.c
new file mode 100644
index 0000000..dcf6de2
--- /dev/null
+++ b/src/charset/aliases.c
@@ -0,0 +1,361 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+
+struct alias {
+ struct alias *next;
+ hubbub_aliases_canon *canon;
+ uint16_t name_len;
+ char name[1];
+};
+
+#define HASH_SIZE (43)
+static hubbub_aliases_canon *canon_tab[HASH_SIZE];
+static struct alias *alias_tab[HASH_SIZE];
+
+static hubbub_error hubbub_create_alias(const char *alias,
+ hubbub_aliases_canon *c, hubbub_alloc alloc, void *pw);
+static hubbub_aliases_canon *hubbub_create_canon(const char *canon,
+ uint16_t mibenum, hubbub_alloc alloc, void *pw);
+static uint32_t hubbub_hash_val(const char *alias, size_t len);
+
+/**
+ * Create alias data from Aliases file
+ *
+ * \param filename The path to the Aliases file
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ */
+hubbub_error hubbub_aliases_create(const char *filename,
+ hubbub_alloc alloc, void *pw)
+{
+ char buf[300];
+ FILE *fp;
+
+ if (filename == NULL || alloc == NULL)
+ return HUBBUB_BADPARM;
+
+ fp = fopen(filename, "r");
+ if (fp == NULL)
+ return HUBBUB_FILENOTFOUND;
+
+ while (fgets(buf, sizeof buf, fp)) {
+ char *p, *aliases = 0, *mib, *end;
+ hubbub_aliases_canon *cf;
+
+ if (buf[0] == 0 || buf[0] == '#')
+ /* skip blank lines or comments */
+ continue;
+
+ buf[strlen(buf) - 1] = 0; /* lose terminating newline */
+ end = buf + strlen(buf);
+
+ /* find end of canonical form */
+ for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ *p++ = '\0'; /* terminate canonical form */
+
+ /* skip whitespace */
+ for (; *p && isspace(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ mib = p;
+
+ /* find end of mibenum */
+ for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p < end)
+ *p++ = '\0'; /* terminate mibenum */
+
+ cf = hubbub_create_canon(buf, atoi(mib), alloc, pw);
+ if (cf == NULL)
+ continue;
+
+ /* skip whitespace */
+ for (; p < end && *p && isspace(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ aliases = p;
+
+ while (p < end) {
+ /* find end of alias */
+ for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p > end)
+ /* stop if we've gone past the end */
+ break;
+ /* terminate current alias */
+ *p++ = '\0';
+
+ if (hubbub_create_alias(aliases, cf,
+ alloc, pw) != HUBBUB_OK)
+ break;
+
+ /* in terminating, we may have advanced
+ * past the end - check this here */
+ if (p >= end)
+ break;
+
+ /* skip whitespace */
+ for (; *p && isspace(*p); p++)
+ ; /* do nothing */
+
+ if (p >= end)
+ /* gone past end => stop */
+ break;
+
+ /* update pointer to current alias */
+ aliases = p;
+ }
+ }
+
+ fclose(fp);
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Free all alias data
+ *
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data
+ */
+void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw)
+{
+ hubbub_aliases_canon *c, *d;
+ struct alias *a, *b;
+ int i;
+
+ for (i = 0; i != HASH_SIZE; i++) {
+ for (c = canon_tab[i]; c; c = d) {
+ d = c->next;
+ alloc(c, 0, pw);
+ }
+ canon_tab[i] = NULL;
+
+ for (a = alias_tab[i]; a; a = b) {
+ b = a->next;
+ alloc(a, 0, pw);
+ }
+ alias_tab[i] = NULL;
+ }
+}
+
+/**
+ * Retrieve the MIB enum value assigned to an encoding name
+ *
+ * \param alias The alias to lookup
+ * \param len The length of the alias string
+ * \return The MIB enum value, or 0 if not found
+ */
+uint16_t hubbub_mibenum_from_name(const char *alias, size_t len)
+{
+ hubbub_aliases_canon *c;
+
+ if (alias == NULL)
+ return 0;
+
+ c = hubbub_alias_canonicalise(alias, len);
+ if (c == NULL)
+ return 0;
+
+ return c->mib_enum;
+}
+
+/**
+ * Retrieve the canonical name of an encoding from the MIB enum
+ *
+ * \param mibenum The MIB enum value
+ * \return Pointer to canonical name, or NULL if not found
+ */
+const char *hubbub_mibenum_to_name(uint16_t mibenum)
+{
+ int i;
+ hubbub_aliases_canon *c;
+
+ for (i = 0; i != HASH_SIZE; i++)
+ for (c = canon_tab[i]; c; c = c->next)
+ if (c->mib_enum == mibenum)
+ return c->name;
+
+ return NULL;
+}
+
+
+/**
+ * Retrieve the canonical form of an alias name
+ *
+ * \param alias The alias name
+ * \param len The length of the alias name
+ * \return Pointer to canonical form or NULL if not found
+ */
+hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias,
+ size_t len)
+{
+ uint32_t hash;
+ hubbub_aliases_canon *c;
+ struct alias *a;
+
+ if (alias == NULL)
+ return NULL;
+
+ hash = hubbub_hash_val(alias, len);
+
+ for (c = canon_tab[hash]; c; c = c->next)
+ if (c->name_len == len &&
+ strncasecmp(c->name, alias, len) == 0)
+ break;
+ if (c)
+ return c;
+
+ for (a = alias_tab[hash]; a; a = a->next)
+ if (a->name_len == len &&
+ strncasecmp(a->name, alias, len) == 0)
+ break;
+ if (a)
+ return a->canon;
+
+ return NULL;
+}
+
+
+/**
+ * Create an alias
+ *
+ * \param alias The alias name
+ * \param c The canonical form
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_create_alias(const char *alias, hubbub_aliases_canon *c,
+ hubbub_alloc alloc, void *pw)
+{
+ struct alias *a;
+ uint32_t hash;
+
+ if (alias == NULL || c == NULL || alloc == NULL)
+ return HUBBUB_BADPARM;
+
+ a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw);
+ if (a == NULL)
+ return HUBBUB_NOMEM;
+
+ a->canon = c;
+ a->name_len = strlen(alias);
+ strcpy(a->name, alias);
+ a->name[a->name_len] = '\0';
+
+ hash = hubbub_hash_val(alias, a->name_len);
+
+ a->next = alias_tab[hash];
+ alias_tab[hash] = a;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Create a canonical form
+ *
+ * \param canon The canonical name
+ * \param mibenum The MIB enum value
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to canonical form or NULL on error
+ */
+hubbub_aliases_canon *hubbub_create_canon(const char *canon,
+ uint16_t mibenum, hubbub_alloc alloc, void *pw)
+{
+ hubbub_aliases_canon *c;
+ uint32_t hash, len;
+
+ if (canon == NULL || alloc == NULL)
+ return NULL;
+
+ len = strlen(canon);
+
+ c = alloc(NULL, sizeof(hubbub_aliases_canon) + len + 1, pw);
+ if (c == NULL)
+ return NULL;
+
+ c->mib_enum = mibenum;
+ c->name_len = len;
+ strcpy(c->name, canon);
+ c->name[len] = '\0';
+
+ hash = hubbub_hash_val(canon, len);
+
+ c->next = canon_tab[hash];
+ canon_tab[hash] = c;
+
+ return c;
+}
+
+/**
+ * Hash function
+ *
+ * \param alias String to hash
+ * \return The hashed value
+ */
+uint32_t hubbub_hash_val(const char *alias, size_t len)
+{
+ const char *s = alias;
+ uint32_t h = 5381;
+
+ if (alias == NULL)
+ return 0;
+
+ while (len--)
+ h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */
+
+ return h % HASH_SIZE;
+}
+
+
+#ifndef NDEBUG
+/**
+ * Dump all alias data to stdout
+ */
+void hubbub_aliases_dump(void)
+{
+ hubbub_aliases_canon *c;
+ struct alias *a;
+ int i;
+ size_t size = 0;
+
+ for (i = 0; i != HASH_SIZE; i++) {
+ for (c = canon_tab[i]; c; c = c->next) {
+ printf("%d %s\n", i, c->name);
+ size += offsetof(hubbub_aliases_canon, name) +
+ c->name_len;
+ }
+
+ for (a = alias_tab[i]; a; a = a->next) {
+ printf("%d %s\n", i, a->name);
+ size += offsetof(struct alias, name) + a->name_len;
+ }
+ }
+
+ size += (sizeof(canon_tab) / sizeof(canon_tab[0]));
+ size += (sizeof(alias_tab) / sizeof(alias_tab[0]));
+
+ printf("%u\n", (unsigned int) size);
+}
+#endif
diff --git a/src/charset/aliases.h b/src/charset/aliases.h
new file mode 100644
index 0000000..e0505d0
--- /dev/null
+++ b/src/charset/aliases.h
@@ -0,0 +1,42 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_charset_aliases_h_
+#define hubbub_charset_aliases_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+
+typedef struct hubbub_aliases_canon {
+ struct hubbub_aliases_canon *next;
+ uint16_t mib_enum;
+ uint16_t name_len;
+ char name[1];
+} hubbub_aliases_canon;
+
+/* Load encoding aliases from file */
+hubbub_error hubbub_aliases_create(const char *filename,
+ hubbub_alloc alloc, void *pw);
+/* Destroy encoding aliases */
+void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw);
+
+/* Convert an encoding alias to a MIB enum value */
+uint16_t hubbub_mibenum_from_name(const char *alias, size_t len);
+/* Convert a MIB enum value into an encoding alias */
+const char *hubbub_mibenum_to_name(uint16_t mibenum);
+
+/* Canonicalise an alias name */
+hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias,
+ size_t len);
+
+#ifndef NDEBUG
+void hubbub_aliases_dump(void);
+#endif
+
+#endif
diff --git a/src/charset/codec.c b/src/charset/codec.c
new file mode 100644
index 0000000..12a1bdc
--- /dev/null
+++ b/src/charset/codec.c
@@ -0,0 +1,186 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include "charset/aliases.h"
+
+#include "codec_impl.h"
+
+extern hubbub_charsethandler hubbub_iconv_codec_handler;
+extern hubbub_charsethandler hubbub_utf8_codec_handler;
+
+static hubbub_charsethandler *handler_table[] = {
+ &hubbub_utf8_codec_handler,
+ &hubbub_iconv_codec_handler,
+ NULL,
+};
+
+/**
+ * Create a charset codec
+ *
+ * \param charset Target charset
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec instance, or NULL on failure
+ */
+hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset,
+ hubbub_alloc alloc, void *pw)
+{
+ hubbub_charsetcodec *codec;
+ hubbub_charsethandler **handler;
+ const hubbub_aliases_canon * canon;
+
+ if (charset == NULL || alloc == NULL)
+ return NULL;
+
+ /* Canonicalise charset name. */
+ canon = hubbub_alias_canonicalise(charset, strlen(charset));
+ if (canon == NULL)
+ return NULL;
+
+ /* Search for handler class */
+ for (handler = handler_table; *handler != NULL; handler++) {
+ if ((*handler)->handles_charset(canon->name))
+ break;
+ }
+
+ /* None found */
+ if ((*handler) == NULL)
+ return NULL;
+
+ /* Instantiate class */
+ codec = (*handler)->create(canon->name, alloc, pw);
+ if (codec == NULL)
+ return NULL;
+
+ /* and initialise it */
+ codec->mibenum = canon->mib_enum;
+
+ codec->filter = NULL;
+ codec->filter_pw = NULL;
+
+ codec->errormode = HUBBUB_CHARSETCODEC_ERROR_LOOSE;
+
+ codec->alloc = alloc;
+ codec->alloc_pw = pw;
+
+ return codec;
+}
+
+/**
+ * Destroy a charset codec
+ *
+ * \param codec The codec to destroy
+ */
+void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec)
+{
+ if (codec == NULL)
+ return;
+
+ codec->handler.destroy(codec);
+
+ codec->alloc(codec, 0, codec->alloc_pw);
+}
+
+/**
+ * Configure a charset codec
+ *
+ * \param codec The codec to configure
+ * \parem type The codec option type to configure
+ * \param params Option-specific parameters
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec,
+ hubbub_charsetcodec_opttype type,
+ hubbub_charsetcodec_optparams *params)
+{
+ if (codec == NULL || params == NULL)
+ return HUBBUB_BADPARM;
+
+ switch (type) {
+ case HUBBUB_CHARSETCODEC_FILTER_FUNC:
+ codec->filter = params->filter_func.filter;
+ codec->filter_pw = params->filter_func.pw;
+ break;
+
+ case HUBBUB_CHARSETCODEC_ERROR_MODE:
+ codec->errormode = params->error_mode.mode;
+ break;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Encode a chunk of UCS4 data into a codec's charset
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ */
+hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ if (codec == NULL || source == NULL || *source == NULL ||
+ sourcelen == NULL || dest == NULL || *dest == NULL ||
+ destlen == NULL)
+ return HUBBUB_BADPARM;
+
+ return codec->handler.encode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Decode a chunk of data in a codec's charset into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ *
+ * Call this with a source length of 0 to flush any buffers.
+ */
+hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ if (codec == NULL || source == NULL || *source == NULL ||
+ sourcelen == NULL || dest == NULL || *dest == NULL ||
+ destlen == NULL)
+ return HUBBUB_BADPARM;
+
+ return codec->handler.decode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Clear a charset codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec)
+{
+ if (codec == NULL)
+ return HUBBUB_BADPARM;
+
+ /* Reset filter */
+ if (codec->filter)
+ codec->filter(HUBBUB_CHARSETCODEC_NULL, NULL, NULL, NULL);
+
+ return codec->handler.reset(codec);
+}
+
diff --git a/src/charset/codec.h b/src/charset/codec.h
new file mode 100644
index 0000000..4cd94d8
--- /dev/null
+++ b/src/charset/codec.h
@@ -0,0 +1,153 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_charset_codec_h_
+#define hubbub_charset_codec_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+
+typedef struct hubbub_charsetcodec hubbub_charsetcodec;
+
+#define HUBBUB_CHARSETCODEC_NULL (0xffffffffU)
+
+/**
+ * Type of charset codec filter function
+ *
+ * \param c UCS4 character (in host byte order) or
+ * HUBBUB_CHARSETCODEC_NULL to reset
+ * \param output Pointer to location to store output buffer location
+ * \param outputlen Pointer to location to store output buffer length
+ * \param pw Pointer to client-specific private data
+ * \return HUBBUB_OK on success, or appropriate error otherwise.
+ *
+ * The output buffer is owned by the filter code and will not be freed by
+ * any charset codec. It should contain the replacement UCS4 character(s)
+ * for the input. The replacement characters should be in host byte order.
+ * The contents of *output and *outputlen on entry are ignored and these
+ * will be filled in by the filter code.
+ *
+ * Filters may elect to replace the input character with no output. In this
+ * case, *output should be set to NULL and *outputlen should be set to 0 and
+ * HUBBUB_OK should be returned.
+ *
+ * The output length is in terms of the number of UCS4 characters in the
+ * output buffer. i.e.:
+ *
+ * for (size_t i = 0; i < outputlen; i++) {
+ * dest[curchar++] = output[i];
+ * }
+ *
+ * would copy the contents of the filter output buffer to the codec's output
+ * buffer.
+ */
+typedef hubbub_error (*hubbub_charsetcodec_filter)(uint32_t c,
+ uint32_t **output, size_t *outputlen, void *pw);
+
+/**
+ * Charset codec error mode
+ *
+ * A codec's error mode determines its behaviour in the face of:
+ *
+ * + characters which are unrepresentable in the destination charset (if
+ * encoding data) or which cannot be converted to UCS4 (if decoding data).
+ * + invalid byte sequences (both encoding and decoding)
+ *
+ * The options provide a choice between the following approaches:
+ *
+ * + draconian, "stop processing" ("strict")
+ * + "replace the unrepresentable character with something else" ("loose")
+ * + "attempt to transliterate, or replace if unable" ("translit")
+ *
+ * The default error mode is "loose".
+ *
+ *
+ * In the "loose" case, the replacement character will depend upon:
+ *
+ * + Whether the operation was encoding or decoding
+ * + If encoding, what the destination charset is.
+ *
+ * If decoding, the replacement character will be:
+ *
+ * U+FFFD (REPLACEMENT CHARACTER)
+ *
+ * If encoding, the replacement character will be:
+ *
+ * U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32)
+ * U+FFFD (REPLACEMENT CHARACTER) otherwise.
+ *
+ *
+ * In the "translit" case, the codec will attempt to transliterate into
+ * the destination charset, if encoding. If decoding, or if transliteration
+ * fails, this option is identical to "loose".
+ */
+typedef enum hubbub_charsetcodec_errormode {
+ /** Abort processing if unrepresentable character encountered */
+ HUBBUB_CHARSETCODEC_ERROR_STRICT = 0,
+ /** Replace unrepresentable characters with single alternate */
+ HUBBUB_CHARSETCODEC_ERROR_LOOSE = 1,
+ /** Transliterate unrepresentable characters, if possible */
+ HUBBUB_CHARSETCODEC_ERROR_TRANSLIT = 2,
+} hubbub_charsetcodec_errormode;
+
+/**
+ * Charset codec option types
+ */
+typedef enum hubbub_charsetcodec_opttype {
+ /** Register codec filter function */
+ HUBBUB_CHARSETCODEC_FILTER_FUNC = 0,
+ /** Set codec error mode */
+ HUBBUB_CHARSETCODEC_ERROR_MODE = 1,
+} hubbub_charsetcodec_opttype;
+
+/**
+ * Charset codec option parameters
+ */
+typedef union hubbub_charsetcodec_optparams {
+ /** Parameters for filter function setting */
+ struct {
+ /** Filter function */
+ hubbub_charsetcodec_filter filter;
+ /** Client-specific private data */
+ void *pw;
+ } filter_func;
+
+ /** Parameters for error mode setting */
+ struct {
+ /** The desired error handling mode */
+ hubbub_charsetcodec_errormode mode;
+ } error_mode;
+} hubbub_charsetcodec_optparams;
+
+
+/* Create a charset codec */
+hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset,
+ hubbub_alloc alloc, void *pw);
+/* Destroy a charset codec */
+void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec);
+
+/* Configure a charset codec */
+hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec,
+ hubbub_charsetcodec_opttype type,
+ hubbub_charsetcodec_optparams *params);
+
+/* Encode a chunk of UCS4 data into a codec's charset */
+hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+
+/* Decode a chunk of data in a codec's charset into UCS4 */
+hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+
+/* Reset a charset codec */
+hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec);
+
+#endif
diff --git a/src/charset/codec_iconv.c b/src/charset/codec_iconv.c
new file mode 100644
index 0000000..097e82a
--- /dev/null
+++ b/src/charset/codec_iconv.c
@@ -0,0 +1,837 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/* This codec is hideously slow. Only use it as a last resort */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <iconv.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include "charset/aliases.h"
+#include "utils/utils.h"
+
+#include "codec_impl.h"
+
+/**
+ * A note on endianness:
+ *
+ * UCS4 is big-endian by default. Therefore, this codec reads and writes
+ * big-endian values. This is fine, and causes no problems. However, to
+ * make life easier for client-supplied filter code, character values passed
+ * to a filter and those read back from a filter are in host-endian.
+ * Therefore, we need to convert from big-endian to host-endian when passing
+ * characters to a filter and perform the reverse translation when reading
+ * characters back.
+ */
+
+/**
+ * Iconv-based charset codec
+ */
+typedef struct hubbub_iconv_codec {
+ hubbub_charsetcodec base; /**< Base class */
+
+ iconv_t read_cd; /**< Iconv handle for reading */
+#define INVAL_BUFSIZE (32)
+ uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
+ * incomplete input
+ * sequences */
+ size_t inval_len; /**< Number of bytes in inval_buf */
+
+#define READ_BUFSIZE (8)
+ uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
+ * output sequences (decode)
+ */
+ size_t read_len; /**< Number of characters in
+ * read_buf */
+
+ iconv_t write_cd; /**< Iconv handle for writing */
+#define WRITE_BUFSIZE (8)
+ uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
+ * output sequences (encode)
+ */
+ size_t write_len; /**< Number of characters in
+ * write_buf */
+} hubbub_iconv_codec;
+
+
+static bool hubbub_iconv_codec_handles_charset(const char *charset);
+static hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset,
+ hubbub_alloc alloc, void *pw);
+static void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec);
+static hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec);
+static hubbub_error hubbub_iconv_codec_filter_decoded_char(
+ hubbub_iconv_codec *c, uint32_t ucs4, uint8_t **dest,
+ size_t *destlen);
+static bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c);
+static hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool hubbub_iconv_codec_handles_charset(const char *charset)
+{
+ iconv_t cd;
+ bool ret;
+
+ cd = iconv_open("UCS-4", charset);
+
+ ret = (cd != (iconv_t) -1);
+
+ if (ret)
+ iconv_close(cd);
+
+ return ret;
+}
+
+/**
+ * Create an iconv-based codec
+ *
+ * \param charset The charset to read from / write to
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset,
+ hubbub_alloc alloc, void *pw)
+{
+ hubbub_iconv_codec *codec;
+
+ codec = alloc(NULL, sizeof(hubbub_iconv_codec), pw);
+ if (codec == NULL)
+ return NULL;
+
+ codec->read_cd = iconv_open("UCS-4", charset);
+ if (codec->read_cd == (iconv_t) -1) {
+ alloc(codec, 0, pw);
+ return NULL;
+ }
+
+ codec->write_cd = iconv_open(charset, "UCS-4");
+ if (codec->write_cd == (iconv_t) -1) {
+ iconv_close(codec->read_cd);
+ alloc(codec, 0, pw);
+ return NULL;
+ }
+
+ codec->inval_buf[0] = '\0';
+ codec->inval_len = 0;
+
+ codec->read_buf[0] = 0;
+ codec->read_len = 0;
+
+ codec->write_buf[0] = 0;
+ codec->write_len = 0;
+
+ /* Finally, populate vtable */
+ codec->base.handler.destroy = hubbub_iconv_codec_destroy;
+ codec->base.handler.encode = hubbub_iconv_codec_encode;
+ codec->base.handler.decode = hubbub_iconv_codec_decode;
+ codec->base.handler.reset = hubbub_iconv_codec_reset;
+
+ return (hubbub_charsetcodec *) codec;
+}
+
+/**
+ * Destroy an iconv-based codec
+ *
+ * \param codec The codec to destroy
+ */
+void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec)
+{
+ hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
+
+ iconv_close(c->read_cd);
+ iconv_close(c->write_cd);
+
+ return;
+}
+
+/**
+ * Encode a chunk of UCS4 data into an iconv-based codec's charset
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call. This buffered data is post-filtering,
+ * so will not be refiltered on the next call.
+ *
+ * In the case of the filter function failing, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the encoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * There is no way to determine the output character which caused a
+ * failure (as it may be one in a filter-injected replacement sequence).
+ * It is, however, possible to determine which source character caused it
+ * (this being the character immediately before the location pointed to by
+ * ::source on exit).
+ *
+ * [I.e. the process of filtering results in a potential one-to-many mapping
+ * between source characters and output characters, and identification of
+ * individual output characters is impossible.]
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
+ uint32_t ucs4;
+ const uint32_t *towrite;
+ size_t towritelen;
+ hubbub_error error;
+
+ /* Process any outstanding characters from the previous call */
+ if (c->write_len > 0) {
+ uint32_t *pwrite = c->write_buf;
+
+ while (c->write_len > 0) {
+ error = hubbub_iconv_codec_write_char(c, pwrite[0],
+ dest, destlen);
+ if (error != HUBBUB_OK) {
+ /* Copy outstanding chars down, skipping
+ * invalid one, if present, so as to avoid
+ * reprocessing the invalid character */
+ if (error == HUBBUB_INVALID) {
+ for (ucs4 = 1; ucs4 < c->write_len;
+ ucs4++) {
+ c->write_buf[ucs4] =
+ pwrite[ucs4];
+ }
+ }
+
+ return error;
+ }
+
+ pwrite++;
+ c->write_len--;
+ }
+ }
+
+ /* Now process the characters for this call */
+ while (*sourcelen > 0) {
+ towrite = (const uint32_t *) (const void *) *source;
+ towritelen = 1;
+ ucs4 = *towrite;
+
+ /* Run character we're about to output through the
+ * registered filter, so it can replace it, if it sees
+ * fit to do so */
+ if (c->base.filter != NULL) {
+ uint32_t *replacement;
+
+ error = c->base.filter(ntohl(ucs4),
+ &replacement, &towritelen,
+ c->base.filter_pw);
+ if (error != HUBBUB_OK) {
+ /* Don't eat character -- filter failed,
+ * so nothing gets written or buffered.
+ * It's up to the client to ensure that
+ * the filter works in the case where it
+ * reprocesses this character after the
+ * fault is fixed up. */
+
+ return error;
+ }
+
+ /* Convert filter output to big endian UCS4 */
+ for (ucs4 = 0; ucs4 < towritelen; ucs4++) {
+ replacement[ucs4] = htonl(replacement[ucs4]);
+ }
+
+ towrite = (const uint32_t *) replacement;
+ }
+
+ /* Output current character(s) */
+ while (towritelen > 0) {
+ error = hubbub_iconv_codec_write_char(c, towrite[0],
+ dest, destlen);
+
+ if (error != HUBBUB_OK) {
+ ucs4 = (error == HUBBUB_INVALID) ? 1 : 0;
+
+ if (towritelen - ucs4 >= WRITE_BUFSIZE)
+ abort();
+
+ c->write_len = towritelen - ucs4;
+
+ /* Copy pending chars to save area, for
+ * processing next call; skipping invalid
+ * character, if present, so it's not
+ * reprocessed. */
+ for (; ucs4 < towritelen; ucs4++) {
+ c->write_buf[ucs4] = towrite[ucs4];
+ }
+
+ /* Claim character we've just buffered,
+ * so it's not repreocessed */
+ *source += 4;
+ *sourcelen -= 4;
+
+ return error;
+ }
+
+ towrite++;
+ towritelen--;
+ }
+
+ *source += 4;
+ *sourcelen -= 4;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Decode a chunk of data in an iconv-based codec's charset into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ * This buffered data is post-filtering, so will not be refiltered on the
+ * next call.
+ *
+ * In the case of the result being _INVALID or the filter function failing,
+ * ::source will point _at_ the last input character read; nothing will be
+ * written or buffered for the failed character. It is up to the client to
+ * fix the cause of the failure and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * There is no way to determine the output character which caused a
+ * failure (as it may be one in a filter-injected replacement sequence).
+ * It is, however, possible to determine which source character caused it
+ * (this being the character immediately at or before the location pointed
+ * to by ::source on exit).
+ *
+ * [I.e. the process of filtering results in a potential one-to-many mapping
+ * between source characters and output characters, and identification of
+ * individual output characters is impossible.]
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
+ hubbub_error error;
+
+ if (c->read_len > 0) {
+ /* Output left over from last decode
+ * Attempt to finish this here */
+ uint32_t *pread = c->read_buf;
+
+ while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+ *((uint32_t *) (void *) *dest) = pread[0];
+
+ *dest += 4;
+ *destlen -= 4;
+
+ pread++;
+ c->read_len--;
+ }
+
+ if (*destlen < c->read_len * 4) {
+ /* Run out of output buffer */
+ size_t i;
+
+ /* Shuffle remaining output down */
+ for (i = 0; i < c->read_len; i++) {
+ c->read_buf[i] = pread[i];
+ }
+
+ return HUBBUB_NOMEM;
+ }
+ }
+
+ if (c->inval_len > 0) {
+ /* The last decode ended in an incomplete sequence.
+ * Fill up inval_buf with data from the start of the
+ * new chunk and process it. */
+ uint8_t *in = c->inval_buf;
+ size_t ol = c->inval_len;
+ size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+ size_t orig_l = l;
+
+ memcpy(c->inval_buf + ol, *source, l);
+
+ l += c->inval_len;
+
+ error = hubbub_iconv_codec_read_char(c,
+ (const uint8_t **) &in, &l, dest, destlen);
+ if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
+ return error;
+ }
+
+
+ /* And now, fix everything up so the normal processing
+ * does the right thing. */
+ *source += max((signed) (orig_l - l), 0);
+ *sourcelen -= max((signed) (orig_l - l), 0);
+
+ /* Failed to resolve an incomplete character and
+ * ran out of buffer space. No recovery strategy
+ * possible, so explode everywhere. */
+ if ((orig_l + ol) - l == 0)
+ abort();
+
+ /* Handle memry exhaustion case from above */
+ if (error != HUBBUB_OK)
+ return error;
+ }
+
+ while (*sourcelen > 0) {
+ error = hubbub_iconv_codec_read_char(c,
+ source, sourcelen, dest, destlen);
+ if (error != HUBBUB_OK) {
+ return error;
+ }
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Clear an iconv-based codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec)
+{
+ hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
+
+ iconv(c->read_cd, NULL, NULL, NULL, NULL);
+ iconv(c->write_cd, NULL, NULL, NULL, NULL);
+
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ c->read_buf[0] = 0;
+ c->read_len = 0;
+
+ c->write_buf[0] = 0;
+ c->write_len = 0;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Feed a UCS4 character through the registered filter and output the result
+ *
+ * \param c Codec to use
+ * \param ucs4 UCS4 character (big endian)
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to output buffer length
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ */
+hubbub_error hubbub_iconv_codec_filter_decoded_char(hubbub_iconv_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ if (c->base.filter != NULL) {
+ uint32_t *rep;
+ size_t replen;
+ hubbub_error error;
+
+ error = c->base.filter(ntohl(ucs4), &rep, &replen,
+ c->base.filter_pw);
+ if (error != HUBBUB_OK) {
+ return error;
+ }
+
+ while (replen > 0 && *destlen >= replen * 4) {
+ *((uint32_t *) (void *) *dest) = htonl(*rep);
+
+ *dest += 4;
+ *destlen -= 4;
+
+ rep++;
+ replen--;
+ }
+
+ if (*destlen < replen * 4) {
+ /* Run out of output buffer */
+ size_t i;
+
+ /* Buffer remaining output */
+ c->read_len = replen;
+
+ for (i = 0; i < replen; i++) {
+ c->read_buf[i] = htonl(rep[i]);
+ }
+
+ return HUBBUB_NOMEM;
+ }
+
+ } else {
+ if (*destlen < 4) {
+ /* Run out of output buffer */
+
+ c->read_len = 1;
+ c->read_buf[0] = ucs4;
+
+ return HUBBUB_NOMEM;
+ }
+
+ *((uint32_t *) (void *) *dest) = ucs4;
+ *dest += 4;
+ *destlen -= 4;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Detect if a codec's charset is Unicode capable
+ *
+ * \param c Codec to consider
+ * \return true if a Unicode variant, false otherwise
+ */
+bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c)
+{
+ static uint16_t ucs4;
+ static uint16_t ucs2;
+ static uint16_t utf8;
+ static uint16_t utf16;
+ static uint16_t utf16be;
+ static uint16_t utf16le;
+ static uint16_t utf32;
+ static uint16_t utf32be;
+ static uint16_t utf32le;
+
+ if (ucs4 == 0) {
+ ucs4 = hubbub_mibenum_from_name("UCS-4", SLEN("UCS-4"));
+ ucs2 = hubbub_mibenum_from_name("UCS-2", SLEN("UCS-2"));
+ utf8 = hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+ utf16 = hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16"));
+ utf16be = hubbub_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE"));
+ utf16le = hubbub_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE"));
+ utf32 = hubbub_mibenum_from_name("UTF-32", SLEN("UTF-32"));
+ utf32be = hubbub_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"));
+ utf32le = hubbub_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE"));
+ }
+
+ return (c->base.mibenum == ucs4 ||
+ c->base.mibenum == ucs2 ||
+ c->base.mibenum == utf8 ||
+ c->base.mibenum == utf16 ||
+ c->base.mibenum == utf16be ||
+ c->base.mibenum == utf16le ||
+ c->base.mibenum == utf32 ||
+ c->base.mibenum == utf32be ||
+ c->base.mibenum == utf32le);
+}
+
+/**
+ * Read a character from the codec's native charset to UCS4 (big endian)
+ *
+ * \param c The codec
+ * \param source Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen Pointer to length of source buffer (updated on exit)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ * This buffered data is post-filtering, so will not be refiltered on the
+ * next call.
+ *
+ * In the case of the result being _INVALID or the filter function failing,
+ * ::source will point _at_ the last input character read; nothing will be
+ * written or buffered for the failed character. It is up to the client to
+ * fix the cause of the failure and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ size_t iconv_ret;
+ const uint8_t *origsrc = *source;
+ size_t origsrclen = *sourcelen;
+ uint32_t ucs4;
+ uint8_t *pucs4 = (uint8_t *) &ucs4;
+ size_t sucs4 = 4;
+ hubbub_error error;
+
+ /* Use iconv to convert a single character
+ * Side effect: Updates *source to point at next input
+ * character and *sourcelen to reflect reduced input length
+ */
+ iconv_ret = iconv(c->read_cd, (char **) source, sourcelen,
+ (char **) (void *) &pucs4, &sucs4);
+
+ if (iconv_ret != (size_t) -1 ||
+ (*source != origsrc && sucs4 == 0)) {
+ /* Read a character */
+ error = hubbub_iconv_codec_filter_decoded_char(c,
+ ucs4, dest, destlen);
+ if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
+ /* filter function failed; restore source pointers */
+ *source = origsrc;
+ *sourcelen = origsrclen;
+ }
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ return error;
+ } else if (errno == E2BIG) {
+ /* Should never happen */
+ abort();
+ } else if (errno == EINVAL) {
+ /* Incomplete input sequence */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (const char *) *source, *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ return HUBBUB_OK;
+ } else if (errno == EILSEQ) {
+ /* Illegal input sequence */
+ bool found = false;
+ const uint8_t *oldsrc;
+ size_t oldsrclen;
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ /* Strict errormode; simply flag invalid character */
+ if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) {
+ /* restore source pointers */
+ *source = origsrc;
+ *sourcelen = origsrclen;
+
+ return HUBBUB_INVALID;
+ }
+
+ /* Ok, this becomes problematic. The iconv API here
+ * is particularly unhelpful; *source will point at
+ * the _start_ of the illegal sequence. This means
+ * that we must find the end of the sequence */
+
+ /* Search for the start of the next valid input
+ * sequence (or the end of the input stream) */
+ while (*sourcelen > 1) {
+ pucs4 = (uint8_t *) &ucs4;
+ sucs4 = 4;
+
+ (*source)++;
+ (*sourcelen)--;
+
+ oldsrc = *source;
+ oldsrclen = *sourcelen;
+
+ iconv_ret = iconv(c->read_cd,
+ (char **) source, sourcelen,
+ (char **) (void *) &pucs4, &sucs4);
+ if (iconv_ret != (size_t) -1 || errno != EILSEQ) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found) {
+ /* Found start of next valid sequence */
+ *source = oldsrc;
+ *sourcelen = oldsrclen;
+ } else {
+ /* Not found - skip last byte in buffer */
+ (*source)++;
+ (*sourcelen)--;
+
+ if (*sourcelen != 0)
+ abort();
+ }
+
+ /* output U+FFFD and continue processing. */
+ error = hubbub_iconv_codec_filter_decoded_char(c,
+ htonl(0xFFFD), dest, destlen);
+ if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
+ /* filter function failed; restore source pointers */
+ *source = origsrc;
+ *sourcelen = origsrclen;
+ }
+
+ return error;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Write a UCS4 character in a codec's native charset
+ *
+ * \param c The codec
+ * \param ucs4 The UCS4 character to write (big endian)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if character cannot be represented and the
+ * codec's error handling mode is set to STRICT.
+ */
+hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ size_t iconv_ret;
+ uint8_t *pucs4 = (uint8_t *) &ucs4;
+ size_t sucs4 = 4;
+ uint8_t *origdest = *dest;
+
+ iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4,
+ &sucs4, (char **) dest, destlen);
+
+ if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+ /* Output buffer is too small */
+ return HUBBUB_NOMEM;
+ } else if (iconv_ret == (size_t) -1 && errno == EILSEQ) {
+ /* Illegal multibyte sequence */
+ /* This should never happen */
+ abort();
+ } else if (iconv_ret == (size_t) -1 && errno == EINVAL) {
+ /* Incomplete input character */
+ /* This should never happen */
+ abort();
+ } else if (*dest == origdest) {
+ /* Nothing was output */
+ switch (c->base.errormode) {
+ case HUBBUB_CHARSETCODEC_ERROR_STRICT:
+ return HUBBUB_INVALID;
+
+ case HUBBUB_CHARSETCODEC_ERROR_TRANSLIT:
+ /** \todo transliteration */
+ case HUBBUB_CHARSETCODEC_ERROR_LOOSE:
+ {
+ pucs4 = (uint8_t *) &ucs4;
+ sucs4 = 4;
+
+ ucs4 = hubbub_iconv_codec_is_unicode(c)
+ ? htonl(0xFFFD) : htonl(0x3F);
+
+ iconv_ret = iconv(c->write_cd,
+ (char **) (void *) &pucs4, &sucs4,
+ (char **) dest, destlen);
+
+ if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+ return HUBBUB_NOMEM;
+ } else if (iconv_ret == (size_t) -1 &&
+ errno == EILSEQ) {
+ /* Illegal multibyte sequence */
+ /* This should never happen */
+ abort();
+ } else if (iconv_ret == (size_t) -1 &&
+ errno == EINVAL) {
+ /* Incomplete input character */
+ /* This should never happen */
+ abort();
+ }
+ }
+ break;
+ }
+ }
+
+ return HUBBUB_OK;
+}
+
+const hubbub_charsethandler hubbub_iconv_codec_handler = {
+ hubbub_iconv_codec_handles_charset,
+ hubbub_iconv_codec_create
+};
diff --git a/src/charset/codec_impl.h b/src/charset/codec_impl.h
new file mode 100644
index 0000000..eb5116b
--- /dev/null
+++ b/src/charset/codec_impl.h
@@ -0,0 +1,51 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_charset_codecimpl_h_
+#define hubbub_charset_codecimpl_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include "codec.h"
+
+/**
+ * Core charset codec definition; implementations extend this
+ */
+struct hubbub_charsetcodec {
+ uint16_t mibenum; /**< MIB enum for charset */
+
+ hubbub_charsetcodec_filter filter; /**< filter function */
+ void *filter_pw; /**< filter private word */
+
+ hubbub_charsetcodec_errormode errormode; /**< error mode */
+
+ hubbub_alloc alloc; /**< allocation function */
+ void *alloc_pw; /**< private word */
+
+ struct {
+ void (*destroy)(hubbub_charsetcodec *codec);
+ hubbub_error (*encode)(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+ hubbub_error (*decode)(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+ hubbub_error (*reset)(hubbub_charsetcodec *codec);
+ } handler; /**< Vtable for handler code */
+};
+
+/**
+ * Codec factory component definition
+ */
+typedef struct hubbub_charsethandler {
+ bool (*handles_charset)(const char *charset);
+ hubbub_charsetcodec *(*create)(const char *charset,
+ hubbub_alloc alloc, void *pw);
+} hubbub_charsethandler;
+
+#endif
diff --git a/src/charset/codec_utf8.c b/src/charset/codec_utf8.c
new file mode 100644
index 0000000..86d667f
--- /dev/null
+++ b/src/charset/codec_utf8.c
@@ -0,0 +1,620 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include "charset/aliases.h"
+#include "utils/utf8.h"
+#include "utils/utils.h"
+
+#include "codec_impl.h"
+
+/**
+ * UTF-8 charset codec
+ */
+typedef struct hubbub_utf8_codec {
+ hubbub_charsetcodec base; /**< Base class */
+
+#define INVAL_BUFSIZE (32)
+ uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
+ * incomplete input
+ * sequences */
+ size_t inval_len; /*< Byte length of inval_buf **/
+
+#define READ_BUFSIZE (8)
+ uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
+ * output sequences (decode)
+ * (host-endian) */
+ size_t read_len; /**< Character length of read_buf */
+
+#define WRITE_BUFSIZE (8)
+ uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
+ * output sequences (encode)
+ * (host-endian) */
+ size_t write_len; /**< Character length of write_buf */
+
+} hubbub_utf8_codec;
+
+static bool hubbub_utf8_codec_handles_charset(const char *charset);
+static hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset,
+ hubbub_alloc alloc, void *pw);
+static void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec);
+static hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec);
+static hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_utf8_codec_filter_decoded_char(
+ hubbub_utf8_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool hubbub_utf8_codec_handles_charset(const char *charset)
+{
+ return hubbub_mibenum_from_name(charset, strlen(charset)) ==
+ hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+}
+
+/**
+ * Create a utf8 codec
+ *
+ * \param charset The charset to read from / write to
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset,
+ hubbub_alloc alloc, void *pw)
+{
+ hubbub_utf8_codec *codec;
+
+ UNUSED(charset);
+
+ codec = alloc(NULL, sizeof(hubbub_utf8_codec), pw);
+ if (codec == NULL)
+ return NULL;
+
+ codec->inval_buf[0] = '\0';
+ codec->inval_len = 0;
+
+ codec->read_buf[0] = 0;
+ codec->read_len = 0;
+
+ codec->write_buf[0] = 0;
+ codec->write_len = 0;
+
+ /* Finally, populate vtable */
+ codec->base.handler.destroy = hubbub_utf8_codec_destroy;
+ codec->base.handler.encode = hubbub_utf8_codec_encode;
+ codec->base.handler.decode = hubbub_utf8_codec_decode;
+ codec->base.handler.reset = hubbub_utf8_codec_reset;
+
+ return (hubbub_charsetcodec *) codec;
+}
+
+/**
+ * Destroy a utf8 codec
+ *
+ * \param codec The codec to destroy
+ */
+void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec)
+{
+ UNUSED(codec);
+}
+
+/**
+ * Encode a chunk of UCS4 data into utf8
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call. This buffered data is post-filtering,
+ * so will not be refiltered on the next call.
+ *
+ * In the case of the filter function failing, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the encoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * There is no way to determine the output character which caused a
+ * failure (as it may be one in a filter-injected replacement sequence).
+ * It is, however, possible to determine which source character caused it
+ * (this being the character immediately before the location pointed to by
+ * ::source on exit).
+ *
+ * [I.e. the process of filtering results in a potential one-to-many mapping
+ * between source characters and output characters, and identification of
+ * individual output characters is impossible.]
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec;
+ uint32_t ucs4;
+ uint32_t *towrite;
+ size_t towritelen;
+ hubbub_error error;
+
+ /* Process any outstanding characters from the previous call */
+ if (c->write_len > 0) {
+ uint32_t *pwrite = c->write_buf;
+ uint8_t buf[6];
+ size_t len;
+
+ while (c->write_len > 0) {
+ error = hubbub_utf8_from_ucs4(pwrite[0], buf, &len);
+ if (error != HUBBUB_OK)
+ abort();
+
+ if (*destlen < len) {
+ /* Insufficient output buffer space */
+ for (len = 0; len < c->write_len; len++)
+ c->write_buf[len] = pwrite[len];
+
+ return HUBBUB_NOMEM;
+ }
+
+ memcpy(*dest, buf, len);
+
+ *dest += len;
+ *destlen -= len;
+
+ pwrite++;
+ c->write_len--;
+ }
+ }
+
+ /* Now process the characters for this call */
+ while (*sourcelen > 0) {
+ ucs4 = ntohl(*((uint32_t *) (void *) *source));
+ towrite = &ucs4;
+ towritelen = 1;
+
+ /* Run character we're about to output through the
+ * registered filter, so it can replace it. */
+ if (c->base.filter != NULL) {
+ error = c->base.filter(ucs4,
+ &towrite, &towritelen,
+ c->base.filter_pw);
+ if (error != HUBBUB_OK)
+ return error;
+ }
+
+ /* Output current characters */
+ while (towritelen > 0) {
+ uint8_t buf[6];
+ size_t len;
+
+ error = hubbub_utf8_from_ucs4(towrite[0], buf, &len);
+ if (error != HUBBUB_OK)
+ abort();
+
+ if (*destlen < len) {
+ /* Insufficient output space */
+ if (towritelen >= WRITE_BUFSIZE)
+ abort();
+
+ c->write_len = towritelen;
+
+ /* Copy pending chars to save area, for
+ * processing next call. */
+ for (len = 0; len < towritelen; len++)
+ c->write_buf[len] = towrite[len];
+
+ /* Claim character we've just buffered,
+ * so it's not reprocessed */
+ *source += 4;
+ *sourcelen -= 4;
+
+ return HUBBUB_NOMEM;
+ }
+
+ memcpy(*dest, buf, len);
+
+ *dest += len;
+ *destlen -= len;
+
+ towrite++;
+ towritelen--;
+ }
+
+ *source += 4;
+ *sourcelen -= 4;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Decode a chunk of utf8 data into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ * This buffered data is post-filtering, so will not be refiltered on the
+ * next call.
+ *
+ * In the case of the result being _INVALID or the filter function failing,
+ * ::source will point _at_ the last input character read; nothing will be
+ * written or buffered for the failed character. It is up to the client to
+ * fix the cause of the failure and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * There is no way to determine the output character which caused a
+ * failure (as it may be one in a filter-injected replacement sequence).
+ * It is, however, possible to determine which source character caused it
+ * (this being the character immediately at or before the location pointed
+ * to by ::source on exit).
+ *
+ * [I.e. the process of filtering results in a potential one-to-many mapping
+ * between source characters and output characters, and identification of
+ * individual output characters is impossible.]
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec;
+ hubbub_error error;
+
+ if (c->read_len > 0) {
+ /* Output left over from last decode */
+ uint32_t *pread = c->read_buf;
+
+ while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+ *((uint32_t *) (void *) *dest) = htonl(pread[0]);
+
+ *dest += 4;
+ *destlen -= 4;
+
+ pread++;
+ c->read_len--;
+ }
+
+ if (*destlen < c->read_len * 4) {
+ /* Ran out of output buffer */
+ size_t i;
+
+ /* Shuffle remaining output down */
+ for (i = 0; i < c->read_len; i++)
+ c->read_buf[i] = pread[i];
+
+ return HUBBUB_NOMEM;
+ }
+ }
+
+ if (c->inval_len > 0) {
+ /* The last decode ended in an incomplete sequence.
+ * Fill up inval_buf with data from the start of the
+ * new chunk and process it. */
+ uint8_t *in = c->inval_buf;
+ size_t ol = c->inval_len;
+ size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+ size_t orig_l = l;
+
+ memcpy(c->inval_buf + ol, *source, l);
+
+ l += c->inval_len;
+
+ error = hubbub_utf8_codec_read_char(c,
+ (const uint8_t **) &in, &l, dest, destlen);
+ if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
+ return error;
+ }
+
+ /* And now, fix up source pointers */
+ *source += max((signed) (orig_l - l), 0);
+ *sourcelen -= max((signed) (orig_l - l), 0);
+
+ /* Failed to resolve an incomplete character and
+ * ran out of buffer space. No recovery strategy
+ * possible, so explode everywhere. */
+ if ((orig_l + ol) - l == 0)
+ abort();
+
+ /* Report memory exhaustion case from above */
+ if (error != HUBBUB_OK)
+ return error;
+ }
+
+ /* Finally, the "normal" case; process all outstanding characters */
+ while (*sourcelen > 0) {
+ error = hubbub_utf8_codec_read_char(c,
+ source, sourcelen, dest, destlen);
+ if (error != HUBBUB_OK) {
+ return error;
+ }
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Clear a utf8 codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec)
+{
+ hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec;
+
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ c->read_buf[0] = 0;
+ c->read_len = 0;
+
+ c->write_buf[0] = 0;
+ c->write_len = 0;
+
+ return HUBBUB_OK;
+}
+
+
+/**
+ * Read a character from the UTF-8 to UCS4 (big endian)
+ *
+ * \param c The codec
+ * \param source Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen Pointer to length of source buffer (updated on exit)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ * This buffered data is post-filtering, so will not be refiltered on the
+ * next call.
+ *
+ * In the case of the result being _INVALID or the filter function failing,
+ * ::source will point _at_ the last input character read; nothing will be
+ * written or buffered for the failed character. It is up to the client to
+ * fix the cause of the failure and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ uint32_t ucs4;
+ size_t sucs4;
+ hubbub_error error;
+
+ /* Convert a single character */
+ error = hubbub_utf8_to_ucs4(*source, *sourcelen, &ucs4, &sucs4);
+ if (error == HUBBUB_OK) {
+ /* Read a character */
+ error = hubbub_utf8_codec_filter_decoded_char(c,
+ ucs4, dest, destlen);
+ if (error == HUBBUB_OK || error == HUBBUB_NOMEM) {
+ /* filter function succeeded; update source pointers */
+ *source += sucs4;
+ *sourcelen -= sucs4;
+ }
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ return error;
+ } else if (error == HUBBUB_NEEDDATA) {
+ /* Incomplete input sequence */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (char *) *source, *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ return HUBBUB_OK;
+ } else if (error == HUBBUB_INVALID) {
+ /* Illegal input sequence */
+ uint32_t nextchar;
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ /* Strict errormode; simply flag invalid character */
+ if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) {
+ return HUBBUB_INVALID;
+ }
+
+ /* Find next valid UTF-8 sequence.
+ * We're processing client-provided data, so let's
+ * be paranoid about its validity. */
+ error = hubbub_utf8_next_paranoid(*source, *sourcelen,
+ 0, &nextchar);
+ if (error != HUBBUB_OK) {
+ if (error == HUBBUB_NEEDDATA) {
+ /* Need more data to be sure */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (char *) *source,
+ *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ nextchar = 0;
+ } else {
+ return error;
+ }
+ }
+
+ /* output U+FFFD and continue processing. */
+ error = hubbub_utf8_codec_filter_decoded_char(c,
+ 0xFFFD, dest, destlen);
+ if (error == HUBBUB_OK || error == HUBBUB_NOMEM) {
+ /* filter function succeeded; update source pointers */
+ *source += nextchar;
+ *sourcelen -= nextchar;
+ }
+
+ return error;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Feed a UCS4 character through the registered filter and output the result
+ *
+ * \param c Codec to use
+ * \param ucs4 UCS4 character (host endian)
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to output buffer length
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ */
+hubbub_error hubbub_utf8_codec_filter_decoded_char(hubbub_utf8_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ if (c->base.filter != NULL) {
+ uint32_t *rep;
+ size_t replen;
+ hubbub_error error;
+
+ error = c->base.filter(ucs4, &rep, &replen,
+ c->base.filter_pw);
+ if (error != HUBBUB_OK) {
+ return error;
+ }
+
+ while (replen > 0 && *destlen >= replen * 4) {
+ *((uint32_t *) (void *) *dest) = htonl(*rep);
+
+ *dest += 4;
+ *destlen -= 4;
+
+ rep++;
+ replen--;
+ }
+
+ if (*destlen < replen * 4) {
+ /* Run out of output buffer */
+ size_t i;
+
+ /* Buffer remaining output */
+ c->read_len = replen;
+
+ for (i = 0; i < replen; i++) {
+ c->read_buf[i] = rep[i];
+ }
+
+ return HUBBUB_NOMEM;
+ }
+
+ } else {
+ if (*destlen < 4) {
+ /* Run out of output buffer */
+ c->read_len = 1;
+ c->read_buf[0] = ucs4;
+
+ return HUBBUB_NOMEM;
+ }
+
+ *((uint32_t *) (void *) *dest) = htonl(ucs4);
+ *dest += 4;
+ *destlen -= 4;
+ }
+
+ return HUBBUB_OK;
+}
+
+
+const hubbub_charsethandler hubbub_utf8_codec_handler = {
+ hubbub_utf8_codec_handles_charset,
+ hubbub_utf8_codec_create
+};
diff --git a/src/charset/detect.c b/src/charset/detect.c
new file mode 100644
index 0000000..8ff3b87
--- /dev/null
+++ b/src/charset/detect.c
@@ -0,0 +1,673 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "utils/utils.h"
+
+#include "detect.h"
+
+static uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len);
+static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len);
+static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
+ const uint8_t *end);
+static uint16_t hubbub_charset_parse_content(const uint8_t *value,
+ uint32_t valuelen);
+static bool hubbub_charset_get_attribute(const uint8_t **data,
+ const uint8_t *end,
+ const uint8_t **name, uint32_t *namelen,
+ const uint8_t **value, uint32_t *valuelen);
+
+/**
+ * Extract a charset from a chunk of data
+ *
+ * \param data Pointer to pointer to buffer containing data
+ * \param len Pointer to buffer length
+ * \param mibenum Pointer to location to store MIB enum representing charset
+ * \param source Pointer to location to receive charset source
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ *
+ * The data pointer and length will be modified by this function if
+ * a byte order mark is encountered at the start of the buffer. The updated
+ * data pointer will point to the first byte in the buffer after the BOM.
+ * The length will be modified appropriately.
+ *
+ * The larger a chunk of data fed to this routine, the better, as it allows
+ * charset autodetection access to a larger dataset for analysis.
+ */
+hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
+ uint16_t *mibenum, hubbub_charset_source *source)
+{
+ uint16_t charset = 0;
+
+ if (data == NULL || *data == NULL || len == NULL ||
+ mibenum == NULL || source == NULL)
+ return HUBBUB_BADPARM;
+
+ /* We need at least 4 bytes of data */
+ if (*len < 4)
+ goto default_encoding;
+
+ /* First, look for a BOM */
+ charset = hubbub_charset_read_bom(data, len);
+ if (charset != 0) {
+ *mibenum = charset;
+ *source = HUBBUB_CHARSET_DOCUMENT;
+
+ return HUBBUB_OK;
+ }
+
+ /* No BOM was found, so we must look for a meta charset within
+ * the document itself. */
+ charset = hubbub_charset_scan_meta(*data, *len);
+ if (charset != 0) {
+ /* ISO-8859-1 becomes Windows-1252 */
+ if (charset == hubbub_mibenum_from_name("ISO-8859-1",
+ SLEN("ISO-8859-1"))) {
+ charset = hubbub_mibenum_from_name("Windows-1252",
+ SLEN("Windows-1252"));
+ /* Fallback to 8859-1 if that failed */
+ if (charset == 0)
+ charset = hubbub_mibenum_from_name(
+ "ISO-8859-1", SLEN("ISO-8859-1"));
+ }
+
+ /* If we've encountered a meta charset for a non-ASCII-
+ * compatible encoding, don't trust it.
+ *
+ * Firstly, it should have been sent with a BOM (and thus
+ * detected above).
+ *
+ * Secondly, we've just used an ASCII-only parser to
+ * extract the encoding from the document. Therefore,
+ * the document plainly isn't what the meta charset
+ * claims it is.
+ *
+ * What we do in this case is to ignore the meta charset's
+ * claims and leave the charset determination to the
+ * autodetection routines (or the fallback case if they
+ * fail).
+ */
+ if (charset != hubbub_mibenum_from_name("UTF-16",
+ SLEN("UTF-16")) &&
+ charset != hubbub_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE")) &&
+ charset != hubbub_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE")) &&
+ charset != hubbub_mibenum_from_name("UTF-32",
+ SLEN("UTF-32")) &&
+ charset != hubbub_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE")) &&
+ charset != hubbub_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"))) {
+
+ *mibenum = charset;
+ *source = HUBBUB_CHARSET_DOCUMENT;
+
+ return HUBBUB_OK;
+ }
+ }
+
+ /* No charset was specified within the document, attempt to
+ * autodetect the encoding from the data that we have available. */
+
+ /** \todo Charset autodetection */
+
+ /* We failed to autodetect a charset, so use the default fallback */
+default_encoding:
+
+ charset = hubbub_mibenum_from_name("Windows-1252",
+ SLEN("Windows-1252"));
+ if (charset == 0)
+ charset = hubbub_mibenum_from_name("ISO-8859-1",
+ SLEN("ISO-8859-1"));
+
+ *mibenum = charset;
+ *source = HUBBUB_CHARSET_DEFAULT;
+
+ return HUBBUB_OK;
+}
+
+
+/**
+ * Inspect the beginning of a buffer of data for the presence of a
+ * UTF Byte Order Mark.
+ *
+ * \param data Pointer to pointer to buffer containing data
+ * \param len Pointer to buffer length
+ * \return MIB enum representing encoding described by BOM, or 0 if not found
+ *
+ * If a BOM is found, the data pointer will be modified to point to the first
+ * byte in the buffer after the BOM. The length will also be modified
+ * appropriately.
+ */
+uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len)
+{
+ if (data == NULL || *data == NULL || len == NULL)
+ return 0;
+
+ /* We require at least 4 bytes of data */
+ if (*len < 4)
+ return 0;
+
+#define UTF32BOM_LEN (4)
+#define UTF16BOM_LEN (2)
+#define UTF8BOM_LEN (3)
+
+ if ((*data)[0] == 0x00 && (*data)[1] == 0x00 &&
+ (*data)[2] == 0xFE && (*data)[3] == 0xFF) {
+ *data += UTF32BOM_LEN;
+ *len -= UTF32BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"));
+ } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE &&
+ (*data)[2] == 0x00 && (*data)[3] == 0x00) {
+ *data += UTF32BOM_LEN;
+ *len -= UTF32BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE"));
+ } else if ((*data)[0] == 0xFE && (*data)[1] == 0xFF) {
+ *data += UTF16BOM_LEN;
+ *len -= UTF16BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE"));
+ } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE) {
+ *data += UTF16BOM_LEN;
+ *len -= UTF16BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE"));
+ } else if ((*data)[0] == 0xEF && (*data)[1] == 0xBB &&
+ (*data)[2] == 0xBF) {
+ *data += UTF8BOM_LEN;
+ *len -= UTF8BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+ }
+
+#undef UTF32BOM_LEN
+#undef UTF16BOM_LEN
+#undef UTF8BOM_LEN
+
+ return 0;
+}
+
+#define PEEK(a) \
+ (pos < end - SLEN(a) && \
+ strncasecmp((const char *) pos, a, SLEN(a)) == 0)
+
+#define ADVANCE(a) \
+ while (pos < end - SLEN(a)) { \
+ if (PEEK(a)) \
+ break; \
+ pos++; \
+ } \
+ \
+ if (pos == end - SLEN(a)) \
+ return 0;
+
+#define ISSPACE(a) \
+ (a == 0x09 || a == 0x0a || a == 0x0b || \
+ a == 0x0c || a == 0x0d || a == 0x20)
+
+/**
+ * Search for a meta charset within a buffer of data
+ *
+ * \param data Pointer to buffer containing data
+ * \param len Length of buffer in data
+ * \return MIB enum representing encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len)
+{
+ const uint8_t *pos = data;
+ const uint8_t *end;
+ uint16_t mibenum;
+
+ if (data == NULL)
+ return 0;
+
+ end = pos + min(512, len);
+
+ /* 1. */
+ while (pos < end) {
+ /* a */
+ if (PEEK("<!--")) {
+ pos += SLEN("<!--");
+ ADVANCE("-->");
+ /* b */
+ } else if (PEEK("<meta")) {
+ if (pos + SLEN("<meta") >= end - 1)
+ return 0;
+
+ if (ISSPACE(*(pos + SLEN("<meta")))) {
+ /* 1 */
+ pos += SLEN("<meta");
+
+ mibenum = hubbub_charset_parse_attributes(
+ &pos, end);
+ if (mibenum != 0)
+ return mibenum;
+
+ if (pos >= end)
+ return 0;
+ }
+ /* c */
+ } else if ((PEEK("</") && (pos < end - 3 &&
+ (0x41 <= (*(pos + 2) & ~ 0x20) &&
+ (*(pos + 2) & ~ 0x20) <= 0x5A))) ||
+ (pos < end - 2 && *pos == '<' &&
+ (0x41 <= (*(pos + 1) & ~ 0x20) &&
+ (*(pos + 1) & ~ 0x20) <= 0x5A))) {
+
+ /* skip '<' */
+ pos++;
+
+ /* 1. */
+ while (pos < end) {
+ if (ISSPACE(*pos) ||
+ *pos == '>' || *pos == '<')
+ break;
+ pos++;
+ }
+
+ if (pos >= end)
+ return 0;
+
+ /* 3 */
+ if (*pos != '<') {
+ const uint8_t *n;
+ const uint8_t *v;
+ uint32_t nl, vl;
+
+ while (hubbub_charset_get_attribute(&pos, end,
+ &n, &nl, &v, &vl))
+ ; /* do nothing */
+ /* 2 */
+ } else
+ continue;
+ /* d */
+ } else if (PEEK("<!") || PEEK("</") || PEEK("<?")) {
+ pos++;
+ ADVANCE(">");
+ }
+
+ /* e - do nothing */
+
+ /* 2 */
+ pos++;
+ }
+
+ return 0;
+}
+
+/**
+ * Parse attributes on a meta tag
+ *
+ * \param pos Pointer to pointer to current location (updated on exit)
+ * \param end Pointer to end of data stream
+ * \return MIB enum of detected encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
+ const uint8_t *end)
+{
+ const uint8_t *name;
+ const uint8_t *value;
+ uint32_t namelen, valuelen;
+ uint16_t mibenum;
+
+ if (pos == NULL || *pos == NULL || end == NULL)
+ return 0;
+
+ /* 2 */
+ while (hubbub_charset_get_attribute(pos, end,
+ &name, &namelen, &value, &valuelen)) {
+ /* 3 */
+ /* a */
+ if (namelen == SLEN("charset") && valuelen > 0 &&
+ strncasecmp((const char *) name, "charset",
+ SLEN("charset")) == 0) {
+ /* strip value */
+ while (ISSPACE(*value)) {
+ value++;
+ valuelen--;
+ }
+
+ while (valuelen > 0 && ISSPACE(value[valuelen - 1]))
+ valuelen--;
+
+ mibenum = hubbub_mibenum_from_name(
+ (const char *) value, valuelen);
+ if (mibenum != 0)
+ return mibenum;
+ /* b */
+ } else if (namelen == SLEN("content") && valuelen > 0 &&
+ strncasecmp((const char *) name, "content",
+ SLEN("content")) == 0) {
+ mibenum = hubbub_charset_parse_content(value,
+ valuelen);
+ if (mibenum != 0)
+ return mibenum;
+ }
+
+ /* c - do nothing */
+
+ /* 1 */
+ while (*pos < end) {
+ if (ISSPACE(**pos))
+ break;
+ (*pos)++;
+ }
+
+ if (*pos >= end) {
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Parse a content= attribute's value
+ *
+ * \param value Attribute's value
+ * \param valuelen Length of value
+ * \return MIB enum of detected encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_parse_content(const uint8_t *value,
+ uint32_t valuelen)
+{
+ const uint8_t *end;
+ const uint8_t *tentative = NULL;
+ uint32_t tentative_len = 0;
+
+ if (value == NULL)
+ return 0;
+
+ end = value + valuelen;
+
+ /* 1 */
+ while (value < end) {
+ if (*value == ';') {
+ value++;
+ break;
+ }
+
+ value++;
+ }
+
+ if (value >= end)
+ return 0;
+
+ /* 2 */
+ while (value < end && ISSPACE(*value)) {
+ value++;
+ }
+
+ if (value >= end)
+ return 0;
+
+ /* 3 */
+ if (value < end - SLEN("charset") &&
+ strncasecmp((const char *) value,
+ "charset", SLEN("charset")) != 0)
+ return 0;
+
+ value += SLEN("charset");
+
+ /* 4 */
+ while (value < end && ISSPACE(*value)) {
+ value++;
+ }
+
+ if (value >= end)
+ return 0;
+
+ /* 5 */
+ if (*value != '=')
+ return 0;
+ /* skip '=' */
+ value++;
+
+ /* 6 */
+ while (value < end && ISSPACE(*value)) {
+ value++;
+ }
+
+ if (value >= end)
+ return 0;
+
+ /* 7 */
+ tentative = value;
+
+ /* a */
+ if (*value == '"') {
+ while (++value < end && *value != '"') {
+ tentative_len++;
+ }
+
+ if (value < end)
+ tentative++;
+ else
+ tentative = NULL;
+ /* b */
+ } else if (*value == '\'') {
+ while (++value < end && *value != '\'') {
+ tentative_len++;
+ }
+
+ if (value < end)
+ tentative++;
+ else
+ tentative = NULL;
+ /* c */
+ } else {
+ while (value < end && !ISSPACE(*value)) {
+ value++;
+ tentative_len++;
+ }
+ }
+
+ /* 8 */
+ if (tentative != NULL) {
+ return hubbub_mibenum_from_name((const char *) tentative,
+ tentative_len);
+ }
+
+ /* 9 */
+ return 0;
+}
+
+/**
+ * Extract an attribute from the data stream
+ *
+ * \param data Pointer to pointer to current location (updated on exit)
+ * \param end Pointer to end of data stream
+ * \param name Pointer to location to receive attribute name
+ * \param namelen Pointer to location to receive attribute name length
+ * \param value Pointer to location to receive attribute value
+ * \param valuelen Pointer to location to receive attribute value langth
+ * \return true if attribute extracted, false otherwise.
+ *
+ * Note: The caller should heed the returned lengths; these are the only
+ * indicator that useful content resides in name or value.
+ */
+bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end,
+ const uint8_t **name, uint32_t *namelen,
+ const uint8_t **value, uint32_t *valuelen)
+{
+ const uint8_t *pos;
+
+ if (data == NULL || *data == NULL || end == NULL || name == NULL ||
+ namelen == NULL || value == NULL || valuelen == NULL)
+ return false;
+
+ pos = *data;
+
+ /* 1. Skip leading spaces or '/' characters */
+ while (pos < end && (ISSPACE(*pos) || *pos == '/')) {
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ /* 2. Invalid element open character */
+ if (*pos == '<') {
+ pos--;
+ *data = pos;
+ return false;
+ }
+
+ /* 3. End of element */
+ if (*pos == '>') {
+ *data = pos;
+ return false;
+ }
+
+ /* 4. Initialise name & value to empty string */
+ *name = pos;
+ *namelen = 0;
+ *value = (const uint8_t *) "";
+ *valuelen = 0;
+
+ /* 5. Extract name */
+ while (pos < end) {
+ /* a */
+ if (*pos == '=') {
+ break;
+ }
+
+ /* b */
+ if (ISSPACE(*pos)) {
+ break;
+ }
+
+ /* c */
+ if (*pos == '/' || *pos == '<' || *pos == '>') {
+ return true;
+ }
+
+ /* d is handled by strncasecmp in _parse_attributes */
+
+ /* e */
+ (*namelen)++;
+
+ /* 6 */
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ if (ISSPACE(*pos)) {
+ /* 7. Skip trailing spaces */
+ while (pos < end && ISSPACE(*pos)) {
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ /* 8. Must be '=' */
+ if (*pos != '=') {
+ pos--;
+ *data = pos;
+ return true;
+ }
+ }
+
+ /* 9. Skip '=' */
+ pos++;
+
+ /* 10. Skip any spaces after '=' */
+ while (pos < end && ISSPACE(*pos)) {
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ /* 11. Extract value, if quoted */
+ /* a */
+ if (*pos == '\'' || *pos == '"') {
+ /* 1 */
+ const uint8_t *quote = pos;
+
+ /* 2 */
+ while (++pos < end) {
+ /* 3 */
+ if (*pos == *quote) {
+ *value = (quote + 1);
+ *data = ++pos;
+ return true;
+ }
+
+ /* 4 is handled by strncasecmp */
+
+ /* 5 */
+ (*valuelen)++;
+
+ /* 6 */
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+ }
+
+ /* b */
+ if (*pos == '<' || *pos == '>') {
+ *data = pos;
+ return true;
+ }
+
+ /* c is handled by strncasecmp */
+
+ /* d */
+ *value = pos;
+
+ while (pos < end) {
+ /* 12. Extract unquoted value */
+ /* a */
+ if (ISSPACE(*pos) || *pos == '<' || *pos == '>') {
+ *data = pos;
+ return true;
+ }
+
+ /* b is handled by strncasecmp */
+
+ /* c */
+ (*valuelen)++;
+
+ /* 13. Advance */
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ /* should never be reached */
+ abort();
+
+ return false;
+}
diff --git a/src/charset/detect.h b/src/charset/detect.h
new file mode 100644
index 0000000..854a8d6
--- /dev/null
+++ b/src/charset/detect.h
@@ -0,0 +1,22 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_charset_detect_h_
+#define hubbub_charset_detect_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+#include <hubbub/types.h>
+
+/* Extract a charset from a chunk of data */
+hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
+ uint16_t *mibenum, hubbub_charset_source *source);
+
+#endif
+