4 files changed, 214 insertions, 0 deletions
diff --git a/include/parserutils/charset/codec.h b/include/parserutils/charset/codec.h
new file mode 100644
index 0000000..ca98db5
--- /dev/null
+++ b/include/parserutils/charset/codec.h
@@ -0,0 +1,114 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_codec_h_
+#define parserutils_charset_codec_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+typedef struct parserutils_charset_codec parserutils_charset_codec;
+
+#define PARSERUTILS_CHARSET_CODEC_NULL (0xffffffffU)
+
+/**
+ * Charset codec error mode
+ *
+ * A codec's error mode determines its behaviour in the face of:
+ *
+ * + characters which are unrepresentable in the destination charset (if
+ *   encoding data) or which cannot be converted to UCS4 (if decoding data).
+ * + invalid byte sequences (both encoding and decoding)
+ *
+ * The options provide a choice between the following approaches:
+ *
+ * + draconian, "stop processing" ("strict")
+ * + "replace the unrepresentable character with something else" ("loose")
+ * + "attempt to transliterate, or replace if unable" ("translit")
+ *
+ * The default error mode is "loose".
+ *
+ *
+ * In the "loose" case, the replacement character will depend upon:
+ *
+ * + Whether the operation was encoding or decoding
+ * + If encoding, what the destination charset is.
+ *
+ * If decoding, the replacement character will be:
+ *
+ *     U+FFFD (REPLACEMENT CHARACTER)
+ *
+ * If encoding, the replacement character will be:
+ *
+ *     U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32)
+ *     U+FFFD (REPLACEMENT CHARACTER) otherwise.
+ *
+ *
+ * In the "translit" case, the codec will attempt to transliterate into
+ * the destination charset, if encoding. If decoding, or if transliteration
+ * fails, this option is identical to "loose".
+ */
+typedef enum parserutils_charset_codec_errormode {
+	/** Abort processing if unrepresentable character encountered */
+	PARSERUTILS_CHARSET_CODEC_ERROR_STRICT   = 0,
+	/** Replace unrepresentable characters with single alternate */
+	PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE    = 1,
+	/** Transliterate unrepresentable characters, if possible */
+	PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT = 2,
+} parserutils_charset_codec_errormode;
+
+/**
+ * Charset codec option types
+ */
+typedef enum parserutils_charset_codec_opttype {
+	/** Set codec error mode */
+	PARSERUTILS_CHARSET_CODEC_ERROR_MODE  = 1,
+} parserutils_charset_codec_opttype;
+
+/**
+ * Charset codec option parameters
+ */
+typedef union parserutils_charset_codec_optparams {
+	/** Parameters for error mode setting */
+	struct {
+		/** The desired error handling mode */
+		parserutils_charset_codec_errormode mode;
+	} error_mode;
+} parserutils_charset_codec_optparams;
+
+
+/* Create a charset codec */
+parserutils_charset_codec *parserutils_charset_codec_create(const char *charset,
+		parserutils_alloc alloc, void *pw);
+/* Destroy a charset codec */
+void parserutils_charset_codec_destroy(parserutils_charset_codec *codec);
+
+/* Configure a charset codec */
+parserutils_error parserutils_charset_codec_setopt(
+		parserutils_charset_codec *codec,
+		parserutils_charset_codec_opttype type, 
+		parserutils_charset_codec_optparams *params);
+
+/* Encode a chunk of UCS4 data into a codec's charset */
+parserutils_error parserutils_charset_codec_encode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+
+/* Decode a chunk of data in a codec's charset into UCS4 */
+parserutils_error parserutils_charset_codec_decode(
+		parserutils_charset_codec *codec,
+		const uint8_t **source, size_t *sourcelen,
+		uint8_t **dest, size_t *destlen);
+
+/* Reset a charset codec */
+parserutils_error parserutils_charset_codec_reset(
+		parserutils_charset_codec *codec);
+
+#endif
diff --git a/include/parserutils/charset/mibenum.h b/include/parserutils/charset/mibenum.h
new file mode 100644
index 0000000..8b3ac9d
--- /dev/null
+++ b/include/parserutils/charset/mibenum.h
@@ -0,0 +1,24 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_mibenum_h_
+#define parserutils_charset_mibenum_h_
+
+#include <inttypes.h>
+#include <stdbool.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+/* Convert an encoding alias to a MIB enum value */
+uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len);
+/* Convert a MIB enum value into an encoding alias */
+const char *parserutils_charset_mibenum_to_name(uint16_t mibenum);
+/* Determine if a MIB enum value represents a Unicode variant */
+bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum);
+
+#endif
diff --git a/include/parserutils/charset/utf16.h b/include/parserutils/charset/utf16.h
new file mode 100644
index 0000000..6569d6e
--- /dev/null
+++ b/include/parserutils/charset/utf16.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-16 manipulation functions (interface).
+ */
+
+#ifndef parserutils_charset_utf16_h_
+#define parserutils_charset_utf16_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+
+parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, 
+		size_t len, uint32_t *ucs4, size_t *clen);
+parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, 
+		uint8_t *s, size_t *len);
+
+parserutils_error parserutils_charset_utf16_length(const uint8_t *s, 
+		size_t max, size_t *len);
+parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
+		size_t *len);
+
+parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, 
+		uint32_t off, uint32_t *prevoff);
+parserutils_error parserutils_charset_utf16_next(const uint8_t *s, 
+		uint32_t len, uint32_t off, uint32_t *nextoff);
+
+parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
+		uint32_t len, uint32_t off, uint32_t *nextoff);
+
+#endif
+
diff --git a/include/parserutils/charset/utf8.h b/include/parserutils/charset/utf8.h
new file mode 100644
index 0000000..16e012e
--- /dev/null
+++ b/include/parserutils/charset/utf8.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (interface).
+ */
+
+#ifndef parserutils_charset_utf8_h_
+#define parserutils_charset_utf8_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+
+parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
+		uint32_t *ucs4, size_t *clen);
+parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, uint8_t **s,
+		size_t *len);
+
+parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
+		size_t *len);
+parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
+		size_t *len);
+
+parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
+		uint32_t *prevoff);
+parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
+		uint32_t off, uint32_t *nextoff);
+
+parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s, 
+		uint32_t len, uint32_t off, uint32_t *nextoff);
+
+#endif
+