4 files changed, 799 insertions, 0 deletions
diff --git a/src/charset/encodings/Makefile b/src/charset/encodings/Makefile
new file mode 100644
index 0000000..47d9210
--- /dev/null
+++ b/src/charset/encodings/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Sources
+SRCS_$(d) := utf8.c utf16.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/encodings/utf16.c b/src/charset/encodings/utf16.c
new file mode 100644
index 0000000..95dc64f
--- /dev/null
+++ b/src/charset/encodings/utf16.c
@@ -0,0 +1,239 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-16 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/utf16.h>
+
+/**
+ * Convert a UTF-16 sequence into a single UCS4 character
+ *
+ * \param s     The sequence to process
+ * \param len   Length of sequence
+ * \param ucs4  Pointer to location to receive UCS4 character (host endian)
+ * \param clen  Pointer to location to receive byte length of UTF-16 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s, 
+		size_t len, uint32_t *ucs4, size_t *clen)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || ucs4 == NULL || clen == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (len < 2)
+		return PARSERUTILS_NEEDDATA;
+
+	if (*ss < 0xD800 || *ss > 0xDFFF) {
+		*ucs4 = *ss;
+		*clen = 2;
+	} else if (0xD800 <= *ss && *ss <= 0xBFFF) {
+		if (len < 4)
+			return PARSERUTILS_NEEDDATA;
+
+		if (0xDC00 <= ss[1] && ss[1] <= 0xE000) {
+			*ucs4 = (((s[0] >> 6) & 0x1f) + 1) |
+					((s[0] & 0x3f) | (s[1] & 0x3ff));
+			*clen = 4;
+		} else {
+			return PARSERUTILS_INVALID;
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-16 sequence
+ *
+ * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s     Pointer to 4 byte long output buffer
+ * \param len   Pointer to location to receive length of multibyte sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
+		size_t *len)
+{
+	uint16_t *ss = (uint16_t *) (void *) s;
+	uint32_t l = 0;
+
+	if (s == NULL || len == NULL)
+		return PARSERUTILS_BADPARM;
+	else if (ucs4 < 0x10000) {
+		*ss = (uint16_t) ucs4;
+		l = 2;
+	} else if (ucs4 < 0x110000) {
+		ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
+		ss[1] = 0xDC00 | (ucs4 & 0x3ff);
+		l = 4;
+	} else {
+		return PARSERUTILS_INVALID;
+	}
+
+	*len = l;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-16 string
+ *
+ * \param s    The string
+ * \param max  Maximum length
+ * \param len  Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max,
+		size_t *len)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+	const uint16_t *end = (const uint16_t *) (const void *) (s + max);
+	int l = 0;
+
+	if (s == NULL || len == NULL)
+		return PARSERUTILS_BADPARM;
+
+	while (ss < end) {
+		if (*ss < 0xD800 || 0xDFFF < *ss)
+			ss++;
+		else
+			ss += 2;
+
+		l++;
+	}
+
+	*len = l;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-16 character
+ *
+ * \param s    Pointer to start of character
+ * \param len  Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
+		size_t *len)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || len == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (*ss < 0xD800 || 0xDFFF < *ss)
+		*len = 2;
+	else
+		*len = 4;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Find previous legal UTF-16 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off,
+		uint32_t *prevoff)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || prevoff == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (off < 2)
+		*prevoff = 0;
+	else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
+		*prevoff = off - 2;
+	else
+		*prevoff = (off < 4) ? 0 : off - 4;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len,
+		uint32_t off, uint32_t *nextoff)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || off >= len || nextoff == NULL)
+		return PARSERUTILS_BADPARM;
+
+	if (len - off < 4)
+		*nextoff = len;
+	else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
+		*nextoff = off + 2;
+	else
+		*nextoff = (len - off < 6) ? len : off + 4;
+
+	return PARSERUTILS_OK;
+}
+
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
+		uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+	const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+	if (s == NULL || off >= len || nextoff == NULL)
+		return PARSERUTILS_BADPARM;
+
+	while (1) {
+		if (len - off < 4) {
+			return PARSERUTILS_NEEDDATA;
+		} else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
+			*nextoff = off + 2;
+			break;
+		} else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
+			if (len - off < 6)
+				return PARSERUTILS_NEEDDATA;
+
+			if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
+				*nextoff = off + 4;
+				break;
+			} else {
+				ss++;
+				off += 2;
+			}
+		}
+	}
+
+	return PARSERUTILS_OK;
+}
+
diff --git a/src/charset/encodings/utf8.c b/src/charset/encodings/utf8.c
new file mode 100644
index 0000000..5b4ba95
--- /dev/null
+++ b/src/charset/encodings/utf8.c
@@ -0,0 +1,175 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/utf8.h>
+#include "charset/encodings/utf8impl.h"
+
+/** Number of continuation bytes for a given start byte */
+const uint8_t numContinuations[256] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+	3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
+};
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param s     The sequence to process
+ * \param len   Length of sequence
+ * \param ucs4  Pointer to location to receive UCS4 character (host endian)
+ * \param clen  Pointer to location to receive byte length of UTF-8 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
+		uint32_t *ucs4, size_t *clen)
+{
+	parserutils_error error;
+
+	UTF8_TO_UCS4(s, len, ucs4, clen, error);
+
+	return error;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param ucs4  The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s     Pointer to pointer to output buffer, updated on exit
+ * \param len   Pointer to length, in bytes, of output buffer, updated on exit
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, 
+		uint8_t **s, size_t *len)
+{
+	parserutils_error error;
+
+	UTF8_FROM_UCS4(ucs4, s, len, error);
+
+	return error;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s    The string
+ * \param max  Maximum length
+ * \param len  Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
+		size_t *len)
+{
+	parserutils_error error;
+
+	UTF8_LENGTH(s, max, len, error);
+
+	return error;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s    Pointer to start of character
+ * \param len  Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
+		size_t *len)
+{
+	parserutils_error error;
+
+	UTF8_CHAR_BYTE_LENGTH(s, len, error);
+
+	return error;
+}
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
+		uint32_t *prevoff)
+{
+	parserutils_error error;
+
+	UTF8_PREV(s, off, prevoff, error);
+
+	return error;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
+		uint32_t off, uint32_t *nextoff)
+{
+	parserutils_error error;
+
+	UTF8_NEXT(s, len, off, nextoff, error);
+
+	return error;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s, 
+		uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+	parserutils_error error;
+
+	UTF8_NEXT_PARANOID(s, len, off, nextoff, error);
+
+	return error;
+}
+
diff --git a/src/charset/encodings/utf8impl.h b/src/charset/encodings/utf8impl.h
new file mode 100644
index 0000000..1ca9de7
--- /dev/null
+++ b/src/charset/encodings/utf8impl.h
@@ -0,0 +1,339 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_encodings_utf8impl_h_
+#define parserutils_charset_encodings_utf8impl_h_
+
+/** \file
+ * UTF-8 manipulation macros (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+/** Number of continuation bytes for a given start byte */
+extern const uint8_t numContinuations[256];
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param s      The sequence to process
+ * \param len    Length of sequence
+ * \param ucs4   Pointer to location to receive UCS4 character (host endian)
+ * \param clen   Pointer to location to receive byte length of UTF-8 sequence
+ * \param error  Location to receive error code
+ */
+#define UTF8_TO_UCS4(s, len, ucs4, clen, error)				\
+do {									\
+	uint32_t c, min;						\
+	uint8_t n;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || ucs4 == NULL || clen == NULL) {		\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	if (len == 0) {							\
+		error = PARSERUTILS_NEEDDATA;				\
+		break;							\
+	}								\
+									\
+	c = s[0];							\
+									\
+	if (c < 0x80) {							\
+		n = 1;							\
+		min = 0;						\
+	} else if ((c & 0xE0) == 0xC0) {				\
+		c &= 0x1F;						\
+		n = 2;							\
+		min = 0x80;						\
+	} else if ((c & 0xF0) == 0xE0) {				\
+		c &= 0x0F;						\
+		n = 3;							\
+		min = 0x800;						\
+	} else if ((c & 0xF8) == 0xF0) {				\
+		c &= 0x07;						\
+		n = 4;							\
+		min = 0x10000;						\
+	} else if ((c & 0xFC) == 0xF8) {				\
+		c &= 0x03;						\
+		n = 5;							\
+		min = 0x200000;						\
+	} else if ((c & 0xFE) == 0xFC) {				\
+		c &= 0x01;						\
+		n = 6;							\
+		min = 0x4000000;					\
+	} else {							\
+		error = PARSERUTILS_INVALID;				\
+		break;							\
+	}								\
+									\
+	if (len < n) {							\
+		error = PARSERUTILS_NEEDDATA;				\
+		break;							\
+	}								\
+									\
+	for (uint8_t i = 1; i < n; i++) {				\
+		uint32_t t = s[i];					\
+									\
+		if ((t & 0xC0) != 0x80) {				\
+			error = PARSERUTILS_INVALID;			\
+			break;						\
+		}							\
+									\
+		c <<= 6;						\
+		c |= t & 0x3F;						\
+	}								\
+									\
+	if (error == PARSERUTILS_OK) {					\
+		/* Detect overlong sequences, surrogates and fffe/ffff */ \
+		if (c < min || (c >= 0xD800 && c <= 0xDFFF) ||		\
+				c == 0xFFFE || c == 0xFFFF) {		\
+			error = PARSERUTILS_INVALID;			\
+			break;						\
+		}							\
+									\
+		*ucs4 = c;						\
+		*clen = n;						\
+	}								\
+} while(0)
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param ucs4   The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s      Pointer to pointer to output buffer, updated on exit
+ * \param len    Pointer to length, in bytes, of output buffer, updated on exit
+ * \param error  Location to receive error code
+ */
+#define UTF8_FROM_UCS4(ucs4, s, len, error)				\
+do {									\
+	uint8_t *buf;							\
+	uint8_t l = 0;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || *s == NULL || len == NULL) {			\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	if (ucs4 < 0x80) {						\
+		l = 1;							\
+	} else if (ucs4 < 0x800) {					\
+		l = 2;							\
+	} else if (ucs4 < 0x10000) {					\
+		l = 3;							\
+	} else if (ucs4 < 0x200000) {					\
+		l = 4;							\
+	} else if (ucs4 < 0x4000000) {					\
+		l = 5;							\
+	} else if (ucs4 <= 0x7FFFFFFF) {				\
+		l = 6;							\
+	} else {							\
+		error = PARSERUTILS_INVALID;				\
+		break;							\
+	}								\
+									\
+	if (l > *len) {							\
+		error = PARSERUTILS_NOMEM;				\
+		break;							\
+	}								\
+									\
+	buf = *s;							\
+									\
+	if (l == 1) {							\
+		buf[0] = (uint8_t) ucs4;				\
+	} else {							\
+		for (uint8_t i = l; i > 1; i--) {			\
+			buf[i - 1] = 0x80 | (ucs4 & 0x3F);		\
+			ucs4 >>= 6;					\
+		}							\
+		buf[0] = ~((1 << (8 - l)) - 1) | ucs4;			\
+	}								\
+									\
+	*s += l;							\
+	*len -= l;							\
+} while(0)
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s      The string
+ * \param max    Maximum length
+ * \param len    Pointer to location to receive length of string
+ * \param error  Location to receive error code
+ */
+#define UTF8_LENGTH(s, max, len, error)					\
+do {									\
+	const uint8_t *end = s + max;					\
+	int l = 0;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || len == NULL) {					\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	while (s < end) {						\
+		uint32_t c = s[0];					\
+									\
+		if ((c & 0x80) == 0x00)					\
+			s += 1;						\
+		else if ((c & 0xE0) == 0xC0)				\
+			s += 2;						\
+		else if ((c & 0xF0) == 0xE0)				\
+			s += 3;						\
+		else if ((c & 0xF8) == 0xF0)				\
+			s += 4;						\
+		else if ((c & 0xFC) == 0xF8)				\
+			s += 5;						\
+		else if ((c & 0xFE) == 0xFC)				\
+			s += 6;						\
+		else {							\
+			error = PARSERUTILS_INVALID;			\
+			break;						\
+		}							\
+									\
+		l++;							\
+	}								\
+									\
+	if (error == PARSERUTILS_OK)					\
+		*len = l;						\
+} while(0)
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s      Pointer to start of character
+ * \param len    Pointer to location to receive length
+ * \param error  Location to receive error code
+ */
+#define UTF8_CHAR_BYTE_LENGTH(s, len, error)				\
+do {									\
+	if (s == NULL || len == NULL) {					\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	*len = numContinuations[s[0]] + 1 /* Start byte */;		\
+									\
+	error = PARSERUTILS_OK;						\
+} while(0)
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_PREV(s, off, prevoff, error)				\
+do {									\
+	if (s == NULL || prevoff == NULL) {				\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	while (off != 0 && (s[--off] & 0xC0) == 0x80)			\
+		/* do nothing */;					\
+									\
+	*prevoff = off;							\
+									\
+	error = PARSERUTILS_OK;						\
+} while(0)
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_NEXT(s, len, off, nextoff, error)				\
+do {									\
+	if (s == NULL || off >= len || nextoff == NULL) {		\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	/* Skip current start byte (if present - may be mid-sequence) */\
+	if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)			\
+		off++;							\
+									\
+	while (off < len && (s[off] & 0xC0) == 0x80)			\
+		off++;							\
+									\
+	*nextoff = off;							\
+									\
+	error = PARSERUTILS_OK;						\
+} while(0)
+
+/**
+ * Skip to start of next sequence in UTF-8 input
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error)			\
+do {									\
+	uint8_t c;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || off >= len || nextoff == NULL) {		\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	c = s[off];							\
+									\
+	/* If we're mid-sequence, simply advance to next byte */	\
+	if (!(c < 0x80 || (c & 0xC0) == 0xC0)) {			\
+		off++;							\
+	} else {							\
+		uint32_t nCont = numContinuations[c];			\
+		uint32_t nToSkip;					\
+									\
+		if (off + nCont + 1 >= len) {				\
+			error = PARSERUTILS_NEEDDATA;			\
+			break;						\
+		}							\
+									\
+		/* Verify continuation bytes */				\
+		for (nToSkip = 1; nToSkip <= nCont; nToSkip++) {	\
+			if ((s[off + nToSkip] & 0xC0) != 0x80)		\
+				break;					\
+		}							\
+									\
+		/* Skip over the valid bytes */				\
+		off += nToSkip;						\
+	}								\
+									\
+	*nextoff = off;							\
+} while(0)
+
+#endif