1 files changed, 339 insertions, 0 deletions
diff --git a/src/charset/encodings/utf8impl.h b/src/charset/encodings/utf8impl.h
new file mode 100644
index 0000000..1ca9de7
--- /dev/null
+++ b/src/charset/encodings/utf8impl.h
@@ -0,0 +1,339 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_encodings_utf8impl_h_
+#define parserutils_charset_encodings_utf8impl_h_
+
+/** \file
+ * UTF-8 manipulation macros (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+/** Number of continuation bytes for a given start byte */
+extern const uint8_t numContinuations[256];
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param s      The sequence to process
+ * \param len    Length of sequence
+ * \param ucs4   Pointer to location to receive UCS4 character (host endian)
+ * \param clen   Pointer to location to receive byte length of UTF-8 sequence
+ * \param error  Location to receive error code
+ */
+#define UTF8_TO_UCS4(s, len, ucs4, clen, error)				\
+do {									\
+	uint32_t c, min;						\
+	uint8_t n;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || ucs4 == NULL || clen == NULL) {		\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	if (len == 0) {							\
+		error = PARSERUTILS_NEEDDATA;				\
+		break;							\
+	}								\
+									\
+	c = s[0];							\
+									\
+	if (c < 0x80) {							\
+		n = 1;							\
+		min = 0;						\
+	} else if ((c & 0xE0) == 0xC0) {				\
+		c &= 0x1F;						\
+		n = 2;							\
+		min = 0x80;						\
+	} else if ((c & 0xF0) == 0xE0) {				\
+		c &= 0x0F;						\
+		n = 3;							\
+		min = 0x800;						\
+	} else if ((c & 0xF8) == 0xF0) {				\
+		c &= 0x07;						\
+		n = 4;							\
+		min = 0x10000;						\
+	} else if ((c & 0xFC) == 0xF8) {				\
+		c &= 0x03;						\
+		n = 5;							\
+		min = 0x200000;						\
+	} else if ((c & 0xFE) == 0xFC) {				\
+		c &= 0x01;						\
+		n = 6;							\
+		min = 0x4000000;					\
+	} else {							\
+		error = PARSERUTILS_INVALID;				\
+		break;							\
+	}								\
+									\
+	if (len < n) {							\
+		error = PARSERUTILS_NEEDDATA;				\
+		break;							\
+	}								\
+									\
+	for (uint8_t i = 1; i < n; i++) {				\
+		uint32_t t = s[i];					\
+									\
+		if ((t & 0xC0) != 0x80) {				\
+			error = PARSERUTILS_INVALID;			\
+			break;						\
+		}							\
+									\
+		c <<= 6;						\
+		c |= t & 0x3F;						\
+	}								\
+									\
+	if (error == PARSERUTILS_OK) {					\
+		/* Detect overlong sequences, surrogates and fffe/ffff */ \
+		if (c < min || (c >= 0xD800 && c <= 0xDFFF) ||		\
+				c == 0xFFFE || c == 0xFFFF) {		\
+			error = PARSERUTILS_INVALID;			\
+			break;						\
+		}							\
+									\
+		*ucs4 = c;						\
+		*clen = n;						\
+	}								\
+} while(0)
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param ucs4   The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s      Pointer to pointer to output buffer, updated on exit
+ * \param len    Pointer to length, in bytes, of output buffer, updated on exit
+ * \param error  Location to receive error code
+ */
+#define UTF8_FROM_UCS4(ucs4, s, len, error)				\
+do {									\
+	uint8_t *buf;							\
+	uint8_t l = 0;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || *s == NULL || len == NULL) {			\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	if (ucs4 < 0x80) {						\
+		l = 1;							\
+	} else if (ucs4 < 0x800) {					\
+		l = 2;							\
+	} else if (ucs4 < 0x10000) {					\
+		l = 3;							\
+	} else if (ucs4 < 0x200000) {					\
+		l = 4;							\
+	} else if (ucs4 < 0x4000000) {					\
+		l = 5;							\
+	} else if (ucs4 <= 0x7FFFFFFF) {				\
+		l = 6;							\
+	} else {							\
+		error = PARSERUTILS_INVALID;				\
+		break;							\
+	}								\
+									\
+	if (l > *len) {							\
+		error = PARSERUTILS_NOMEM;				\
+		break;							\
+	}								\
+									\
+	buf = *s;							\
+									\
+	if (l == 1) {							\
+		buf[0] = (uint8_t) ucs4;				\
+	} else {							\
+		for (uint8_t i = l; i > 1; i--) {			\
+			buf[i - 1] = 0x80 | (ucs4 & 0x3F);		\
+			ucs4 >>= 6;					\
+		}							\
+		buf[0] = ~((1 << (8 - l)) - 1) | ucs4;			\
+	}								\
+									\
+	*s += l;							\
+	*len -= l;							\
+} while(0)
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s      The string
+ * \param max    Maximum length
+ * \param len    Pointer to location to receive length of string
+ * \param error  Location to receive error code
+ */
+#define UTF8_LENGTH(s, max, len, error)					\
+do {									\
+	const uint8_t *end = s + max;					\
+	int l = 0;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || len == NULL) {					\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	while (s < end) {						\
+		uint32_t c = s[0];					\
+									\
+		if ((c & 0x80) == 0x00)					\
+			s += 1;						\
+		else if ((c & 0xE0) == 0xC0)				\
+			s += 2;						\
+		else if ((c & 0xF0) == 0xE0)				\
+			s += 3;						\
+		else if ((c & 0xF8) == 0xF0)				\
+			s += 4;						\
+		else if ((c & 0xFC) == 0xF8)				\
+			s += 5;						\
+		else if ((c & 0xFE) == 0xFC)				\
+			s += 6;						\
+		else {							\
+			error = PARSERUTILS_INVALID;			\
+			break;						\
+		}							\
+									\
+		l++;							\
+	}								\
+									\
+	if (error == PARSERUTILS_OK)					\
+		*len = l;						\
+} while(0)
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s      Pointer to start of character
+ * \param len    Pointer to location to receive length
+ * \param error  Location to receive error code
+ */
+#define UTF8_CHAR_BYTE_LENGTH(s, len, error)				\
+do {									\
+	if (s == NULL || len == NULL) {					\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	*len = numContinuations[s[0]] + 1 /* Start byte */;		\
+									\
+	error = PARSERUTILS_OK;						\
+} while(0)
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s        The string
+ * \param off      Offset in the string to start at
+ * \param prevoff  Pointer to location to receive offset of first byte of
+ *                 previous legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_PREV(s, off, prevoff, error)				\
+do {									\
+	if (s == NULL || prevoff == NULL) {				\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	while (off != 0 && (s[--off] & 0xC0) == 0x80)			\
+		/* do nothing */;					\
+									\
+	*prevoff = off;							\
+									\
+	error = PARSERUTILS_OK;						\
+} while(0)
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s        The string (assumed valid)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_NEXT(s, len, off, nextoff, error)				\
+do {									\
+	if (s == NULL || off >= len || nextoff == NULL) {		\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	/* Skip current start byte (if present - may be mid-sequence) */\
+	if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)			\
+		off++;							\
+									\
+	while (off < len && (s[off] & 0xC0) == 0x80)			\
+		off++;							\
+									\
+	*nextoff = off;							\
+									\
+	error = PARSERUTILS_OK;						\
+} while(0)
+
+/**
+ * Skip to start of next sequence in UTF-8 input
+ *
+ * \param s        The string (assumed to be of dubious validity)
+ * \param len      Maximum offset in string
+ * \param off      Offset in the string to start at
+ * \param nextoff  Pointer to location to receive offset of first byte of
+ *                 next legal character
+ * \param error    Location to receive error code
+ */
+#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error)			\
+do {									\
+	uint8_t c;							\
+									\
+	error = PARSERUTILS_OK;						\
+									\
+	if (s == NULL || off >= len || nextoff == NULL) {		\
+		error = PARSERUTILS_BADPARM;				\
+		break;							\
+	}								\
+									\
+	c = s[off];							\
+									\
+	/* If we're mid-sequence, simply advance to next byte */	\
+	if (!(c < 0x80 || (c & 0xC0) == 0xC0)) {			\
+		off++;							\
+	} else {							\
+		uint32_t nCont = numContinuations[c];			\
+		uint32_t nToSkip;					\
+									\
+		if (off + nCont + 1 >= len) {				\
+			error = PARSERUTILS_NEEDDATA;			\
+			break;						\
+		}							\
+									\
+		/* Verify continuation bytes */				\
+		for (nToSkip = 1; nToSkip <= nCont; nToSkip++) {	\
+			if ((s[off + nToSkip] & 0xC0) != 0x80)		\
+				break;					\
+		}							\
+									\
+		/* Skip over the valid bytes */				\
+		off += nToSkip;						\
+	}								\
+									\
+	*nextoff = off;							\
+} while(0)
+
+#endif