summaryrefslogtreecommitdiff
path: root/src/charset/encodings
diff options
context:
space:
mode:
Diffstat (limited to 'src/charset/encodings')
-rw-r--r--src/charset/encodings/Makefile46
-rw-r--r--src/charset/encodings/utf16.c239
-rw-r--r--src/charset/encodings/utf8.c175
-rw-r--r--src/charset/encodings/utf8impl.h339
4 files changed, 799 insertions, 0 deletions
diff --git a/src/charset/encodings/Makefile b/src/charset/encodings/Makefile
new file mode 100644
index 0000000..47d9210
--- /dev/null
+++ b/src/charset/encodings/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Sources
+SRCS_$(d) := utf8.c utf16.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/charset/encodings/utf16.c b/src/charset/encodings/utf16.c
new file mode 100644
index 0000000..95dc64f
--- /dev/null
+++ b/src/charset/encodings/utf16.c
@@ -0,0 +1,239 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-16 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/utf16.h>
+
+/**
+ * Convert a UTF-16 sequence into a single UCS4 character
+ *
+ * \param s The sequence to process
+ * \param len Length of sequence
+ * \param ucs4 Pointer to location to receive UCS4 character (host endian)
+ * \param clen Pointer to location to receive byte length of UTF-16 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s,
+ size_t len, uint32_t *ucs4, size_t *clen)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || ucs4 == NULL || clen == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (len < 2)
+ return PARSERUTILS_NEEDDATA;
+
+ if (*ss < 0xD800 || *ss > 0xDFFF) {
+ *ucs4 = *ss;
+ *clen = 2;
+ } else if (0xD800 <= *ss && *ss <= 0xBFFF) {
+ if (len < 4)
+ return PARSERUTILS_NEEDDATA;
+
+ if (0xDC00 <= ss[1] && ss[1] <= 0xE000) {
+ *ucs4 = (((s[0] >> 6) & 0x1f) + 1) |
+ ((s[0] & 0x3f) | (s[1] & 0x3ff));
+ *clen = 4;
+ } else {
+ return PARSERUTILS_INVALID;
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-16 sequence
+ *
+ * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s Pointer to 4 byte long output buffer
+ * \param len Pointer to location to receive length of multibyte sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4, uint8_t *s,
+ size_t *len)
+{
+ uint16_t *ss = (uint16_t *) (void *) s;
+ uint32_t l = 0;
+
+ if (s == NULL || len == NULL)
+ return PARSERUTILS_BADPARM;
+ else if (ucs4 < 0x10000) {
+ *ss = (uint16_t) ucs4;
+ l = 2;
+ } else if (ucs4 < 0x110000) {
+ ss[0] = 0xD800 | (((ucs4 >> 16) & 0x1f) - 1) | (ucs4 >> 10);
+ ss[1] = 0xDC00 | (ucs4 & 0x3ff);
+ l = 4;
+ } else {
+ return PARSERUTILS_INVALID;
+ }
+
+ *len = l;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-16 string
+ *
+ * \param s The string
+ * \param max Maximum length
+ * \param len Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_length(const uint8_t *s, size_t max,
+ size_t *len)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+ const uint16_t *end = (const uint16_t *) (const void *) (s + max);
+ int l = 0;
+
+ if (s == NULL || len == NULL)
+ return PARSERUTILS_BADPARM;
+
+ while (ss < end) {
+ if (*ss < 0xD800 || 0xDFFF < *ss)
+ ss++;
+ else
+ ss += 2;
+
+ l++;
+ }
+
+ *len = l;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-16 character
+ *
+ * \param s Pointer to start of character
+ * \param len Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
+ size_t *len)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || len == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (*ss < 0xD800 || 0xDFFF < *ss)
+ *len = 2;
+ else
+ *len = 4;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Find previous legal UTF-16 char in string
+ *
+ * \param s The string
+ * \param off Offset in the string to start at
+ * \param prevoff Pointer to location to receive offset of first byte of
+ * previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_prev(const uint8_t *s, uint32_t off,
+ uint32_t *prevoff)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || prevoff == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (off < 2)
+ *prevoff = 0;
+ else if (ss[-1] < 0xDC00 || ss[-1] > 0xDFFF)
+ *prevoff = off - 2;
+ else
+ *prevoff = (off < 4) ? 0 : off - 4;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s The string (assumed valid)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || off >= len || nextoff == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (len - off < 4)
+ *nextoff = len;
+ else if (ss[1] < 0xD800 || ss[1] > 0xDBFF)
+ *nextoff = off + 2;
+ else
+ *nextoff = (len - off < 6) ? len : off + 4;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Find next legal UTF-16 char in string
+ *
+ * \param s The string (assumed to be of dubious validity)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
+ uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+ const uint16_t *ss = (const uint16_t *) (const void *) s;
+
+ if (s == NULL || off >= len || nextoff == NULL)
+ return PARSERUTILS_BADPARM;
+
+ while (1) {
+ if (len - off < 4) {
+ return PARSERUTILS_NEEDDATA;
+ } else if (ss[1] < 0xD800 || ss[1] > 0xDFFF) {
+ *nextoff = off + 2;
+ break;
+ } else if (ss[1] >= 0xD800 && ss[1] <= 0xDBFF) {
+ if (len - off < 6)
+ return PARSERUTILS_NEEDDATA;
+
+ if (ss[2] >= 0xDC00 && ss[2] <= 0xDFFF) {
+ *nextoff = off + 4;
+ break;
+ } else {
+ ss++;
+ off += 2;
+ }
+ }
+ }
+
+ return PARSERUTILS_OK;
+}
+
diff --git a/src/charset/encodings/utf8.c b/src/charset/encodings/utf8.c
new file mode 100644
index 0000000..5b4ba95
--- /dev/null
+++ b/src/charset/encodings/utf8.c
@@ -0,0 +1,175 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/utf8.h>
+#include "charset/encodings/utf8impl.h"
+
+/** Number of continuation bytes for a given start byte */
+const uint8_t numContinuations[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
+};
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param s The sequence to process
+ * \param len Length of sequence
+ * \param ucs4 Pointer to location to receive UCS4 character (host endian)
+ * \param clen Pointer to location to receive byte length of UTF-8 sequence
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
+ uint32_t *ucs4, size_t *clen)
+{
+ parserutils_error error;
+
+ UTF8_TO_UCS4(s, len, ucs4, clen, error);
+
+ return error;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s Pointer to pointer to output buffer, updated on exit
+ * \param len Pointer to length, in bytes, of output buffer, updated on exit
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4,
+ uint8_t **s, size_t *len)
+{
+ parserutils_error error;
+
+ UTF8_FROM_UCS4(ucs4, s, len, error);
+
+ return error;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s The string
+ * \param max Maximum length
+ * \param len Pointer to location to receive length of string
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
+ size_t *len)
+{
+ parserutils_error error;
+
+ UTF8_LENGTH(s, max, len, error);
+
+ return error;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s Pointer to start of character
+ * \param len Pointer to location to receive length
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
+ size_t *len)
+{
+ parserutils_error error;
+
+ UTF8_CHAR_BYTE_LENGTH(s, len, error);
+
+ return error;
+}
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s The string
+ * \param off Offset in the string to start at
+ * \param prevoff Pointer to location to receive offset of first byte of
+ * previous legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
+ uint32_t *prevoff)
+{
+ parserutils_error error;
+
+ UTF8_PREV(s, off, prevoff, error);
+
+ return error;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s The string (assumed valid)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff)
+{
+ parserutils_error error;
+
+ UTF8_NEXT(s, len, off, nextoff, error);
+
+ return error;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s The string (assumed to be of dubious validity)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s,
+ uint32_t len, uint32_t off, uint32_t *nextoff)
+{
+ parserutils_error error;
+
+ UTF8_NEXT_PARANOID(s, len, off, nextoff, error);
+
+ return error;
+}
+
diff --git a/src/charset/encodings/utf8impl.h b/src/charset/encodings/utf8impl.h
new file mode 100644
index 0000000..1ca9de7
--- /dev/null
+++ b/src/charset/encodings/utf8impl.h
@@ -0,0 +1,339 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_encodings_utf8impl_h_
+#define parserutils_charset_encodings_utf8impl_h_
+
+/** \file
+ * UTF-8 manipulation macros (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+/** Number of continuation bytes for a given start byte */
+extern const uint8_t numContinuations[256];
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param s The sequence to process
+ * \param len Length of sequence
+ * \param ucs4 Pointer to location to receive UCS4 character (host endian)
+ * \param clen Pointer to location to receive byte length of UTF-8 sequence
+ * \param error Location to receive error code
+ */
+#define UTF8_TO_UCS4(s, len, ucs4, clen, error) \
+do { \
+ uint32_t c, min; \
+ uint8_t n; \
+ \
+ error = PARSERUTILS_OK; \
+ \
+ if (s == NULL || ucs4 == NULL || clen == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ if (len == 0) { \
+ error = PARSERUTILS_NEEDDATA; \
+ break; \
+ } \
+ \
+ c = s[0]; \
+ \
+ if (c < 0x80) { \
+ n = 1; \
+ min = 0; \
+ } else if ((c & 0xE0) == 0xC0) { \
+ c &= 0x1F; \
+ n = 2; \
+ min = 0x80; \
+ } else if ((c & 0xF0) == 0xE0) { \
+ c &= 0x0F; \
+ n = 3; \
+ min = 0x800; \
+ } else if ((c & 0xF8) == 0xF0) { \
+ c &= 0x07; \
+ n = 4; \
+ min = 0x10000; \
+ } else if ((c & 0xFC) == 0xF8) { \
+ c &= 0x03; \
+ n = 5; \
+ min = 0x200000; \
+ } else if ((c & 0xFE) == 0xFC) { \
+ c &= 0x01; \
+ n = 6; \
+ min = 0x4000000; \
+ } else { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ if (len < n) { \
+ error = PARSERUTILS_NEEDDATA; \
+ break; \
+ } \
+ \
+ for (uint8_t i = 1; i < n; i++) { \
+ uint32_t t = s[i]; \
+ \
+ if ((t & 0xC0) != 0x80) { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ c <<= 6; \
+ c |= t & 0x3F; \
+ } \
+ \
+ if (error == PARSERUTILS_OK) { \
+ /* Detect overlong sequences, surrogates and fffe/ffff */ \
+ if (c < min || (c >= 0xD800 && c <= 0xDFFF) || \
+ c == 0xFFFE || c == 0xFFFF) { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ *ucs4 = c; \
+ *clen = n; \
+ } \
+} while(0)
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This macro conforms to RFC2279, however.
+ *
+ * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s Pointer to pointer to output buffer, updated on exit
+ * \param len Pointer to length, in bytes, of output buffer, updated on exit
+ * \param error Location to receive error code
+ */
+#define UTF8_FROM_UCS4(ucs4, s, len, error) \
+do { \
+ uint8_t *buf; \
+ uint8_t l = 0; \
+ \
+ error = PARSERUTILS_OK; \
+ \
+ if (s == NULL || *s == NULL || len == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ if (ucs4 < 0x80) { \
+ l = 1; \
+ } else if (ucs4 < 0x800) { \
+ l = 2; \
+ } else if (ucs4 < 0x10000) { \
+ l = 3; \
+ } else if (ucs4 < 0x200000) { \
+ l = 4; \
+ } else if (ucs4 < 0x4000000) { \
+ l = 5; \
+ } else if (ucs4 <= 0x7FFFFFFF) { \
+ l = 6; \
+ } else { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ if (l > *len) { \
+ error = PARSERUTILS_NOMEM; \
+ break; \
+ } \
+ \
+ buf = *s; \
+ \
+ if (l == 1) { \
+ buf[0] = (uint8_t) ucs4; \
+ } else { \
+ for (uint8_t i = l; i > 1; i--) { \
+ buf[i - 1] = 0x80 | (ucs4 & 0x3F); \
+ ucs4 >>= 6; \
+ } \
+ buf[0] = ~((1 << (8 - l)) - 1) | ucs4; \
+ } \
+ \
+ *s += l; \
+ *len -= l; \
+} while(0)
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s The string
+ * \param max Maximum length
+ * \param len Pointer to location to receive length of string
+ * \param error Location to receive error code
+ */
+#define UTF8_LENGTH(s, max, len, error) \
+do { \
+ const uint8_t *end = s + max; \
+ int l = 0; \
+ \
+ error = PARSERUTILS_OK; \
+ \
+ if (s == NULL || len == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ while (s < end) { \
+ uint32_t c = s[0]; \
+ \
+ if ((c & 0x80) == 0x00) \
+ s += 1; \
+ else if ((c & 0xE0) == 0xC0) \
+ s += 2; \
+ else if ((c & 0xF0) == 0xE0) \
+ s += 3; \
+ else if ((c & 0xF8) == 0xF0) \
+ s += 4; \
+ else if ((c & 0xFC) == 0xF8) \
+ s += 5; \
+ else if ((c & 0xFE) == 0xFC) \
+ s += 6; \
+ else { \
+ error = PARSERUTILS_INVALID; \
+ break; \
+ } \
+ \
+ l++; \
+ } \
+ \
+ if (error == PARSERUTILS_OK) \
+ *len = l; \
+} while(0)
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s Pointer to start of character
+ * \param len Pointer to location to receive length
+ * \param error Location to receive error code
+ */
+#define UTF8_CHAR_BYTE_LENGTH(s, len, error) \
+do { \
+ if (s == NULL || len == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ *len = numContinuations[s[0]] + 1 /* Start byte */; \
+ \
+ error = PARSERUTILS_OK; \
+} while(0)
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s The string
+ * \param off Offset in the string to start at
+ * \param prevoff Pointer to location to receive offset of first byte of
+ * previous legal character
+ * \param error Location to receive error code
+ */
+#define UTF8_PREV(s, off, prevoff, error) \
+do { \
+ if (s == NULL || prevoff == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ while (off != 0 && (s[--off] & 0xC0) == 0x80) \
+ /* do nothing */; \
+ \
+ *prevoff = off; \
+ \
+ error = PARSERUTILS_OK; \
+} while(0)
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s The string (assumed valid)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \param error Location to receive error code
+ */
+#define UTF8_NEXT(s, len, off, nextoff, error) \
+do { \
+ if (s == NULL || off >= len || nextoff == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ /* Skip current start byte (if present - may be mid-sequence) */\
+ if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) \
+ off++; \
+ \
+ while (off < len && (s[off] & 0xC0) == 0x80) \
+ off++; \
+ \
+ *nextoff = off; \
+ \
+ error = PARSERUTILS_OK; \
+} while(0)
+
+/**
+ * Skip to start of next sequence in UTF-8 input
+ *
+ * \param s The string (assumed to be of dubious validity)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \param error Location to receive error code
+ */
+#define UTF8_NEXT_PARANOID(s, len, off, nextoff, error) \
+do { \
+ uint8_t c; \
+ \
+ error = PARSERUTILS_OK; \
+ \
+ if (s == NULL || off >= len || nextoff == NULL) { \
+ error = PARSERUTILS_BADPARM; \
+ break; \
+ } \
+ \
+ c = s[off]; \
+ \
+ /* If we're mid-sequence, simply advance to next byte */ \
+ if (!(c < 0x80 || (c & 0xC0) == 0xC0)) { \
+ off++; \
+ } else { \
+ uint32_t nCont = numContinuations[c]; \
+ uint32_t nToSkip; \
+ \
+ if (off + nCont + 1 >= len) { \
+ error = PARSERUTILS_NEEDDATA; \
+ break; \
+ } \
+ \
+ /* Verify continuation bytes */ \
+ for (nToSkip = 1; nToSkip <= nCont; nToSkip++) { \
+ if ((s[off + nToSkip] & 0xC0) != 0x80) \
+ break; \
+ } \
+ \
+ /* Skip over the valid bytes */ \
+ off += nToSkip; \
+ } \
+ \
+ *nextoff = off; \
+} while(0)
+
+#endif