diff options
author | John Mark Bell <jmb@netsurf-browser.org> | 2007-06-23 22:40:25 +0000 |
---|---|---|
committer | John Mark Bell <jmb@netsurf-browser.org> | 2007-06-23 22:40:25 +0000 |
commit | 7b30a5520cfb56e651f0eb4da85a3e07747da7dc (patch) | |
tree | 5d6281c071c089e1e7a8ae6f8044cecaf6a7db16 /src/utils | |
download | libhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.gz libhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.bz2 |
Import hubbub -- an HTML parsing library.
Plenty of work still to do (like tree generation ;)
svn path=/trunk/hubbub/; revision=3359
Diffstat (limited to 'src/utils')
-rw-r--r-- | src/utils/Makefile | 53 | ||||
-rw-r--r-- | src/utils/dict.c | 219 | ||||
-rw-r--r-- | src/utils/dict.h | 31 | ||||
-rw-r--r-- | src/utils/errors.c | 70 | ||||
-rw-r--r-- | src/utils/utf8.c | 368 | ||||
-rw-r--r-- | src/utils/utf8.h | 38 | ||||
-rw-r--r-- | src/utils/utils.h | 28 |
7 files changed, 807 insertions, 0 deletions
diff --git a/src/utils/Makefile b/src/utils/Makefile new file mode 100644 index 0000000..59b5512 --- /dev/null +++ b/src/utils/Makefile @@ -0,0 +1,53 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Objects +OBJS = dict errors utf8 + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS))) + +debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS))) + +clean: + -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS})) + +distclean: + +setup: + +export: + +test: + +# Pattern rules +../Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +../Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/utils/dict.c b/src/utils/dict.c new file mode 100644 index 0000000..f50ffab --- /dev/null +++ b/src/utils/dict.c @@ -0,0 +1,219 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <stdbool.h> + +#include "utils/dict.h" + +/** Node in a dictionary tree */ +typedef struct hubbub_dict_node { + uint8_t split; /**< Data to split on */ + struct hubbub_dict_node *lt; /**< Subtree for data less than + * split */ + struct hubbub_dict_node *eq; /**< Subtree for data equal to split + * If split == '\0', this stores the + * pointer to the actual data, not a + * subtree */ + struct hubbub_dict_node *gt; /**< Subtree for data greater than + * split */ +} hubbub_dict_node; + +/** Dictionary object */ +struct hubbub_dict { + hubbub_dict_node *dict; /**< Root of tree */ + + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Pointer to client data */ +}; + +static void hubbub_dict_destroy_internal(hubbub_dict *dict, + hubbub_dict_node *root); +static hubbub_dict_node *hubbub_dict_insert_internal(hubbub_dict *dict, + hubbub_dict_node *parent, const char *key, + const void *value); + + +/** + * Create a dictionary + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to dictionary instance, or NULL on error + */ +hubbub_dict *hubbub_dict_create(hubbub_alloc alloc, void *pw) +{ + hubbub_dict *dict; + + if (alloc == NULL) + return NULL; + + dict = alloc(NULL, sizeof(hubbub_dict), pw); + if (dict == NULL) + return NULL; + + dict->dict = NULL; + + dict->alloc = alloc; + dict->pw = pw; + + return dict; +} + +/** + * Destroy a dictionary + * + * \param dict Dictionary to destroy + */ +void hubbub_dict_destroy(hubbub_dict *dict) +{ + if (dict == NULL) + return; + + hubbub_dict_destroy_internal(dict, dict->dict); + + dict->alloc(dict, 0, dict->pw); +} + +/** + * Helper routine for dictionary destruction + * + * \param dict Dictionary being destroyed + * \param root Root node of dictionary (sub)tree to destroy + */ +void hubbub_dict_destroy_internal(hubbub_dict *dict, hubbub_dict_node *root) +{ + if (root == NULL) + return; + + hubbub_dict_destroy_internal(dict, root->lt); + if (root->split != '\0') + hubbub_dict_destroy_internal(dict, root->eq); + hubbub_dict_destroy_internal(dict, root->gt); + + dict->alloc(root, 0, dict->pw); +} + +/** + * Insert a key-value pair into a dictionary + * + * \param dict Dictionary to insert into + * \param key Key string + * \param value Value to associate with key (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_dict_insert(hubbub_dict *dict, const char *key, + const void *value) +{ + if (dict == NULL || key == NULL) + return HUBBUB_BADPARM; + + dict->dict = hubbub_dict_insert_internal(dict, dict->dict, + key, value); + + return HUBBUB_OK; +} + +/** + * Helper routine for insertion into dictionary + * + * \param dict Dictionary being inserted into + * \param parent Parent node of subtree to insert into + * \param key Key string + * \param value Value to associate with key + * \return Pointer to root of tree created + */ +hubbub_dict_node *hubbub_dict_insert_internal(hubbub_dict *dict, + hubbub_dict_node *parent, const char *key, const void *value) +{ + if (parent == NULL) { + parent = dict->alloc(NULL, + sizeof(hubbub_dict_node), dict->pw); + if (parent == NULL) + return NULL; + parent->split = (uint8_t) key[0]; + parent->lt = parent->eq = parent->gt = NULL; + } + + if ((uint8_t) key[0] < parent->split) { + parent->lt = hubbub_dict_insert_internal(dict, + parent->lt, key, value); + } else if ((uint8_t) key[0] == parent->split) { + if (key[0] == '\0') { + parent->eq = (hubbub_dict_node *) value; + } else { + parent->eq = hubbub_dict_insert_internal(dict, + parent->eq, ++key, value); + } + } else { + parent->gt = hubbub_dict_insert_internal(dict, + parent->gt, key, value); + } + + return parent; +} + +/** + * Step-wise search for a key in a dictionary + * + * \param dict Dictionary to search + * \param c Character to look for + * \param result Pointer to location for result + * \param context Pointer to location for search context + * \return HUBBUB_OK if key found, + * HUBBUB_NEEDDATA if more steps are required + * HUBBUB_INVALID if nothing matches + * + * The value pointed to by ::context must be NULL for the first call. + * Thereafter, pass in the same value as returned by the previous call. + * The context is opaque to the caller and should not be inspected. + * + * The location pointed to by ::result will be set to NULL unless a match + * is found. + */ +hubbub_error hubbub_dict_search_step(hubbub_dict *dict, uint8_t c, + const void **result, void **context) +{ + bool match = false; + hubbub_dict_node *p; + + if (dict == NULL || result == NULL || context == NULL) + return HUBBUB_BADPARM; + + *result = NULL; + + if (*context == NULL) { + p = dict->dict; + } else { + p = (hubbub_dict_node *) *context; + } + + while (p != NULL) { + if (c < p->split) { + p = p->lt; + } else if (c == p->split) { + if (p->split == '\0') { + match = true; + p = NULL; + } else if (p->eq != NULL && p->eq->split == '\0') { + match = true; + *result = (const void *) p->eq->eq; + p = p->eq; + } else { + p = p->eq; + } + + break; + } else { + p = p->gt; + } + } + + *context = (void *) p; + + return (match) ? HUBBUB_OK : + (p == NULL) ? HUBBUB_INVALID : HUBBUB_NEEDDATA; +} diff --git a/src/utils/dict.h b/src/utils/dict.h new file mode 100644 index 0000000..2cde01d --- /dev/null +++ b/src/utils/dict.h @@ -0,0 +1,31 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_utils_dict_h_ +#define hubbub_utils_dict_h_ + +#include <inttypes.h> + +#include <hubbub/errors.h> +#include <hubbub/hubbub.h> + +typedef struct hubbub_dict hubbub_dict; + +/* Create a dictionary */ +hubbub_dict *hubbub_dict_create(hubbub_alloc alloc, void *pw); +/* Destroy a dictionary */ +void hubbub_dict_destroy(hubbub_dict *dict); + +/* Insert a key-value pair into a dictionary */ +hubbub_error hubbub_dict_insert(hubbub_dict *dict, const char *key, + const void *value); + +/* Step-wise search for a key in a dictionary */ +hubbub_error hubbub_dict_search_step(hubbub_dict *dict, uint8_t c, + const void **result, void **context); + +#endif diff --git a/src/utils/errors.c b/src/utils/errors.c new file mode 100644 index 0000000..e57ba6a --- /dev/null +++ b/src/utils/errors.c @@ -0,0 +1,70 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <string.h> + +#include <hubbub/errors.h> + +/** + * Convert a hubbub error code to a string + * + * \param error The error code to convert + * \return Pointer to string representation of error, or NULL if unknown. + */ +const char *hubbub_error_to_string(hubbub_error error) +{ + const char *result = NULL; + + switch (error) { + case HUBBUB_OK: + result = "No error"; + break; + case HUBBUB_NOMEM: + result = "Insufficient memory"; + break; + case HUBBUB_BADPARM: + result = "Bad parameter"; + break; + case HUBBUB_INVALID: + result = "Invalid input"; + break; + case HUBBUB_FILENOTFOUND: + result = "File not found"; + break; + case HUBBUB_NEEDDATA: + result = "Insufficient data"; + break; + } + + return result; +} + +/** + * Convert a string representation of an error name to a hubbub error code + * + * \param str String containing error name + * \param len Length of string (bytes) + * \return Hubbub error code, or HUBBUB_OK if unknown + */ +hubbub_error hubbub_error_from_string(const char *str, size_t len) +{ + if (strncmp(str, "HUBBUB_OK", len) == 0) { + return HUBBUB_OK; + } else if (strncmp(str, "HUBBUB_NOMEM", len) == 0) { + return HUBBUB_NOMEM; + } else if (strncmp(str, "HUBBUB_BADPARM", len) == 0) { + return HUBBUB_BADPARM; + } else if (strncmp(str, "HUBBUB_INVALID", len) == 0) { + return HUBBUB_INVALID; + } else if (strncmp(str, "HUBBUB_FILENOTFOUND", len) == 0) { + return HUBBUB_FILENOTFOUND; + } else if (strncmp(str, "HUBBUB_NEEDDATA", len) == 0) { + return HUBBUB_NEEDDATA; + } + + return HUBBUB_OK; +} diff --git a/src/utils/utf8.c b/src/utils/utf8.c new file mode 100644 index 0000000..062d629 --- /dev/null +++ b/src/utils/utf8.c @@ -0,0 +1,368 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +/** \file + * UTF-8 manipulation functions (implementation). + */ + +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +#include "utils/utf8.h" + +/** Number of continuation bytes for a given start byte */ +static const uint8_t numContinuations[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, +}; + +/** + * Convert a UTF-8 multibyte sequence into a single UCS4 character + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param s The sequence to process + * \param len Length of sequence + * \param ucs4 Pointer to location to receive UCS4 character (host endian) + * \param clen Pointer to location to receive byte length of UTF-8 sequence + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen) +{ + if (s == NULL || ucs4 == NULL || clen == NULL) + return HUBBUB_BADPARM; + + if (len == 0) + return HUBBUB_NEEDDATA; + + if (*s < 0x80) { + *ucs4 = *s; + *clen = 1; + } else if ((*s & 0xE0) == 0xC0) { + if (len < 2) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F); + *clen = 2; + } + } else if ((*s & 0xF0) == 0xE0) { + if (len < 3) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 12) | + ((*(s+1) & 0x3F) << 6) | + (*(s+2) & 0x3F); + *clen = 3; + } + } else if ((*s & 0xF8) == 0xF0) { + if (len < 4) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80 || + (*(s+3) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 18) | + ((*(s+1) & 0x3F) << 12) | + ((*(s+2) & 0x3F) << 6) | + (*(s+3) & 0x3F); + *clen = 4; + } + } else if ((*s & 0xFC) == 0xF8) { + if (len < 5) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80 || + (*(s+3) & 0xC0) != 0x80 || + (*(s+4) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 24) | + ((*(s+1) & 0x3F) << 18) | + ((*(s+2) & 0x3F) << 12) | + ((*(s+3) & 0x3F) << 6) | + (*(s+4) & 0x3F); + *clen = 5; + } + } else if ((*s & 0xFE) == 0xFC) { + if (len < 6) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80 || + (*(s+3) & 0xC0) != 0x80 || + (*(s+4) & 0xC0) != 0x80 || + (*(s+5) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 28) | + ((*(s+1) & 0x3F) << 24) | + ((*(s+2) & 0x3F) << 18) | + ((*(s+3) & 0x3F) << 12) | + ((*(s+4) & 0x3F) << 6) | + (*(s+5) & 0x3F); + *clen = 6; + } + } else { + return HUBBUB_INVALID; + } + + return HUBBUB_OK; +} + +/** + * Convert a single UCS4 character into a UTF-8 multibyte sequence + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) + * \param s Pointer to 6 byte long output buffer + * \param len Pointer to location to receive length of multibyte sequence + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len) +{ + uint32_t l = 0; + + if (s == NULL || len == NULL) + return HUBBUB_BADPARM; + else if (ucs4 < 0x80) { + *s = (uint8_t) ucs4; + l = 1; + } else if (ucs4 < 0x800) { + *s = 0xC0 | ((ucs4 >> 6) & 0x1F); + *(s+1) = 0x80 | (ucs4 & 0x3F); + l = 2; + } else if (ucs4 < 0x10000) { + *s = 0xE0 | ((ucs4 >> 12) & 0xF); + *(s+1) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+2) = 0x80 | (ucs4 & 0x3F); + l = 3; + } else if (ucs4 < 0x200000) { + *s = 0xF0 | ((ucs4 >> 18) & 0x7); + *(s+1) = 0x80 | ((ucs4 >> 12) & 0x3F); + *(s+2) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+3) = 0x80 | (ucs4 & 0x3F); + l = 4; + } else if (ucs4 < 0x4000000) { + *s = 0xF8 | ((ucs4 >> 24) & 0x3); + *(s+1) = 0x80 | ((ucs4 >> 18) & 0x3F); + *(s+2) = 0x80 | ((ucs4 >> 12) & 0x3F); + *(s+3) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+4) = 0x80 | (ucs4 & 0x3F); + l = 5; + } else if (ucs4 <= 0x7FFFFFFF) { + *s = 0xFC | ((ucs4 >> 30) & 0x1); + *(s+1) = 0x80 | ((ucs4 >> 24) & 0x3F); + *(s+2) = 0x80 | ((ucs4 >> 18) & 0x3F); + *(s+3) = 0x80 | ((ucs4 >> 12) & 0x3F); + *(s+4) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+5) = 0x80 | (ucs4 & 0x3F); + l = 6; + } else { + return HUBBUB_INVALID; + } + + *len = l; + + return HUBBUB_OK; +} + +/** + * Calculate the length (in characters) of a bounded UTF-8 string + * + * \param s The string + * \param max Maximum length + * \param len Pointer to location to receive length of string + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max, + size_t *len) +{ + const uint8_t *end = s + max; + int l = 0; + + if (s == NULL || len == NULL) + return HUBBUB_BADPARM; + + while (s < end) { + if ((*s & 0x80) == 0x00) + s += 1; + else if ((*s & 0xE0) == 0xC0) + s += 2; + else if ((*s & 0xF0) == 0xE0) + s += 3; + else if ((*s & 0xF8) == 0xF0) + s += 4; + else if ((*s & 0xFC) == 0xF8) + s += 5; + else if ((*s & 0xFE) == 0xFC) + s += 6; + else + return HUBBUB_INVALID; + l++; + } + + *len = l; + + return HUBBUB_OK; +} + +/** + * Calculate the length (in bytes) of a UTF-8 character + * + * \param s Pointer to start of character + * \param len Pointer to location to receive length + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s, + size_t *len) +{ + if (s == NULL || len == NULL) + return HUBBUB_BADPARM; + + *len = numContinuations[s[0]] + 1 /* Start byte */; + + return HUBBUB_OK; +} + +/** + * Find previous legal UTF-8 char in string + * + * \param s The string + * \param off Offset in the string to start at + * \param prevoff Pointer to location to receive offset of first byte of + * previous legal character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff) +{ + if (s == NULL || prevoff == NULL) + return HUBBUB_BADPARM; + + while (off != 0 && (s[--off] & 0xC0) == 0x80) + /* do nothing */; + + *prevoff = off; + + return HUBBUB_OK; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed valid) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + if (s == NULL || off >= len || nextoff == NULL) + return HUBBUB_BADPARM; + + /* Skip current start byte (if present - may be mid-sequence) */ + if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) + off++; + + while (off < len && (s[off] & 0xC0) == 0x80) + off++; + + *nextoff = off; + + return HUBBUB_OK; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed to be of dubious validity) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + bool valid; + + if (s == NULL || off >= len || nextoff == NULL) + return HUBBUB_BADPARM; + + /* Skip current start byte (if present - may be mid-sequence) */ + if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) + off++; + + while (1) { + /* Find next possible start byte */ + while (off < len && (s[off] & 0xC0) == 0x80) + off++; + + /* Ran off end of data */ + if (off == len || off + numContinuations[s[off]] >= len) + return HUBBUB_NEEDDATA; + + /* Found if start byte is ascii, + * or next n bytes are valid continuations */ + valid = true; + + switch (numContinuations[s[off]]) { + case 5: + valid &= ((s[off + 5] & 0xC0) == 0x80); + case 4: + valid &= ((s[off + 4] & 0xC0) == 0x80); + case 3: + valid &= ((s[off + 3] & 0xC0) == 0x80); + case 2: + valid &= ((s[off + 2] & 0xC0) == 0x80); + case 1: + valid &= ((s[off + 1] & 0xC0) == 0x80); + case 0: + valid &= (s[off + 0] < 0x80); + } + + if (valid) + break; + + /* Otherwise, skip this (invalid) start byte and try again */ + off++; + } + + *nextoff = off; + + return HUBBUB_OK; +} + diff --git a/src/utils/utf8.h b/src/utils/utf8.h new file mode 100644 index 0000000..8836338 --- /dev/null +++ b/src/utils/utf8.h @@ -0,0 +1,38 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +/** \file + * UTF-8 manipulation functions (interface). + */ + +#ifndef hubbub_utils_utf8_h_ +#define hubbub_utils_utf8_h + +#include <inttypes.h> + +#include <hubbub/errors.h> + +inline hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen); +inline hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len); + +inline hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max, + size_t *len); +inline hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s, + size_t *len); + +inline hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff); +inline hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +inline hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +#endif + diff --git a/src/utils/utils.h b/src/utils/utils.h new file mode 100644 index 0000000..a1e0230 --- /dev/null +++ b/src/utils/utils.h @@ -0,0 +1,28 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_utils_h_ +#define hubbub_utils_h_ + +#ifndef max +#define max(a,b) ((a)>(b)?(a):(b)) +#endif + +#ifndef min +#define min(a,b) ((a)<(b)?(a):(b)) +#endif + +#ifndef SLEN +/* Calculate length of a string constant */ +#define SLEN(s) (sizeof((s)) - 1) /* -1 for '\0' */ +#endif + +#ifndef UNUSED +#define UNUSED(x) ((x)=(x)) +#endif + +#endif |