From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sat, 23 Jun 2007 22:40:25 +0000 Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359 --- src/input/Makefile | 53 +++++ src/input/filter.c | 380 ++++++++++++++++++++++++++++++++ src/input/filter.h | 57 +++++ src/input/inputstream.c | 479 ++++++++++++++++++++++++++++++++++++++++ src/input/inputstream.h | 98 +++++++++ src/input/streamimpl.h | 77 +++++++ src/input/utf8_stream.c | 567 ++++++++++++++++++++++++++++++++++++++++++++++++ 7 files changed, 1711 insertions(+) create mode 100644 src/input/Makefile create mode 100644 src/input/filter.c create mode 100644 src/input/filter.h create mode 100644 src/input/inputstream.c create mode 100644 src/input/inputstream.h create mode 100644 src/input/streamimpl.h create mode 100644 src/input/utf8_stream.c (limited to 'src/input') diff --git a/src/input/Makefile b/src/input/Makefile new file mode 100644 index 0000000..8b06c63 --- /dev/null +++ b/src/input/Makefile @@ -0,0 +1,53 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Objects +OBJS = filter inputstream utf8_stream + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS))) + +debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS))) + +clean: + -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS})) + +distclean: + +setup: + +export: + +test: + +# Pattern rules +../Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +../Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/input/filter.c b/src/input/filter.c new file mode 100644 index 0000000..5ac5391 --- /dev/null +++ b/src/input/filter.c @@ -0,0 +1,380 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include +#include +#include + +#include "charset/aliases.h" +#include "charset/codec.h" +#include "utils/utils.h" + +#include "input/filter.h" + + +/** Input filter */ +struct hubbub_filter { + hubbub_charsetcodec *read_codec; /**< Read codec */ + hubbub_charsetcodec *write_codec; /**< Write codec */ + + uint32_t filter_output[2]; /**< Filter output buffer */ + uint32_t last_filter_char; /**< Last filtered character */ + + uint32_t pivot_buf[64]; /**< Conversion pivot buffer */ + + bool leftover; /**< Data remains from last call */ + uint8_t *pivot_left; /**< Remaining pivot to write */ + size_t pivot_len; /**< Length of pivot remaining */ + + struct { + uint16_t encoding; /**< Input encoding */ + } settings; /**< Filter settings */ + + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client private data */ +}; + +static hubbub_error hubbub_filter_set_defaults(hubbub_filter *input); +static hubbub_error hubbub_filter_set_encoding(hubbub_filter *input, + const char *enc); +static hubbub_error read_character_filter(uint32_t c, + uint32_t **output, size_t *outputlen, void *pw); + +/** + * Create an input filter + * + * \param int_enc Desired encoding of document + * \param alloc Function used to (de)allocate data + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to filter instance, or NULL on failure + */ +hubbub_filter *hubbub_filter_create(const char *int_enc, + hubbub_alloc alloc, void *pw) +{ + hubbub_filter *filter; + + if (alloc == NULL) + return NULL; + + filter = alloc(NULL, sizeof(*filter), pw); + if (!filter) + return NULL; + + filter->last_filter_char = 0; + + filter->leftover = false; + filter->pivot_left = NULL; + filter->pivot_len = 0; + + filter->alloc = alloc; + filter->pw = pw; + + if (hubbub_filter_set_defaults(filter) != HUBBUB_OK) { + filter->alloc(filter, 0, pw); + return NULL; + } + + filter->write_codec = hubbub_charsetcodec_create(int_enc, alloc, pw); + if (filter->write_codec == NULL) { + if (filter->read_codec != NULL) + hubbub_charsetcodec_destroy(filter->read_codec); + filter->alloc(filter, 0, pw); + return NULL; + } + + return filter; +} + +/** + * Destroy an input filter + * + * \param input Pointer to filter instance + */ +void hubbub_filter_destroy(hubbub_filter *input) +{ + if (input == NULL) + return; + + if (input->read_codec != NULL) + hubbub_charsetcodec_destroy(input->read_codec); + + if (input->write_codec != NULL) + hubbub_charsetcodec_destroy(input->write_codec); + + input->alloc(input, 0, input->pw); + + return; +} + +/** + * Configure an input filter + * + * \param input Pointer to filter instance + * \param type Input option type to configure + * \param params Option-specific parameters + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_setopt(hubbub_filter *input, + hubbub_filter_opttype type, + hubbub_filter_optparams *params) +{ + hubbub_error error = HUBBUB_OK; + + if (input == NULL || params == NULL) + return HUBBUB_BADPARM; + + switch (type) { + case HUBBUB_FILTER_SET_ENCODING: + error = hubbub_filter_set_encoding(input, + params->encoding.name); + break; + } + + return error; +} + +/** + * Process a chunk of data + * + * \param input Pointer to filter instance + * \param data Pointer to pointer to input buffer + * \param len Pointer to length of input buffer + * \param output Pointer to pointer to output buffer + * \param outlen Pointer to length of output buffer + * \return HUBBUB_OK on success, appropriate error otherwise + * + * Call this with an input buffer length of 0 to flush any buffers. + */ +hubbub_error hubbub_filter_process_chunk(hubbub_filter *input, + const uint8_t **data, size_t *len, + uint8_t **output, size_t *outlen) +{ + hubbub_error read_error, write_error; + + if (input == NULL || data == NULL || *data == NULL || len == NULL || + output == NULL || *output == NULL || outlen == NULL) + return HUBBUB_BADPARM; + + if (input->leftover) { + /* Some data left to be written from last call */ + + /* Attempt to flush the remaining data. */ + write_error = hubbub_charsetcodec_encode(input->write_codec, + (const uint8_t **) &input->pivot_left, + &input->pivot_len, + output, outlen); + + if (write_error != HUBBUB_OK) { + return write_error; + } + + /* And clear leftover */ + input->pivot_left = NULL; + input->pivot_len = 0; + input->leftover = false; + } + + while (*len > 0) { + size_t pivot_len = sizeof(input->pivot_buf); + uint8_t *pivot = (uint8_t *) input->pivot_buf; + + read_error = hubbub_charsetcodec_decode(input->read_codec, + data, len, + (uint8_t **) &pivot, &pivot_len); + + pivot = (uint8_t *) input->pivot_buf; + pivot_len = sizeof(input->pivot_buf) - pivot_len; + + if (pivot_len > 0) { + write_error = hubbub_charsetcodec_encode( + input->write_codec, + (const uint8_t **) &pivot, + &pivot_len, + output, outlen); + + if (write_error != HUBBUB_OK) { + input->leftover = true; + input->pivot_left = pivot; + input->pivot_len = pivot_len; + + return write_error; + } + } + + if (read_error != HUBBUB_OK && read_error != HUBBUB_NOMEM) + return read_error; + } + + return HUBBUB_OK; +} + +/** + * Reset an input filter's state + * + * \param input The input filter to reset + * \param HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_reset(hubbub_filter *input) +{ + hubbub_error error; + + if (input == NULL) + return HUBBUB_BADPARM; + + /* Clear pivot buffer leftovers */ + input->pivot_left = NULL; + input->pivot_len = 0; + input->leftover = false; + + /* Reset read codec */ + error = hubbub_charsetcodec_reset(input->read_codec); + if (error != HUBBUB_OK) + return error; + + /* Reset write codec */ + error = hubbub_charsetcodec_reset(input->write_codec); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Set an input filter's default settings + * + * \param input Input filter to configure + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_set_defaults(hubbub_filter *input) +{ + hubbub_error error; + + if (input == NULL) + return HUBBUB_BADPARM; + + input->read_codec = NULL; + input->write_codec = NULL; + input->settings.encoding = 0; + error = hubbub_filter_set_encoding(input, "ISO-8859-1"); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Set an input filter's encoding + * + * \param input Input filter to configure + * \param enc Encoding name + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_set_encoding(hubbub_filter *input, + const char *enc) +{ + const char *old_enc; + uint16_t mibenum; + hubbub_error error; + hubbub_charsetcodec_optparams params; + + if (input == NULL || enc == NULL) + return HUBBUB_BADPARM; + + mibenum = hubbub_mibenum_from_name(enc, strlen(enc)); + if (mibenum == 0) + return HUBBUB_INVALID; + + /* Exit early if we're already using this encoding */ + if (input->settings.encoding == mibenum) + return HUBBUB_OK; + + old_enc = hubbub_mibenum_to_name(input->settings.encoding); + if (old_enc == NULL) + old_enc = "ISO-8859-1"; + + if (input->read_codec != NULL) + hubbub_charsetcodec_destroy(input->read_codec); + + input->read_codec = hubbub_charsetcodec_create(enc, input->alloc, + input->pw); + if (input->read_codec == NULL) + return HUBBUB_NOMEM; + + /* Register filter function */ + params.filter_func.filter = read_character_filter; + params.filter_func.pw = (void *) input; + error = hubbub_charsetcodec_setopt(input->read_codec, + HUBBUB_CHARSETCODEC_FILTER_FUNC, + (hubbub_charsetcodec_optparams *) ¶ms); + if (error != HUBBUB_OK) + return error; + + input->settings.encoding = mibenum; + + return HUBBUB_OK; +} + +/** + * Character filter function for read characters + * + * \param c The read character (UCS4 - host byte order) + * \param output Pointer to pointer to output buffer (filled on exit) + * \param outputlen Pointer to output buffer length (filled on exit) + * \param pw Pointer to client-specific private data. + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error read_character_filter(uint32_t c, uint32_t **output, + size_t *outputlen, void *pw) +{ + hubbub_filter *input = (hubbub_filter *) pw; + size_t len; + + if (output == NULL || outputlen == NULL || pw == NULL) + return HUBBUB_BADPARM; + + /* Line ending normalisation: + * CRLF -> LF (trap CR and let LF through unmodified) + * CR -> LF (trap CR and convert to LF if not CRLF) + * LF -> LF (leave LF alone) + */ + +#define NUL (0x00000000) +#define CR (0x0000000D) +#define LF (0x0000000A) +#define REP (0x0000FFFD) + + if (c == NUL) { + /* Replace NUL (U+0000) characters in input with U+FFFD */ + input->filter_output[0] = REP; + len = 1; + } else if (c == CR) { + /* Trap CR characters */ + len = 0; + } else if (input->last_filter_char == CR && c != LF) { + /* Last char was CR and this isn't LF => CR -> LF */ + input->filter_output[0] = LF; + input->filter_output[1] = c; + len = 2; + } else { + /* Let character through unchanged */ + input->filter_output[0] = c; + len = 1; + } + +#undef NUL +#undef CR +#undef LF +#undef REP + + input->last_filter_char = c; + + *output = input->filter_output; + *outputlen = len; + + return HUBBUB_OK; +} diff --git a/src/input/filter.h b/src/input/filter.h new file mode 100644 index 0000000..6650e09 --- /dev/null +++ b/src/input/filter.h @@ -0,0 +1,57 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_input_filter_h_ +#define hubbub_input_filter_h_ + +#include + +#include +#include + +typedef struct hubbub_filter hubbub_filter; + +/** + * Input filter option types + */ +typedef enum hubbub_filter_opttype { + HUBBUB_FILTER_SET_ENCODING = 0, +} hubbub_filter_opttype; + +/** + * Input filter option parameters + */ +typedef union hubbub_filter_optparams { + /** Parameters for encoding setting */ + struct { + /** Encoding name */ + const char *name; + } encoding; +} hubbub_filter_optparams; + + +/* Create an input filter */ +hubbub_filter *hubbub_filter_create(const char *int_enc, + hubbub_alloc alloc, void *pw); +/* Destroy an input filter */ +void hubbub_filter_destroy(hubbub_filter *input); + +/* Configure an input filter */ +hubbub_error hubbub_filter_setopt(hubbub_filter *input, + hubbub_filter_opttype type, + hubbub_filter_optparams *params); + +/* Process a chunk of data */ +hubbub_error hubbub_filter_process_chunk(hubbub_filter *input, + const uint8_t **data, size_t *len, + uint8_t **output, size_t *outlen); + +/* Reset an input filter's state */ +hubbub_error hubbub_filter_reset(hubbub_filter *input); + +#endif + diff --git a/src/input/inputstream.c b/src/input/inputstream.c new file mode 100644 index 0000000..f82d279 --- /dev/null +++ b/src/input/inputstream.c @@ -0,0 +1,479 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include + +#include "charset/aliases.h" +#include "input/streamimpl.h" + +/** + * Buffer moving claimant context + */ +struct hubbub_inputstream_bm_handler { + hubbub_inputstream_buffermoved handler; /**< Handler function */ + void *pw; /**< Client private data */ + + struct hubbub_inputstream_bm_handler *next; + struct hubbub_inputstream_bm_handler *prev; +}; + +extern hubbub_streamhandler utf8stream; + +static hubbub_streamhandler *handler_table[] = { + &utf8stream, + NULL +}; + +/** + * Create an input stream + * + * \param enc Document charset, or NULL to autodetect + * \param int_enc Desired encoding of document + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to stream instance, or NULL on failure + */ +hubbub_inputstream *hubbub_inputstream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw) +{ + hubbub_inputstream *stream; + hubbub_streamhandler **handler; + + if (int_enc == NULL || alloc == NULL) + return NULL; + + /* Search for handler class */ + for (handler = handler_table; *handler != NULL; handler++) { + if ((*handler)->uses_encoding(int_enc)) + break; + } + + /* None found */ + if ((*handler) == NULL) + return NULL; + + stream = (*handler)->create(enc, int_enc, alloc, pw); + if (stream == NULL) + return NULL; + + stream->handlers = NULL; + + stream->alloc = alloc; + stream->pw = pw; + + return stream; +} + +/** + * Destroy an input stream + * + * \param stream Input stream to destroy + */ +void hubbub_inputstream_destroy(hubbub_inputstream *stream) +{ + hubbub_inputstream_bm_handler *h, *i; + + if (stream == NULL) + return; + + for (h = stream->handlers; h; h = i) { + i = h->next; + + stream->alloc(h, 0, stream->pw); + } + + stream->destroy(stream); +} + +/** + * Append data to an input stream + * + * \param stream Input stream to append data to + * \param data Data to append (in document charset), or NULL to flag EOF + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + if (stream == NULL) + return HUBBUB_BADPARM; + + /* Calling this if we've disowned the buffer is foolish */ + if (stream->buffer == NULL) + return HUBBUB_INVALID; + + return stream->append(stream, data, len); +} + +/** + * Insert data into stream at current location + * + * \param stream Input stream to insert into + * \param data Data to insert (UTF-8 encoded) + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + if (stream == NULL || data == NULL) + return HUBBUB_BADPARM; + + /* Calling this if we've disowned the buffer is foolish */ + if (stream->buffer == NULL) + return HUBBUB_INVALID; + + return stream->insert(stream, data, len); +} + +/** + * Look at the next character in the stream + * + * \param stream Stream to look in + * \return UCS4 (host-endian) character code, or EOF or OOD. + */ +uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_INPUTSTREAM_OOD; + + return stream->peek(stream);; +} + +/** + * Retrieve the byte index and length of the current character in the stream + * + * \param stream Stream to look in + * \param len Pointer to location to receive byte length of character + * \return Byte index of current character from start of stream, + * or (uint32_t) -1 on error + */ +uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream, + size_t *len) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || len == NULL || stream->buffer == NULL) + return (uint32_t) -1; + + return stream->cur_pos(stream, len); +} + +/** + * Convert the current character to lower case + * + * \param stream Stream to look in + */ +void hubbub_inputstream_lowercase(hubbub_inputstream *stream) +{ + if (stream == NULL || stream->buffer == NULL) + return; + + stream->lowercase(stream); +} + +/** + * Convert the current character to upper case + * + * \param stream Stream to look in + */ +void hubbub_inputstream_uppercase(hubbub_inputstream *stream) +{ + if (stream == NULL || stream->buffer == NULL) + return; + + stream->uppercase(stream); +} + +/** + * Advance the stream's current position + * + * \param stream The stream whose position to advance + */ +void hubbub_inputstream_advance(hubbub_inputstream *stream) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || stream->buffer == NULL) + return; + + if (stream->cursor == stream->buffer_len) + return; + + stream->advance(stream); +} + +/** + * Push a character back onto the stream + * + * \param stream Stream to push back to + * \param character UCS4 (host-endian) codepoint to push back + * \return HUBBUB_OK on success, appropriate error otherwise + * + * Note that this doesn't actually modify the data in the stream. + * It works by ensuring that the character located just before the + * current stream location is the same as ::character. If it is, + * then the stream pointer is moved back. If it is not, then an + * error is returned and the stream pointer remains unmodified. + */ +hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream, + uint32_t character) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_BADPARM; + + if (stream->cursor == 0) + return HUBBUB_INVALID; + + return stream->push_back(stream, character); +} + +/** + * Rewind the input stream by a number of bytes + * + * \param stream Stream to rewind + * \param n Number of bytes to go back + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n) +{ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_BADPARM; + + if (stream->cursor < n) + return HUBBUB_INVALID; + + stream->cursor -= n; + + return HUBBUB_OK; +} + +/** + * Claim ownership of an input stream's buffer + * + * \param stream Input stream whose buffer to claim + * \param buffer Pointer to location to receive buffer pointer + * \param len Pointer to location to receive byte length of buffer + * \return HUBBUB_OK on success, appropriate error otherwise. + * + * Once the buffer has been claimed by a client, the input stream disclaims + * all ownership rights (and invalidates any internal references it may have + * to the buffer). Therefore, the only input stream call which may be made + * after calling this function is to destroy the input stream. Therefore, + * unless the stream pointer is located at EOF, this call will return an + * error. + */ +hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream, + uint8_t **buffer, size_t *len) +{ + if (stream == NULL || buffer == NULL || len == NULL) + return HUBBUB_BADPARM; + + if (stream->had_eof == false || + stream->cursor != stream->buffer_len) + return HUBBUB_INVALID; + + *buffer = stream->buffer; + *len = stream->buffer_len; + + stream->buffer = NULL; + + return HUBBUB_OK; +} + +/** + * Register interest in buffer moved events + * + * \param stream Input stream to register interest with + * \param handler Pointer to handler function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_register_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw) +{ + hubbub_inputstream_bm_handler *h; + + if (stream == NULL || handler == NULL) + return HUBBUB_BADPARM; + + h = stream->alloc(NULL, sizeof(hubbub_inputstream_bm_handler), + stream->pw); + if (h == NULL) + return HUBBUB_NOMEM; + + h->handler = handler; + h->pw = pw; + + h->prev = NULL; + h->next = stream->handlers; + + if (stream->handlers) + stream->handlers->prev = h; + stream->handlers = h; + + /* And notify claimant of current buffer location */ + handler(stream->buffer, stream->buffer_len, pw); + + return HUBBUB_OK; +} + +/** + * Deregister interest in buffer moved events + * + * \param stream Input stream to deregister from + * \param handler Pointer to handler function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_deregister_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw) +{ + hubbub_inputstream_bm_handler *h; + + if (stream == NULL || handler == NULL) + return HUBBUB_BADPARM; + + for (h = stream->handlers; h; h = h->next) { + if (h->handler == handler && h->pw == pw) + break; + } + + if (h == NULL) + return HUBBUB_INVALID; + + if (h->next) + h->next->prev = h->prev; + if (h->prev) + h->prev->next = h->next; + else + stream->handlers = h->next; + + stream->alloc(h, 0, stream->pw); + + return HUBBUB_OK; +} + +/** + * Case insensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + if (stream == NULL || stream->buffer == NULL) + return 1; /* arbitrary */ + + return stream->cmp_range_ci(stream, r1, r2, len); +} + +/** + * Case sensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + if (stream == NULL || stream->buffer == NULL) + return 1; /* arbitrary */ + + return stream->cmp_range_cs(stream, r1, r2, len); +} + +/** + * Case sensitively compare a range of input stream against an ASCII string + * + * \param stream Input stream to look in + * \param off Offset of range start + * \param len Byte length of range + * \param data Comparison string + * \param dlen Byte length of comparison string + * \return 0 if match, non-zero otherwise + */ +int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen) +{ + if (stream == NULL || stream->buffer == NULL) + return 1; /* arbitrary */ + + return stream->cmp_range_ascii(stream, off, len, data, dlen); +} + +/** + * Replace a range of bytes in the input stream with a single character + * + * \param stream Input stream containing data + * \param start Offset of start of range to replace + * \param len Length (in bytes) of range to replace + * \param ucs4 UCS4 (host endian) encoded replacement character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4) +{ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_BADPARM; + + if (start >= stream->buffer_len) + return HUBBUB_INVALID; + + if (start < stream->cursor) + return HUBBUB_INVALID; + + return stream->replace_range(stream, start, len, ucs4); +} + +/** + * Read the document charset + * + * \param stream Input stream to query + * \param source Pointer to location to receive charset source + * \return Pointer to charset name (constant; do not free), or NULL if unknown + */ +const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream, + hubbub_charset_source *source) +{ + if (stream == NULL || source == NULL) + return NULL; + + *source = stream->encsrc; + + if (stream->encsrc == HUBBUB_CHARSET_UNKNOWN) + return NULL; + + return hubbub_mibenum_to_name(stream->mibenum); +} + +/** + * Inform interested parties that the buffer has moved + * + * \param stream Input stream + */ +void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream) +{ + hubbub_inputstream_bm_handler *h; + + if (stream == NULL) + return; + + for (h = stream->handlers; h; h = h->next) + h->handler(stream->buffer, stream->buffer_len, h->pw); +} + diff --git a/src/input/inputstream.h b/src/input/inputstream.h new file mode 100644 index 0000000..5325d14 --- /dev/null +++ b/src/input/inputstream.h @@ -0,0 +1,98 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_input_inputstream_h_ +#define hubbub_input_inputstream_h_ + +#include + +#include +#include +#include + +typedef struct hubbub_inputstream hubbub_inputstream; + +/* EOF pseudo-character */ +#define HUBBUB_INPUTSTREAM_EOF (0xFFFFFFFFU) +/* Out-of-data indicator */ +#define HUBBUB_INPUTSTREAM_OOD (0xFFFFFFFEU) + +/* Type of input stream buffer moved handler function */ +typedef void (*hubbub_inputstream_buffermoved)(const uint8_t *buffer, + size_t len, void *pw); + +/* Create an input stream */ +hubbub_inputstream *hubbub_inputstream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw); +/* Destroy an input stream */ +void hubbub_inputstream_destroy(hubbub_inputstream *stream); + +/* Append data to an input stream */ +hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len); +/* Insert data into stream at current location */ +hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len); + +/* Look at the next character in the stream */ +uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream); + +/* Retrieve the byte index and length of the current character in the stream */ +uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream, size_t *len); + +/* Convert the current character to lowercase */ +void hubbub_inputstream_lowercase(hubbub_inputstream *stream); + +/* Convert the current character to uppercase */ +void hubbub_inputstream_uppercase(hubbub_inputstream *stream); + +/* Advance the stream's current position */ +void hubbub_inputstream_advance(hubbub_inputstream *stream); + +/* Push a character back onto the stream */ +hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream, + uint32_t character); + +/* Rewind the input stream by a number of bytes */ +hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n); + +/* Claim ownership of an input stream's buffer */ +hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream, + uint8_t **buffer, size_t *len); + +/* Register interest in buffer moved events */ +hubbub_error hubbub_inputstream_register_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw); + +/* Deregister interest in buffer moved events */ +hubbub_error hubbub_inputstream_deregister_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw); + +/* Case insensitively compare a pair of ranges in the input stream */ +int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len); + +/* Case sensitively compare a pair of ranges in the input stream */ +int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len); + +/* Case sensitively compare a range of input stream against an ASCII string */ +int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen); + +/* Replace a range of bytes in the input stream with a single character */ +hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4); + +/* Read the document charset */ +const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream, + hubbub_charset_source *source); + +#endif + diff --git a/src/input/streamimpl.h b/src/input/streamimpl.h new file mode 100644 index 0000000..f44f6da --- /dev/null +++ b/src/input/streamimpl.h @@ -0,0 +1,77 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef hubbub_input_streamimpl_h_ +#define hubbub_input_streamimpl_h_ + +#include + +#include + +#include "input/filter.h" +#include "input/inputstream.h" + +typedef struct hubbub_inputstream_bm_handler hubbub_inputstream_bm_handler; + +/** + * Input stream definition: implementations extend this + */ +struct hubbub_inputstream { + uint8_t *buffer; /**< Document buffer */ + size_t buffer_len; /**< Amount of data in buffer */ + size_t buffer_alloc; /**< Allocated size of buffer */ + + uint32_t cursor; /**< Byte offset of current position */ + + bool had_eof; /**< Whether EOF has been reached */ + + uint16_t mibenum; /**< MIB enum for charset, or 0 */ + hubbub_charset_source encsrc; /**< Charset source */ + + hubbub_filter *input; /**< Charset conversion filter */ + + hubbub_inputstream_bm_handler *handlers; /**< List of buffer + * moved handlers */ + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client private data */ + + void (*destroy)(hubbub_inputstream *stream); + hubbub_error (*append)(hubbub_inputstream *stream, + const uint8_t *data, size_t len); + hubbub_error (*insert)(hubbub_inputstream *stream, + const uint8_t *data, size_t len); + uint32_t (*peek)(hubbub_inputstream *stream); + uint32_t (*cur_pos)(hubbub_inputstream *stream, size_t *len); + void (*lowercase)(hubbub_inputstream *stream); + void (*uppercase)(hubbub_inputstream *stream); + void (*advance)(hubbub_inputstream *stream); + hubbub_error (*push_back)(hubbub_inputstream *stream, + uint32_t character); + int (*cmp_range_ci)(hubbub_inputstream *stream, uint32_t r1, + uint32_t r2, size_t len); + int (*cmp_range_cs)(hubbub_inputstream *stream, uint32_t r1, + uint32_t r2, size_t len); + int (*cmp_range_ascii)(hubbub_inputstream *stream, + uint32_t off, size_t len, + const char *data, size_t dlen); + hubbub_error (*replace_range)(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4); +}; + +/** + * Input stream factory component definition + */ +typedef struct hubbub_streamhandler { + bool (*uses_encoding)(const char *int_enc); + hubbub_inputstream *(*create)(const char *enc, const char *int_enc, + hubbub_alloc alloc, void *pw); +} hubbub_streamhandler; + +/* Notification of stream buffer moving */ +void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream); + +#endif diff --git a/src/input/utf8_stream.c b/src/input/utf8_stream.c new file mode 100644 index 0000000..5d08993 --- /dev/null +++ b/src/input/utf8_stream.c @@ -0,0 +1,567 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include + +#include "charset/aliases.h" +#include "charset/detect.h" +#include "input/streamimpl.h" +#include "utils/utf8.h" +#include "utils/utils.h" + +#define BUFFER_CHUNK (4096) + +static bool hubbub_utf8stream_uses_encoding(const char *int_enc); +static hubbub_inputstream *hubbub_utf8stream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw); +static void hubbub_utf8stream_destroy(hubbub_inputstream *stream); +static hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len); +static hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len); +static uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream); +static uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream, + size_t *len); +static void hubbub_utf8stream_lowercase(hubbub_inputstream *stream); +static void hubbub_utf8stream_uppercase(hubbub_inputstream *stream); +static void hubbub_utf8stream_advance(hubbub_inputstream *stream); +static hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream, + uint32_t character); +static int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len); +static int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len); +static int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen); +static hubbub_error hubbub_utf8stream_replace_range( + hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4); + +/** + * Determine whether a stream implementation uses an internal encoding + * + * \param int_enc The desired encoding + * \return true if handled, false otherwise + */ +bool hubbub_utf8stream_uses_encoding(const char *int_enc) +{ + return (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) == + hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"))); +} + +/** + * Create an input stream + * + * \param enc Document charset, or NULL if unknown + * \param int_enc Desired encoding of document + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to stream instance, or NULL on failure + */ +hubbub_inputstream *hubbub_utf8stream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw) +{ + hubbub_inputstream *stream; + + if (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) != + hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"))) + return NULL; + + stream = alloc(NULL, sizeof(hubbub_inputstream), pw); + if (stream == NULL) + return NULL; + + stream->buffer = alloc(NULL, BUFFER_CHUNK, pw); + if (stream->buffer == NULL) { + alloc(stream, 0, pw); + return NULL; + } + + stream->buffer_len = 0; + stream->buffer_alloc = BUFFER_CHUNK; + + stream->cursor = 0; + + stream->had_eof = false; + + stream->input = hubbub_filter_create(int_enc, alloc, pw); + if (stream->input == NULL) { + alloc(stream->buffer, 0, pw); + alloc(stream, 0, pw); + return NULL; + } + + if (enc != NULL) { + hubbub_error error; + hubbub_filter_optparams params; + + stream->mibenum = hubbub_mibenum_from_name(enc, strlen(enc)); + + if (stream->mibenum != 0) { + params.encoding.name = enc; + + error = hubbub_filter_setopt(stream->input, + HUBBUB_FILTER_SET_ENCODING, ¶ms); + if (error != HUBBUB_OK && error != HUBBUB_INVALID) { + hubbub_filter_destroy(stream->input); + alloc(stream->buffer, 0, pw); + alloc(stream, 0, pw); + return NULL; + } + + stream->encsrc = HUBBUB_CHARSET_DICTATED; + } + } else { + stream->mibenum = 0; + stream->encsrc = HUBBUB_CHARSET_UNKNOWN; + } + + stream->destroy = hubbub_utf8stream_destroy; + stream->append = hubbub_utf8stream_append; + stream->insert = hubbub_utf8stream_insert; + stream->peek = hubbub_utf8stream_peek; + stream->cur_pos = hubbub_utf8stream_cur_pos; + stream->lowercase = hubbub_utf8stream_lowercase; + stream->uppercase = hubbub_utf8stream_uppercase; + stream->advance = hubbub_utf8stream_advance; + stream->push_back = hubbub_utf8stream_push_back; + stream->cmp_range_ci = hubbub_utf8stream_compare_range_ci; + stream->cmp_range_cs = hubbub_utf8stream_compare_range_cs; + stream->cmp_range_ascii = hubbub_utf8stream_compare_range_ascii; + stream->replace_range = hubbub_utf8stream_replace_range; + + return stream; +} + +/** + * Destroy an input stream + * + * \param stream Input stream to destroy + */ +void hubbub_utf8stream_destroy(hubbub_inputstream *stream) +{ + if (stream->input != NULL) { + hubbub_filter_destroy(stream->input); + } + + if (stream->buffer != NULL) { + stream->alloc(stream->buffer, 0, stream->pw); + } + + stream->alloc(stream, 0, stream->pw); +} + +/** + * Append data to an input stream + * + * \param stream Input stream to append data to + * \param data Data to append (in document charset), or NULL to flag EOF + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + hubbub_error error; + uint8_t *base; + size_t space; + + if (data == NULL) { + /* EOF indicated */ + size_t dummy_len = 0; + uint8_t *dummy_data = (uint8_t *) &dummy_len; + + base = stream->buffer + stream->buffer_len; + space = stream->buffer_alloc - stream->buffer_len; + + /* Forcibly flush through any remaining buffered data */ + while ((error = hubbub_filter_process_chunk(stream->input, + (const uint8_t **) &dummy_data, &dummy_len, + &base, &space)) == HUBBUB_NOMEM) { + bool moved = false; + uint8_t *temp = stream->alloc(stream->buffer, + stream->buffer_alloc + BUFFER_CHUNK, + stream->pw); + + if (temp == NULL) { + return HUBBUB_NOMEM; + } + + moved = (temp != stream->buffer); + + stream->buffer = temp; + stream->buffer_len += stream->buffer_alloc - + stream->buffer_len - space; + stream->buffer_alloc += BUFFER_CHUNK; + + base = stream->buffer + stream->buffer_len; + space = stream->buffer_alloc - stream->buffer_len; + + if (moved) + hubbub_inputstream_buffer_moved(stream); + } + + /* And fix up buffer length */ + stream->buffer_len += stream->buffer_alloc - + stream->buffer_len - space; + + stream->had_eof = true; + } else { + /* Normal data chunk */ + + if (stream->mibenum == 0) { + /* Haven't found charset yet; detect it */ + error = hubbub_charset_extract(&data, &len, + &stream->mibenum, &stream->encsrc); + if (error) { + return error; + } + + /* We should always have a charset by now */ + if (stream->mibenum == 0) + abort(); + } + + base = stream->buffer + stream->buffer_len; + space = stream->buffer_alloc - stream->buffer_len; + + /* Convert chunk to UTF-8 */ + while ((error = hubbub_filter_process_chunk(stream->input, + &data, &len, + &base, &space)) == HUBBUB_NOMEM) { + bool moved = false; + uint8_t *temp = stream->alloc(stream->buffer, + stream->buffer_alloc + BUFFER_CHUNK, + stream->pw); + + if (temp == NULL) { + return HUBBUB_NOMEM; + } + + moved = (temp != stream->buffer); + + stream->buffer = temp; + stream->buffer_len += stream->buffer_alloc - + stream->buffer_len - space; + stream->buffer_alloc += BUFFER_CHUNK; + + base = stream->buffer + stream->buffer_len; + space = stream->buffer_alloc - stream->buffer_len - + space; + + if (moved) + hubbub_inputstream_buffer_moved(stream); + } + + /* And fix up buffer length */ + stream->buffer_len += stream->buffer_alloc - + stream->buffer_len - space; + } + + return HUBBUB_OK; +} + +/** + * Insert data into stream at current location + * + * \param stream Input stream to insert into + * \param data Data to insert (UTF-8 encoded) + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + size_t space; + uint8_t *curpos; + + space = stream->buffer_alloc - stream->buffer_len; + + /* Need to grow buffer, if there's insufficient space */ + if (space <= len) { + bool moved = false; + uint8_t *temp = stream->alloc(stream->buffer, + stream->buffer_alloc + + ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) + + BUFFER_CHUNK, + stream->pw); + + if (temp == NULL) + return HUBBUB_NOMEM; + + moved = (temp != stream->buffer); + + stream->buffer = temp; + stream->buffer_alloc += + ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK); + + if (moved) + hubbub_inputstream_buffer_moved(stream); + } + + /* Find the insertion point + * (just before the next character to be read) */ + curpos = stream->buffer + stream->cursor; + + /* Move data above this point up */ + memmove(curpos + len, curpos, stream->buffer_len - stream->cursor); + + /* Copy new data into gap created by memmove */ + memcpy(curpos, data, len); + + /* Fix up buffer length */ + stream->buffer_len += len; + + return HUBBUB_OK; +} + +/** + * Look at the next character in the stream + * + * \param stream Stream to look in + * \return UCS4 (host-endian) character code, or EOF or OOD. + */ +uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream) +{ + hubbub_error error; + size_t len; + uint32_t ret; + + if (stream->cursor == stream->buffer_len) { + return stream->had_eof ? HUBBUB_INPUTSTREAM_EOF + : HUBBUB_INPUTSTREAM_OOD; + } + + error = hubbub_utf8_to_ucs4(stream->buffer + stream->cursor, + stream->buffer_len - stream->cursor, + &ret, &len); + if (error != HUBBUB_OK && error != HUBBUB_NEEDDATA) + return HUBBUB_INPUTSTREAM_OOD; + + if (error == HUBBUB_NEEDDATA) { + if (stream->had_eof) + return HUBBUB_INPUTSTREAM_EOF; + else + return HUBBUB_INPUTSTREAM_OOD; + } + + return ret; +} + +/** + * Retrieve the byte index and length of the current character in the stream + * + * \param stream Stream to look in + * \param len Pointer to location to receive byte length of character + * \return Byte index of current character from start of stream, + * or (uint32_t) -1 on error + */ +uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream, + size_t *len) +{ + hubbub_utf8_char_byte_length(stream->buffer + stream->cursor, len); + + return stream->cursor; +} + +/** + * Convert the current character to lower case + * + * \param stream Stream to look in + */ +void hubbub_utf8stream_lowercase(hubbub_inputstream *stream) +{ + if ('A' <= stream->buffer[stream->cursor] && + stream->buffer[stream->cursor] <= 'Z') + stream->buffer[stream->cursor] += 0x0020; +} + +/** + * Convert the current character to upper case + * + * \param stream Stream to look in + */ +void hubbub_utf8stream_uppercase(hubbub_inputstream *stream) +{ + if ('a' <= stream->buffer[stream->cursor] && + stream->buffer[stream->cursor] <= 'z') + stream->buffer[stream->cursor] -= 0x0020; +} + +/** + * Advance the stream's current position + * + * \param stream The stream whose position to advance + */ +void hubbub_utf8stream_advance(hubbub_inputstream *stream) +{ + hubbub_error error; + uint32_t next; + + error = hubbub_utf8_next(stream->buffer, stream->buffer_len, + stream->cursor, &next); + + if (error == HUBBUB_OK) + stream->cursor = next; +} + +/** + * Push a character back onto the stream + * + * \param stream Stream to push back to + * \param character UCS4 (host-endian) codepoint to push back + * \return HUBBUB_OK on success, appropriate error otherwise + * + * Note that this doesn't actually modify the data in the stream. + * It works by ensuring that the character located just before the + * current stream location is the same as ::character. If it is, + * then the stream pointer is moved back. If it is not, then an + * error is returned and the stream pointer remains unmodified. + */ +hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream, + uint32_t character) +{ + hubbub_error error; + uint32_t prev; + uint8_t buf[6]; + size_t len; + + error = hubbub_utf8_prev(stream->buffer, stream->cursor, &prev); + if (error != HUBBUB_OK) + return error; + + error = hubbub_utf8_from_ucs4(character, buf, &len); + if (error != HUBBUB_OK) + return error; + + if ((stream->cursor - prev) != len || + memcmp(stream->buffer + prev, buf, len) != 0) + return HUBBUB_INVALID; + + stream->cursor = prev; + + return HUBBUB_OK; +} + +/** + * Case insensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + return strncasecmp((const char *) (stream->buffer + r1), + (const char *) (stream->buffer + r2), len); +} + +/** + * Case sensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + return strncmp((const char *) (stream->buffer + r1), + (const char *) (stream->buffer + r2), len); +} + +/** + * Case sensitively compare a range of input stream against an ASCII string + * + * \param stream Input stream to look in + * \param off Offset of range start + * \param len Byte length of range + * \param data Comparison string + * \param dlen Byte length of comparison string + * \return 0 if match, non-zero otherwise + */ +int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen) +{ + /* Lengths don't match, so strings don't */ + if (len != dlen) + return 1; /* arbitrary */ + + return strncmp((const char *) (stream->buffer + off), + data, len); +} + +/** + * Replace a range of bytes in the input stream with a single character + * + * \param stream Input stream containing data + * \param start Offset of start of range to replace + * \param len Length (in bytes) of range to replace + * \param ucs4 UCS4 (host endian) encoded replacement character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_utf8stream_replace_range(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4) +{ + uint8_t buf[6]; + size_t replen; + int32_t diff; + hubbub_error error; + + /* Get UTF8 version of replacement character */ + error = hubbub_utf8_from_ucs4(ucs4, buf, &replen); + if (error) + return error; + + diff = replen - len; + + if (stream->buffer_len + diff >= stream->buffer_alloc) { + /* Need more buffer space */ + bool moved = false; + uint8_t *temp = stream->alloc(stream->buffer, + stream->buffer_alloc + + ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) + + BUFFER_CHUNK, + stream->pw); + + if (temp == NULL) + return HUBBUB_NOMEM; + + moved = (temp != stream->buffer); + + stream->buffer = temp; + stream->buffer_alloc += + ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK); + + if (moved) + hubbub_inputstream_buffer_moved(stream); + } + + /* Move subsequent input to correct location */ + memmove(stream->buffer + start + len + diff, + stream->buffer + start + len, + stream->buffer_len - (start + len)); + + /* And fill the gap with the replacement character */ + memcpy(stream->buffer + start, buf, replen); + + /* Finally, update length */ + stream->buffer_len += diff; + + return HUBBUB_OK; +} + +hubbub_streamhandler utf8stream = { + hubbub_utf8stream_uses_encoding, + hubbub_utf8stream_create +}; -- cgit v1.2.3