diff options
Diffstat (limited to 'src/input')
-rw-r--r-- | src/input/Makefile | 46 | ||||
-rw-r--r-- | src/input/filter.c | 380 | ||||
-rw-r--r-- | src/input/filter.h | 57 | ||||
-rw-r--r-- | src/input/inputstream.c | 481 | ||||
-rw-r--r-- | src/input/inputstream.h | 98 | ||||
-rw-r--r-- | src/input/streamimpl.h | 77 | ||||
-rw-r--r-- | src/input/utf16_stream.c | 605 | ||||
-rw-r--r-- | src/input/utf8_stream.c | 562 |
8 files changed, 0 insertions, 2306 deletions
diff --git a/src/input/Makefile b/src/input/Makefile deleted file mode 100644 index 3b9206f..0000000 --- a/src/input/Makefile +++ /dev/null @@ -1,46 +0,0 @@ -# Child makefile fragment for libhubbub -# -# Toolchain is provided by top-level makefile -# -# Variables provided by top-level makefile -# -# COMPONENT The name of the component -# EXPORT The location of the export directory -# TOP The location of the source tree root -# RELEASEDIR The place to put release objects -# DEBUGDIR The place to put debug objects -# -# do_include Canned command sequence to include a child makefile -# -# Variables provided by parent makefile: -# -# DIR The name of the directory we're in, relative to $(TOP) -# -# Variables we can manipulate: -# -# ITEMS_CLEAN The list of items to remove for "make clean" -# ITEMS_DISTCLEAN The list of items to remove for "make distclean" -# TARGET_TESTS The list of target names to run for "make test" -# -# SOURCES The list of sources to build for $(COMPONENT) -# -# Plus anything from the toolchain - -# Push parent directory onto the directory stack -sp := $(sp).x -dirstack_$(sp) := $(d) -d := $(DIR) - -# Sources -SRCS_$(d) := filter.c inputstream.c utf8_stream.c utf16_stream.c - -# Append to sources for component -SOURCES += $(addprefix $(d), $(SRCS_$(d))) - -# Now include any children we may have -MAKE_INCLUDES := $(wildcard $(d)*/Makefile) -$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) - -# Finally, pop off the directory stack -d := $(dirstack_$(sp)) -sp := $(basename $(sp)) diff --git a/src/input/filter.c b/src/input/filter.c deleted file mode 100644 index 7a97840..0000000 --- a/src/input/filter.c +++ /dev/null @@ -1,380 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> - */ - -#include <errno.h> -#include <stdbool.h> -#include <stdlib.h> -#include <string.h> - -#include "charset/aliases.h" -#include "charset/codec.h" -#include "utils/utils.h" - -#include "input/filter.h" - - -/** Input filter */ -struct hubbub_filter { - hubbub_charsetcodec *read_codec; /**< Read codec */ - hubbub_charsetcodec *write_codec; /**< Write codec */ - - uint32_t filter_output[2]; /**< Filter output buffer */ - uint32_t last_filter_char; /**< Last filtered character */ - - uint32_t pivot_buf[64]; /**< Conversion pivot buffer */ - - bool leftover; /**< Data remains from last call */ - uint8_t *pivot_left; /**< Remaining pivot to write */ - size_t pivot_len; /**< Length of pivot remaining */ - - struct { - uint16_t encoding; /**< Input encoding */ - } settings; /**< Filter settings */ - - hubbub_alloc alloc; /**< Memory (de)allocation function */ - void *pw; /**< Client private data */ -}; - -static hubbub_error hubbub_filter_set_defaults(hubbub_filter *input); -static hubbub_error hubbub_filter_set_encoding(hubbub_filter *input, - const char *enc); -static hubbub_error read_character_filter(uint32_t c, - uint32_t **output, size_t *outputlen, void *pw); - -/** - * Create an input filter - * - * \param int_enc Desired encoding of document - * \param alloc Function used to (de)allocate data - * \param pw Pointer to client-specific private data (may be NULL) - * \return Pointer to filter instance, or NULL on failure - */ -hubbub_filter *hubbub_filter_create(const char *int_enc, - hubbub_alloc alloc, void *pw) -{ - hubbub_filter *filter; - - if (alloc == NULL) - return NULL; - - filter = alloc(NULL, sizeof(*filter), pw); - if (!filter) - return NULL; - - filter->last_filter_char = 0; - - filter->leftover = false; - filter->pivot_left = NULL; - filter->pivot_len = 0; - - filter->alloc = alloc; - filter->pw = pw; - - if (hubbub_filter_set_defaults(filter) != HUBBUB_OK) { - filter->alloc(filter, 0, pw); - return NULL; - } - - filter->write_codec = hubbub_charsetcodec_create(int_enc, alloc, pw); - if (filter->write_codec == NULL) { - if (filter->read_codec != NULL) - hubbub_charsetcodec_destroy(filter->read_codec); - filter->alloc(filter, 0, pw); - return NULL; - } - - return filter; -} - -/** - * Destroy an input filter - * - * \param input Pointer to filter instance - */ -void hubbub_filter_destroy(hubbub_filter *input) -{ - if (input == NULL) - return; - - if (input->read_codec != NULL) - hubbub_charsetcodec_destroy(input->read_codec); - - if (input->write_codec != NULL) - hubbub_charsetcodec_destroy(input->write_codec); - - input->alloc(input, 0, input->pw); - - return; -} - -/** - * Configure an input filter - * - * \param input Pointer to filter instance - * \param type Input option type to configure - * \param params Option-specific parameters - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_filter_setopt(hubbub_filter *input, - hubbub_filter_opttype type, - hubbub_filter_optparams *params) -{ - hubbub_error error = HUBBUB_OK; - - if (input == NULL || params == NULL) - return HUBBUB_BADPARM; - - switch (type) { - case HUBBUB_FILTER_SET_ENCODING: - error = hubbub_filter_set_encoding(input, - params->encoding.name); - break; - } - - return error; -} - -/** - * Process a chunk of data - * - * \param input Pointer to filter instance - * \param data Pointer to pointer to input buffer - * \param len Pointer to length of input buffer - * \param output Pointer to pointer to output buffer - * \param outlen Pointer to length of output buffer - * \return HUBBUB_OK on success, appropriate error otherwise - * - * Call this with an input buffer length of 0 to flush any buffers. - */ -hubbub_error hubbub_filter_process_chunk(hubbub_filter *input, - const uint8_t **data, size_t *len, - uint8_t **output, size_t *outlen) -{ - hubbub_error read_error, write_error; - - if (input == NULL || data == NULL || *data == NULL || len == NULL || - output == NULL || *output == NULL || outlen == NULL) - return HUBBUB_BADPARM; - - if (input->leftover) { - /* Some data left to be written from last call */ - - /* Attempt to flush the remaining data. */ - write_error = hubbub_charsetcodec_encode(input->write_codec, - (const uint8_t **) &input->pivot_left, - &input->pivot_len, - output, outlen); - - if (write_error != HUBBUB_OK) { - return write_error; - } - - /* And clear leftover */ - input->pivot_left = NULL; - input->pivot_len = 0; - input->leftover = false; - } - - while (*len > 0) { - size_t pivot_len = sizeof(input->pivot_buf); - uint8_t *pivot = (uint8_t *) input->pivot_buf; - - read_error = hubbub_charsetcodec_decode(input->read_codec, - data, len, - (uint8_t **) &pivot, &pivot_len); - - pivot = (uint8_t *) input->pivot_buf; - pivot_len = sizeof(input->pivot_buf) - pivot_len; - - if (pivot_len > 0) { - write_error = hubbub_charsetcodec_encode( - input->write_codec, - (const uint8_t **) &pivot, - &pivot_len, - output, outlen); - - if (write_error != HUBBUB_OK) { - input->leftover = true; - input->pivot_left = pivot; - input->pivot_len = pivot_len; - - return write_error; - } - } - - if (read_error != HUBBUB_OK && read_error != HUBBUB_NOMEM) - return read_error; - } - - return HUBBUB_OK; -} - -/** - * Reset an input filter's state - * - * \param input The input filter to reset - * \param HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_filter_reset(hubbub_filter *input) -{ - hubbub_error error; - - if (input == NULL) - return HUBBUB_BADPARM; - - /* Clear pivot buffer leftovers */ - input->pivot_left = NULL; - input->pivot_len = 0; - input->leftover = false; - - /* Reset read codec */ - error = hubbub_charsetcodec_reset(input->read_codec); - if (error != HUBBUB_OK) - return error; - - /* Reset write codec */ - error = hubbub_charsetcodec_reset(input->write_codec); - if (error != HUBBUB_OK) - return error; - - return HUBBUB_OK; -} - -/** - * Set an input filter's default settings - * - * \param input Input filter to configure - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_filter_set_defaults(hubbub_filter *input) -{ - hubbub_error error; - - if (input == NULL) - return HUBBUB_BADPARM; - - input->read_codec = NULL; - input->write_codec = NULL; - input->settings.encoding = 0; - error = hubbub_filter_set_encoding(input, "ISO-8859-1"); - if (error != HUBBUB_OK) - return error; - - return HUBBUB_OK; -} - -/** - * Set an input filter's encoding - * - * \param input Input filter to configure - * \param enc Encoding name - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_filter_set_encoding(hubbub_filter *input, - const char *enc) -{ - const char *old_enc; - uint16_t mibenum; - hubbub_error error; - hubbub_charsetcodec_optparams params; - - if (input == NULL || enc == NULL) - return HUBBUB_BADPARM; - - mibenum = hubbub_mibenum_from_name(enc, strlen(enc)); - if (mibenum == 0) - return HUBBUB_INVALID; - - /* Exit early if we're already using this encoding */ - if (input->settings.encoding == mibenum) - return HUBBUB_OK; - - old_enc = hubbub_mibenum_to_name(input->settings.encoding); - if (old_enc == NULL) - old_enc = "ISO-8859-1"; - - if (input->read_codec != NULL) - hubbub_charsetcodec_destroy(input->read_codec); - - input->read_codec = hubbub_charsetcodec_create(enc, input->alloc, - input->pw); - if (input->read_codec == NULL) - return HUBBUB_NOMEM; - - /* Register filter function */ - params.filter_func.filter = read_character_filter; - params.filter_func.pw = (void *) input; - error = hubbub_charsetcodec_setopt(input->read_codec, - HUBBUB_CHARSETCODEC_FILTER_FUNC, - (hubbub_charsetcodec_optparams *) ¶ms); - if (error != HUBBUB_OK) - return error; - - input->settings.encoding = mibenum; - - return HUBBUB_OK; -} - -/** - * Character filter function for read characters - * - * \param c The read character (UCS4 - host byte order) - * \param output Pointer to pointer to output buffer (filled on exit) - * \param outputlen Pointer to output buffer length (filled on exit) - * \param pw Pointer to client-specific private data. - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error read_character_filter(uint32_t c, uint32_t **output, - size_t *outputlen, void *pw) -{ - hubbub_filter *input = (hubbub_filter *) pw; - size_t len; - - if (output == NULL || outputlen == NULL || pw == NULL) - return HUBBUB_BADPARM; - - /* Line ending normalisation: - * CRLF -> LF (trap CR and let LF through unmodified) - * CR -> LF (trap CR and convert to LF if not CRLF) - * LF -> LF (leave LF alone) - */ - -#define NUL (0x00000000) -#define CR (0x0000000D) -#define LF (0x0000000A) -#define REP (0x0000FFFD) - - /* Replace NUL (U+0000) characters in input with U+FFFD */ - if (c == NUL) - c = REP; - - if (c == CR) { - /* Convert CRs to LFs straight away */ - input->filter_output[0] = LF; - len = 1; - } else if (input->last_filter_char == CR && c == LF) { - /* Trap this LF */ - len = 0; - } else { - /* Let character through unchanged */ - input->filter_output[0] = c; - len = 1; - } - - -#undef NUL -#undef CR -#undef LF -#undef REP - - input->last_filter_char = c; - - *output = input->filter_output; - *outputlen = len; - - return HUBBUB_OK; -} diff --git a/src/input/filter.h b/src/input/filter.h deleted file mode 100644 index 6650e09..0000000 --- a/src/input/filter.h +++ /dev/null @@ -1,57 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> - */ - -#ifndef hubbub_input_filter_h_ -#define hubbub_input_filter_h_ - -#include <inttypes.h> - -#include <hubbub/errors.h> -#include <hubbub/functypes.h> - -typedef struct hubbub_filter hubbub_filter; - -/** - * Input filter option types - */ -typedef enum hubbub_filter_opttype { - HUBBUB_FILTER_SET_ENCODING = 0, -} hubbub_filter_opttype; - -/** - * Input filter option parameters - */ -typedef union hubbub_filter_optparams { - /** Parameters for encoding setting */ - struct { - /** Encoding name */ - const char *name; - } encoding; -} hubbub_filter_optparams; - - -/* Create an input filter */ -hubbub_filter *hubbub_filter_create(const char *int_enc, - hubbub_alloc alloc, void *pw); -/* Destroy an input filter */ -void hubbub_filter_destroy(hubbub_filter *input); - -/* Configure an input filter */ -hubbub_error hubbub_filter_setopt(hubbub_filter *input, - hubbub_filter_opttype type, - hubbub_filter_optparams *params); - -/* Process a chunk of data */ -hubbub_error hubbub_filter_process_chunk(hubbub_filter *input, - const uint8_t **data, size_t *len, - uint8_t **output, size_t *outlen); - -/* Reset an input filter's state */ -hubbub_error hubbub_filter_reset(hubbub_filter *input); - -#endif - diff --git a/src/input/inputstream.c b/src/input/inputstream.c deleted file mode 100644 index 744aa23..0000000 --- a/src/input/inputstream.c +++ /dev/null @@ -1,481 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> - */ - -#include <stdlib.h> - -#include "charset/aliases.h" -#include "input/streamimpl.h" - -/** - * Buffer moving claimant context - */ -struct hubbub_inputstream_bm_handler { - hubbub_inputstream_buffermoved handler; /**< Handler function */ - void *pw; /**< Client private data */ - - struct hubbub_inputstream_bm_handler *next; - struct hubbub_inputstream_bm_handler *prev; -}; - -extern hubbub_streamhandler utf8stream; -extern hubbub_streamhandler utf16stream; - -static hubbub_streamhandler *handler_table[] = { - &utf8stream, - &utf16stream, - NULL -}; - -/** - * Create an input stream - * - * \param enc Document charset, or NULL to autodetect - * \param int_enc Desired encoding of document - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return Pointer to stream instance, or NULL on failure - */ -hubbub_inputstream *hubbub_inputstream_create(const char *enc, - const char *int_enc, hubbub_alloc alloc, void *pw) -{ - hubbub_inputstream *stream; - hubbub_streamhandler **handler; - - if (int_enc == NULL || alloc == NULL) - return NULL; - - /* Search for handler class */ - for (handler = handler_table; *handler != NULL; handler++) { - if ((*handler)->uses_encoding(int_enc)) - break; - } - - /* None found */ - if ((*handler) == NULL) - return NULL; - - stream = (*handler)->create(enc, int_enc, alloc, pw); - if (stream == NULL) - return NULL; - - stream->handlers = NULL; - - stream->alloc = alloc; - stream->pw = pw; - - return stream; -} - -/** - * Destroy an input stream - * - * \param stream Input stream to destroy - */ -void hubbub_inputstream_destroy(hubbub_inputstream *stream) -{ - hubbub_inputstream_bm_handler *h, *i; - - if (stream == NULL) - return; - - for (h = stream->handlers; h; h = i) { - i = h->next; - - stream->alloc(h, 0, stream->pw); - } - - stream->destroy(stream); -} - -/** - * Append data to an input stream - * - * \param stream Input stream to append data to - * \param data Data to append (in document charset), or NULL to flag EOF - * \param len Length, in bytes, of data - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream, - const uint8_t *data, size_t len) -{ - if (stream == NULL) - return HUBBUB_BADPARM; - - /* Calling this if we've disowned the buffer is foolish */ - if (stream->buffer == NULL) - return HUBBUB_INVALID; - - return stream->append(stream, data, len); -} - -/** - * Insert data into stream at current location - * - * \param stream Input stream to insert into - * \param data Data to insert (UTF-8 encoded) - * \param len Length, in bytes, of data - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream, - const uint8_t *data, size_t len) -{ - if (stream == NULL || data == NULL) - return HUBBUB_BADPARM; - - /* Calling this if we've disowned the buffer is foolish */ - if (stream->buffer == NULL) - return HUBBUB_INVALID; - - return stream->insert(stream, data, len); -} - -/** - * Look at the next character in the stream - * - * \param stream Stream to look in - * \return UCS4 (host-endian) character code, or EOF or OOD. - */ -uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream) -{ - /* It is illegal to call this after the buffer has been disowned */ - if (stream == NULL || stream->buffer == NULL) - return HUBBUB_INPUTSTREAM_OOD; - - return stream->peek(stream);; -} - -/** - * Retrieve the byte index and length of the current character in the stream - * - * \param stream Stream to look in - * \param len Pointer to location to receive byte length of character - * \return Byte index of current character from start of stream, - * or (uint32_t) -1 on error - */ -uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream, - size_t *len) -{ - /* It is illegal to call this after the buffer has been disowned */ - if (stream == NULL || len == NULL || stream->buffer == NULL) - return (uint32_t) -1; - - return stream->cur_pos(stream, len); -} - -/** - * Convert the current character to lower case - * - * \param stream Stream to look in - */ -void hubbub_inputstream_lowercase(hubbub_inputstream *stream) -{ - if (stream == NULL || stream->buffer == NULL) - return; - - stream->lowercase(stream); -} - -/** - * Convert the current character to upper case - * - * \param stream Stream to look in - */ -void hubbub_inputstream_uppercase(hubbub_inputstream *stream) -{ - if (stream == NULL || stream->buffer == NULL) - return; - - stream->uppercase(stream); -} - -/** - * Advance the stream's current position - * - * \param stream The stream whose position to advance - */ -void hubbub_inputstream_advance(hubbub_inputstream *stream) -{ - /* It is illegal to call this after the buffer has been disowned */ - if (stream == NULL || stream->buffer == NULL) - return; - - if (stream->cursor == stream->buffer_len) - return; - - stream->advance(stream); -} - -/** - * Push a character back onto the stream - * - * \param stream Stream to push back to - * \param character UCS4 (host-endian) codepoint to push back - * \return HUBBUB_OK on success, appropriate error otherwise - * - * Note that this doesn't actually modify the data in the stream. - * It works by ensuring that the character located just before the - * current stream location is the same as ::character. If it is, - * then the stream pointer is moved back. If it is not, then an - * error is returned and the stream pointer remains unmodified. - */ -hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream, - uint32_t character) -{ - /* It is illegal to call this after the buffer has been disowned */ - if (stream == NULL || stream->buffer == NULL) - return HUBBUB_BADPARM; - - if (stream->cursor == 0) - return HUBBUB_INVALID; - - return stream->push_back(stream, character); -} - -/** - * Rewind the input stream by a number of bytes - * - * \param stream Stream to rewind - * \param n Number of bytes to go back - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n) -{ - if (stream == NULL || stream->buffer == NULL) - return HUBBUB_BADPARM; - - if (stream->cursor < n) - return HUBBUB_INVALID; - - stream->cursor -= n; - - return HUBBUB_OK; -} - -/** - * Claim ownership of an input stream's buffer - * - * \param stream Input stream whose buffer to claim - * \param buffer Pointer to location to receive buffer pointer - * \param len Pointer to location to receive byte length of buffer - * \return HUBBUB_OK on success, appropriate error otherwise. - * - * Once the buffer has been claimed by a client, the input stream disclaims - * all ownership rights (and invalidates any internal references it may have - * to the buffer). Therefore, the only input stream call which may be made - * after calling this function is to destroy the input stream. Therefore, - * unless the stream pointer is located at EOF, this call will return an - * error. - */ -hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream, - uint8_t **buffer, size_t *len) -{ - if (stream == NULL || buffer == NULL || len == NULL) - return HUBBUB_BADPARM; - - if (stream->had_eof == false || - stream->cursor != stream->buffer_len) - return HUBBUB_INVALID; - - *buffer = stream->buffer; - *len = stream->buffer_len; - - stream->buffer = NULL; - - return HUBBUB_OK; -} - -/** - * Register interest in buffer moved events - * - * \param stream Input stream to register interest with - * \param handler Pointer to handler function - * \param pw Pointer to client-specific private data (may be NULL) - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_inputstream_register_movehandler( - hubbub_inputstream *stream, - hubbub_inputstream_buffermoved handler, void *pw) -{ - hubbub_inputstream_bm_handler *h; - - if (stream == NULL || handler == NULL) - return HUBBUB_BADPARM; - - h = stream->alloc(NULL, sizeof(hubbub_inputstream_bm_handler), - stream->pw); - if (h == NULL) - return HUBBUB_NOMEM; - - h->handler = handler; - h->pw = pw; - - h->prev = NULL; - h->next = stream->handlers; - - if (stream->handlers) - stream->handlers->prev = h; - stream->handlers = h; - - /* And notify claimant of current buffer location */ - handler(stream->buffer, stream->buffer_len, pw); - - return HUBBUB_OK; -} - -/** - * Deregister interest in buffer moved events - * - * \param stream Input stream to deregister from - * \param handler Pointer to handler function - * \param pw Pointer to client-specific private data (may be NULL) - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_inputstream_deregister_movehandler( - hubbub_inputstream *stream, - hubbub_inputstream_buffermoved handler, void *pw) -{ - hubbub_inputstream_bm_handler *h; - - if (stream == NULL || handler == NULL) - return HUBBUB_BADPARM; - - for (h = stream->handlers; h; h = h->next) { - if (h->handler == handler && h->pw == pw) - break; - } - - if (h == NULL) - return HUBBUB_INVALID; - - if (h->next) - h->next->prev = h->prev; - if (h->prev) - h->prev->next = h->next; - else - stream->handlers = h->next; - - stream->alloc(h, 0, stream->pw); - - return HUBBUB_OK; -} - -/** - * Case insensitively compare a pair of ranges in the input stream - * - * \param stream Input stream to look in - * \param r1 Offset of start of first range - * \param r2 Offset of start of second range - * \param len Byte length of ranges - * \return 0 if ranges match, non-zero otherwise - */ -int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream, - uint32_t r1, uint32_t r2, size_t len) -{ - if (stream == NULL || stream->buffer == NULL) - return 1; /* arbitrary */ - - return stream->cmp_range_ci(stream, r1, r2, len); -} - -/** - * Case sensitively compare a pair of ranges in the input stream - * - * \param stream Input stream to look in - * \param r1 Offset of start of first range - * \param r2 Offset of start of second range - * \param len Byte length of ranges - * \return 0 if ranges match, non-zero otherwise - */ -int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream, - uint32_t r1, uint32_t r2, size_t len) -{ - if (stream == NULL || stream->buffer == NULL) - return 1; /* arbitrary */ - - return stream->cmp_range_cs(stream, r1, r2, len); -} - -/** - * Case sensitively compare a range of input stream against an ASCII string - * - * \param stream Input stream to look in - * \param off Offset of range start - * \param len Byte length of range - * \param data Comparison string - * \param dlen Byte length of comparison string - * \return 0 if match, non-zero otherwise - */ -int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream, - uint32_t off, size_t len, const char *data, size_t dlen) -{ - if (stream == NULL || stream->buffer == NULL) - return 1; /* arbitrary */ - - return stream->cmp_range_ascii(stream, off, len, data, dlen); -} - -/** - * Replace a range of bytes in the input stream with a single character - * - * \param stream Input stream containing data - * \param start Offset of start of range to replace - * \param len Length (in bytes) of range to replace - * \param ucs4 UCS4 (host endian) encoded replacement character - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream, - uint32_t start, size_t len, uint32_t ucs4) -{ - if (stream == NULL || stream->buffer == NULL) - return HUBBUB_BADPARM; - - if (start >= stream->buffer_len) - return HUBBUB_INVALID; - - if (start < stream->cursor) - return HUBBUB_INVALID; - - return stream->replace_range(stream, start, len, ucs4); -} - -/** - * Read the document charset - * - * \param stream Input stream to query - * \param source Pointer to location to receive charset source - * \return Pointer to charset name (constant; do not free), or NULL if unknown - */ -const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream, - hubbub_charset_source *source) -{ - if (stream == NULL || source == NULL) - return NULL; - - *source = stream->encsrc; - - if (stream->encsrc == HUBBUB_CHARSET_UNKNOWN) - return NULL; - - return hubbub_mibenum_to_name(stream->mibenum); -} - -/** - * Inform interested parties that the buffer has moved - * - * \param stream Input stream - */ -void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream) -{ - hubbub_inputstream_bm_handler *h; - - if (stream == NULL) - return; - - for (h = stream->handlers; h; h = h->next) - h->handler(stream->buffer, stream->buffer_len, h->pw); -} - diff --git a/src/input/inputstream.h b/src/input/inputstream.h deleted file mode 100644 index 5325d14..0000000 --- a/src/input/inputstream.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> - */ - -#ifndef hubbub_input_inputstream_h_ -#define hubbub_input_inputstream_h_ - -#include <inttypes.h> - -#include <hubbub/errors.h> -#include <hubbub/functypes.h> -#include <hubbub/types.h> - -typedef struct hubbub_inputstream hubbub_inputstream; - -/* EOF pseudo-character */ -#define HUBBUB_INPUTSTREAM_EOF (0xFFFFFFFFU) -/* Out-of-data indicator */ -#define HUBBUB_INPUTSTREAM_OOD (0xFFFFFFFEU) - -/* Type of input stream buffer moved handler function */ -typedef void (*hubbub_inputstream_buffermoved)(const uint8_t *buffer, - size_t len, void *pw); - -/* Create an input stream */ -hubbub_inputstream *hubbub_inputstream_create(const char *enc, - const char *int_enc, hubbub_alloc alloc, void *pw); -/* Destroy an input stream */ -void hubbub_inputstream_destroy(hubbub_inputstream *stream); - -/* Append data to an input stream */ -hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream, - const uint8_t *data, size_t len); -/* Insert data into stream at current location */ -hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream, - const uint8_t *data, size_t len); - -/* Look at the next character in the stream */ -uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream); - -/* Retrieve the byte index and length of the current character in the stream */ -uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream, size_t *len); - -/* Convert the current character to lowercase */ -void hubbub_inputstream_lowercase(hubbub_inputstream *stream); - -/* Convert the current character to uppercase */ -void hubbub_inputstream_uppercase(hubbub_inputstream *stream); - -/* Advance the stream's current position */ -void hubbub_inputstream_advance(hubbub_inputstream *stream); - -/* Push a character back onto the stream */ -hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream, - uint32_t character); - -/* Rewind the input stream by a number of bytes */ -hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n); - -/* Claim ownership of an input stream's buffer */ -hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream, - uint8_t **buffer, size_t *len); - -/* Register interest in buffer moved events */ -hubbub_error hubbub_inputstream_register_movehandler( - hubbub_inputstream *stream, - hubbub_inputstream_buffermoved handler, void *pw); - -/* Deregister interest in buffer moved events */ -hubbub_error hubbub_inputstream_deregister_movehandler( - hubbub_inputstream *stream, - hubbub_inputstream_buffermoved handler, void *pw); - -/* Case insensitively compare a pair of ranges in the input stream */ -int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream, - uint32_t r1, uint32_t r2, size_t len); - -/* Case sensitively compare a pair of ranges in the input stream */ -int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream, - uint32_t r1, uint32_t r2, size_t len); - -/* Case sensitively compare a range of input stream against an ASCII string */ -int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream, - uint32_t off, size_t len, const char *data, size_t dlen); - -/* Replace a range of bytes in the input stream with a single character */ -hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream, - uint32_t start, size_t len, uint32_t ucs4); - -/* Read the document charset */ -const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream, - hubbub_charset_source *source); - -#endif - diff --git a/src/input/streamimpl.h b/src/input/streamimpl.h deleted file mode 100644 index f44f6da..0000000 --- a/src/input/streamimpl.h +++ /dev/null @@ -1,77 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> - */ - -#ifndef hubbub_input_streamimpl_h_ -#define hubbub_input_streamimpl_h_ - -#include <stdbool.h> - -#include <hubbub/types.h> - -#include "input/filter.h" -#include "input/inputstream.h" - -typedef struct hubbub_inputstream_bm_handler hubbub_inputstream_bm_handler; - -/** - * Input stream definition: implementations extend this - */ -struct hubbub_inputstream { - uint8_t *buffer; /**< Document buffer */ - size_t buffer_len; /**< Amount of data in buffer */ - size_t buffer_alloc; /**< Allocated size of buffer */ - - uint32_t cursor; /**< Byte offset of current position */ - - bool had_eof; /**< Whether EOF has been reached */ - - uint16_t mibenum; /**< MIB enum for charset, or 0 */ - hubbub_charset_source encsrc; /**< Charset source */ - - hubbub_filter *input; /**< Charset conversion filter */ - - hubbub_inputstream_bm_handler *handlers; /**< List of buffer - * moved handlers */ - hubbub_alloc alloc; /**< Memory (de)allocation function */ - void *pw; /**< Client private data */ - - void (*destroy)(hubbub_inputstream *stream); - hubbub_error (*append)(hubbub_inputstream *stream, - const uint8_t *data, size_t len); - hubbub_error (*insert)(hubbub_inputstream *stream, - const uint8_t *data, size_t len); - uint32_t (*peek)(hubbub_inputstream *stream); - uint32_t (*cur_pos)(hubbub_inputstream *stream, size_t *len); - void (*lowercase)(hubbub_inputstream *stream); - void (*uppercase)(hubbub_inputstream *stream); - void (*advance)(hubbub_inputstream *stream); - hubbub_error (*push_back)(hubbub_inputstream *stream, - uint32_t character); - int (*cmp_range_ci)(hubbub_inputstream *stream, uint32_t r1, - uint32_t r2, size_t len); - int (*cmp_range_cs)(hubbub_inputstream *stream, uint32_t r1, - uint32_t r2, size_t len); - int (*cmp_range_ascii)(hubbub_inputstream *stream, - uint32_t off, size_t len, - const char *data, size_t dlen); - hubbub_error (*replace_range)(hubbub_inputstream *stream, - uint32_t start, size_t len, uint32_t ucs4); -}; - -/** - * Input stream factory component definition - */ -typedef struct hubbub_streamhandler { - bool (*uses_encoding)(const char *int_enc); - hubbub_inputstream *(*create)(const char *enc, const char *int_enc, - hubbub_alloc alloc, void *pw); -} hubbub_streamhandler; - -/* Notification of stream buffer moving */ -void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream); - -#endif diff --git a/src/input/utf16_stream.c b/src/input/utf16_stream.c deleted file mode 100644 index e69f124..0000000 --- a/src/input/utf16_stream.c +++ /dev/null @@ -1,605 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> - */ - -#include <stdbool.h> -#include <string.h> - -#include "charset/aliases.h" -#include "charset/detect.h" -#include "input/streamimpl.h" -#include "utils/utf16.h" -#include "utils/utils.h" - -#define BUFFER_CHUNK (4096) - -static bool hubbub_utf16stream_uses_encoding(const char *int_enc); -static hubbub_inputstream *hubbub_utf16stream_create(const char *enc, - const char *int_enc, hubbub_alloc alloc, void *pw); -static void hubbub_utf16stream_destroy(hubbub_inputstream *stream); -static hubbub_error hubbub_utf16stream_append(hubbub_inputstream *stream, - const uint8_t *data, size_t len); -static hubbub_error hubbub_utf16stream_insert(hubbub_inputstream *stream, - const uint8_t *data, size_t len); -static uint32_t hubbub_utf16stream_peek(hubbub_inputstream *stream); -static uint32_t hubbub_utf16stream_cur_pos(hubbub_inputstream *stream, - size_t *len); -static void hubbub_utf16stream_lowercase(hubbub_inputstream *stream); -static void hubbub_utf16stream_uppercase(hubbub_inputstream *stream); -static void hubbub_utf16stream_advance(hubbub_inputstream *stream); -static hubbub_error hubbub_utf16stream_push_back(hubbub_inputstream *stream, - uint32_t character); -static int hubbub_utf16stream_compare_range_ci(hubbub_inputstream *stream, - uint32_t r1, uint32_t r2, size_t len); -static int hubbub_utf16stream_compare_range_cs(hubbub_inputstream *stream, - uint32_t r1, uint32_t r2, size_t len); -static int hubbub_utf16stream_compare_range_ascii(hubbub_inputstream *stream, - uint32_t off, size_t len, const char *data, size_t dlen); -static hubbub_error hubbub_utf16stream_replace_range( - hubbub_inputstream *stream, - uint32_t start, size_t len, uint32_t ucs4); - -/** - * Determine whether a stream implementation uses an internal encoding - * - * \param int_enc The desired encoding - * \return true if handled, false otherwise - */ -bool hubbub_utf16stream_uses_encoding(const char *int_enc) -{ - return (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) == - hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16"))); -} - -/** - * Create an input stream - * - * \param enc Document charset, or NULL if unknown - * \param int_enc Desired encoding of document - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return Pointer to stream instance, or NULL on failure - */ -hubbub_inputstream *hubbub_utf16stream_create(const char *enc, - const char *int_enc, hubbub_alloc alloc, void *pw) -{ - hubbub_inputstream *stream; - - if (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) != - hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16"))) - return NULL; - - stream = alloc(NULL, sizeof(hubbub_inputstream), pw); - if (stream == NULL) - return NULL; - - stream->buffer = alloc(NULL, BUFFER_CHUNK, pw); - if (stream->buffer == NULL) { - alloc(stream, 0, pw); - return NULL; - } - - stream->buffer_len = 0; - stream->buffer_alloc = BUFFER_CHUNK; - - stream->cursor = 0; - - stream->had_eof = false; - - stream->input = hubbub_filter_create(int_enc, alloc, pw); - if (stream->input == NULL) { - alloc(stream->buffer, 0, pw); - alloc(stream, 0, pw); - return NULL; - } - - if (enc != NULL) { - hubbub_error error; - hubbub_filter_optparams params; - - stream->mibenum = hubbub_mibenum_from_name(enc, strlen(enc)); - - if (stream->mibenum != 0) { - params.encoding.name = enc; - - error = hubbub_filter_setopt(stream->input, - HUBBUB_FILTER_SET_ENCODING, ¶ms); - if (error != HUBBUB_OK && error != HUBBUB_INVALID) { - hubbub_filter_destroy(stream->input); - alloc(stream->buffer, 0, pw); - alloc(stream, 0, pw); - return NULL; - } - - stream->encsrc = HUBBUB_CHARSET_DICTATED; - } - } else { - stream->mibenum = 0; - stream->encsrc = HUBBUB_CHARSET_UNKNOWN; - } - - stream->destroy = hubbub_utf16stream_destroy; - stream->append = hubbub_utf16stream_append; - stream->insert = hubbub_utf16stream_insert; - stream->peek = hubbub_utf16stream_peek; - stream->cur_pos = hubbub_utf16stream_cur_pos; - stream->lowercase = hubbub_utf16stream_lowercase; - stream->uppercase = hubbub_utf16stream_uppercase; - stream->advance = hubbub_utf16stream_advance; - stream->push_back = hubbub_utf16stream_push_back; - stream->cmp_range_ci = hubbub_utf16stream_compare_range_ci; - stream->cmp_range_cs = hubbub_utf16stream_compare_range_cs; - stream->cmp_range_ascii = hubbub_utf16stream_compare_range_ascii; - stream->replace_range = hubbub_utf16stream_replace_range; - - return stream; -} - -/** - * Destroy an input stream - * - * \param stream Input stream to destroy - */ -void hubbub_utf16stream_destroy(hubbub_inputstream *stream) -{ - if (stream->input != NULL) { - hubbub_filter_destroy(stream->input); - } - - if (stream->buffer != NULL) { - stream->alloc(stream->buffer, 0, stream->pw); - } - - stream->alloc(stream, 0, stream->pw); -} - -/** - * Append data to an input stream - * - * \param stream Input stream to append data to - * \param data Data to append (in document charset), or NULL to flag EOF - * \param len Length, in bytes, of data - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf16stream_append(hubbub_inputstream *stream, - const uint8_t *data, size_t len) -{ - hubbub_error error; - uint8_t *base; - size_t space; - - if (data == NULL) { - /* EOF indicated */ - size_t dummy_len = 0; - uint8_t *dummy_data = (uint8_t *) &dummy_len; - - base = stream->buffer + stream->buffer_len; - space = stream->buffer_alloc - stream->buffer_len; - - /* Forcibly flush through any remaining buffered data */ - while ((error = hubbub_filter_process_chunk(stream->input, - (const uint8_t **) &dummy_data, &dummy_len, - &base, &space)) == HUBBUB_NOMEM) { - bool moved = false; - uint8_t *temp = stream->alloc(stream->buffer, - stream->buffer_alloc + BUFFER_CHUNK, - stream->pw); - - if (temp == NULL) { - return HUBBUB_NOMEM; - } - - moved = (temp != stream->buffer); - - stream->buffer = temp; - stream->buffer_len = stream->buffer_alloc - space; - stream->buffer_alloc += BUFFER_CHUNK; - - base = stream->buffer + stream->buffer_len; - space += BUFFER_CHUNK; - - if (moved) - hubbub_inputstream_buffer_moved(stream); - } - - /* And fix up buffer length */ - stream->buffer_len = stream->buffer_alloc - space; - - stream->had_eof = true; - } else { - /* Normal data chunk */ - - if (stream->mibenum == 0) { - /* Haven't found charset yet; detect it */ - error = hubbub_charset_extract(&data, &len, - &stream->mibenum, &stream->encsrc); - if (error) { - return error; - } - - /* We should always have a charset by now */ - if (stream->mibenum == 0) - abort(); - } - - base = stream->buffer + stream->buffer_len; - space = stream->buffer_alloc - stream->buffer_len; - - /* Convert chunk to UTF-16 */ - while ((error = hubbub_filter_process_chunk(stream->input, - &data, &len, - &base, &space)) == HUBBUB_NOMEM) { - bool moved = false; - uint8_t *temp = stream->alloc(stream->buffer, - stream->buffer_alloc + BUFFER_CHUNK, - stream->pw); - - if (temp == NULL) { - return HUBBUB_NOMEM; - } - - moved = (temp != stream->buffer); - - stream->buffer = temp; - stream->buffer_len = stream->buffer_alloc - space; - stream->buffer_alloc += BUFFER_CHUNK; - - base = stream->buffer + stream->buffer_len; - space += BUFFER_CHUNK; - - if (moved) - hubbub_inputstream_buffer_moved(stream); - } - - /* And fix up buffer length */ - stream->buffer_len = stream->buffer_alloc - space; - } - - return HUBBUB_OK; -} - -/** - * Insert data into stream at current location - * - * \param stream Input stream to insert into - * \param data Data to insert (UTF-16 encoded) - * \param len Length, in bytes, of data - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf16stream_insert(hubbub_inputstream *stream, - const uint8_t *data, size_t len) -{ - size_t space; - uint8_t *curpos; - - space = stream->buffer_alloc - stream->buffer_len; - - /* Need to grow buffer, if there's insufficient space */ - if (space <= len) { - bool moved = false; - uint8_t *temp = stream->alloc(stream->buffer, - stream->buffer_alloc + - ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) + - BUFFER_CHUNK, - stream->pw); - - if (temp == NULL) - return HUBBUB_NOMEM; - - moved = (temp != stream->buffer); - - stream->buffer = temp; - stream->buffer_alloc += - ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK); - - if (moved) - hubbub_inputstream_buffer_moved(stream); - } - - /* Find the insertion point - * (just before the next character to be read) */ - curpos = stream->buffer + stream->cursor; - - /* Move data above this point up */ - memmove(curpos + len, curpos, stream->buffer_len - stream->cursor); - - /* Copy new data into gap created by memmove */ - memcpy(curpos, data, len); - - /* Fix up buffer length */ - stream->buffer_len += len; - - return HUBBUB_OK; -} - -/** - * Look at the next character in the stream - * - * \param stream Stream to look in - * \return UCS4 (host-endian) character code, or EOF or OOD. - */ -uint32_t hubbub_utf16stream_peek(hubbub_inputstream *stream) -{ - hubbub_error error; - size_t len; - uint32_t ret; - - if (stream->cursor == stream->buffer_len) { - return stream->had_eof ? HUBBUB_INPUTSTREAM_EOF - : HUBBUB_INPUTSTREAM_OOD; - } - - error = hubbub_utf16_to_ucs4(stream->buffer + stream->cursor, - stream->buffer_len - stream->cursor, - &ret, &len); - if (error != HUBBUB_OK && error != HUBBUB_NEEDDATA) - return HUBBUB_INPUTSTREAM_OOD; - - if (error == HUBBUB_NEEDDATA) { - if (stream->had_eof) - return HUBBUB_INPUTSTREAM_EOF; - else - return HUBBUB_INPUTSTREAM_OOD; - } - - return ret; -} - -/** - * Retrieve the byte index and length of the current character in the stream - * - * \param stream Stream to look in - * \param len Pointer to location to receive byte length of character - * \return Byte index of current character from start of stream, - * or (uint32_t) -1 on error - */ -uint32_t hubbub_utf16stream_cur_pos(hubbub_inputstream *stream, - size_t *len) -{ - hubbub_utf16_char_byte_length(stream->buffer + stream->cursor, len); - - return stream->cursor; -} - -/** - * Convert the current character to lower case - * - * \param stream Stream to look in - */ -void hubbub_utf16stream_lowercase(hubbub_inputstream *stream) -{ - uint16_t *buf = (uint16_t *) - ((void *) (stream->buffer + stream->cursor)); - - if (0x0041 <= buf[0] && buf[0] <= 0x005B) - buf[0] += 0x0020; -} - -/** - * Convert the current character to upper case - * - * \param stream Stream to look in - */ -void hubbub_utf16stream_uppercase(hubbub_inputstream *stream) -{ - uint16_t *buf = (uint16_t *) - ((void *) (stream->buffer + stream->cursor)); - - if (0x0061 <= buf[0] && buf[0] <= 0x007B) - buf[0] -= 0x0020; -} - -/** - * Advance the stream's current position - * - * \param stream The stream whose position to advance - */ -void hubbub_utf16stream_advance(hubbub_inputstream *stream) -{ - hubbub_error error; - uint32_t next; - - error = hubbub_utf16_next(stream->buffer, stream->buffer_len, - stream->cursor, &next); - - if (error == HUBBUB_OK) - stream->cursor = next; -} - -/** - * Push a character back onto the stream - * - * \param stream Stream to push back to - * \param character UCS4 (host-endian) codepoint to push back - * \return HUBBUB_OK on success, appropriate error otherwise - * - * Note that this doesn't actually modify the data in the stream. - * It works by ensuring that the character located just before the - * current stream location is the same as ::character. If it is, - * then the stream pointer is moved back. If it is not, then an - * error is returned and the stream pointer remains unmodified. - */ -hubbub_error hubbub_utf16stream_push_back(hubbub_inputstream *stream, - uint32_t character) -{ - hubbub_error error; - uint32_t prev; - uint8_t buf[4]; - size_t len; - - error = hubbub_utf16_prev(stream->buffer, stream->cursor, &prev); - if (error != HUBBUB_OK) - return error; - - error = hubbub_utf16_from_ucs4(character, buf, &len); - if (error != HUBBUB_OK) - return error; - - if ((stream->cursor - prev) != len || - memcmp(stream->buffer + prev, buf, len) != 0) - return HUBBUB_INVALID; - - stream->cursor = prev; - - return HUBBUB_OK; -} - -/** - * Case insensitively compare a pair of ranges in the input stream - * - * \param stream Input stream to look in - * \param r1 Offset of start of first range - * \param r2 Offset of start of second range - * \param len Byte length of ranges - * \return 0 if ranges match, non-zero otherwise - */ -int hubbub_utf16stream_compare_range_ci(hubbub_inputstream *stream, - uint32_t r1, uint32_t r2, size_t len) -{ - uint8_t *range1 = (stream->buffer + r1); - uint8_t *range2 = (stream->buffer + r2); - int c1, c2; - uint32_t r1next, r2next; - hubbub_error error; - - if (len == 0) - return 0; - - do { - c1 = *((uint16_t *) (void *) range1); - c2 = *((uint16_t *) (void *) range2); - - if ((0x0041 <= c1 && c1 <= 0x005B)) - c1 |= 0x0020; - - if ((0x0041 <= c2 && c2 <= 0x005B)) - c2 |= 0x0020; - - error = hubbub_utf16_next(range1, len, 0, &r1next); - error = hubbub_utf16_next(range2, len, 0, &r2next); - - range1 += r1next; - range2 += r2next; - - len -= r1next; - } while(c1 != 0 && (c1 == c2) && len > 0); - - return (c1 - c2); -} - -/** - * Case sensitively compare a pair of ranges in the input stream - * - * \param stream Input stream to look in - * \param r1 Offset of start of first range - * \param r2 Offset of start of second range - * \param len Byte length of ranges - * \return 0 if ranges match, non-zero otherwise - */ -int hubbub_utf16stream_compare_range_cs(hubbub_inputstream *stream, - uint32_t r1, uint32_t r2, size_t len) -{ - return memcmp((const char *) (stream->buffer + r1), - (const char *) (stream->buffer + r2), len); -} - -/** - * Case sensitively compare a range of input stream against an ASCII string - * - * \param stream Input stream to look in - * \param off Offset of range start - * \param len Byte length of range - * \param data Comparison string - * \param dlen Byte length of comparison string - * \return 0 if match, non-zero otherwise - */ -int hubbub_utf16stream_compare_range_ascii(hubbub_inputstream *stream, - uint32_t off, size_t len, const char *data, size_t dlen) -{ - uint8_t *range = (stream->buffer + off); - int c1, c2; - - /* Lengths don't match, so strings don't */ - if (len != dlen * 2) - return 1; /* arbitrary */ - - do { - c1 = *((uint16_t *) (void *) range); - c2 = *data; - - range += 2; - data++; - - len -= 2; - } while (c1 != 0 && (c1 == c2) && len > 0); - - return (c1 - c2); -} - -/** - * Replace a range of bytes in the input stream with a single character - * - * \param stream Input stream containing data - * \param start Offset of start of range to replace - * \param len Length (in bytes) of range to replace - * \param ucs4 UCS4 (host endian) encoded replacement character - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf16stream_replace_range(hubbub_inputstream *stream, - uint32_t start, size_t len, uint32_t ucs4) -{ - uint8_t buf[4]; - size_t replen; - int32_t diff; - hubbub_error error; - - /* Get utf16 version of replacement character */ - error = hubbub_utf16_from_ucs4(ucs4, buf, &replen); - if (error) - return error; - - diff = replen - len; - - if (stream->buffer_len + diff >= stream->buffer_alloc) { - /* Need more buffer space */ - bool moved = false; - uint8_t *temp = stream->alloc(stream->buffer, - stream->buffer_alloc + - ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) + - BUFFER_CHUNK, - stream->pw); - - if (temp == NULL) - return HUBBUB_NOMEM; - - moved = (temp != stream->buffer); - - stream->buffer = temp; - stream->buffer_alloc += - ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK); - - if (moved) - hubbub_inputstream_buffer_moved(stream); - } - - /* Move subsequent input to correct location */ - memmove(stream->buffer + start + len + diff, - stream->buffer + start + len, - stream->buffer_len - (start + len)); - - /* And fill the gap with the replacement character */ - memcpy(stream->buffer + start, buf, replen); - - /* Finally, update length */ - stream->buffer_len += diff; - - return HUBBUB_OK; -} - -hubbub_streamhandler utf16stream = { - hubbub_utf16stream_uses_encoding, - hubbub_utf16stream_create -}; diff --git a/src/input/utf8_stream.c b/src/input/utf8_stream.c deleted file mode 100644 index 3de142b..0000000 --- a/src/input/utf8_stream.c +++ /dev/null @@ -1,562 +0,0 @@ -/* - * This file is part of Hubbub. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> - */ - -#include <stdbool.h> -#include <string.h> - -#include "charset/aliases.h" -#include "charset/detect.h" -#include "input/streamimpl.h" -#include "utils/utf8.h" -#include "utils/utils.h" - -#define BUFFER_CHUNK (4096) - -static bool hubbub_utf8stream_uses_encoding(const char *int_enc); -static hubbub_inputstream *hubbub_utf8stream_create(const char *enc, - const char *int_enc, hubbub_alloc alloc, void *pw); -static void hubbub_utf8stream_destroy(hubbub_inputstream *stream); -static hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream, - const uint8_t *data, size_t len); -static hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream, - const uint8_t *data, size_t len); -static uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream); -static uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream, - size_t *len); -static void hubbub_utf8stream_lowercase(hubbub_inputstream *stream); -static void hubbub_utf8stream_uppercase(hubbub_inputstream *stream); -static void hubbub_utf8stream_advance(hubbub_inputstream *stream); -static hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream, - uint32_t character); -static int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream, - uint32_t r1, uint32_t r2, size_t len); -static int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream, - uint32_t r1, uint32_t r2, size_t len); -static int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream, - uint32_t off, size_t len, const char *data, size_t dlen); -static hubbub_error hubbub_utf8stream_replace_range( - hubbub_inputstream *stream, - uint32_t start, size_t len, uint32_t ucs4); - -/** - * Determine whether a stream implementation uses an internal encoding - * - * \param int_enc The desired encoding - * \return true if handled, false otherwise - */ -bool hubbub_utf8stream_uses_encoding(const char *int_enc) -{ - return (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) == - hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"))); -} - -/** - * Create an input stream - * - * \param enc Document charset, or NULL if unknown - * \param int_enc Desired encoding of document - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return Pointer to stream instance, or NULL on failure - */ -hubbub_inputstream *hubbub_utf8stream_create(const char *enc, - const char *int_enc, hubbub_alloc alloc, void *pw) -{ - hubbub_inputstream *stream; - - if (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) != - hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"))) - return NULL; - - stream = alloc(NULL, sizeof(hubbub_inputstream), pw); - if (stream == NULL) - return NULL; - - stream->buffer = alloc(NULL, BUFFER_CHUNK, pw); - if (stream->buffer == NULL) { - alloc(stream, 0, pw); - return NULL; - } - - stream->buffer_len = 0; - stream->buffer_alloc = BUFFER_CHUNK; - - stream->cursor = 0; - - stream->had_eof = false; - - stream->input = hubbub_filter_create(int_enc, alloc, pw); - if (stream->input == NULL) { - alloc(stream->buffer, 0, pw); - alloc(stream, 0, pw); - return NULL; - } - - if (enc != NULL) { - hubbub_error error; - hubbub_filter_optparams params; - - stream->mibenum = hubbub_mibenum_from_name(enc, strlen(enc)); - - if (stream->mibenum != 0) { - params.encoding.name = enc; - - error = hubbub_filter_setopt(stream->input, - HUBBUB_FILTER_SET_ENCODING, ¶ms); - if (error != HUBBUB_OK && error != HUBBUB_INVALID) { - hubbub_filter_destroy(stream->input); - alloc(stream->buffer, 0, pw); - alloc(stream, 0, pw); - return NULL; - } - - stream->encsrc = HUBBUB_CHARSET_DICTATED; - } - } else { - stream->mibenum = 0; - stream->encsrc = HUBBUB_CHARSET_UNKNOWN; - } - - stream->destroy = hubbub_utf8stream_destroy; - stream->append = hubbub_utf8stream_append; - stream->insert = hubbub_utf8stream_insert; - stream->peek = hubbub_utf8stream_peek; - stream->cur_pos = hubbub_utf8stream_cur_pos; - stream->lowercase = hubbub_utf8stream_lowercase; - stream->uppercase = hubbub_utf8stream_uppercase; - stream->advance = hubbub_utf8stream_advance; - stream->push_back = hubbub_utf8stream_push_back; - stream->cmp_range_ci = hubbub_utf8stream_compare_range_ci; - stream->cmp_range_cs = hubbub_utf8stream_compare_range_cs; - stream->cmp_range_ascii = hubbub_utf8stream_compare_range_ascii; - stream->replace_range = hubbub_utf8stream_replace_range; - - return stream; -} - -/** - * Destroy an input stream - * - * \param stream Input stream to destroy - */ -void hubbub_utf8stream_destroy(hubbub_inputstream *stream) -{ - if (stream->input != NULL) { - hubbub_filter_destroy(stream->input); - } - - if (stream->buffer != NULL) { - stream->alloc(stream->buffer, 0, stream->pw); - } - - stream->alloc(stream, 0, stream->pw); -} - -/** - * Append data to an input stream - * - * \param stream Input stream to append data to - * \param data Data to append (in document charset), or NULL to flag EOF - * \param len Length, in bytes, of data - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream, - const uint8_t *data, size_t len) -{ - hubbub_error error; - uint8_t *base; - size_t space; - - if (data == NULL) { - /* EOF indicated */ - size_t dummy_len = 0; - uint8_t *dummy_data = (uint8_t *) &dummy_len; - - base = stream->buffer + stream->buffer_len; - space = stream->buffer_alloc - stream->buffer_len; - - /* Forcibly flush through any remaining buffered data */ - while ((error = hubbub_filter_process_chunk(stream->input, - (const uint8_t **) &dummy_data, &dummy_len, - &base, &space)) == HUBBUB_NOMEM) { - bool moved = false; - uint8_t *temp = stream->alloc(stream->buffer, - stream->buffer_alloc + BUFFER_CHUNK, - stream->pw); - - if (temp == NULL) { - return HUBBUB_NOMEM; - } - - moved = (temp != stream->buffer); - - stream->buffer = temp; - stream->buffer_len = stream->buffer_alloc - space; - stream->buffer_alloc += BUFFER_CHUNK; - - base = stream->buffer + stream->buffer_len; - space += BUFFER_CHUNK; - - if (moved) - hubbub_inputstream_buffer_moved(stream); - } - - /* And fix up buffer length */ - stream->buffer_len = stream->buffer_alloc - space; - - stream->had_eof = true; - } else { - /* Normal data chunk */ - - if (stream->mibenum == 0) { - /* Haven't found charset yet; detect it */ - error = hubbub_charset_extract(&data, &len, - &stream->mibenum, &stream->encsrc); - if (error) { - return error; - } - - /* We should always have a charset by now */ - if (stream->mibenum == 0) - abort(); - } - - base = stream->buffer + stream->buffer_len; - space = stream->buffer_alloc - stream->buffer_len; - - /* Convert chunk to UTF-8 */ - while ((error = hubbub_filter_process_chunk(stream->input, - &data, &len, - &base, &space)) == HUBBUB_NOMEM) { - bool moved = false; - uint8_t *temp = stream->alloc(stream->buffer, - stream->buffer_alloc + BUFFER_CHUNK, - stream->pw); - - if (temp == NULL) { - return HUBBUB_NOMEM; - } - - moved = (temp != stream->buffer); - - stream->buffer = temp; - stream->buffer_len = stream->buffer_alloc - space; - stream->buffer_alloc += BUFFER_CHUNK; - - base = stream->buffer + stream->buffer_len; - space += BUFFER_CHUNK; - - if (moved) - hubbub_inputstream_buffer_moved(stream); - } - - /* And fix up buffer length */ - stream->buffer_len = stream->buffer_alloc - space; - } - - return HUBBUB_OK; -} - -/** - * Insert data into stream at current location - * - * \param stream Input stream to insert into - * \param data Data to insert (UTF-8 encoded) - * \param len Length, in bytes, of data - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream, - const uint8_t *data, size_t len) -{ - size_t space; - uint8_t *curpos; - - space = stream->buffer_alloc - stream->buffer_len; - - /* Need to grow buffer, if there's insufficient space */ - if (space <= len) { - bool moved = false; - uint8_t *temp = stream->alloc(stream->buffer, - stream->buffer_alloc + - ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) + - BUFFER_CHUNK, - stream->pw); - - if (temp == NULL) - return HUBBUB_NOMEM; - - moved = (temp != stream->buffer); - - stream->buffer = temp; - stream->buffer_alloc += - ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK); - - if (moved) - hubbub_inputstream_buffer_moved(stream); - } - - /* Find the insertion point - * (just before the next character to be read) */ - curpos = stream->buffer + stream->cursor; - - /* Move data above this point up */ - memmove(curpos + len, curpos, stream->buffer_len - stream->cursor); - - /* Copy new data into gap created by memmove */ - memcpy(curpos, data, len); - - /* Fix up buffer length */ - stream->buffer_len += len; - - return HUBBUB_OK; -} - -/** - * Look at the next character in the stream - * - * \param stream Stream to look in - * \return UCS4 (host-endian) character code, or EOF or OOD. - */ -uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream) -{ - hubbub_error error; - size_t len; - uint32_t ret; - - if (stream->cursor == stream->buffer_len) { - return stream->had_eof ? HUBBUB_INPUTSTREAM_EOF - : HUBBUB_INPUTSTREAM_OOD; - } - - error = hubbub_utf8_to_ucs4(stream->buffer + stream->cursor, - stream->buffer_len - stream->cursor, - &ret, &len); - if (error != HUBBUB_OK && error != HUBBUB_NEEDDATA) - return HUBBUB_INPUTSTREAM_OOD; - - if (error == HUBBUB_NEEDDATA) { - if (stream->had_eof) - return HUBBUB_INPUTSTREAM_EOF; - else - return HUBBUB_INPUTSTREAM_OOD; - } - - return ret; -} - -/** - * Retrieve the byte index and length of the current character in the stream - * - * \param stream Stream to look in - * \param len Pointer to location to receive byte length of character - * \return Byte index of current character from start of stream, - * or (uint32_t) -1 on error - */ -uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream, - size_t *len) -{ - hubbub_utf8_char_byte_length(stream->buffer + stream->cursor, len); - - return stream->cursor; -} - -/** - * Convert the current character to lower case - * - * \param stream Stream to look in - */ -void hubbub_utf8stream_lowercase(hubbub_inputstream *stream) -{ - if ('A' <= stream->buffer[stream->cursor] && - stream->buffer[stream->cursor] <= 'Z') - stream->buffer[stream->cursor] += 0x0020; -} - -/** - * Convert the current character to upper case - * - * \param stream Stream to look in - */ -void hubbub_utf8stream_uppercase(hubbub_inputstream *stream) -{ - if ('a' <= stream->buffer[stream->cursor] && - stream->buffer[stream->cursor] <= 'z') - stream->buffer[stream->cursor] -= 0x0020; -} - -/** - * Advance the stream's current position - * - * \param stream The stream whose position to advance - */ -void hubbub_utf8stream_advance(hubbub_inputstream *stream) -{ - hubbub_error error; - uint32_t next; - - error = hubbub_utf8_next(stream->buffer, stream->buffer_len, - stream->cursor, &next); - - if (error == HUBBUB_OK) - stream->cursor = next; -} - -/** - * Push a character back onto the stream - * - * \param stream Stream to push back to - * \param character UCS4 (host-endian) codepoint to push back - * \return HUBBUB_OK on success, appropriate error otherwise - * - * Note that this doesn't actually modify the data in the stream. - * It works by ensuring that the character located just before the - * current stream location is the same as ::character. If it is, - * then the stream pointer is moved back. If it is not, then an - * error is returned and the stream pointer remains unmodified. - */ -hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream, - uint32_t character) -{ - hubbub_error error; - uint32_t prev; - uint8_t buf[6]; - size_t len; - - error = hubbub_utf8_prev(stream->buffer, stream->cursor, &prev); - if (error != HUBBUB_OK) - return error; - - error = hubbub_utf8_from_ucs4(character, buf, &len); - if (error != HUBBUB_OK) - return error; - - if ((stream->cursor - prev) != len || - memcmp(stream->buffer + prev, buf, len) != 0) - return HUBBUB_INVALID; - - stream->cursor = prev; - - return HUBBUB_OK; -} - -/** - * Case insensitively compare a pair of ranges in the input stream - * - * \param stream Input stream to look in - * \param r1 Offset of start of first range - * \param r2 Offset of start of second range - * \param len Byte length of ranges - * \return 0 if ranges match, non-zero otherwise - */ -int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream, - uint32_t r1, uint32_t r2, size_t len) -{ - return strncasecmp((const char *) (stream->buffer + r1), - (const char *) (stream->buffer + r2), len); -} - -/** - * Case sensitively compare a pair of ranges in the input stream - * - * \param stream Input stream to look in - * \param r1 Offset of start of first range - * \param r2 Offset of start of second range - * \param len Byte length of ranges - * \return 0 if ranges match, non-zero otherwise - */ -int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream, - uint32_t r1, uint32_t r2, size_t len) -{ - return strncmp((const char *) (stream->buffer + r1), - (const char *) (stream->buffer + r2), len); -} - -/** - * Case sensitively compare a range of input stream against an ASCII string - * - * \param stream Input stream to look in - * \param off Offset of range start - * \param len Byte length of range - * \param data Comparison string - * \param dlen Byte length of comparison string - * \return 0 if match, non-zero otherwise - */ -int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream, - uint32_t off, size_t len, const char *data, size_t dlen) -{ - /* Lengths don't match, so strings don't */ - if (len != dlen) - return 1; /* arbitrary */ - - return strncmp((const char *) (stream->buffer + off), - data, len); -} - -/** - * Replace a range of bytes in the input stream with a single character - * - * \param stream Input stream containing data - * \param start Offset of start of range to replace - * \param len Length (in bytes) of range to replace - * \param ucs4 UCS4 (host endian) encoded replacement character - * \return HUBBUB_OK on success, appropriate error otherwise - */ -hubbub_error hubbub_utf8stream_replace_range(hubbub_inputstream *stream, - uint32_t start, size_t len, uint32_t ucs4) -{ - uint8_t buf[6]; - size_t replen; - int32_t diff; - hubbub_error error; - - /* Get UTF8 version of replacement character */ - error = hubbub_utf8_from_ucs4(ucs4, buf, &replen); - if (error) - return error; - - diff = replen - len; - - if (stream->buffer_len + diff >= stream->buffer_alloc) { - /* Need more buffer space */ - bool moved = false; - uint8_t *temp = stream->alloc(stream->buffer, - stream->buffer_alloc + - ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) + - BUFFER_CHUNK, - stream->pw); - - if (temp == NULL) - return HUBBUB_NOMEM; - - moved = (temp != stream->buffer); - - stream->buffer = temp; - stream->buffer_alloc += - ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK); - - if (moved) - hubbub_inputstream_buffer_moved(stream); - } - - /* Move subsequent input to correct location */ - memmove(stream->buffer + start + len + diff, - stream->buffer + start + len, - stream->buffer_len - (start + len)); - - /* And fill the gap with the replacement character */ - memcpy(stream->buffer + start, buf, replen); - - /* Finally, update length */ - stream->buffer_len += diff; - - return HUBBUB_OK; -} - -hubbub_streamhandler utf8stream = { - hubbub_utf8stream_uses_encoding, - hubbub_utf8stream_create -}; |