From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sat, 23 Jun 2007 22:40:25 +0000 Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359 --- src/input/filter.c | 380 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 380 insertions(+) create mode 100644 src/input/filter.c (limited to 'src/input/filter.c') diff --git a/src/input/filter.c b/src/input/filter.c new file mode 100644 index 0000000..5ac5391 --- /dev/null +++ b/src/input/filter.c @@ -0,0 +1,380 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include +#include +#include + +#include "charset/aliases.h" +#include "charset/codec.h" +#include "utils/utils.h" + +#include "input/filter.h" + + +/** Input filter */ +struct hubbub_filter { + hubbub_charsetcodec *read_codec; /**< Read codec */ + hubbub_charsetcodec *write_codec; /**< Write codec */ + + uint32_t filter_output[2]; /**< Filter output buffer */ + uint32_t last_filter_char; /**< Last filtered character */ + + uint32_t pivot_buf[64]; /**< Conversion pivot buffer */ + + bool leftover; /**< Data remains from last call */ + uint8_t *pivot_left; /**< Remaining pivot to write */ + size_t pivot_len; /**< Length of pivot remaining */ + + struct { + uint16_t encoding; /**< Input encoding */ + } settings; /**< Filter settings */ + + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client private data */ +}; + +static hubbub_error hubbub_filter_set_defaults(hubbub_filter *input); +static hubbub_error hubbub_filter_set_encoding(hubbub_filter *input, + const char *enc); +static hubbub_error read_character_filter(uint32_t c, + uint32_t **output, size_t *outputlen, void *pw); + +/** + * Create an input filter + * + * \param int_enc Desired encoding of document + * \param alloc Function used to (de)allocate data + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to filter instance, or NULL on failure + */ +hubbub_filter *hubbub_filter_create(const char *int_enc, + hubbub_alloc alloc, void *pw) +{ + hubbub_filter *filter; + + if (alloc == NULL) + return NULL; + + filter = alloc(NULL, sizeof(*filter), pw); + if (!filter) + return NULL; + + filter->last_filter_char = 0; + + filter->leftover = false; + filter->pivot_left = NULL; + filter->pivot_len = 0; + + filter->alloc = alloc; + filter->pw = pw; + + if (hubbub_filter_set_defaults(filter) != HUBBUB_OK) { + filter->alloc(filter, 0, pw); + return NULL; + } + + filter->write_codec = hubbub_charsetcodec_create(int_enc, alloc, pw); + if (filter->write_codec == NULL) { + if (filter->read_codec != NULL) + hubbub_charsetcodec_destroy(filter->read_codec); + filter->alloc(filter, 0, pw); + return NULL; + } + + return filter; +} + +/** + * Destroy an input filter + * + * \param input Pointer to filter instance + */ +void hubbub_filter_destroy(hubbub_filter *input) +{ + if (input == NULL) + return; + + if (input->read_codec != NULL) + hubbub_charsetcodec_destroy(input->read_codec); + + if (input->write_codec != NULL) + hubbub_charsetcodec_destroy(input->write_codec); + + input->alloc(input, 0, input->pw); + + return; +} + +/** + * Configure an input filter + * + * \param input Pointer to filter instance + * \param type Input option type to configure + * \param params Option-specific parameters + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_setopt(hubbub_filter *input, + hubbub_filter_opttype type, + hubbub_filter_optparams *params) +{ + hubbub_error error = HUBBUB_OK; + + if (input == NULL || params == NULL) + return HUBBUB_BADPARM; + + switch (type) { + case HUBBUB_FILTER_SET_ENCODING: + error = hubbub_filter_set_encoding(input, + params->encoding.name); + break; + } + + return error; +} + +/** + * Process a chunk of data + * + * \param input Pointer to filter instance + * \param data Pointer to pointer to input buffer + * \param len Pointer to length of input buffer + * \param output Pointer to pointer to output buffer + * \param outlen Pointer to length of output buffer + * \return HUBBUB_OK on success, appropriate error otherwise + * + * Call this with an input buffer length of 0 to flush any buffers. + */ +hubbub_error hubbub_filter_process_chunk(hubbub_filter *input, + const uint8_t **data, size_t *len, + uint8_t **output, size_t *outlen) +{ + hubbub_error read_error, write_error; + + if (input == NULL || data == NULL || *data == NULL || len == NULL || + output == NULL || *output == NULL || outlen == NULL) + return HUBBUB_BADPARM; + + if (input->leftover) { + /* Some data left to be written from last call */ + + /* Attempt to flush the remaining data. */ + write_error = hubbub_charsetcodec_encode(input->write_codec, + (const uint8_t **) &input->pivot_left, + &input->pivot_len, + output, outlen); + + if (write_error != HUBBUB_OK) { + return write_error; + } + + /* And clear leftover */ + input->pivot_left = NULL; + input->pivot_len = 0; + input->leftover = false; + } + + while (*len > 0) { + size_t pivot_len = sizeof(input->pivot_buf); + uint8_t *pivot = (uint8_t *) input->pivot_buf; + + read_error = hubbub_charsetcodec_decode(input->read_codec, + data, len, + (uint8_t **) &pivot, &pivot_len); + + pivot = (uint8_t *) input->pivot_buf; + pivot_len = sizeof(input->pivot_buf) - pivot_len; + + if (pivot_len > 0) { + write_error = hubbub_charsetcodec_encode( + input->write_codec, + (const uint8_t **) &pivot, + &pivot_len, + output, outlen); + + if (write_error != HUBBUB_OK) { + input->leftover = true; + input->pivot_left = pivot; + input->pivot_len = pivot_len; + + return write_error; + } + } + + if (read_error != HUBBUB_OK && read_error != HUBBUB_NOMEM) + return read_error; + } + + return HUBBUB_OK; +} + +/** + * Reset an input filter's state + * + * \param input The input filter to reset + * \param HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_reset(hubbub_filter *input) +{ + hubbub_error error; + + if (input == NULL) + return HUBBUB_BADPARM; + + /* Clear pivot buffer leftovers */ + input->pivot_left = NULL; + input->pivot_len = 0; + input->leftover = false; + + /* Reset read codec */ + error = hubbub_charsetcodec_reset(input->read_codec); + if (error != HUBBUB_OK) + return error; + + /* Reset write codec */ + error = hubbub_charsetcodec_reset(input->write_codec); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Set an input filter's default settings + * + * \param input Input filter to configure + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_set_defaults(hubbub_filter *input) +{ + hubbub_error error; + + if (input == NULL) + return HUBBUB_BADPARM; + + input->read_codec = NULL; + input->write_codec = NULL; + input->settings.encoding = 0; + error = hubbub_filter_set_encoding(input, "ISO-8859-1"); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Set an input filter's encoding + * + * \param input Input filter to configure + * \param enc Encoding name + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_set_encoding(hubbub_filter *input, + const char *enc) +{ + const char *old_enc; + uint16_t mibenum; + hubbub_error error; + hubbub_charsetcodec_optparams params; + + if (input == NULL || enc == NULL) + return HUBBUB_BADPARM; + + mibenum = hubbub_mibenum_from_name(enc, strlen(enc)); + if (mibenum == 0) + return HUBBUB_INVALID; + + /* Exit early if we're already using this encoding */ + if (input->settings.encoding == mibenum) + return HUBBUB_OK; + + old_enc = hubbub_mibenum_to_name(input->settings.encoding); + if (old_enc == NULL) + old_enc = "ISO-8859-1"; + + if (input->read_codec != NULL) + hubbub_charsetcodec_destroy(input->read_codec); + + input->read_codec = hubbub_charsetcodec_create(enc, input->alloc, + input->pw); + if (input->read_codec == NULL) + return HUBBUB_NOMEM; + + /* Register filter function */ + params.filter_func.filter = read_character_filter; + params.filter_func.pw = (void *) input; + error = hubbub_charsetcodec_setopt(input->read_codec, + HUBBUB_CHARSETCODEC_FILTER_FUNC, + (hubbub_charsetcodec_optparams *) ¶ms); + if (error != HUBBUB_OK) + return error; + + input->settings.encoding = mibenum; + + return HUBBUB_OK; +} + +/** + * Character filter function for read characters + * + * \param c The read character (UCS4 - host byte order) + * \param output Pointer to pointer to output buffer (filled on exit) + * \param outputlen Pointer to output buffer length (filled on exit) + * \param pw Pointer to client-specific private data. + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error read_character_filter(uint32_t c, uint32_t **output, + size_t *outputlen, void *pw) +{ + hubbub_filter *input = (hubbub_filter *) pw; + size_t len; + + if (output == NULL || outputlen == NULL || pw == NULL) + return HUBBUB_BADPARM; + + /* Line ending normalisation: + * CRLF -> LF (trap CR and let LF through unmodified) + * CR -> LF (trap CR and convert to LF if not CRLF) + * LF -> LF (leave LF alone) + */ + +#define NUL (0x00000000) +#define CR (0x0000000D) +#define LF (0x0000000A) +#define REP (0x0000FFFD) + + if (c == NUL) { + /* Replace NUL (U+0000) characters in input with U+FFFD */ + input->filter_output[0] = REP; + len = 1; + } else if (c == CR) { + /* Trap CR characters */ + len = 0; + } else if (input->last_filter_char == CR && c != LF) { + /* Last char was CR and this isn't LF => CR -> LF */ + input->filter_output[0] = LF; + input->filter_output[1] = c; + len = 2; + } else { + /* Let character through unchanged */ + input->filter_output[0] = c; + len = 1; + } + +#undef NUL +#undef CR +#undef LF +#undef REP + + input->last_filter_char = c; + + *output = input->filter_output; + *outputlen = len; + + return HUBBUB_OK; +} -- cgit v1.2.3