From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sat, 23 Jun 2007 22:40:25 +0000 Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359 --- src/input/inputstream.c | 479 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 479 insertions(+) create mode 100644 src/input/inputstream.c (limited to 'src/input/inputstream.c') diff --git a/src/input/inputstream.c b/src/input/inputstream.c new file mode 100644 index 0000000..f82d279 --- /dev/null +++ b/src/input/inputstream.c @@ -0,0 +1,479 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include + +#include "charset/aliases.h" +#include "input/streamimpl.h" + +/** + * Buffer moving claimant context + */ +struct hubbub_inputstream_bm_handler { + hubbub_inputstream_buffermoved handler; /**< Handler function */ + void *pw; /**< Client private data */ + + struct hubbub_inputstream_bm_handler *next; + struct hubbub_inputstream_bm_handler *prev; +}; + +extern hubbub_streamhandler utf8stream; + +static hubbub_streamhandler *handler_table[] = { + &utf8stream, + NULL +}; + +/** + * Create an input stream + * + * \param enc Document charset, or NULL to autodetect + * \param int_enc Desired encoding of document + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to stream instance, or NULL on failure + */ +hubbub_inputstream *hubbub_inputstream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw) +{ + hubbub_inputstream *stream; + hubbub_streamhandler **handler; + + if (int_enc == NULL || alloc == NULL) + return NULL; + + /* Search for handler class */ + for (handler = handler_table; *handler != NULL; handler++) { + if ((*handler)->uses_encoding(int_enc)) + break; + } + + /* None found */ + if ((*handler) == NULL) + return NULL; + + stream = (*handler)->create(enc, int_enc, alloc, pw); + if (stream == NULL) + return NULL; + + stream->handlers = NULL; + + stream->alloc = alloc; + stream->pw = pw; + + return stream; +} + +/** + * Destroy an input stream + * + * \param stream Input stream to destroy + */ +void hubbub_inputstream_destroy(hubbub_inputstream *stream) +{ + hubbub_inputstream_bm_handler *h, *i; + + if (stream == NULL) + return; + + for (h = stream->handlers; h; h = i) { + i = h->next; + + stream->alloc(h, 0, stream->pw); + } + + stream->destroy(stream); +} + +/** + * Append data to an input stream + * + * \param stream Input stream to append data to + * \param data Data to append (in document charset), or NULL to flag EOF + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + if (stream == NULL) + return HUBBUB_BADPARM; + + /* Calling this if we've disowned the buffer is foolish */ + if (stream->buffer == NULL) + return HUBBUB_INVALID; + + return stream->append(stream, data, len); +} + +/** + * Insert data into stream at current location + * + * \param stream Input stream to insert into + * \param data Data to insert (UTF-8 encoded) + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + if (stream == NULL || data == NULL) + return HUBBUB_BADPARM; + + /* Calling this if we've disowned the buffer is foolish */ + if (stream->buffer == NULL) + return HUBBUB_INVALID; + + return stream->insert(stream, data, len); +} + +/** + * Look at the next character in the stream + * + * \param stream Stream to look in + * \return UCS4 (host-endian) character code, or EOF or OOD. + */ +uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_INPUTSTREAM_OOD; + + return stream->peek(stream);; +} + +/** + * Retrieve the byte index and length of the current character in the stream + * + * \param stream Stream to look in + * \param len Pointer to location to receive byte length of character + * \return Byte index of current character from start of stream, + * or (uint32_t) -1 on error + */ +uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream, + size_t *len) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || len == NULL || stream->buffer == NULL) + return (uint32_t) -1; + + return stream->cur_pos(stream, len); +} + +/** + * Convert the current character to lower case + * + * \param stream Stream to look in + */ +void hubbub_inputstream_lowercase(hubbub_inputstream *stream) +{ + if (stream == NULL || stream->buffer == NULL) + return; + + stream->lowercase(stream); +} + +/** + * Convert the current character to upper case + * + * \param stream Stream to look in + */ +void hubbub_inputstream_uppercase(hubbub_inputstream *stream) +{ + if (stream == NULL || stream->buffer == NULL) + return; + + stream->uppercase(stream); +} + +/** + * Advance the stream's current position + * + * \param stream The stream whose position to advance + */ +void hubbub_inputstream_advance(hubbub_inputstream *stream) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || stream->buffer == NULL) + return; + + if (stream->cursor == stream->buffer_len) + return; + + stream->advance(stream); +} + +/** + * Push a character back onto the stream + * + * \param stream Stream to push back to + * \param character UCS4 (host-endian) codepoint to push back + * \return HUBBUB_OK on success, appropriate error otherwise + * + * Note that this doesn't actually modify the data in the stream. + * It works by ensuring that the character located just before the + * current stream location is the same as ::character. If it is, + * then the stream pointer is moved back. If it is not, then an + * error is returned and the stream pointer remains unmodified. + */ +hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream, + uint32_t character) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_BADPARM; + + if (stream->cursor == 0) + return HUBBUB_INVALID; + + return stream->push_back(stream, character); +} + +/** + * Rewind the input stream by a number of bytes + * + * \param stream Stream to rewind + * \param n Number of bytes to go back + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n) +{ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_BADPARM; + + if (stream->cursor < n) + return HUBBUB_INVALID; + + stream->cursor -= n; + + return HUBBUB_OK; +} + +/** + * Claim ownership of an input stream's buffer + * + * \param stream Input stream whose buffer to claim + * \param buffer Pointer to location to receive buffer pointer + * \param len Pointer to location to receive byte length of buffer + * \return HUBBUB_OK on success, appropriate error otherwise. + * + * Once the buffer has been claimed by a client, the input stream disclaims + * all ownership rights (and invalidates any internal references it may have + * to the buffer). Therefore, the only input stream call which may be made + * after calling this function is to destroy the input stream. Therefore, + * unless the stream pointer is located at EOF, this call will return an + * error. + */ +hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream, + uint8_t **buffer, size_t *len) +{ + if (stream == NULL || buffer == NULL || len == NULL) + return HUBBUB_BADPARM; + + if (stream->had_eof == false || + stream->cursor != stream->buffer_len) + return HUBBUB_INVALID; + + *buffer = stream->buffer; + *len = stream->buffer_len; + + stream->buffer = NULL; + + return HUBBUB_OK; +} + +/** + * Register interest in buffer moved events + * + * \param stream Input stream to register interest with + * \param handler Pointer to handler function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_register_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw) +{ + hubbub_inputstream_bm_handler *h; + + if (stream == NULL || handler == NULL) + return HUBBUB_BADPARM; + + h = stream->alloc(NULL, sizeof(hubbub_inputstream_bm_handler), + stream->pw); + if (h == NULL) + return HUBBUB_NOMEM; + + h->handler = handler; + h->pw = pw; + + h->prev = NULL; + h->next = stream->handlers; + + if (stream->handlers) + stream->handlers->prev = h; + stream->handlers = h; + + /* And notify claimant of current buffer location */ + handler(stream->buffer, stream->buffer_len, pw); + + return HUBBUB_OK; +} + +/** + * Deregister interest in buffer moved events + * + * \param stream Input stream to deregister from + * \param handler Pointer to handler function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_deregister_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw) +{ + hubbub_inputstream_bm_handler *h; + + if (stream == NULL || handler == NULL) + return HUBBUB_BADPARM; + + for (h = stream->handlers; h; h = h->next) { + if (h->handler == handler && h->pw == pw) + break; + } + + if (h == NULL) + return HUBBUB_INVALID; + + if (h->next) + h->next->prev = h->prev; + if (h->prev) + h->prev->next = h->next; + else + stream->handlers = h->next; + + stream->alloc(h, 0, stream->pw); + + return HUBBUB_OK; +} + +/** + * Case insensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + if (stream == NULL || stream->buffer == NULL) + return 1; /* arbitrary */ + + return stream->cmp_range_ci(stream, r1, r2, len); +} + +/** + * Case sensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + if (stream == NULL || stream->buffer == NULL) + return 1; /* arbitrary */ + + return stream->cmp_range_cs(stream, r1, r2, len); +} + +/** + * Case sensitively compare a range of input stream against an ASCII string + * + * \param stream Input stream to look in + * \param off Offset of range start + * \param len Byte length of range + * \param data Comparison string + * \param dlen Byte length of comparison string + * \return 0 if match, non-zero otherwise + */ +int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen) +{ + if (stream == NULL || stream->buffer == NULL) + return 1; /* arbitrary */ + + return stream->cmp_range_ascii(stream, off, len, data, dlen); +} + +/** + * Replace a range of bytes in the input stream with a single character + * + * \param stream Input stream containing data + * \param start Offset of start of range to replace + * \param len Length (in bytes) of range to replace + * \param ucs4 UCS4 (host endian) encoded replacement character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4) +{ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_BADPARM; + + if (start >= stream->buffer_len) + return HUBBUB_INVALID; + + if (start < stream->cursor) + return HUBBUB_INVALID; + + return stream->replace_range(stream, start, len, ucs4); +} + +/** + * Read the document charset + * + * \param stream Input stream to query + * \param source Pointer to location to receive charset source + * \return Pointer to charset name (constant; do not free), or NULL if unknown + */ +const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream, + hubbub_charset_source *source) +{ + if (stream == NULL || source == NULL) + return NULL; + + *source = stream->encsrc; + + if (stream->encsrc == HUBBUB_CHARSET_UNKNOWN) + return NULL; + + return hubbub_mibenum_to_name(stream->mibenum); +} + +/** + * Inform interested parties that the buffer has moved + * + * \param stream Input stream + */ +void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream) +{ + hubbub_inputstream_bm_handler *h; + + if (stream == NULL) + return; + + for (h = stream->handlers; h; h = h->next) + h->handler(stream->buffer, stream->buffer_len, h->pw); +} + -- cgit v1.2.3