summaryrefslogtreecommitdiff
path: root/src/input
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2008-05-01 16:34:46 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2008-05-01 16:34:46 +0000
commit2777a04ed2ba4fd36138b991d66a32a283361f7e (patch)
treeb0c3730533c36ca41402b6d0c5b98413f0a57bee /src/input
downloadlibparserutils-2777a04ed2ba4fd36138b991d66a32a283361f7e.tar.gz
libparserutils-2777a04ed2ba4fd36138b991d66a32a283361f7e.tar.bz2
Import parser construction utility library
svn path=/trunk/libparserutils/; revision=4111
Diffstat (limited to 'src/input')
-rw-r--r--src/input/Makefile46
-rw-r--r--src/input/filter.c384
-rw-r--r--src/input/filter.h57
-rw-r--r--src/input/inputstream.c477
4 files changed, 964 insertions, 0 deletions
diff --git a/src/input/Makefile b/src/input/Makefile
new file mode 100644
index 0000000..d62740e
--- /dev/null
+++ b/src/input/Makefile
@@ -0,0 +1,46 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Sources
+SRCS_$(d) := filter.c inputstream.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/input/filter.c b/src/input/filter.c
new file mode 100644
index 0000000..f40c98f
--- /dev/null
+++ b/src/input/filter.c
@@ -0,0 +1,384 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef WITH_ICONV_FILTER
+#include <iconv.h>
+#endif
+
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/codec.h>
+
+#include "input/filter.h"
+#include "utils/utils.h"
+
+/** Input filter */
+struct parserutils_filter {
+#ifdef WITH_ICONV_FILTER
+ iconv_t cd; /**< Iconv conversion descriptor */
+ uint16_t int_enc; /**< The internal encoding */
+#else
+ parserutils_charset_codec *read_codec; /**< Read codec */
+ parserutils_charset_codec *write_codec; /**< Write codec */
+
+ uint32_t pivot_buf[64]; /**< Conversion pivot buffer */
+
+ bool leftover; /**< Data remains from last call */
+ uint8_t *pivot_left; /**< Remaining pivot to write */
+ size_t pivot_len; /**< Length of pivot remaining */
+#endif
+
+ struct {
+ uint16_t encoding; /**< Input encoding */
+ } settings; /**< Filter settings */
+
+ parserutils_alloc alloc; /**< Memory (de)allocation function */
+ void *pw; /**< Client private data */
+};
+
+static parserutils_error filter_set_defaults(parserutils_filter *input);
+static parserutils_error filter_set_encoding(parserutils_filter *input,
+ const char *enc);
+
+/**
+ * Create an input filter
+ *
+ * \param int_enc Desired encoding of document
+ * \param alloc Function used to (de)allocate data
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to filter instance, or NULL on failure
+ */
+parserutils_filter *parserutils_filter_create(const char *int_enc,
+ parserutils_alloc alloc, void *pw)
+{
+ parserutils_filter *filter;
+
+ if (int_enc == NULL || alloc == NULL)
+ return NULL;
+
+ filter = alloc(NULL, sizeof(*filter), pw);
+ if (!filter)
+ return NULL;
+
+#ifdef WITH_ICONV_FILTER
+ filter->cd = (iconv_t) -1;
+ filter->int_enc = parserutils_charset_mibenum_from_name(
+ int_enc, strlen(int_enc));
+ if (filter->int_enc == 0) {
+ alloc(filter, 0, pw);
+ return NULL;
+ }
+#else
+ filter->leftover = false;
+ filter->pivot_left = NULL;
+ filter->pivot_len = 0;
+#endif
+
+ filter->alloc = alloc;
+ filter->pw = pw;
+
+ if (filter_set_defaults(filter) != PARSERUTILS_OK) {
+ filter->alloc(filter, 0, pw);
+ return NULL;
+ }
+
+#ifndef WITH_ICONV_FILTER
+ filter->write_codec =
+ parserutils_charset_codec_create(int_enc, alloc, pw);
+ if (filter->write_codec == NULL) {
+ if (filter->read_codec != NULL)
+ parserutils_charset_codec_destroy(filter->read_codec);
+ filter->alloc(filter, 0, pw);
+ return NULL;
+ }
+#endif
+
+ return filter;
+}
+
+/**
+ * Destroy an input filter
+ *
+ * \param input Pointer to filter instance
+ */
+void parserutils_filter_destroy(parserutils_filter *input)
+{
+ if (input == NULL)
+ return;
+
+#ifdef WITH_ICONV_FILTER
+ if (input->cd != (iconv_t) -1)
+ iconv_close(input->cd);
+#else
+ if (input->read_codec != NULL)
+ parserutils_charset_codec_destroy(input->read_codec);
+
+ if (input->write_codec != NULL)
+ parserutils_charset_codec_destroy(input->write_codec);
+#endif
+
+ input->alloc(input, 0, input->pw);
+
+ return;
+}
+
+/**
+ * Configure an input filter
+ *
+ * \param input Pointer to filter instance
+ * \param type Input option type to configure
+ * \param params Option-specific parameters
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_filter_setopt(parserutils_filter *input,
+ parserutils_filter_opttype type,
+ parserutils_filter_optparams *params)
+{
+ parserutils_error error = PARSERUTILS_OK;
+
+ if (input == NULL || params == NULL)
+ return PARSERUTILS_BADPARM;
+
+ switch (type) {
+ case PARSERUTILS_FILTER_SET_ENCODING:
+ error = filter_set_encoding(input, params->encoding.name);
+ break;
+ }
+
+ return error;
+}
+
+/**
+ * Process a chunk of data
+ *
+ * \param input Pointer to filter instance
+ * \param data Pointer to pointer to input buffer
+ * \param len Pointer to length of input buffer
+ * \param output Pointer to pointer to output buffer
+ * \param outlen Pointer to length of output buffer
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ *
+ * Call this with an input buffer length of 0 to flush any buffers.
+ */
+parserutils_error parserutils_filter_process_chunk(parserutils_filter *input,
+ const uint8_t **data, size_t *len,
+ uint8_t **output, size_t *outlen)
+{
+ if (input == NULL || data == NULL || *data == NULL || len == NULL ||
+ output == NULL || *output == NULL || outlen == NULL)
+ return PARSERUTILS_BADPARM;
+
+#ifdef WITH_ICONV_FILTER
+ if (iconv(input->cd, (char **) data, len,
+ (char **) output, outlen) == (size_t) -1) {
+ switch (errno) {
+ case E2BIG:
+ return PARSERUTILS_NOMEM;
+ case EILSEQ:
+ if (*outlen < 3)
+ return PARSERUTILS_NOMEM;
+
+ (*output)[0] = 0xef;
+ (*output)[1] = 0xbf;
+ (*output)[2] = 0xbd;
+
+ *output += 3;
+ *outlen -= 3;
+
+ (*data)++;
+ (*len)--;
+
+ while (*len > 0) {
+ size_t ret;
+
+ ret = iconv(input->cd, (char **) data, len,
+ (char **) output, outlen);
+ if (ret != (size_t) -1 || errno != EILSEQ)
+ break;
+
+ (*data)++;
+ (*len)--;
+ }
+
+ return errno == E2BIG ? PARSERUTILS_NOMEM
+ : PARSERUTILS_OK;
+ }
+ }
+
+ return PARSERUTILS_OK;
+#else
+ parserutils_error read_error, write_error;
+
+ if (input->leftover) {
+ /* Some data left to be written from last call */
+
+ /* Attempt to flush the remaining data. */
+ write_error = parserutils_charset_codec_encode(
+ input->write_codec,
+ (const uint8_t **) &input->pivot_left,
+ &input->pivot_len,
+ output, outlen);
+
+ if (write_error != PARSERUTILS_OK)
+ return write_error;
+
+
+ /* And clear leftover */
+ input->pivot_left = NULL;
+ input->pivot_len = 0;
+ input->leftover = false;
+ }
+
+ while (*len > 0) {
+ size_t pivot_len = sizeof(input->pivot_buf);
+ uint8_t *pivot = (uint8_t *) input->pivot_buf;
+
+ read_error = parserutils_charset_codec_decode(input->read_codec,
+ data, len,
+ (uint8_t **) &pivot, &pivot_len);
+
+ pivot = (uint8_t *) input->pivot_buf;
+ pivot_len = sizeof(input->pivot_buf) - pivot_len;
+
+ if (pivot_len > 0) {
+ write_error = parserutils_charset_codec_encode(
+ input->write_codec,
+ (const uint8_t **) &pivot,
+ &pivot_len,
+ output, outlen);
+
+ if (write_error != PARSERUTILS_OK) {
+ input->leftover = true;
+ input->pivot_left = pivot;
+ input->pivot_len = pivot_len;
+
+ return write_error;
+ }
+ }
+
+ if (read_error != PARSERUTILS_OK &&
+ read_error != PARSERUTILS_NOMEM)
+ return read_error;
+ }
+
+ return PARSERUTILS_OK;
+#endif
+}
+
+/**
+ * Reset an input filter's state
+ *
+ * \param input The input filter to reset
+ * \param PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_filter_reset(parserutils_filter *input)
+{
+ if (input == NULL)
+ return PARSERUTILS_BADPARM;
+
+#ifdef WITH_ICONV_FILTER
+ iconv(input->cd, NULL, 0, NULL, 0);
+#else
+ parserutils_error error;
+
+ /* Clear pivot buffer leftovers */
+ input->pivot_left = NULL;
+ input->pivot_len = 0;
+ input->leftover = false;
+
+ /* Reset read codec */
+ error = parserutils_charset_codec_reset(input->read_codec);
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ /* Reset write codec */
+ error = parserutils_charset_codec_reset(input->write_codec);
+ if (error != PARSERUTILS_OK)
+ return error;
+#endif
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Set an input filter's default settings
+ *
+ * \param input Input filter to configure
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error filter_set_defaults(parserutils_filter *input)
+{
+ parserutils_error error;
+
+ if (input == NULL)
+ return PARSERUTILS_BADPARM;
+
+#ifndef WITH_ICONV_FILTER
+ input->read_codec = NULL;
+ input->write_codec = NULL;
+#endif
+
+ input->settings.encoding = 0;
+ error = filter_set_encoding(input, "UTF-8");
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Set an input filter's encoding
+ *
+ * \param input Input filter to configure
+ * \param enc Encoding name
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error filter_set_encoding(parserutils_filter *input,
+ const char *enc)
+{
+ const char *old_enc;
+ uint16_t mibenum;
+
+ if (input == NULL || enc == NULL)
+ return PARSERUTILS_BADPARM;
+
+ mibenum = parserutils_charset_mibenum_from_name(enc, strlen(enc));
+ if (mibenum == 0)
+ return PARSERUTILS_INVALID;
+
+ /* Exit early if we're already using this encoding */
+ if (input->settings.encoding == mibenum)
+ return PARSERUTILS_OK;
+
+ old_enc = parserutils_charset_mibenum_to_name(input->settings.encoding);
+ if (old_enc == NULL)
+ old_enc = "UTF-8";
+
+#ifdef WITH_ICONV_FILTER
+ if (input->cd != (iconv_t) -1)
+ iconv_close(input->cd);
+
+ input->cd = iconv_open(
+ parserutils_charset_mibenum_to_name(input->int_enc), enc);
+#else
+ if (input->read_codec != NULL)
+ parserutils_charset_codec_destroy(input->read_codec);
+
+ input->read_codec = parserutils_charset_codec_create(enc, input->alloc,
+ input->pw);
+ if (input->read_codec == NULL)
+ return PARSERUTILS_NOMEM;
+#endif
+
+ input->settings.encoding = mibenum;
+
+ return PARSERUTILS_OK;
+}
diff --git a/src/input/filter.h b/src/input/filter.h
new file mode 100644
index 0000000..96941a6
--- /dev/null
+++ b/src/input/filter.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_input_filter_h_
+#define parserutils_input_filter_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+typedef struct parserutils_filter parserutils_filter;
+
+/**
+ * Input filter option types
+ */
+typedef enum parserutils_filter_opttype {
+ PARSERUTILS_FILTER_SET_ENCODING = 0,
+} parserutils_filter_opttype;
+
+/**
+ * Input filter option parameters
+ */
+typedef union parserutils_filter_optparams {
+ /** Parameters for encoding setting */
+ struct {
+ /** Encoding name */
+ const char *name;
+ } encoding;
+} parserutils_filter_optparams;
+
+
+/* Create an input filter */
+parserutils_filter *parserutils_filter_create(const char *int_enc,
+ parserutils_alloc alloc, void *pw);
+/* Destroy an input filter */
+void parserutils_filter_destroy(parserutils_filter *input);
+
+/* Configure an input filter */
+parserutils_error parserutils_filter_setopt(parserutils_filter *input,
+ parserutils_filter_opttype type,
+ parserutils_filter_optparams *params);
+
+/* Process a chunk of data */
+parserutils_error parserutils_filter_process_chunk(parserutils_filter *input,
+ const uint8_t **data, size_t *len,
+ uint8_t **output, size_t *outlen);
+
+/* Reset an input filter's state */
+parserutils_error parserutils_filter_reset(parserutils_filter *input);
+
+#endif
+
diff --git a/src/input/inputstream.c b/src/input/inputstream.c
new file mode 100644
index 0000000..fd44995
--- /dev/null
+++ b/src/input/inputstream.c
@@ -0,0 +1,477 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <parserutils/charset/mibenum.h>
+#include <parserutils/charset/utf8.h>
+#include <parserutils/input/inputstream.h>
+
+#include "input/filter.h"
+#include "utils/utils.h"
+
+/**
+ * Private input stream definition
+ */
+typedef struct parserutils_inputstream_private {
+ parserutils_inputstream public; /**< Public part. Must be first */
+
+ parserutils_buffer *raw; /**< Buffer containing raw data */
+
+ bool done_first_chunk; /**< Whether the first chunk has
+ * been processed */
+
+ uint16_t mibenum; /**< MIB enum for charset, or 0 */
+ uint32_t encsrc; /**< Charset source */
+
+ parserutils_filter *input; /**< Charset conversion filter */
+
+ parserutils_charset_detect_func csdetect; /**< Charset detection func.*/
+
+ parserutils_alloc alloc; /**< Memory (de)allocation function */
+ void *pw; /**< Client private data */
+} parserutils_inputstream_private;
+
+static inline parserutils_error parserutils_inputstream_refill_buffer(
+ parserutils_inputstream_private *stream);
+static inline parserutils_error parserutils_inputstream_strip_bom(
+ uint16_t mibenum, parserutils_buffer *buffer);
+
+/**
+ * Create an input stream
+ *
+ * \param enc Document charset, or NULL to autodetect
+ * \param encsrc Value for encoding source, if specified, or 0
+ * \param csdetect Charset detection function, or NULL
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to stream instance, or NULL on failure
+ *
+ * The value 0 is defined as being the lowest priority encoding source
+ * (i.e. the default fallback encoding). Beyond this, no further
+ * interpretation is made upon the encoding source.
+ */
+parserutils_inputstream *parserutils_inputstream_create(const char *enc,
+ uint32_t encsrc, parserutils_charset_detect_func csdetect,
+ parserutils_alloc alloc, void *pw)
+{
+ parserutils_inputstream_private *stream;
+
+ if (alloc == NULL)
+ return NULL;
+
+ stream = alloc(NULL, sizeof(parserutils_inputstream_private), pw);
+ if (stream == NULL)
+ return NULL;
+
+ stream->raw = parserutils_buffer_create(alloc, pw);
+ if (stream->raw == NULL) {
+ alloc(stream, 0, pw);
+ return NULL;
+ }
+
+ stream->public.utf8 = parserutils_buffer_create(alloc, pw);
+ if (stream->public.utf8 == NULL) {
+ parserutils_buffer_destroy(stream->raw);
+ alloc(stream, 0, pw);
+ return NULL;
+ }
+
+ stream->public.cursor = 0;
+ stream->public.had_eof = false;
+ stream->done_first_chunk = false;
+
+ stream->input = parserutils_filter_create("UTF-8", alloc, pw);
+ if (stream->input == NULL) {
+ parserutils_buffer_destroy(stream->public.utf8);
+ parserutils_buffer_destroy(stream->raw);
+ alloc(stream, 0, pw);
+ return NULL;
+ }
+
+ if (enc != NULL) {
+ parserutils_error error;
+ parserutils_filter_optparams params;
+
+ stream->mibenum =
+ parserutils_charset_mibenum_from_name(enc, strlen(enc));
+
+ if (stream->mibenum != 0) {
+ params.encoding.name = enc;
+
+ error = parserutils_filter_setopt(stream->input,
+ PARSERUTILS_FILTER_SET_ENCODING,
+ &params);
+ if (error != PARSERUTILS_OK &&
+ error != PARSERUTILS_INVALID) {
+ parserutils_filter_destroy(stream->input);
+ parserutils_buffer_destroy(stream->public.utf8);
+ parserutils_buffer_destroy(stream->raw);
+ alloc(stream, 0, pw);
+ return NULL;
+ }
+
+ stream->encsrc = encsrc;
+ }
+ } else {
+ stream->mibenum = 0;
+ stream->encsrc = 0;
+ }
+
+ stream->csdetect = csdetect;
+
+ stream->alloc = alloc;
+ stream->pw = pw;
+
+ return (parserutils_inputstream *) stream;
+}
+
+/**
+ * Destroy an input stream
+ *
+ * \param stream Input stream to destroy
+ */
+void parserutils_inputstream_destroy(parserutils_inputstream *stream)
+{
+ parserutils_inputstream_private *s =
+ (parserutils_inputstream_private *) stream;
+
+ if (stream == NULL)
+ return;
+
+ parserutils_filter_destroy(s->input);
+ parserutils_buffer_destroy(s->public.utf8);
+ parserutils_buffer_destroy(s->raw);
+ s->alloc(s, 0, s->pw);
+}
+
+/**
+ * Append data to an input stream
+ *
+ * \param stream Input stream to append data to
+ * \param data Data to append (in document charset), or NULL to flag EOF
+ * \param len Length, in bytes, of data
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_inputstream_append(
+ parserutils_inputstream *stream,
+ const uint8_t *data, size_t len)
+{
+ parserutils_inputstream_private *s =
+ (parserutils_inputstream_private *) stream;
+
+ if (stream == NULL)
+ return PARSERUTILS_BADPARM;
+
+ if (data == NULL) {
+ s->public.had_eof = true;
+ return PARSERUTILS_OK;
+ }
+
+ return parserutils_buffer_append(s->raw, data, len);
+}
+
+/**
+ * Insert data into stream at current location
+ *
+ * \param stream Input stream to insert into
+ * \param data Data to insert (UTF-8 encoded)
+ * \param len Length, in bytes, of data
+ * \return PARSERUTILS_OK on success, appropriate error otherwise
+ */
+parserutils_error parserutils_inputstream_insert(
+ parserutils_inputstream *stream,
+ const uint8_t *data, size_t len)
+{
+ parserutils_inputstream_private *s =
+ (parserutils_inputstream_private *) stream;
+
+ if (stream == NULL || data == NULL)
+ return PARSERUTILS_BADPARM;
+
+ return parserutils_buffer_insert(s->public.utf8, s->public.cursor,
+ data, len);
+}
+
+#define IS_ASCII(x) (((x) & 0x80) == 0)
+
+/* Look at the character in the stream that starts at
+ * offset bytes from the cursor (slow version)
+ *
+ * \param stream Stream to look in
+ * \param offset Byte offset of start of character
+ * \param length Pointer to location to receive character length (in bytes)
+ * \return Pointer to character data, or EOF or OOD.
+ *
+ * Once the character pointed to by the result of this call has been advanced
+ * past (i.e. parserutils_inputstream_advance has caused the stream cursor to
+ * pass over the character), then no guarantee is made as to the validity of
+ * the data pointed to. Thus, any attempt to dereference the pointer after
+ * advancing past the data it points to is a bug.
+ */
+uintptr_t parserutils_inputstream_peek_slow(parserutils_inputstream *stream,
+ size_t offset, size_t *length)
+{
+ parserutils_inputstream_private *s =
+ (parserutils_inputstream_private *) stream;
+ parserutils_error error = PARSERUTILS_OK;
+ size_t len;
+
+ if (stream == NULL)
+ return PARSERUTILS_INPUTSTREAM_OOD;
+
+ /* There's insufficient data in the buffer, so read some more */
+ if (s->raw->length == 0) {
+ /* No more data to be had */
+ return s->public.had_eof ? PARSERUTILS_INPUTSTREAM_EOF
+ : PARSERUTILS_INPUTSTREAM_OOD;
+ }
+
+ /* Refill utf8 buffer from raw buffer */
+ error = parserutils_inputstream_refill_buffer(s);
+ if (error != PARSERUTILS_OK)
+ return PARSERUTILS_INPUTSTREAM_OOD;
+
+ /* Now try the read */
+ if (IS_ASCII(s->public.utf8->data[s->public.cursor + offset])) {
+ len = 1;
+ } else {
+ error = parserutils_charset_utf8_char_byte_length(
+ s->public.utf8->data + s->public.cursor + offset,
+ &len);
+
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NEEDDATA)
+ return PARSERUTILS_INPUTSTREAM_OOD;
+
+ if (error == PARSERUTILS_NEEDDATA) {
+ return s->public.had_eof ? PARSERUTILS_INPUTSTREAM_EOF
+ : PARSERUTILS_INPUTSTREAM_OOD;
+ }
+ }
+
+ *length = len;
+
+ return (uintptr_t) (s->public.utf8->data + s->public.cursor + offset);
+}
+
+#undef IS_ASCII
+
+/**
+ * Read the source charset of the input stream
+ *
+ * \param stream Input stream to query
+ * \param source Pointer to location to receive charset source identifier
+ * \return Pointer to charset name (constant; do not free)
+ */
+const char *parserutils_inputstream_read_charset(
+ parserutils_inputstream *stream, uint32_t *source)
+{
+ parserutils_inputstream_private *s =
+ (parserutils_inputstream_private *) stream;
+
+ if (stream == NULL || source == NULL)
+ return NULL;
+
+ *source = s->encsrc;
+
+ if (s->encsrc == 0)
+ return "UTF-8";
+
+ return parserutils_charset_mibenum_to_name(s->mibenum);
+}
+
+/******************************************************************************
+ ******************************************************************************/
+
+/**
+ * Refill the UTF-8 buffer from the raw buffer
+ *
+ * \param stream The inputstream to operate on
+ * \return PARSERUTILS_OK on success
+ */
+parserutils_error parserutils_inputstream_refill_buffer(
+ parserutils_inputstream_private *stream)
+{
+ const uint8_t *raw;
+ uint8_t *utf8;
+ size_t raw_length, utf8_space;
+ parserutils_error error;
+
+ /* If this is the first chunk of data, we must detect the charset and
+ * strip the BOM, if one exists */
+ if (!stream->done_first_chunk) {
+ if (stream->csdetect != NULL) {
+ error = stream->csdetect(stream->raw->data,
+ stream->raw->length,
+ &stream->mibenum, &stream->encsrc);
+ if (error != PARSERUTILS_OK)
+ return error;
+ } else {
+ /* Default to UTF-8 */
+ stream->mibenum =
+ parserutils_charset_mibenum_from_name("UTF-8",
+ SLEN("UTF-8"));
+ stream->encsrc = 0;
+ }
+
+ if (stream->mibenum == 0)
+ abort();
+
+ error = parserutils_inputstream_strip_bom(stream->mibenum,
+ stream->raw);
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ stream->done_first_chunk = true;
+ }
+
+ /* Work out how to perform the buffer fill */
+ if (stream->public.cursor == stream->public.utf8->length) {
+ /* Cursor's at the end, so simply reuse the entire buffer */
+ utf8 = stream->public.utf8->data;
+ utf8_space = stream->public.utf8->allocated;
+ } else {
+ /* Cursor's not at the end, so shift data after cursor to the
+ * bottom of the buffer. If the buffer's still over half full,
+ * extend it. */
+ memmove(stream->public.utf8->data,
+ stream->public.utf8->data + stream->public.cursor,
+ stream->public.utf8->length - stream->public.cursor);
+
+ stream->public.utf8->length -= stream->public.cursor;
+
+ if (stream->public.utf8->length >
+ stream->public.utf8->allocated / 2) {
+ error = parserutils_buffer_grow(stream->public.utf8);
+ if (error != PARSERUTILS_OK)
+ return error;
+ }
+
+ utf8 = stream->public.utf8->data + stream->public.utf8->length;
+ utf8_space = stream->public.utf8->allocated -
+ stream->public.utf8->length;
+ }
+
+ raw = stream->raw->data;
+ raw_length = stream->raw->length;
+
+ /* Try to fill utf8 buffer from the raw data */
+ error = parserutils_filter_process_chunk(stream->input,
+ &raw, &raw_length, &utf8, &utf8_space);
+ /* _NOMEM implies that there's more input to read than available space
+ * in the utf8 buffer. That's fine, so we'll ignore that error. */
+ if (error != PARSERUTILS_OK && error != PARSERUTILS_NOMEM)
+ return error;
+
+ /* Remove the raw data we've processed from the raw buffer */
+ error = parserutils_buffer_discard(stream->raw, 0,
+ stream->raw->length - raw_length);
+ if (error != PARSERUTILS_OK)
+ return error;
+
+ /* Fix up the utf8 buffer information */
+ stream->public.utf8->length =
+ stream->public.utf8->allocated - utf8_space;
+
+ /* Finally, fix up the cursor */
+ stream->public.cursor = 0;
+
+ return PARSERUTILS_OK;
+}
+
+/**
+ * Strip a BOM from a buffer in the given encoding
+ *
+ * \param mibenum The character set of the buffer
+ * \param buffer The buffer to process
+ */
+parserutils_error parserutils_inputstream_strip_bom(uint16_t mibenum,
+ parserutils_buffer *buffer)
+{
+ static uint16_t utf8;
+ static uint16_t utf16;
+ static uint16_t utf16be;
+ static uint16_t utf16le;
+ static uint16_t utf32;
+ static uint16_t utf32be;
+ static uint16_t utf32le;
+
+ if (utf8 == 0) {
+ utf8 = parserutils_charset_mibenum_from_name("UTF-8",
+ SLEN("UTF-8"));
+ utf16 = parserutils_charset_mibenum_from_name("UTF-16",
+ SLEN("UTF-16"));
+ utf16be = parserutils_charset_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE"));
+ utf16le = parserutils_charset_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE"));
+ utf32 = parserutils_charset_mibenum_from_name("UTF-32",
+ SLEN("UTF-32"));
+ utf32be = parserutils_charset_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"));
+ utf32le = parserutils_charset_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE"));
+ }
+
+ /** \todo Handle unmarked UTF-16 and UTF-32. Endianness is specified
+ * by the BOM, if present, or is assumed to be big endian. */
+
+#define UTF32_BOM_LEN (4)
+#define UTF16_BOM_LEN (2)
+#define UTF8_BOM_LEN (3)
+
+ if (mibenum == utf8) {
+ if (buffer->length >= UTF8_BOM_LEN &&
+ buffer->data[0] == 0xEF &&
+ buffer->data[1] == 0xBB &&
+ buffer->data[2] == 0xBF) {
+ return parserutils_buffer_discard(
+ buffer, 0, UTF8_BOM_LEN);
+ }
+ } else if (mibenum == utf16be) {
+ if (buffer->length >= UTF16_BOM_LEN &&
+ buffer->data[0] == 0xFE &&
+ buffer->data[1] == 0xFF) {
+ return parserutils_buffer_discard(
+ buffer, 0, UTF16_BOM_LEN);
+ }
+ } else if (mibenum == utf16le) {
+ if (buffer->length >= UTF16_BOM_LEN &&
+ buffer->data[0] == 0xFF &&
+ buffer->data[1] == 0xFE) {
+ return parserutils_buffer_discard(
+ buffer, 0, UTF16_BOM_LEN);
+ }
+ } else if (mibenum == utf32be) {
+ if (buffer->length >= UTF32_BOM_LEN &&
+ buffer->data[0] == 0x00 &&
+ buffer->data[1] == 0x00 &&
+ buffer->data[2] == 0xFE &&
+ buffer->data[3] == 0xFF) {
+ return parserutils_buffer_discard(
+ buffer, 0, UTF32_BOM_LEN);
+ }
+ } else if (mibenum == utf32le) {
+ if (buffer->length >= UTF32_BOM_LEN &&
+ buffer->data[0] == 0xFF &&
+ buffer->data[1] == 0xFE &&
+ buffer->data[2] == 0x00 &&
+ buffer->data[3] == 0x00) {
+ return parserutils_buffer_discard(
+ buffer, 0, UTF32_BOM_LEN);
+ }
+ }
+
+#undef UTF8_BOM_LEN
+#undef UTF16_BOM_LEN
+#undef UTF32_BOM_LEN
+
+ return PARSERUTILS_OK;
+}
+