summaryrefslogtreecommitdiff
path: root/include/parserutils
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2008-05-01 16:34:46 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2008-05-01 16:34:46 +0000
commit2777a04ed2ba4fd36138b991d66a32a283361f7e (patch)
treeb0c3730533c36ca41402b6d0c5b98413f0a57bee /include/parserutils
downloadlibparserutils-2777a04ed2ba4fd36138b991d66a32a283361f7e.tar.gz
libparserutils-2777a04ed2ba4fd36138b991d66a32a283361f7e.tar.bz2
Import parser construction utility library
svn path=/trunk/libparserutils/; revision=4111
Diffstat (limited to 'include/parserutils')
-rw-r--r--include/parserutils/charset/codec.h114
-rw-r--r--include/parserutils/charset/mibenum.h24
-rw-r--r--include/parserutils/charset/utf16.h38
-rw-r--r--include/parserutils/charset/utf8.h38
-rw-r--r--include/parserutils/errors.h29
-rw-r--r--include/parserutils/functypes.h21
-rw-r--r--include/parserutils/input/inputstream.h143
-rw-r--r--include/parserutils/parserutils.h23
-rw-r--r--include/parserutils/types.h15
-rw-r--r--include/parserutils/utils/buffer.h39
10 files changed, 484 insertions, 0 deletions
diff --git a/include/parserutils/charset/codec.h b/include/parserutils/charset/codec.h
new file mode 100644
index 0000000..ca98db5
--- /dev/null
+++ b/include/parserutils/charset/codec.h
@@ -0,0 +1,114 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_codec_h_
+#define parserutils_charset_codec_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+typedef struct parserutils_charset_codec parserutils_charset_codec;
+
+#define PARSERUTILS_CHARSET_CODEC_NULL (0xffffffffU)
+
+/**
+ * Charset codec error mode
+ *
+ * A codec's error mode determines its behaviour in the face of:
+ *
+ * + characters which are unrepresentable in the destination charset (if
+ * encoding data) or which cannot be converted to UCS4 (if decoding data).
+ * + invalid byte sequences (both encoding and decoding)
+ *
+ * The options provide a choice between the following approaches:
+ *
+ * + draconian, "stop processing" ("strict")
+ * + "replace the unrepresentable character with something else" ("loose")
+ * + "attempt to transliterate, or replace if unable" ("translit")
+ *
+ * The default error mode is "loose".
+ *
+ *
+ * In the "loose" case, the replacement character will depend upon:
+ *
+ * + Whether the operation was encoding or decoding
+ * + If encoding, what the destination charset is.
+ *
+ * If decoding, the replacement character will be:
+ *
+ * U+FFFD (REPLACEMENT CHARACTER)
+ *
+ * If encoding, the replacement character will be:
+ *
+ * U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32)
+ * U+FFFD (REPLACEMENT CHARACTER) otherwise.
+ *
+ *
+ * In the "translit" case, the codec will attempt to transliterate into
+ * the destination charset, if encoding. If decoding, or if transliteration
+ * fails, this option is identical to "loose".
+ */
+typedef enum parserutils_charset_codec_errormode {
+ /** Abort processing if unrepresentable character encountered */
+ PARSERUTILS_CHARSET_CODEC_ERROR_STRICT = 0,
+ /** Replace unrepresentable characters with single alternate */
+ PARSERUTILS_CHARSET_CODEC_ERROR_LOOSE = 1,
+ /** Transliterate unrepresentable characters, if possible */
+ PARSERUTILS_CHARSET_CODEC_ERROR_TRANSLIT = 2,
+} parserutils_charset_codec_errormode;
+
+/**
+ * Charset codec option types
+ */
+typedef enum parserutils_charset_codec_opttype {
+ /** Set codec error mode */
+ PARSERUTILS_CHARSET_CODEC_ERROR_MODE = 1,
+} parserutils_charset_codec_opttype;
+
+/**
+ * Charset codec option parameters
+ */
+typedef union parserutils_charset_codec_optparams {
+ /** Parameters for error mode setting */
+ struct {
+ /** The desired error handling mode */
+ parserutils_charset_codec_errormode mode;
+ } error_mode;
+} parserutils_charset_codec_optparams;
+
+
+/* Create a charset codec */
+parserutils_charset_codec *parserutils_charset_codec_create(const char *charset,
+ parserutils_alloc alloc, void *pw);
+/* Destroy a charset codec */
+void parserutils_charset_codec_destroy(parserutils_charset_codec *codec);
+
+/* Configure a charset codec */
+parserutils_error parserutils_charset_codec_setopt(
+ parserutils_charset_codec *codec,
+ parserutils_charset_codec_opttype type,
+ parserutils_charset_codec_optparams *params);
+
+/* Encode a chunk of UCS4 data into a codec's charset */
+parserutils_error parserutils_charset_codec_encode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+
+/* Decode a chunk of data in a codec's charset into UCS4 */
+parserutils_error parserutils_charset_codec_decode(
+ parserutils_charset_codec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+
+/* Reset a charset codec */
+parserutils_error parserutils_charset_codec_reset(
+ parserutils_charset_codec *codec);
+
+#endif
diff --git a/include/parserutils/charset/mibenum.h b/include/parserutils/charset/mibenum.h
new file mode 100644
index 0000000..8b3ac9d
--- /dev/null
+++ b/include/parserutils/charset/mibenum.h
@@ -0,0 +1,24 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_charset_mibenum_h_
+#define parserutils_charset_mibenum_h_
+
+#include <inttypes.h>
+#include <stdbool.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+/* Convert an encoding alias to a MIB enum value */
+uint16_t parserutils_charset_mibenum_from_name(const char *alias, size_t len);
+/* Convert a MIB enum value into an encoding alias */
+const char *parserutils_charset_mibenum_to_name(uint16_t mibenum);
+/* Determine if a MIB enum value represents a Unicode variant */
+bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum);
+
+#endif
diff --git a/include/parserutils/charset/utf16.h b/include/parserutils/charset/utf16.h
new file mode 100644
index 0000000..6569d6e
--- /dev/null
+++ b/include/parserutils/charset/utf16.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-16 manipulation functions (interface).
+ */
+
+#ifndef parserutils_charset_utf16_h_
+#define parserutils_charset_utf16_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+
+parserutils_error parserutils_charset_utf16_to_ucs4(const uint8_t *s,
+ size_t len, uint32_t *ucs4, size_t *clen);
+parserutils_error parserutils_charset_utf16_from_ucs4(uint32_t ucs4,
+ uint8_t *s, size_t *len);
+
+parserutils_error parserutils_charset_utf16_length(const uint8_t *s,
+ size_t max, size_t *len);
+parserutils_error parserutils_charset_utf16_char_byte_length(const uint8_t *s,
+ size_t *len);
+
+parserutils_error parserutils_charset_utf16_prev(const uint8_t *s,
+ uint32_t off, uint32_t *prevoff);
+parserutils_error parserutils_charset_utf16_next(const uint8_t *s,
+ uint32_t len, uint32_t off, uint32_t *nextoff);
+
+parserutils_error parserutils_charset_utf16_next_paranoid(const uint8_t *s,
+ uint32_t len, uint32_t off, uint32_t *nextoff);
+
+#endif
+
diff --git a/include/parserutils/charset/utf8.h b/include/parserutils/charset/utf8.h
new file mode 100644
index 0000000..16e012e
--- /dev/null
+++ b/include/parserutils/charset/utf8.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (interface).
+ */
+
+#ifndef parserutils_charset_utf8_h_
+#define parserutils_charset_utf8_h_
+
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+
+parserutils_error parserutils_charset_utf8_to_ucs4(const uint8_t *s, size_t len,
+ uint32_t *ucs4, size_t *clen);
+parserutils_error parserutils_charset_utf8_from_ucs4(uint32_t ucs4, uint8_t **s,
+ size_t *len);
+
+parserutils_error parserutils_charset_utf8_length(const uint8_t *s, size_t max,
+ size_t *len);
+parserutils_error parserutils_charset_utf8_char_byte_length(const uint8_t *s,
+ size_t *len);
+
+parserutils_error parserutils_charset_utf8_prev(const uint8_t *s, uint32_t off,
+ uint32_t *prevoff);
+parserutils_error parserutils_charset_utf8_next(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff);
+
+parserutils_error parserutils_charset_utf8_next_paranoid(const uint8_t *s,
+ uint32_t len, uint32_t off, uint32_t *nextoff);
+
+#endif
+
diff --git a/include/parserutils/errors.h b/include/parserutils/errors.h
new file mode 100644
index 0000000..09c715c
--- /dev/null
+++ b/include/parserutils/errors.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_errors_h_
+#define parserutils_errors_h_
+
+#include <stddef.h>
+
+typedef enum parserutils_error {
+ PARSERUTILS_OK = 0,
+
+ PARSERUTILS_NOMEM = 1,
+ PARSERUTILS_BADPARM = 2,
+ PARSERUTILS_INVALID = 3,
+ PARSERUTILS_FILENOTFOUND = 4,
+ PARSERUTILS_NEEDDATA = 5,
+} parserutils_error;
+
+/* Convert a parserutils error value to a string */
+const char *parserutils_error_to_string(parserutils_error error);
+/* Convert a string to a parserutils error value */
+parserutils_error parserutils_error_from_string(const char *str, size_t len);
+
+#endif
+
diff --git a/include/parserutils/functypes.h b/include/parserutils/functypes.h
new file mode 100644
index 0000000..703a329
--- /dev/null
+++ b/include/parserutils/functypes.h
@@ -0,0 +1,21 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007-8 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_functypes_h_
+#define parserutils_functypes_h_
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <stdlib.h>
+
+#include <parserutils/types.h>
+
+/* Type of allocation function for parserutils */
+typedef void *(*parserutils_alloc)(void *ptr, size_t size, void *pw);
+
+#endif
+
diff --git a/include/parserutils/input/inputstream.h b/include/parserutils/input/inputstream.h
new file mode 100644
index 0000000..2b0c407
--- /dev/null
+++ b/include/parserutils/input/inputstream.h
@@ -0,0 +1,143 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_input_inputstream_h_
+#define parserutils_input_inputstream_h_
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+#include <parserutils/types.h>
+#include <parserutils/charset/utf8.h>
+#include <parserutils/utils/buffer.h>
+
+/**
+ * Type of charset detection function
+ */
+typedef parserutils_error (*parserutils_charset_detect_func)(
+ const uint8_t *data, size_t len,
+ uint16_t *mibenum, uint32_t *source);
+
+/**
+ * Input stream object
+ */
+typedef struct parserutils_inputstream
+{
+ parserutils_buffer *utf8; /**< Buffer containing utf8 data */
+
+ uint32_t cursor; /**< Byte offset of current position */
+
+ bool had_eof; /**< Whether EOF has been reached */
+} parserutils_inputstream;
+
+/* EOF pseudo-character */
+#define PARSERUTILS_INPUTSTREAM_EOF (0xFFFFFFFFU)
+/* Out-of-data indicator */
+#define PARSERUTILS_INPUTSTREAM_OOD (0xFFFFFFFEU)
+
+/* Create an input stream */
+parserutils_inputstream *parserutils_inputstream_create(const char *enc,
+ uint32_t encsrc, parserutils_charset_detect_func csdetect,
+ parserutils_alloc alloc, void *pw);
+/* Destroy an input stream */
+void parserutils_inputstream_destroy(parserutils_inputstream *stream);
+
+/* Append data to an input stream */
+parserutils_error parserutils_inputstream_append(
+ parserutils_inputstream *stream,
+ const uint8_t *data, size_t len);
+/* Insert data into stream at current location */
+parserutils_error parserutils_inputstream_insert(
+ parserutils_inputstream *stream,
+ const uint8_t *data, size_t len);
+
+/* Slow form of css_inputstream_peek. */
+uintptr_t parserutils_inputstream_peek_slow(parserutils_inputstream *stream,
+ size_t offset, size_t *length);
+
+/* Look at the character in the stream that starts at
+ * offset bytes from the cursor
+ *
+ * \param stream Stream to look in
+ * \param offset Byte offset of start of character
+ * \param length Pointer to location to receive character length (in bytes)
+ * \return Pointer to character data, or EOF or OOD.
+ *
+ * Once the character pointed to by the result of this call has been advanced
+ * past (i.e. parserutils_inputstream_advance has caused the stream cursor to
+ * pass over the character), then no guarantee is made as to the validity of
+ * the data pointed to. Thus, any attempt to dereference the pointer after
+ * advancing past the data it points to is a bug.
+ */
+static inline uintptr_t parserutils_inputstream_peek(
+ parserutils_inputstream *stream, size_t offset, size_t *length)
+{
+ parserutils_error error = PARSERUTILS_OK;
+ size_t len;
+
+ if (stream == NULL)
+ return PARSERUTILS_INPUTSTREAM_OOD;
+
+#define IS_ASCII(x) (((x) & 0x80) == 0)
+
+ if (stream->cursor + offset < stream->utf8->length) {
+ if (IS_ASCII(stream->utf8->data[stream->cursor + offset])) {
+ len = 1;
+ } else {
+ error = parserutils_charset_utf8_char_byte_length(
+ stream->utf8->data + stream->cursor + offset,
+ &len);
+
+ if (error != PARSERUTILS_OK &&
+ error != PARSERUTILS_NEEDDATA)
+ return PARSERUTILS_INPUTSTREAM_OOD;
+ }
+ }
+
+#undef IS_ASCII
+
+ if (stream->cursor + offset == stream->utf8->length ||
+ error == PARSERUTILS_NEEDDATA) {
+ return parserutils_inputstream_peek_slow(stream,
+ offset, length);
+ }
+
+ *length = len;
+
+ return (uintptr_t) (stream->utf8->data + stream->cursor + offset);
+}
+
+/**
+ * Advance the stream's current position
+ *
+ * \param stream The stream whose position to advance
+ * \param bytes The number of bytes to advance
+ */
+static inline void parserutils_inputstream_advance(
+ parserutils_inputstream *stream, size_t bytes)
+{
+ if (stream == NULL)
+ return;
+
+ if (bytes > stream->utf8->length - stream->cursor)
+ abort();
+
+ if (stream->cursor == stream->utf8->length)
+ return;
+
+ stream->cursor += bytes;
+}
+
+/* Read the document charset */
+const char *parserutils_inputstream_read_charset(
+ parserutils_inputstream *stream, uint32_t *source);
+
+#endif
+
diff --git a/include/parserutils/parserutils.h b/include/parserutils/parserutils.h
new file mode 100644
index 0000000..460e80c
--- /dev/null
+++ b/include/parserutils/parserutils.h
@@ -0,0 +1,23 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_parserutils_h_
+#define parserutils_parserutils_h_
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+#include <parserutils/types.h>
+
+/* Initialise the ParserUtils library for use */
+parserutils_error parserutils_initialise(const char *aliases_file,
+ parserutils_alloc alloc, void *pw);
+
+/* Clean up after ParserUtils */
+parserutils_error parserutils_finalise(parserutils_alloc alloc, void *pw);
+
+#endif
+
diff --git a/include/parserutils/types.h b/include/parserutils/types.h
new file mode 100644
index 0000000..b36e4aa
--- /dev/null
+++ b/include/parserutils/types.h
@@ -0,0 +1,15 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_types_h_
+#define parserutils_types_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#endif
+
diff --git a/include/parserutils/utils/buffer.h b/include/parserutils/utils/buffer.h
new file mode 100644
index 0000000..f3a1883
--- /dev/null
+++ b/include/parserutils/utils/buffer.h
@@ -0,0 +1,39 @@
+/*
+ * This file is part of LibParserUtils.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef parserutils_utils_buffer_h_
+#define parserutils_utils_buffer_h_
+
+#include <parserutils/errors.h>
+#include <parserutils/functypes.h>
+
+struct parserutils_buffer
+{
+ uint8_t *data;
+ size_t length;
+ size_t allocated;
+
+ parserutils_alloc alloc;
+ void *pw;
+};
+typedef struct parserutils_buffer parserutils_buffer;
+
+parserutils_buffer *parserutils_buffer_create(parserutils_alloc alloc,
+ void *pw);
+void parserutils_buffer_destroy(parserutils_buffer *buffer);
+
+parserutils_error parserutils_buffer_append(parserutils_buffer *buffer,
+ const uint8_t *data, size_t len);
+parserutils_error parserutils_buffer_insert(parserutils_buffer *buffer,
+ size_t offset, const uint8_t *data, size_t len);
+parserutils_error parserutils_buffer_discard(parserutils_buffer *buffer,
+ size_t offset, size_t len);
+
+parserutils_error parserutils_buffer_grow(parserutils_buffer *buffer);
+
+#endif
+