summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2007-06-23 22:40:25 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2007-06-23 22:40:25 +0000
commit7b30a5520cfb56e651f0eb4da85a3e07747da7dc (patch)
tree5d6281c071c089e1e7a8ae6f8044cecaf6a7db16 /include
downloadlibhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.gz
libhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.bz2
Import hubbub -- an HTML parsing library.
Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359
Diffstat (limited to 'include')
-rw-r--r--include/hubbub/errors.h29
-rw-r--r--include/hubbub/functypes.h37
-rw-r--r--include/hubbub/hubbub.h23
-rw-r--r--include/hubbub/parser.h84
-rw-r--r--include/hubbub/types.h97
5 files changed, 270 insertions, 0 deletions
diff --git a/include/hubbub/errors.h b/include/hubbub/errors.h
new file mode 100644
index 0000000..c3b1f5d
--- /dev/null
+++ b/include/hubbub/errors.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_errors_h_
+#define hubbub_errors_h_
+
+#include <stddef.h>
+
+typedef enum hubbub_error {
+ HUBBUB_OK = 0,
+
+ HUBBUB_NOMEM = 1,
+ HUBBUB_BADPARM = 2,
+ HUBBUB_INVALID = 3,
+ HUBBUB_FILENOTFOUND = 4,
+ HUBBUB_NEEDDATA = 5,
+} hubbub_error;
+
+/* Convert a hubbub error value to a string */
+const char *hubbub_error_to_string(hubbub_error error);
+/* Convert a string to a hubbub error value */
+hubbub_error hubbub_error_from_string(const char *str, size_t len);
+
+#endif
+
diff --git a/include/hubbub/functypes.h b/include/hubbub/functypes.h
new file mode 100644
index 0000000..aa3e649
--- /dev/null
+++ b/include/hubbub/functypes.h
@@ -0,0 +1,37 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_functypes_h_
+#define hubbub_functypes_h_
+
+#include <stdlib.h>
+
+#include <hubbub/types.h>
+
+/* Type of allocation function for hubbub */
+typedef void *(*hubbub_alloc)(void *ptr, size_t size, void *pw);
+
+/**
+ * Type of token handling function
+ */
+typedef void (*hubbub_token_handler)(const hubbub_token *token, void *pw);
+
+/**
+ * Type of document buffer handling function
+ */
+typedef void (*hubbub_buffer_handler)(const uint8_t *data,
+ size_t len, void *pw);
+
+/**
+ * Type of parse error handling function
+ */
+typedef void (*hubbub_error_handler)(uint32_t line, uint32_t col,
+ const char *message, void *pw);
+
+
+#endif
+
diff --git a/include/hubbub/hubbub.h b/include/hubbub/hubbub.h
new file mode 100644
index 0000000..8a15eca
--- /dev/null
+++ b/include/hubbub/hubbub.h
@@ -0,0 +1,23 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_h_
+#define hubbub_h_
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+#include <hubbub/types.h>
+
+/* Initialise the Hubbub library for use */
+hubbub_error hubbub_initialise(const char *aliases_file,
+ hubbub_alloc alloc, void *pw);
+
+/* Clean up after Hubbub */
+hubbub_error hubbub_finalise(hubbub_alloc alloc, void *pw);
+
+#endif
+
diff --git a/include/hubbub/parser.h b/include/hubbub/parser.h
new file mode 100644
index 0000000..cdf8664
--- /dev/null
+++ b/include/hubbub/parser.h
@@ -0,0 +1,84 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_parser_h_
+#define hubbub_parser_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+#include <hubbub/types.h>
+
+typedef struct hubbub_parser hubbub_parser;
+
+/**
+ * Hubbub parser option types
+ */
+typedef enum hubbub_parser_opttype {
+ HUBBUB_PARSER_TOKEN_HANDLER,
+ HUBBUB_PARSER_BUFFER_HANDLER,
+ HUBBUB_PARSER_ERROR_HANDLER,
+ HUBBUB_PARSER_CONTENT_MODEL,
+} hubbub_parser_opttype;
+
+/**
+ * Hubbub parser option parameters
+ */
+typedef union hubbub_parser_optparams {
+ struct {
+ hubbub_token_handler handler;
+ void *pw;
+ } token_handler;
+
+ struct {
+ hubbub_buffer_handler handler;
+ void *pw;
+ } buffer_handler;
+
+ struct {
+ hubbub_error_handler handler;
+ void *pw;
+ } error_handler;
+
+ struct {
+ hubbub_content_model model;
+ } content_model;
+} hubbub_parser_optparams;
+
+/* Create a hubbub parser */
+hubbub_parser *hubbub_parser_create(const char *enc, const char *int_enc,
+ hubbub_alloc alloc, void *pw);
+/* Destroy a hubbub parser */
+void hubbub_parser_destroy(hubbub_parser *parser);
+
+/* Configure a hubbub parser */
+hubbub_error hubbub_parser_setopt(hubbub_parser *parser,
+ hubbub_parser_opttype type,
+ hubbub_parser_optparams *params);
+
+/* Pass a chunk of data to a hubbub parser for parsing */
+/* This data is encoded in the input charset */
+hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser,
+ uint8_t *data, size_t len);
+/* Pass a chunk of extraneous data to a hubbub parser for parsing */
+/* This data is UTF-8 encoded */
+hubbub_error hubbub_parser_parse_extraneous_chunk(hubbub_parser *parser,
+ uint8_t *data, size_t len);
+/* Inform the parser that the last chunk of data has been parsed */
+hubbub_error hubbub_parser_completed(hubbub_parser *parser);
+
+/* Read the document charset */
+const char *hubbub_parser_read_charset(hubbub_parser *parser,
+ hubbub_charset_source *source);
+
+/* Claim ownership of the document buffer */
+hubbub_error hubbub_parser_claim_buffer(hubbub_parser *parser,
+ uint8_t **buffer, size_t *len);
+
+#endif
+
diff --git a/include/hubbub/types.h b/include/hubbub/types.h
new file mode 100644
index 0000000..57518ae
--- /dev/null
+++ b/include/hubbub/types.h
@@ -0,0 +1,97 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_types_h_
+#define hubbub_types_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+/** Source of charset information, in order of importance
+ * A client-dictated charset will override all others.
+ * A document-specified charset will override autodetection or the default */
+typedef enum hubbub_charset_source {
+ HUBBUB_CHARSET_UNKNOWN = 0, /**< Unknown */
+ HUBBUB_CHARSET_DEFAULT = 1, /**< Default setting */
+ HUBBUB_CHARSET_DETECTED = 2, /**< Autodetected */
+ HUBBUB_CHARSET_DOCUMENT = 3, /**< Defined in document */
+ HUBBUB_CHARSET_DICTATED = 4, /**< Dictated by client */
+} hubbub_charset_source;
+
+/**
+ * Content model flag
+ */
+typedef enum hubbub_content_model {
+ HUBBUB_CONTENT_MODEL_PCDATA,
+ HUBBUB_CONTENT_MODEL_RCDATA,
+ HUBBUB_CONTENT_MODEL_CDATA,
+ HUBBUB_CONTENT_MODEL_PLAINTEXT
+} hubbub_content_model;
+
+/**
+ * Type of an emitted token
+ */
+typedef enum hubbub_token_type {
+ HUBBUB_TOKEN_DOCTYPE,
+ HUBBUB_TOKEN_START_TAG,
+ HUBBUB_TOKEN_END_TAG,
+ HUBBUB_TOKEN_COMMENT,
+ HUBBUB_TOKEN_CHARACTER,
+ HUBBUB_TOKEN_EOF
+} hubbub_token_type;
+
+/**
+ * Tokeniser string type
+ */
+typedef struct hubbub_string {
+ uint32_t data_off; /**< Byte offset of string start */
+ size_t len; /**< Byte length of string */
+} hubbub_string;
+
+/**
+ * Tag attribute data
+ */
+typedef struct hubbub_attribute {
+ hubbub_string name; /**< Attribute name */
+ hubbub_string value; /**< Attribute value */
+} hubbub_attribute;
+
+/**
+ * Data for doctype token
+ */
+typedef struct hubbub_doctype {
+ hubbub_string name; /**< Doctype name */
+ bool correct; /**< Doctype validity flag */
+} hubbub_doctype;
+
+/**
+ * Data for a tag
+ */
+typedef struct hubbub_tag {
+ hubbub_string name; /**< Tag name */
+ uint32_t n_attributes; /**< Count of attributes */
+ hubbub_attribute *attributes; /**< Array of attribute data */
+} hubbub_tag;
+
+/**
+ * Token data
+ */
+typedef struct hubbub_token {
+ hubbub_token_type type;
+
+ union {
+ hubbub_doctype doctype;
+
+ hubbub_tag tag;
+
+ hubbub_string comment;
+
+ hubbub_string character;
+ } data;
+} hubbub_token;
+
+#endif