Rework html parser bindings to have a common API and reside in separate files for ease of reading.

Add error handling to hubbub binding. svn path=/trunk/netsurf/; revision=5404
author: John Mark Bell <jmb@netsurf-browser.org> 2008-09-23 02:19:50 +0000
committer: John Mark Bell <jmb@netsurf-browser.org> 2008-09-23 02:19:50 +0000
commit: 163ad56fce1b8d83d43034620a8d5c847785edf7 (patch)
tree: 5e52e438bc5d90af5fd6977d7367fec15e25ba2e /render
parent: 4fad8726a4ae82849f38ffc3ef087181d7f37e14 (diff)
download: netsurf-163ad56fce1b8d83d43034620a8d5c847785edf7.tar.gz
netsurf-163ad56fce1b8d83d43034620a8d5c847785edf7.tar.bz2
6 files changed, 1054 insertions, 744 deletions
diff --git a/render/directory.c b/render/directory.c
index 1363ad251..c2b343fbe 100644
--- a/render/directory.c
+++ b/render/directory.c
@@ -48,12 +48,10 @@ bool directory_create(struct content *c, const char *params[]) {
 		/* html_create() must have broadcast MSG_ERROR already, so we
 		* don't need to. */
 		return false;
-#ifndef WITH_HUBBUB
-	htmlParseChunk(c->data.html.parser, header, sizeof(header) - 1, 0);
-#else
-	hubbub_parser_parse_chunk(c->data.html.parser, 
-				(uint8_t *) header, sizeof(header) - 1);
-#endif
+
+	binding_parse_chunk(c->data.html.parser_binding,
+			(uint8_t *) header, sizeof(header) - 1);
+
 	return true;
 }
 
@@ -100,12 +98,9 @@ bool directory_convert(struct content *c, int width, int height) {
 			"<body>\n<h1>\nIndex of %s</h1>\n<hr><pre>",
 			nice_path, nice_path);
 	free(nice_path);
-#ifndef WITH_HUBBUB
-	htmlParseChunk(c->data.html.parser, buffer, strlen(buffer), 0);
-#else
-	hubbub_parser_parse_chunk(c->data.html.parser, 
+
+	binding_parse_chunk(c->data.html.parser_binding, 
 			(uint8_t *) buffer, strlen(buffer));
-#endif
 
 	res = url_parent(c->url, &up);
 	if (res == URL_FUNC_OK) {
@@ -113,14 +108,9 @@ bool directory_convert(struct content *c, int width, int height) {
 		if ((res == URL_FUNC_OK) && !compare) {
 			snprintf(buffer, sizeof(buffer),
 				"<a href=\"..\">[..]</a>\n");
-#ifndef WITH_HUBBUB
-			htmlParseChunk(c->data.html.parser, buffer,
-					strlen(buffer), 0);
-#else
-			hubbub_parser_parse_chunk(c->data.html.parser, 
-					(uint8_t *) buffer, 
-					strlen(buffer));
-#endif
+
+			binding_parse_chunk(c->data.html.parser_binding,
+					(uint8_t *) buffer, strlen(buffer));
 		}
 		free(up);
 	}
@@ -137,21 +127,15 @@ bool directory_convert(struct content *c, int width, int height) {
 
 		snprintf(buffer, sizeof(buffer), "<a href=\"%s/%s\">%s</a>\n",
 				c->url, entry->d_name, entry->d_name);
-#ifndef WITH_HUBBUB
-		htmlParseChunk(c->data.html.parser, buffer, strlen(buffer), 0);
-#else
-		hubbub_parser_parse_chunk(c->data.html.parser,
+
+		binding_parse_chunk(c->data.html.parser_binding,
 				(uint8_t *) buffer, strlen(buffer));
-#endif
 	}
 	closedir(parent);
 
-#ifndef WITH_HUBBUB
-	htmlParseChunk(c->data.html.parser, footer, sizeof(footer) - 1, 0);
-#else
-	hubbub_parser_parse_chunk(c->data.html.parser, 
+	binding_parse_chunk(c->data.html.parser_binding,
 			(uint8_t *) footer, sizeof(footer) - 1);
-#endif
+
 	c->type = CONTENT_HTML;
 	return html_convert(c, width, height);
 }
diff --git a/render/html.c b/render/html.c
index 7f9eaf44f..14563f9f4 100644
--- a/render/html.c
+++ b/render/html.c
@@ -28,14 +28,6 @@
 #include <string.h>
 #include <strings.h>
 #include <stdlib.h>
-#ifdef WITH_HUBBUB
-#include <hubbub/hubbub.h>
-#include <hubbub/parser.h>
-#include <hubbub/tree.h>
-#endif
-#include <libxml/tree.h>
-#include <libxml/parser.h>
-#include <libxml/parserInternals.h>
 #include "utils/config.h"
 #include "content/content.h"
 #include "content/fetch.h"
@@ -57,10 +49,6 @@
 #define CHUNK 4096
 
 
-#ifndef WITH_HUBBUB
-static bool html_set_parser_encoding(struct content *c, const char *encoding);
-static const char *html_detect_encoding(const char **data, unsigned int *size);
-#endif
 static void html_convert_css_callback(content_msg msg, struct content *css,
 		intptr_t p1, intptr_t p2, union content_msg_data data);
 static bool html_meta_refresh(struct content *c, xmlNode *head);
@@ -98,380 +86,6 @@ static const char empty_document[] =
 	"</html>";
 
 
-#ifdef WITH_HUBBUB
-
-const char const *ns_prefixes[NUM_NAMESPACES] =
-		{ NULL, NULL, "math", "svg", "xlink", "xml", "xmlns" };
-
-const char const *ns_urls[NUM_NAMESPACES] = {
-	NULL,
-	"http://www.w3.org/1999/xhtml",
-	"http://www.w3.org/1998/Math/MathML",
-	"http://www.w3.org/2000/svg",
-	"http://www.w3.org/1999/xlink",
-	"http://www.w3.org/XML/1998/namespace",
-	"http://www.w3.org/2000/xmlns/"
-};
-
-
-static int create_comment(void *ctx, const hubbub_string *data, void **result);
-static int create_doctype(void *ctx, const hubbub_doctype *doctype,
-		void **result);
-static int create_element(void *ctx, const hubbub_tag *tag, void **result);
-static int create_text(void *ctx, const hubbub_string *data, void **result);
-static int ref_node(void *ctx, void *node);
-static int unref_node(void *ctx, void *node);
-static int append_child(void *ctx, void *parent, void *child, void **result);
-static int insert_before(void *ctx, void *parent, void *child, void *ref_child,
-		void **result);
-static int remove_child(void *ctx, void *parent, void *child, void **result);
-static int clone_node(void *ctx, void *node, bool deep, void **result);
-static int reparent_children(void *ctx, void *node, void *new_parent);
-static int get_parent(void *ctx, void *node, bool element_only, void **result);
-static int has_children(void *ctx, void *node, bool *result);
-static int form_associate(void *ctx, void *form, void *node);
-static int add_attributes(void *ctx, void *node,
-		const hubbub_attribute *attributes, uint32_t n_attributes);
-static int set_quirks_mode(void *ctx, hubbub_quirks_mode mode);
-static int change_encoding(void *ctx, const char *mibenum);
-
-static hubbub_tree_handler tree_handler = {
-	create_comment,
-	create_doctype,
-	create_element,
-	create_text,
-	ref_node,
-	unref_node,
-	append_child,
-	insert_before,
-	remove_child,
-	clone_node,
-	reparent_children,
-	get_parent,
-	has_children,
-	form_associate,
-	add_attributes,
-	set_quirks_mode,
-	change_encoding,
-	NULL
-};
-
-
-
-/*** Tree construction functions ***/
-
-int create_comment(void *ctx, const hubbub_string *data, void **result)
-{
-	xmlNode *node = xmlNewComment(NULL);
-
-	node->content = xmlStrndup(data->ptr, data->len);
-	node->_private = (void *)1;
-	*result = node;
-
-	return 0;
-}
-
-int create_doctype(void *ctx, const hubbub_doctype *doctype, void **result)
-{
-	/* Make a node that doesn't really exist, then don't append it
-	 * later. */
-	xmlNode *node = xmlNewComment(NULL);
-
-	node->_private = (void *)1;
-	*result = node;
-
-	return 0;
-}
-
-int create_element(void *ctx, const hubbub_tag *tag, void **result)
-{
-	struct content *c = ctx;
-	struct content_html_data *html = &c->data.html;
-
-	char *name = strndup((const char *) tag->name.ptr,
-			tag->name.len);
-
-	xmlNode *node = xmlNewNode(NULL, BAD_CAST name);
-	node->_private = (void *)1;
-	*result = node;
-
-	if (html->has_ns == false) {
-		for (size_t i = 1; i < NUM_NAMESPACES; i++) {
-			html->ns[i] = xmlNewNs(node,
-					BAD_CAST ns_urls[i],
-					BAD_CAST ns_prefixes[i]);
-		}
-		html->has_ns = true;
-	}
-
-	xmlSetNs(node, html->ns[tag->ns]);
-
-	free(name);
-
-	for (size_t i = 0; i < tag->n_attributes; i++) {
-		hubbub_attribute *attr = &tag->attributes[i];
-
-		char *name = strndup((const char *) attr->name.ptr,
-				attr->name.len);
-		char *value = strndup((const char *) attr->value.ptr,
-				attr->value.len);
-
-		if (attr->ns == HUBBUB_NS_NULL) {
-			xmlNewProp(node, BAD_CAST name, BAD_CAST value);
-		} else {
-			xmlNewNsProp(node, html->ns[attr->ns], BAD_CAST name,
-					BAD_CAST value);
-		}
-
-		free(name);
-		free(value);
-	}
-
-	return 0;
-}
-
-int create_text(void *ctx, const hubbub_string *data, void **result)
-{
-	xmlNode *node = xmlNewTextLen(BAD_CAST data->ptr, data->len);
-	node->_private = (void *)1;
-	*result = node;
-
-	return 0;
-}
-
-int ref_node(void *ctx, void *node)
-{
-	xmlNode *n = node;
-	n->_private = (void *)((uintptr_t)n->_private + 1);
-
-	return 0;
-}
-
-int unref_node(void *ctx, void *node)
-{
-	xmlNode *n = node;
-	n->_private = (void *)((uintptr_t)n->_private - 1);
-
-	if (n->_private == (void *)0 && n->parent == NULL) {
-		xmlFreeNode(n);
-	}
-
-	return 0;
-}
-
-int append_child(void *ctx, void *parent, void *child, void **result)
-{
-	xmlNode *nparent = parent;
-	xmlNode *nchild = child;
-
-	if (nchild->type == XML_TEXT_NODE &&
-			nparent->last != NULL &&
-			nparent->last->type == XML_TEXT_NODE) {
-		xmlNode *clone;
-		clone_node(ctx, nchild, false, (void **) &clone);
-		*result = xmlAddChild(parent, clone);
-		/* node referenced by clone_node */
-	} else {
-		*result = xmlAddChild(parent, child);
-		ref_node(ctx, *result);
-	}
-
-	return 0;
-}
-
-/* insert 'child' before 'ref_child', under 'parent' */
-int insert_before(void *ctx, void *parent, void *child, void *ref_child,
-		void **result)
-{
-	*result = xmlAddPrevSibling(ref_child, child);
-	ref_node(ctx, *result);
-
-	return 0;
-}
-
-int remove_child(void *ctx, void *parent, void *child, void **result)
-{
-	xmlUnlinkNode(child);
-	*result = child;
-
-	ref_node(ctx, *result);
-
-	return 0;
-}
-
-int clone_node(void *ctx, void *node, bool deep, void **result)
-{
-	xmlNode *n = xmlCopyNode(node, deep ? 1 : 2);
-	n->_private = (void *)1;
-	*result = n;
-
-	return 0;
-}
-
-/* Take all of the child nodes of "node" and append them to "new_parent" */
-int reparent_children(void *ctx, void *node, void *new_parent)
-{
-	xmlNode *n = (xmlNode *) node;
-	xmlNode *p = (xmlNode *) new_parent;
-
-	for (xmlNode *child = n->children; child != NULL; ) {
-		xmlNode *next = child->next;
-
-		xmlUnlinkNode(child);
-
-		if (xmlAddChild(p, child) == NULL)
-			return 1;
-
-		child = next;
-	}
-
-	return 0;
-}
-
-int get_parent(void *ctx, void *node, bool element_only, void **result)
-{
-	*result = ((xmlNode *)node)->parent;
-
-	if (*result != NULL && element_only &&
-			((xmlNode *) *result)->type != XML_ELEMENT_NODE)
-		*result = NULL;
-
-	if (*result != NULL)
-		ref_node(ctx, *result);
-
-	return 0;
-}
-
-int has_children(void *ctx, void *node, bool *result)
-{
-	*result = ((xmlNode *)node)->children ? true : false;
-
-	return 0;
-}
-
-int form_associate(void *ctx, void *form, void *node)
-{
-	return 0;
-}
-
-int add_attributes(void *ctx, void *node,
-		const hubbub_attribute *attributes, uint32_t n_attributes)
-{
-	struct content *c = ctx;
-	struct content_html_data *html = &c->data.html;
-
-	for (size_t i = 0; i < n_attributes; i++) {
-		const hubbub_attribute *attr = &attributes[i];
-
-		char *name = strndup((const char *) attr->name.ptr,
-				attr->name.len);
-		char *value = strndup((const char *) attr->value.ptr,
-				attr->value.len);
-
-		if (attr->ns == HUBBUB_NS_NULL) {
-			xmlNewProp(node, BAD_CAST name, BAD_CAST value);
-		} else {
-			xmlNewNsProp(node, html->ns[attr->ns], BAD_CAST name,
-					BAD_CAST value);
-		}
-
-		free(name);
-		free(value);
-	}
-
-	return 0;
-}
-
-int set_quirks_mode(void *ctx, hubbub_quirks_mode mode)
-{
-	return 0;
-}
-
-int change_encoding(void *ctx, const char *name)
-{
-	struct content *c = ctx;
-	struct content_html_data *html = &c->data.html;
-
-	/* If we have an encoding here, it means we are *certain* */
-	if (html->encoding) {
-		return 0;
-	}
-
-	/* Find the confidence otherwise (can only be from a BOM) */
-	uint32_t source;
-	const char *charset = hubbub_parser_read_charset(html->parser, &source);
-
-	if (source == HUBBUB_CHARSET_CONFIDENT) {
-		html->encoding_source = ENCODING_SOURCE_DETECTED;
-		html->encoding = (char *) charset;
-		return 0;
-	}
-
-	/* So here we have something of confidence tentative... */
-	/* http://www.whatwg.org/specs/web-apps/current-work/#change */
-
-	/* 2. "If the new encoding is identical or equivalent to the encoding
-	 * that is already being used to interpret the input stream, then set
-	 * the confidence to confident and abort these steps." */
-
-	/* Whatever happens, the encoding should be set here; either for
-	 * reprocessing with a different charset, or for confirming that the
-	 * charset is in fact correct */
-	html->encoding = (char *) name;
-	html->encoding_source = ENCODING_SOURCE_META;
-
-	/* Equal encodings will have the same string pointers */
-	return (charset == name) ? 0 : 1;
-}
-
-
-/**
- * Talloc'd-up allocation hook for Hubbub.
- */
-static void *html_hubbub_realloc(void *ptr, size_t len, void *pw)
-{
-	return talloc_realloc_size(pw, ptr, len);
-}
-
-
-
-/**
- * Create, set up, and whatnot, a Hubbub parser instance, along with the
- * relevant libxml2 bits.
- */
-static int html_create_parser(struct content *c)
-{
-	struct content_html_data *html = &c->data.html;
-	hubbub_parser_optparams param;
-
-	html->parser = hubbub_parser_create(html->encoding,
-			html_hubbub_realloc,
-			c);
-	if (!html->parser)
-		return 1;
-
-	html->document = xmlNewDoc(BAD_CAST "1.0");
-	if (!html->document)
-		return 1;
-
-	html->tree_handler = tree_handler;
-	html->tree_handler.ctx = c;
-	param.tree_handler = &html->tree_handler;
-	hubbub_parser_setopt(html->parser, HUBBUB_PARSER_TREE_HANDLER, &param);
-
-	param.document_node = html->document;
-	hubbub_parser_setopt(html->parser, HUBBUB_PARSER_DOCUMENT_NODE, &param);
-
-	return 0;
-}
-
-
-
-#endif
-
-
-
-
 /**
  * Create a CONTENT_HTML.
  *
@@ -485,15 +99,9 @@ bool html_create(struct content *c, const char *params[])
 	struct content_html_data *html = &c->data.html;
 	union content_msg_data msg_data;
 
-	html->parser = 0;
+	html->parser_binding = NULL;
 	html->document = 0;
-#ifdef WITH_HUBBUB
-	html->has_ns = false;
-	memset(html->ns, 0, sizeof(html->ns));
-#endif
-	html->encoding_handler = 0;
 	html->encoding = 0;
-	html->getenc = true;
 	html->base_url = c->url;
 	html->base_target = NULL;
 	html->layout = 0;
@@ -520,31 +128,14 @@ bool html_create(struct content *c, const char *params[])
 			if (!html->encoding)
 				goto no_memory;
 			html->encoding_source = ENCODING_SOURCE_HEADER;
-			html->getenc = false;
 			break;
 		}
 	}
 
-#ifndef WITH_HUBBUB
-	html->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0,
-			XML_CHAR_ENCODING_NONE);
-	if (!html->parser)
+	/* Create the parser binding */
+	html->parser_binding = binding_create_tree(c, html->encoding);
+	if (!html->parser_binding)
 		goto no_memory;
-#else
-
-	/* Set up the parser, libxml2 document, and that */
-	if (html_create_parser(c) != 0)
-		goto no_memory;
-
-#endif
-
-#ifndef WITH_HUBBUB
-	if (html->encoding) {
-		/* an encoding was specified in the Content-Type header */
-		if (!html_set_parser_encoding(c, html->encoding))
-			return false;
-	}
-#endif
 
 	return true;
 
@@ -564,150 +155,39 @@ no_memory:
 bool html_process_data(struct content *c, char *data, unsigned int size)
 {
 	unsigned long x;
-
-#ifndef WITH_HUBBUB
-	if (c->data.html.getenc) {
-		/* No encoding was specified in the Content-Type header.
-		 * Attempt to detect if the encoding is not 8-bit. If the
-		 * encoding is 8-bit, leave the parser unchanged, so that it
-		 * searches for a <meta http-equiv="content-type"
-		 * content="text/html; charset=...">. */
-		const char *encoding;
-		encoding = html_detect_encoding((const char **) &data, &size);
-		if (encoding) {
-			if (!html_set_parser_encoding(c, encoding))
-				return false;
-			c->data.html.encoding = talloc_strdup(c, encoding);
-			if (!c->data.html.encoding)
-				return false;
-			c->data.html.encoding_source =
-					ENCODING_SOURCE_DETECTED;
-		}
-		c->data.html.getenc = false;
-
-		/* The data we received may have solely consisted of a BOM.
-		 * If so, it will have been stripped by html_detect_encoding.
-		 * Therefore, we'll have nothing to do in that case. */
-		if (size == 0)
-			return true;
-	}
-#endif
-
-#ifdef WITH_HUBBUB
-	hubbub_error err;
-#endif
+	binding_error err;
 
 	for (x = 0; x + CHUNK <= size; x += CHUNK) {
-#ifdef WITH_HUBBUB
 		LOG(("Parsing %d bytes", CHUNK));
-		err = hubbub_parser_parse_chunk(
-				c->data.html.parser, 
+		err = binding_parse_chunk(c->data.html.parser_binding,
 				(uint8_t *) data + x, CHUNK);
-		if (err == HUBBUB_ENCODINGCHANGE) {
+		if (err == BINDING_ENCODINGCHANGE) {
 			goto encoding_change;
 		}
-#else
-		htmlParseChunk(c->data.html.parser, data + x, CHUNK, 0);
-#endif
+
 		gui_multitask();
 	}
 
-#ifdef WITH_HUBBUB
 	LOG(("Parsing %lu bytes", (size - x)));
-	err = hubbub_parser_parse_chunk(
-			c->data.html.parser, 
+	err = binding_parse_chunk(c->data.html.parser_binding, 
 			(uint8_t *) data + x, (size - x));
-	if (err == HUBBUB_ENCODINGCHANGE) {
+	if (err == BINDING_ENCODINGCHANGE) {
 		goto encoding_change;
 	}
-#else
-	htmlParseChunk(c->data.html.parser, data + x, (int) (size - x), 0);
-#endif
-
-#ifndef WITH_HUBBUB
-	if (!c->data.html.encoding && c->data.html.parser->input->encoding) {
-		/* The encoding was not in headers or detected,
-		 * and the parser found a <meta http-equiv="content-type"
-		 * content="text/html; charset=...">. */
-
-		/* However, if that encoding is non-ASCII-compatible,
-		 * ignore it, as it can't possibly be correct */
-		if (strncasecmp((const char *) c->data.html.parser->
-					input->encoding,
-				"UTF-16", 6) == 0 || /* UTF-16(LE|BE)? */
-			strncasecmp((const char *) c->data.html.parser->
-					input->encoding,
-				"UTF-32", 6) == 0) { /* UTF-32(LE|BE)? */
-			c->data.html.encoding = talloc_strdup(c, "ISO-8859-1");
-			c->data.html.encoding_source =
-					ENCODING_SOURCE_DETECTED;
-		} else {
-			c->data.html.encoding = talloc_strdup(c,
-				(const char *) c->data.html.parser->
-						input->encoding);
-			c->data.html.encoding_source = ENCODING_SOURCE_META;
-		}
-
-		if (!c->data.html.encoding) {
-			union content_msg_data msg_data;
-
-			msg_data.error = messages_get("NoMemory");
-			content_broadcast(c, CONTENT_MSG_ERROR, msg_data);
-			return false;
-		}
-
-		/* have the encoding; don't attempt to detect it */
-		c->data.html.getenc = false;
-
-		/* now, we must reset the parser such that it reparses
-		 * using the correct charset, and then reparse any document
-		 * source we've got. we achieve this by recreating the
-		 * parser in its entirety as this is simpler than resetting
-		 * the existing one and ensuring it's still set up correctly.
-		 */
-		if (c->data.html.parser->myDoc)
-			xmlFreeDoc(c->data.html.parser->myDoc);
-		htmlFreeParserCtxt(c->data.html.parser);
-
-		c->data.html.parser = htmlCreatePushParserCtxt(0, 0, "", 0,
-				0, XML_CHAR_ENCODING_NONE);
-		if (!c->data.html.parser) {
-			union content_msg_data msg_data;
-
-			msg_data.error = messages_get("NoMemory");
-			content_broadcast(c, CONTENT_MSG_ERROR, msg_data);
-			return false;
-		}
-		if (!html_set_parser_encoding(c, c->data.html.encoding))
-			return false;
-
-		/* and reparse received document source - the recursion
-		 * is safe as we've just set c->data.html.encoding so
-		 * we'll never get back in here. */
-		if (!html_process_data(c, c->source_data, c->source_size))
-			return false;
-	}
-#endif
 
 	return true;
 
-#ifdef WITH_HUBBUB
-
 encoding_change:
 
 	LOG(("Changing encoding"));
 
-	/* Free up hubbub, libxml2 etc */
-	hubbub_parser_destroy(c->data.html.parser);
-	if (c->data.html.document) {
-		xmlFreeDoc(c->data.html.document);
-		c->data.html.document = NULL;
-	}
-	c->data.html.has_ns = false;
-	memset(c->data.html.ns, 0, sizeof(c->data.html.ns));
+	/* Retrieve new encoding */
+	const char *encoding = binding_get_encoding(
+			c->data.html.parser_binding, 
+			&c->data.html.encoding_source);
 
-	/* Set up the parser, libxml2 document, and that */
-	if (html_create_parser(c) != 0) {
+	c->data.html.encoding = strdup(encoding);
+	if (!c->data.html.encoding) {
 		union content_msg_data msg_data;
 
 		msg_data.error = messages_get("NoMemory");
@@ -715,144 +195,26 @@ encoding_change:
 		return false;
 	}
 
-	/* Recurse to reprocess all that data.  This is safe because
-	 * the encoding is now specified at parser-start which means
-	 * it cannot be changed again. */
-	return html_process_data(c, c->source_data, c->source_size);
-
-#endif
-
-}
-
-
-#ifndef WITH_HUBBUB
+	/* Destroy binding */
+	binding_destroy_tree(c->data.html.parser_binding);
 
-/**
- * Set the HTML parser character encoding.
- *
- * \param  c         content of type CONTENT_HTML
- * \param  encoding  name of encoding
- * \return  true on success, false on error and error reported
- */
-bool html_set_parser_encoding(struct content *c, const char *encoding)
-{
-	struct content_html_data *html = &c->data.html;
-	xmlError *error;
-	char error_message[500];
-	union content_msg_data msg_data;
-
-	html->encoding_handler = xmlFindCharEncodingHandler(encoding);
-	if (!html->encoding_handler) {
-		/* either out of memory, or no handler available */
-		/* assume no handler available, which is not a fatal error */
-		LOG(("no encoding handler for \"%s\"", encoding));
-		/* \todo  warn user and ask them to install iconv? */
-		return true;
-	}
+	/* Create new binding, using the new encoding */
+	c->data.html.parser_binding = binding_create_tree(c,
+			c->data.html.encoding);
+	if (!c->data.html.parser_binding) {
+		union content_msg_data msg_data;
 
-	xmlCtxtResetLastError(html->parser);
-	if (xmlSwitchToEncoding(html->parser, html->encoding_handler)) {
-		error = xmlCtxtGetLastError(html->parser);
-		snprintf(error_message, sizeof error_message,
-				"%s xmlSwitchToEncoding(): %s",
-				messages_get("MiscError"),
-				error ? error->message : "failed");
-		msg_data.error = error_message;
+		msg_data.error = messages_get("NoMemory");
 		content_broadcast(c, CONTENT_MSG_ERROR, msg_data);
 		return false;
 	}
 
-	/* Dirty hack to get around libxml oddness:
-	 * 1) When creating a push parser context, the input flow's encoding
-	 *    string is not set (whether an encoding is specified or not)
-	 * 2) When switching encoding (as above), the input flow's encoding
-	 *    string is never changed
-	 * 3) When handling a meta charset, the input flow's encoding string
-	 *    is checked to determine if an encoding has already been set.
-	 *    If it has been set, then the meta charset is ignored.
-	 *
-	 * The upshot of this is that, if we don't explicitly set the input
-	 * flow's encoding string here, any meta charset in the document
-	 * will override our setting, which is incorrect behaviour.
-	 *
-	 * Ideally, this would be fixed in libxml, but that requires rather
-	 * more knowledge than I currently have of what libxml is doing.
-	 */
-	if (!html->parser->input->encoding)
-		html->parser->input->encoding =
-				xmlStrdup((const xmlChar *) encoding);
-
-	/* Ensure noone else attempts to reset the encoding */
-	html->getenc = false;
-
-	return true;
-}
-
-
-/**
- * Attempt to detect the encoding of some HTML data.
- *
- * \param  data  Pointer to HTML source data
- * \param  size  Pointer to length of data
- * \return  a constant string giving the encoding, or 0 if the encoding
- *          appears to be some 8-bit encoding
- *
- * If a BOM is encountered, *data and *size will be modified to skip over it
- */
-
-const char *html_detect_encoding(const char **data, unsigned int *size)
-{
-	const unsigned char *d = (const unsigned char *) *data;
-
-	/* this detection assumes that the first two characters are <= 0xff */
-	if (*size < 4)
-		return 0;
-
-	if (d[0] == 0x00 && d[1] == 0x00 &&
-			d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */
-		*data += 4;
-		*size -= 4;
-		return "UTF-32BE";
-	} else if (d[0] == 0xff && d[1] == 0xfe &&
-			d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */
-		*data += 4;
-		*size -= 4;
-		return "UTF-32LE";
-	}
-	else if (d[0] == 0x00 && d[1] != 0x00 &&
-			d[2] == 0x00 && d[3] != 0x00)   /* 00 xx 00 xx */
-		return "UTF-16BE";
-	else if (d[0] != 0x00 && d[1] == 0x00 &&
-			d[2] != 0x00 && d[3] == 0x00)   /* xx 00 xx 00 */
-		return "UTF-16LE";
-	else if (d[0] == 0x00 && d[1] == 0x00 &&
-			d[2] == 0x00 && d[3] != 0x00)   /* 00 00 00 xx */
-		return "ISO-10646-UCS-4";
-	else if (d[0] != 0x00 && d[1] == 0x00 &&
-			d[2] == 0x00 && d[3] == 0x00)   /* xx 00 00 00 */
-		return "ISO-10646-UCS-4";
-	else if (d[0] == 0xfe && d[1] == 0xff) {        /* BOM fe ff */
-		*data += 2;
-		*size -= 2;
-		return "UTF-16BE";
-	} else if (d[0] == 0xff && d[1] == 0xfe) {      /* BOM ff fe */
-		*data += 2;
-		*size -= 2;
-		return "UTF-16LE";
-	} else if (d[0] == 0xef && d[1] == 0xbb &&
-			d[2] == 0xbf) {                 /* BOM ef bb bf */
-		*data += 3;
-		*size -= 3;
-		return "UTF-8";
-	}
-
-	return 0;
+	/* Recurse to reprocess all that data.  This is safe because
+	 * the encoding is now specified at parser-start which means
+	 * it cannot be changed again. */
+	return html_process_data(c, c->source_data, c->source_size);
 }
 
-
-#endif
-
-
 /**
  * Convert a CONTENT_HTML for display.
  *
@@ -875,29 +237,19 @@ bool html_convert(struct content *c, int width, int height)
 	unsigned int time_before, time_taken;
 
 	/* finish parsing */
-	if (c->source_size == 0)
-#ifndef WITH_HUBBUB
-		htmlParseChunk(c->data.html.parser, empty_document,
-				sizeof empty_document, 0);
-#else
-		hubbub_parser_parse_chunk(c->data.html.parser,
+	if (c->source_size == 0) {
+		binding_parse_chunk(c->data.html.parser_binding,
 				(uint8_t *) empty_document,
 				sizeof empty_document);
-#endif
+	}
 
-#ifndef WITH_HUBBUB
-	htmlParseChunk(c->data.html.parser, "", 0, 1);
-	c->data.html.document = c->data.html.parser->myDoc;
+	binding_parse_completed(c->data.html.parser_binding);
+	c->data.html.document = 
+			binding_get_document(c->data.html.parser_binding);
 	/*xmlDebugDumpDocument(stderr, c->data.html.document);*/
-	htmlFreeParserCtxt(c->data.html.parser);
-	c->data.html.parser = 0;
-#else
-	hubbub_parser_completed(c->data.html.parser);
-	hubbub_parser_destroy(c->data.html.parser);
-	c->data.html.parser = 0;
-	c->data.html.document = c->data.html.document;
-	/*xmlDebugDumpDocument(stderr, document);*/
-#endif
+	binding_destroy_tree(c->data.html.parser_binding);
+	c->data.html.parser_binding = NULL;
+
 	if (!c->data.html.document) {
 		LOG(("Parsing failed"));
 		msg_data.error = messages_get("ParsingFail");
@@ -2206,12 +1558,8 @@ void html_destroy(struct content *c)
 	  	c->bitmap = NULL;
 	}
 
-	if (c->data.html.parser)
-#ifndef WITH_HUBBUB
-		htmlFreeParserCtxt(c->data.html.parser);
-#else
-		hubbub_parser_destroy(c->data.html.parser);
-#endif
+	if (c->data.html.parser_binding)
+		binding_destroy_tree(c->data.html.parser_binding);
 
 	if (c->data.html.document)
 		xmlFreeDoc(c->data.html.document);
diff --git a/render/html.h b/render/html.h
index a67900f29..574205f32 100644
--- a/render/html.h
+++ b/render/html.h
@@ -26,13 +26,9 @@
 #define _NETSURF_RENDER_HTML_H_
 
 #include <stdbool.h>
-#ifdef WITH_HUBBUB
-#include <hubbub/parser.h>
-#include <hubbub/tree.h>
-#endif
-#include <libxml/HTMLparser.h>
 #include "content/content_type.h"
 #include "css/css.h"
+#include "render/parser_binding.h"
 
 struct box;
 struct rect;
@@ -43,9 +39,6 @@ struct imagemap;
 struct object_params;
 struct plotters;
 
-/* Number of namespaces we support */
-#define NUM_NAMESPACES		7
-
 /* entries in stylesheet_content */
 #define STYLESHEET_BASE		0	/* base style sheet */
 #define STYLESHEET_ADBLOCK	1	/* adblocking stylesheet */
@@ -121,26 +114,12 @@ struct content_html_iframe {
 
 /** Data specific to CONTENT_HTML. */
 struct content_html_data {
-#ifndef WITH_HUBBUB
-	htmlParserCtxt *parser;  /**< HTML parser context. */
-#else
-	hubbub_parser *parser; /**< HTML parser context. */
-	hubbub_tree_handler tree_handler;
-
-	bool has_ns;
-	xmlNs *ns[NUM_NAMESPACES];
-#endif
+	void *parser_binding;
 	xmlDoc *document;
 
-	/** HTML parser encoding handler. */
-	xmlCharEncodingHandler *encoding_handler;
-
 	char *encoding;	/**< Encoding of source, 0 if unknown. */
-	enum { ENCODING_SOURCE_HEADER, ENCODING_SOURCE_DETECTED,
-			ENCODING_SOURCE_META } encoding_source;
+	binding_encoding_source encoding_source;
 				/**< Source of encoding information. */
-	bool getenc; /**< Need to get the encoding from the document, as it
-	              * wasn't specified in the Content-Type header. */
 
 	char *base_url;	/**< Base URL (may be a copy of content->url). */
 	char *base_target;	/**< Base target */
diff --git a/render/hubbub_binding.c b/render/hubbub_binding.c
new file mode 100644
index 000000000..b3ed259b2
--- /dev/null
+++ b/render/hubbub_binding.c
@@ -0,0 +1,643 @@
+/*
+ * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org> 
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifdef WITH_HUBBUB
+
+#define _GNU_SOURCE /* for strndup */
+#include <assert.h>
+#include <stdbool.h>
+#include <string.h>
+
+#include <libxml/HTMLparser.h>
+#include <libxml/HTMLtree.h>
+
+#include <hubbub/parser.h>
+#include <hubbub/tree.h>
+
+#include "render/parser_binding.h"
+
+#include "utils/log.h"
+#include "utils/talloc.h"
+
+typedef struct hubbub_ctx {
+	hubbub_parser *parser;
+
+	htmlDocPtr document;
+	bool owns_doc;
+
+	const char *encoding;
+	binding_encoding_source encoding_source;
+
+#define NUM_NAMESPACES (6)
+	xmlNsPtr namespaces[NUM_NAMESPACES];
+#undef NUM_NAMESPACES
+
+	hubbub_tree_handler tree_handler;
+} hubbub_ctx;
+
+static struct {
+	const char *prefix;
+	const char *url;
+} namespaces[] = {
+	{ NULL, NULL },
+	{ NULL, "http://www.w3.org/1999/xhtml" },
+	{ "math", "http://www.w3.org/1998/Math/MathML" },
+	{ "svg", "http://www.w3.org/2000/svg" },
+	{ "xlink", "http://www.w3.org/1999/xlink" },
+	/** \todo Oh dear. LibXML2 refuses to create any namespace with a 
+	 * prefix of "xml". That sucks, royally. */
+	{ "xml", "http://www.w3.org/XML/1998/namespace" },
+	{ "xmlns", "http://www.w3.org/2000/xmlns/" }
+};
+
+static inline char *c_string_from_hubbub_string(hubbub_ctx *ctx, 
+		const hubbub_string *str);
+static void create_namespaces(hubbub_ctx *ctx, xmlNode *root);
+static int create_comment(void *ctx, const hubbub_string *data, void **result);
+static int create_doctype(void *ctx, const hubbub_doctype *doctype,
+		void **result);
+static int create_element(void *ctx, const hubbub_tag *tag, void **result);
+static int create_text(void *ctx, const hubbub_string *data, void **result);
+static int ref_node(void *ctx, void *node);
+static int unref_node(void *ctx, void *node);
+static int append_child(void *ctx, void *parent, void *child, void **result);
+static int insert_before(void *ctx, void *parent, void *child, void *ref_child,
+		void **result);
+static int remove_child(void *ctx, void *parent, void *child, void **result);
+static int clone_node(void *ctx, void *node, bool deep, void **result);
+static int reparent_children(void *ctx, void *node, void *new_parent);
+static int get_parent(void *ctx, void *node, bool element_only, void **result);
+static int has_children(void *ctx, void *node, bool *result);
+static int form_associate(void *ctx, void *form, void *node);
+static int add_attributes(void *ctx, void *node,
+		const hubbub_attribute *attributes, uint32_t n_attributes);
+static int set_quirks_mode(void *ctx, hubbub_quirks_mode mode);
+static int change_encoding(void *ctx, const char *charset);
+
+static hubbub_tree_handler tree_handler = {
+	create_comment,
+	create_doctype,
+	create_element,
+	create_text,
+	ref_node,
+	unref_node,
+	append_child,
+	insert_before,
+	remove_child,
+	clone_node,
+	reparent_children,
+	get_parent,
+	has_children,
+	form_associate,
+	add_attributes,
+	set_quirks_mode,
+	change_encoding,
+	NULL
+};
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+	return talloc_realloc_size(pw, ptr, len);
+}
+
+void *binding_create_tree(void *arena, const char *charset)
+{
+	hubbub_ctx *ctx;
+	hubbub_parser_optparams params;
+
+	ctx = malloc(sizeof(hubbub_ctx));
+	if (ctx == NULL)
+		return NULL;
+
+	ctx->parser = NULL;
+	ctx->encoding = charset;
+	ctx->encoding_source = ENCODING_SOURCE_HEADER;
+	ctx->document = NULL;
+	ctx->owns_doc = true;
+
+	ctx->parser = hubbub_parser_create(charset, myrealloc, arena);
+	if (ctx->parser == NULL) {
+		free(ctx);
+		return NULL;
+	}
+
+	ctx->document = htmlNewDocNoDtD(NULL, NULL);
+	if (ctx->document == NULL) {
+		hubbub_parser_destroy(ctx->parser);
+		free(ctx);
+		return NULL;
+	}
+	ctx->document->_private = (void *) 0;
+
+	for (uint32_t i = 0; 
+		i < sizeof(ctx->namespaces) / sizeof(ctx->namespaces[0]); i++) {
+		ctx->namespaces[i] = NULL;
+	}
+
+	ctx->tree_handler = tree_handler;
+	ctx->tree_handler.ctx = (void *) ctx;
+
+	params.tree_handler = &ctx->tree_handler;
+	hubbub_parser_setopt(ctx->parser, HUBBUB_PARSER_TREE_HANDLER, &params);
+
+	ref_node(ctx, ctx->document);
+	params.document_node = ctx->document;
+	hubbub_parser_setopt(ctx->parser, HUBBUB_PARSER_DOCUMENT_NODE, &params);
+
+	return (void *) ctx;
+}
+
+void binding_destroy_tree(void *ctx)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+
+	if (ctx == NULL)
+		return;
+
+	if (c->parser != NULL)
+		hubbub_parser_destroy(c->parser);
+
+	if (c->owns_doc)
+		xmlFreeDoc(c->document);
+
+	c->parser = NULL;
+	c->encoding = NULL;
+	c->document = NULL;
+
+	free(c);
+}
+
+binding_error binding_parse_chunk(void *ctx, const uint8_t *data, size_t len)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+	hubbub_error err;
+
+	err = hubbub_parser_parse_chunk(c->parser, (uint8_t *) data, len);
+	if (err == HUBBUB_ENCODINGCHANGE)
+		return BINDING_ENCODINGCHANGE;
+
+	return BINDING_OK;
+}
+
+binding_error binding_parse_completed(void *ctx)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+	hubbub_error error;
+
+	error = hubbub_parser_completed(c->parser);
+	/** \todo error handling */
+
+	return BINDING_OK;
+}
+
+const char *binding_get_encoding(void *ctx, binding_encoding_source *source)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+
+	*source = c->encoding_source;
+
+	return c->encoding;
+}
+
+xmlDocPtr binding_get_document(void *ctx)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+	xmlDocPtr doc = c->document;
+
+	c->owns_doc = false;
+
+	return doc;
+}
+
+/*****************************************************************************/
+
+char *c_string_from_hubbub_string(hubbub_ctx *ctx, const hubbub_string *str)
+{
+	return strndup((const char *) str->ptr, (int) str->len);
+}
+
+void create_namespaces(hubbub_ctx *ctx, xmlNode *root)
+{
+	for (uint32_t i = 1; 
+			i < sizeof(namespaces) / sizeof(namespaces[0]); i++) {
+		ctx->namespaces[i - 1] = xmlNewNs(root, 
+				BAD_CAST namespaces[i].url, 
+				BAD_CAST namespaces[i].prefix);
+
+		if (ctx->namespaces[i - 1] == NULL) {
+			LOG(("Failed creating namespace %s\n", 
+					namespaces[i].prefix));
+		}
+	}
+}
+
+int create_comment(void *ctx, const hubbub_string *data, void **result)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+	char *content;
+	xmlNodePtr n;
+
+	content = c_string_from_hubbub_string(c, data);
+	if (content == NULL)
+		return 1;
+
+	n = xmlNewDocComment(c->document, BAD_CAST content);
+	if (n == NULL) {
+		free(content);
+		return 1;
+	}
+	n->_private = (void *) (uintptr_t) 1;
+
+	free(content);
+
+	*result = (void *) n;
+
+	return 0;
+}
+
+int create_doctype(void *ctx, const hubbub_doctype *doctype, void **result)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+	char *name, *public = NULL, *system = NULL;
+	xmlDtdPtr n;
+
+	name = c_string_from_hubbub_string(c, &doctype->name);
+	if (name == NULL)
+		return 1;
+
+	if (!doctype->public_missing) {
+		public = c_string_from_hubbub_string(c, &doctype->public_id);
+		if (public == NULL) {
+			free(name);
+			return 1;
+		}
+	}
+
+	if (!doctype->system_missing) {
+		system = c_string_from_hubbub_string(c, &doctype->system_id);
+		if (system == NULL) {
+			free(public);
+			free(name);
+			return 1;
+		}
+	}
+
+	n = xmlNewDtd(c->document, BAD_CAST name, 
+			BAD_CAST (public ? public : ""),
+			BAD_CAST (system ? system : ""));
+	if (n == NULL) {
+		free(system);
+		free(public);
+		free(name);
+		return 1;
+	}
+	n->_private = (void *) (uintptr_t) 1;
+
+	*result = (void *) n;
+
+	free(system);
+	free(public);
+	free(name);
+
+	return 0;
+}
+
+int create_element(void *ctx, const hubbub_tag *tag, void **result)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+	char *name;
+	xmlNodePtr n;
+
+	name = c_string_from_hubbub_string(c, &tag->name);
+	if (name == NULL)
+		return 1;
+
+	if (c->namespaces[0] != NULL) {
+		n = xmlNewDocNode(c->document, c->namespaces[tag->ns - 1], 
+				BAD_CAST name, NULL);
+	} else {
+		n = xmlNewDocNode(c->document, NULL, BAD_CAST name, NULL);
+
+		/* We're creating the root node of the document. Therefore,
+		 * create the namespaces and set this node's namespace */
+		if (n != NULL && c->namespaces[0] == NULL) {
+			create_namespaces(c, (void *) n);
+
+			xmlSetNs(n, c->namespaces[tag->ns - 1]);
+		}
+	}
+	if (n == NULL) {
+		free(name);
+		return 1;
+	}
+	n->_private = (void *) (uintptr_t) 1;
+
+	if (tag->n_attributes > 0 && add_attributes(ctx, (void *) n, 
+			tag->attributes, tag->n_attributes) != 0) {
+		xmlFreeNode(n);
+		free(name);
+		return 1;
+	}
+
+	*result = (void *) n;
+
+	free(name);
+
+	return 0;
+}
+
+int create_text(void *ctx, const hubbub_string *data, void **result)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+	xmlNodePtr n;
+
+	n = xmlNewDocTextLen(c->document, BAD_CAST data->ptr, (int) data->len);
+	if (n == NULL) {
+		return 1;
+	}
+	n->_private = (void *) (uintptr_t) 1;
+
+	*result = (void *) n;
+
+	return 0;
+}
+
+int ref_node(void *ctx, void *node)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+
+	if (node == c->document) {
+		xmlDoc *n = (xmlDoc *) node;
+		uintptr_t count = (uintptr_t) n->_private;
+
+		n->_private = (void *) ++count;
+	} else {
+		xmlNode *n = (xmlNode *) node;
+		uintptr_t count = (uintptr_t) n->_private;
+
+		n->_private = (void *) ++count;
+	}
+
+	return 0;
+}
+
+int unref_node(void *ctx, void *node)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+
+	if (node == c->document) {
+		xmlDoc *n = (xmlDoc *) node;
+		uintptr_t count = (uintptr_t) n->_private;
+
+		assert(count != 0 && "Node has refcount of zero");
+
+		n->_private = (void *) --count;
+	} else {
+		xmlNode *n = (xmlNode *) node;
+		uintptr_t count = (uintptr_t) n->_private;
+
+		assert(count != 0 && "Node has refcount of zero");
+
+		n->_private = (void *) --count;
+
+		if (count == 0 && n->parent == NULL) {
+			xmlFreeNode(n);
+		}
+	}
+
+	return 0;
+}
+
+int append_child(void *ctx, void *parent, void *child, void **result)
+{
+	xmlNode *chld = (xmlNode *) child;
+	xmlNode *p = (xmlNode *) parent;
+
+	if (chld->type == XML_TEXT_NODE && p->last != NULL && 
+			p->last->type == XML_TEXT_NODE) {
+		/* Need to clone the child, as libxml will free it if it 
+		 * merges the content with a pre-existing text node. */
+		chld = xmlCopyNode(chld, 0);
+		if (chld == NULL)
+			return 1;
+
+		*result = xmlAddChild(p, chld);
+
+		assert(*result != (void *) chld);
+	} else {
+		*result = xmlAddChild(p, chld);
+	}
+
+	if (*result == NULL)
+		return 1;
+
+	ref_node(ctx, *result);
+
+	return 0;
+}
+
+int insert_before(void *ctx, void *parent, void *child, void *ref_child,
+		void **result)
+{
+	xmlNode *chld = (xmlNode *) child;
+	xmlNode *ref = (xmlNode *) ref_child;
+
+	if (chld->type == XML_TEXT_NODE && ref->prev != NULL && 
+			ref->prev->type == XML_TEXT_NODE) {
+		/* Clone text node, as it'll be freed by libxml */
+		chld = xmlCopyNode(chld, 0);
+		if (chld == NULL)
+			return 1;
+
+		*result = xmlAddNextSibling(ref->prev, chld);
+
+		assert(*result != (void *) chld);
+	} else {
+		*result = xmlAddPrevSibling(ref, chld);
+	}
+
+	if (*result == NULL)
+		return 1;
+
+	ref_node(ctx, *result);
+
+	return 0;
+}
+
+int remove_child(void *ctx, void *parent, void *child, void **result)
+{
+	xmlNode *chld = (xmlNode *) child;
+
+	xmlUnlinkNode(chld);
+
+	*result = child;
+
+	ref_node(ctx, *result);
+
+	return 0;
+}
+
+int clone_node(void *ctx, void *node, bool deep, void **result)
+{
+	xmlNode *n = (xmlNode *) node;
+
+	*result = xmlCopyNode(n, deep ? 1 : 2);
+
+	if (*result == NULL)
+		return 1;
+
+	((xmlNode *)(*result))->_private = (void *) (uintptr_t) 1;
+
+	return 0;
+}
+
+int reparent_children(void *ctx, void *node, void *new_parent)
+{
+	xmlNode *n = (xmlNode *) node;
+	xmlNode *p = (xmlNode *) new_parent;
+
+	for (xmlNode *child = n->children; child != NULL; ) {
+		xmlNode *next = child->next;
+
+		xmlUnlinkNode(child);
+
+		if (xmlAddChild(p, child) == NULL)
+			return 1;
+
+		child = next;
+	}
+
+	return 0;
+}
+
+int get_parent(void *ctx, void *node, bool element_only, void **result)
+{
+	xmlNode *n = (xmlNode *) node;
+
+	*result = (void *) n->parent;
+
+	if (*result != NULL && element_only && 
+			((xmlNode *) *result)->type != XML_ELEMENT_NODE) {
+		*result = NULL;
+	}
+
+	if (*result != NULL)
+		ref_node(ctx, *result);
+
+	return 0;
+}
+
+int has_children(void *ctx, void *node, bool *result)
+{
+	xmlNode *n = (xmlNode *) node;
+
+	*result = n->children != NULL;
+
+	return 0;
+}
+
+int form_associate(void *ctx, void *form, void *node)
+{
+	return 0;
+}
+
+int add_attributes(void *ctx, void *node, 
+		const hubbub_attribute *attributes, uint32_t n_attributes)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+	xmlNode *n = (xmlNode *) node;
+
+	for (uint32_t attr = 0; attr < n_attributes; attr++) {
+		xmlAttr *prop;
+		char *name, *value;
+
+		name = c_string_from_hubbub_string(c, &attributes[attr].name);
+		if (name == NULL)
+			return 1;
+
+		value = c_string_from_hubbub_string(c, &attributes[attr].value);
+		if (value == NULL) {
+			free(name);
+			return 1;
+		}
+
+		if (attributes[attr].ns != HUBBUB_NS_NULL && 
+				c->namespaces[0] != NULL) {
+			prop = xmlNewNsProp(n, 
+					c->namespaces[attributes[attr].ns - 1],
+					BAD_CAST name, BAD_CAST value);
+		} else {
+			prop = xmlNewProp(n, BAD_CAST name, BAD_CAST value);
+		}
+		if (prop == NULL) {
+			free(value);
+			free(name);
+			return 1;
+		}
+
+		free(value);
+		free(name);
+	}
+
+	return 0;
+}
+
+int set_quirks_mode(void *ctx, hubbub_quirks_mode mode)
+{
+	return 0;
+}
+
+int change_encoding(void *ctx, const char *charset)
+{
+	hubbub_ctx *c = (hubbub_ctx *) ctx;
+
+	/* If we have an encoding here, it means we are *certain* */
+	if (c->encoding != NULL) {
+		return 0;
+	}
+
+	/* Find the confidence otherwise (can only be from a BOM) */
+	uint32_t source;
+	const char *name = hubbub_parser_read_charset(c->parser, &source);
+
+	if (source == HUBBUB_CHARSET_CONFIDENT) {
+		c->encoding_source = ENCODING_SOURCE_DETECTED;
+		c->encoding = (char *) charset;
+		return 0;
+	}
+
+	/* So here we have something of confidence tentative... */
+	/* http://www.whatwg.org/specs/web-apps/current-work/#change */
+
+	/* 2. "If the new encoding is identical or equivalent to the encoding
+	 * that is already being used to interpret the input stream, then set
+	 * the confidence to confident and abort these steps." */
+
+	/* Whatever happens, the encoding should be set here; either for
+	 * reprocessing with a different charset, or for confirming that the
+	 * charset is in fact correct */
+	c->encoding = charset;
+	c->encoding_source = ENCODING_SOURCE_META;
+
+	/* Equal encodings will have the same string pointers */
+	return (charset == name) ? 0 : 1;
+}
+
+#endif
+
diff --git a/render/libxml_binding.c b/render/libxml_binding.c
new file mode 100644
index 000000000..51cf0a6be
--- /dev/null
+++ b/render/libxml_binding.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright 2007 James Bursa <bursa@users.sourceforge.net>
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef WITH_HUBBUB
+
+#include <stdbool.h>
+#include <string.h>
+
+#include <libxml/HTMLparser.h>
+#include <libxml/HTMLtree.h>
+#include <libxml/parser.h>
+#include <libxml/parserInternals.h>
+
+#include "render/parser_binding.h"
+
+#include "utils/log.h"
+#include "utils/talloc.h"
+
+typedef struct libxml_ctx {
+	htmlParserCtxt *parser;
+
+	/** HTML parser encoding handler. */
+	xmlCharEncodingHandler *encoding_handler;
+
+	const char *encoding;
+	binding_encoding_source encoding_source;
+
+	bool getenc;
+} libxml_ctx;
+
+static bool set_parser_encoding(libxml_ctx *c, const char *encoding);
+static const char *detect_encoding(const char **data, size_t *size);
+
+void *binding_create_tree(void *arena, const char *charset)
+{
+	libxml_ctx *ctx;
+
+	ctx = malloc(sizeof(libxml_ctx));
+	if (ctx == NULL)
+		return NULL;
+
+	ctx->parser = NULL;
+	ctx->encoding_handler = NULL;
+	ctx->encoding = charset;
+	ctx->encoding_source = ENCODING_SOURCE_HEADER;
+	ctx->getenc = true;
+
+	ctx->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0, 
+			XML_CHAR_ENCODING_NONE);
+	if (ctx->parser == NULL) {
+		free(ctx);
+		return NULL;
+	}
+
+	if (ctx->encoding != NULL && !set_parser_encoding(ctx, charset)) {
+		if (ctx->parser->myDoc != NULL)
+			xmlFreeDoc(ctx->parser->myDoc);
+		htmlFreeParserCtxt(ctx->parser);
+		free(ctx);
+		return NULL;
+	}
+
+	return (void *) ctx;
+}
+
+void binding_destroy_tree(void *ctx)
+{
+	libxml_ctx *c = (libxml_ctx *) ctx;
+
+	if (ctx == NULL)
+		return;
+
+	if (c->parser->myDoc != NULL)
+		xmlFreeDoc(c->parser->myDoc);
+
+	if (c->parser != NULL)
+		htmlFreeParserCtxt(c->parser);
+
+	c->parser = NULL;
+	c->encoding = NULL;
+
+	free(c);
+}
+
+binding_error binding_parse_chunk(void *ctx, const uint8_t *data, size_t len)
+{
+	libxml_ctx *c = (libxml_ctx *) ctx;
+
+	if (c->getenc) {
+		/* No encoding was specified in the Content-Type header.
+		 * Attempt to detect if the encoding is not 8-bit. If the
+		 * encoding is 8-bit, leave the parser unchanged, so that it
+		 * searches for a <meta http-equiv="content-type"
+		 * content="text/html; charset=...">. */
+		const char *encoding;
+		encoding = detect_encoding((const char **) (void *) &data, 
+				&len);
+		if (encoding) {
+			if (!set_parser_encoding(c, encoding))
+				return BINDING_NOMEM;
+			c->encoding = encoding;
+			c->encoding_source = ENCODING_SOURCE_DETECTED;
+		}
+		c->getenc = false;
+
+		/* The data we received may have solely consisted of a BOM.
+		 * If so, it will have been stripped by html_detect_encoding.
+		 * Therefore, we'll have nothing to do in that case. */
+		if (len == 0)
+			return BINDING_OK;
+	}
+
+	htmlParseChunk(c->parser, (const char *) data, len, 0);
+	/** \todo error handling */
+
+	if (!c->encoding && c->parser->input->encoding) {
+		/* The encoding was not in headers or detected,
+		 * and the parser found a <meta http-equiv="content-type"
+		 * content="text/html; charset=...">. */
+
+		/* However, if that encoding is non-ASCII-compatible,
+		 * ignore it, as it can't possibly be correct */
+		if (strncasecmp((const char *) c->parser->input->encoding,
+				"UTF-16", 6) == 0 || /* UTF-16(LE|BE)? */
+			strncasecmp((const char *) c->parser->input->encoding,
+				"UTF-32", 6) == 0) { /* UTF-32(LE|BE)? */
+			c->encoding = "ISO-8859-1";
+			c->encoding_source = ENCODING_SOURCE_DETECTED;
+		} else {
+			c->encoding = (const char *) c->parser->input->encoding;
+			c->encoding_source = ENCODING_SOURCE_META;
+		}
+
+		if (!c->encoding)
+			return BINDING_NOMEM;
+
+		/* have the encoding; don't attempt to detect it */
+		c->getenc = false;
+
+		return BINDING_ENCODINGCHANGE;
+	}
+
+	return BINDING_OK;
+}
+
+binding_error binding_parse_completed(void *ctx)
+{
+	libxml_ctx *c = (libxml_ctx *) ctx;
+
+	htmlParseChunk(c->parser, "", 0, 1);
+	/** \todo error handling */
+
+	return BINDING_OK;
+}
+
+const char *binding_get_encoding(void *ctx, binding_encoding_source *source)
+{
+	libxml_ctx *c = (libxml_ctx *) ctx;
+
+	*source = c->encoding_source;
+
+	return c->encoding;
+}
+
+xmlDocPtr binding_get_document(void *ctx)
+{
+	libxml_ctx *c = (libxml_ctx *) ctx;
+	xmlDocPtr doc = c->parser->myDoc;
+
+	c->parser->myDoc = NULL;
+
+	return doc;
+}
+
+/******************************************************************************/
+
+/**
+ * Set the HTML parser character encoding.
+ *
+ * \param  c         context
+ * \param  encoding  name of encoding
+ * \return  true on success, false on error and error reported
+ */
+bool set_parser_encoding(libxml_ctx *c, const char *encoding)
+{
+	xmlError *error;
+
+	c->encoding_handler = xmlFindCharEncodingHandler(encoding);
+	if (!c->encoding_handler) {
+		/* either out of memory, or no handler available */
+		/* assume no handler available, which is not a fatal error */
+		LOG(("no encoding handler for \"%s\"", encoding));
+		/* \todo  warn user and ask them to install iconv? */
+		return true;
+	}
+
+	xmlCtxtResetLastError(c->parser);
+	if (xmlSwitchToEncoding(c->parser, c->encoding_handler)) {
+		error = xmlCtxtGetLastError(c->parser);
+		LOG(("xmlSwitchToEncoding(): %s",
+				error ? error->message : "failed"));
+		return false;
+	}
+
+	/* Dirty hack to get around libxml oddness:
+	 * 1) When creating a push parser context, the input flow's encoding
+	 *    string is not set (whether an encoding is specified or not)
+	 * 2) When switching encoding (as above), the input flow's encoding
+	 *    string is never changed
+	 * 3) When handling a meta charset, the input flow's encoding string
+	 *    is checked to determine if an encoding has already been set.
+	 *    If it has been set, then the meta charset is ignored.
+	 *
+	 * The upshot of this is that, if we don't explicitly set the input
+	 * flow's encoding string here, any meta charset in the document
+	 * will override our setting, which is incorrect behaviour.
+	 *
+	 * Ideally, this would be fixed in libxml, but that requires rather
+	 * more knowledge than I currently have of what libxml is doing.
+	 */
+	if (!c->parser->input->encoding)
+		c->parser->input->encoding =
+				xmlStrdup((const xmlChar *) encoding);
+
+	/* Ensure noone else attempts to reset the encoding */
+	c->getenc = false;
+
+	return true;
+}
+
+/**
+ * Attempt to detect the encoding of some HTML data.
+ *
+ * \param  data  Pointer to HTML source data
+ * \param  size  Pointer to length of data
+ * \return  a constant string giving the encoding, or 0 if the encoding
+ *          appears to be some 8-bit encoding
+ *
+ * If a BOM is encountered, *data and *size will be modified to skip over it
+ */
+
+const char *detect_encoding(const char **data, size_t *size)
+{
+	const unsigned char *d = (const unsigned char *) *data;
+
+	/* this detection assumes that the first two characters are <= 0xff */
+	if (*size < 4)
+		return 0;
+
+	if (d[0] == 0x00 && d[1] == 0x00 &&
+			d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */
+		*data += 4;
+		*size -= 4;
+		return "UTF-32BE";
+	} else if (d[0] == 0xff && d[1] == 0xfe &&
+			d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */
+		*data += 4;
+		*size -= 4;
+		return "UTF-32LE";
+	}
+	else if (d[0] == 0x00 && d[1] != 0x00 &&
+			d[2] == 0x00 && d[3] != 0x00)   /* 00 xx 00 xx */
+		return "UTF-16BE";
+	else if (d[0] != 0x00 && d[1] == 0x00 &&
+			d[2] != 0x00 && d[3] == 0x00)   /* xx 00 xx 00 */
+		return "UTF-16LE";
+	else if (d[0] == 0x00 && d[1] == 0x00 &&
+			d[2] == 0x00 && d[3] != 0x00)   /* 00 00 00 xx */
+		return "ISO-10646-UCS-4";
+	else if (d[0] != 0x00 && d[1] == 0x00 &&
+			d[2] == 0x00 && d[3] == 0x00)   /* xx 00 00 00 */
+		return "ISO-10646-UCS-4";
+	else if (d[0] == 0xfe && d[1] == 0xff) {        /* BOM fe ff */
+		*data += 2;
+		*size -= 2;
+		return "UTF-16BE";
+	} else if (d[0] == 0xff && d[1] == 0xfe) {      /* BOM ff fe */
+		*data += 2;
+		*size -= 2;
+		return "UTF-16LE";
+	} else if (d[0] == 0xef && d[1] == 0xbb &&
+			d[2] == 0xbf) {                 /* BOM ef bb bf */
+		*data += 3;
+		*size -= 3;
+		return "UTF-8";
+	}
+
+	return 0;
+}
+
+#endif
+
diff --git a/render/parser_binding.h b/render/parser_binding.h
new file mode 100644
index 000000000..73e6e9708
--- /dev/null
+++ b/render/parser_binding.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _NETSURF_RENDER_PARSER_BINDING_H_
+#define _NETSURF_RENDER_PARSER_BINDING_H_
+
+#include <stdint.h>
+
+#include <libxml/tree.h>
+
+typedef enum binding_error {
+	BINDING_OK,
+	BINDING_NOMEM,
+	BINDING_ENCODINGCHANGE
+} binding_error;
+
+typedef enum binding_encoding_source { 
+	ENCODING_SOURCE_HEADER, 
+	ENCODING_SOURCE_DETECTED,
+	ENCODING_SOURCE_META 
+} binding_encoding_source;
+
+void *binding_create_tree(void *arena, const char *charset);
+void binding_destroy_tree(void *ctx);
+
+binding_error binding_parse_chunk(void *ctx, const uint8_t *data, size_t len);
+binding_error binding_parse_completed(void *ctx);
+
+const char *binding_get_encoding(void *ctx, binding_encoding_source *source);
+xmlDocPtr binding_get_document(void *ctx);
+
+#endif
+
author	John Mark Bell <jmb@netsurf-browser.org>	2008-09-23 02:19:50 +0000
committer	John Mark Bell <jmb@netsurf-browser.org>	2008-09-23 02:19:50 +0000
commit	163ad56fce1b8d83d43034620a8d5c847785edf7 (patch)
tree	5e52e438bc5d90af5fd6977d7367fec15e25ba2e /render
parent	4fad8726a4ae82849f38ffc3ef087181d7f37e14 (diff)
download	netsurf-163ad56fce1b8d83d43034620a8d5c847785edf7.tar.gz netsurf-163ad56fce1b8d83d43034620a8d5c847785edf7.tar.bz2