diff options
author | John Mark Bell <jmb@netsurf-browser.org> | 2008-09-23 02:19:50 +0000 |
---|---|---|
committer | John Mark Bell <jmb@netsurf-browser.org> | 2008-09-23 02:19:50 +0000 |
commit | 163ad56fce1b8d83d43034620a8d5c847785edf7 (patch) | |
tree | 5e52e438bc5d90af5fd6977d7367fec15e25ba2e /render | |
parent | 4fad8726a4ae82849f38ffc3ef087181d7f37e14 (diff) | |
download | netsurf-163ad56fce1b8d83d43034620a8d5c847785edf7.tar.gz netsurf-163ad56fce1b8d83d43034620a8d5c847785edf7.tar.bz2 |
Rework html parser bindings to have a common API and reside in separate files for ease of reading.
Add error handling to hubbub binding.
svn path=/trunk/netsurf/; revision=5404
Diffstat (limited to 'render')
-rw-r--r-- | render/directory.c | 42 | ||||
-rw-r--r-- | render/html.c | 730 | ||||
-rw-r--r-- | render/html.h | 27 | ||||
-rw-r--r-- | render/hubbub_binding.c | 643 | ||||
-rw-r--r-- | render/libxml_binding.c | 308 | ||||
-rw-r--r-- | render/parser_binding.h | 48 |
6 files changed, 1054 insertions, 744 deletions
diff --git a/render/directory.c b/render/directory.c index 1363ad251..c2b343fbe 100644 --- a/render/directory.c +++ b/render/directory.c @@ -48,12 +48,10 @@ bool directory_create(struct content *c, const char *params[]) { /* html_create() must have broadcast MSG_ERROR already, so we * don't need to. */ return false; -#ifndef WITH_HUBBUB - htmlParseChunk(c->data.html.parser, header, sizeof(header) - 1, 0); -#else - hubbub_parser_parse_chunk(c->data.html.parser, - (uint8_t *) header, sizeof(header) - 1); -#endif + + binding_parse_chunk(c->data.html.parser_binding, + (uint8_t *) header, sizeof(header) - 1); + return true; } @@ -100,12 +98,9 @@ bool directory_convert(struct content *c, int width, int height) { "<body>\n<h1>\nIndex of %s</h1>\n<hr><pre>", nice_path, nice_path); free(nice_path); -#ifndef WITH_HUBBUB - htmlParseChunk(c->data.html.parser, buffer, strlen(buffer), 0); -#else - hubbub_parser_parse_chunk(c->data.html.parser, + + binding_parse_chunk(c->data.html.parser_binding, (uint8_t *) buffer, strlen(buffer)); -#endif res = url_parent(c->url, &up); if (res == URL_FUNC_OK) { @@ -113,14 +108,9 @@ bool directory_convert(struct content *c, int width, int height) { if ((res == URL_FUNC_OK) && !compare) { snprintf(buffer, sizeof(buffer), "<a href=\"..\">[..]</a>\n"); -#ifndef WITH_HUBBUB - htmlParseChunk(c->data.html.parser, buffer, - strlen(buffer), 0); -#else - hubbub_parser_parse_chunk(c->data.html.parser, - (uint8_t *) buffer, - strlen(buffer)); -#endif + + binding_parse_chunk(c->data.html.parser_binding, + (uint8_t *) buffer, strlen(buffer)); } free(up); } @@ -137,21 +127,15 @@ bool directory_convert(struct content *c, int width, int height) { snprintf(buffer, sizeof(buffer), "<a href=\"%s/%s\">%s</a>\n", c->url, entry->d_name, entry->d_name); -#ifndef WITH_HUBBUB - htmlParseChunk(c->data.html.parser, buffer, strlen(buffer), 0); -#else - hubbub_parser_parse_chunk(c->data.html.parser, + + binding_parse_chunk(c->data.html.parser_binding, (uint8_t *) buffer, strlen(buffer)); -#endif } closedir(parent); -#ifndef WITH_HUBBUB - htmlParseChunk(c->data.html.parser, footer, sizeof(footer) - 1, 0); -#else - hubbub_parser_parse_chunk(c->data.html.parser, + binding_parse_chunk(c->data.html.parser_binding, (uint8_t *) footer, sizeof(footer) - 1); -#endif + c->type = CONTENT_HTML; return html_convert(c, width, height); } diff --git a/render/html.c b/render/html.c index 7f9eaf44f..14563f9f4 100644 --- a/render/html.c +++ b/render/html.c @@ -28,14 +28,6 @@ #include <string.h> #include <strings.h> #include <stdlib.h> -#ifdef WITH_HUBBUB -#include <hubbub/hubbub.h> -#include <hubbub/parser.h> -#include <hubbub/tree.h> -#endif -#include <libxml/tree.h> -#include <libxml/parser.h> -#include <libxml/parserInternals.h> #include "utils/config.h" #include "content/content.h" #include "content/fetch.h" @@ -57,10 +49,6 @@ #define CHUNK 4096 -#ifndef WITH_HUBBUB -static bool html_set_parser_encoding(struct content *c, const char *encoding); -static const char *html_detect_encoding(const char **data, unsigned int *size); -#endif static void html_convert_css_callback(content_msg msg, struct content *css, intptr_t p1, intptr_t p2, union content_msg_data data); static bool html_meta_refresh(struct content *c, xmlNode *head); @@ -98,380 +86,6 @@ static const char empty_document[] = "</html>"; -#ifdef WITH_HUBBUB - -const char const *ns_prefixes[NUM_NAMESPACES] = - { NULL, NULL, "math", "svg", "xlink", "xml", "xmlns" }; - -const char const *ns_urls[NUM_NAMESPACES] = { - NULL, - "http://www.w3.org/1999/xhtml", - "http://www.w3.org/1998/Math/MathML", - "http://www.w3.org/2000/svg", - "http://www.w3.org/1999/xlink", - "http://www.w3.org/XML/1998/namespace", - "http://www.w3.org/2000/xmlns/" -}; - - -static int create_comment(void *ctx, const hubbub_string *data, void **result); -static int create_doctype(void *ctx, const hubbub_doctype *doctype, - void **result); -static int create_element(void *ctx, const hubbub_tag *tag, void **result); -static int create_text(void *ctx, const hubbub_string *data, void **result); -static int ref_node(void *ctx, void *node); -static int unref_node(void *ctx, void *node); -static int append_child(void *ctx, void *parent, void *child, void **result); -static int insert_before(void *ctx, void *parent, void *child, void *ref_child, - void **result); -static int remove_child(void *ctx, void *parent, void *child, void **result); -static int clone_node(void *ctx, void *node, bool deep, void **result); -static int reparent_children(void *ctx, void *node, void *new_parent); -static int get_parent(void *ctx, void *node, bool element_only, void **result); -static int has_children(void *ctx, void *node, bool *result); -static int form_associate(void *ctx, void *form, void *node); -static int add_attributes(void *ctx, void *node, - const hubbub_attribute *attributes, uint32_t n_attributes); -static int set_quirks_mode(void *ctx, hubbub_quirks_mode mode); -static int change_encoding(void *ctx, const char *mibenum); - -static hubbub_tree_handler tree_handler = { - create_comment, - create_doctype, - create_element, - create_text, - ref_node, - unref_node, - append_child, - insert_before, - remove_child, - clone_node, - reparent_children, - get_parent, - has_children, - form_associate, - add_attributes, - set_quirks_mode, - change_encoding, - NULL -}; - - - -/*** Tree construction functions ***/ - -int create_comment(void *ctx, const hubbub_string *data, void **result) -{ - xmlNode *node = xmlNewComment(NULL); - - node->content = xmlStrndup(data->ptr, data->len); - node->_private = (void *)1; - *result = node; - - return 0; -} - -int create_doctype(void *ctx, const hubbub_doctype *doctype, void **result) -{ - /* Make a node that doesn't really exist, then don't append it - * later. */ - xmlNode *node = xmlNewComment(NULL); - - node->_private = (void *)1; - *result = node; - - return 0; -} - -int create_element(void *ctx, const hubbub_tag *tag, void **result) -{ - struct content *c = ctx; - struct content_html_data *html = &c->data.html; - - char *name = strndup((const char *) tag->name.ptr, - tag->name.len); - - xmlNode *node = xmlNewNode(NULL, BAD_CAST name); - node->_private = (void *)1; - *result = node; - - if (html->has_ns == false) { - for (size_t i = 1; i < NUM_NAMESPACES; i++) { - html->ns[i] = xmlNewNs(node, - BAD_CAST ns_urls[i], - BAD_CAST ns_prefixes[i]); - } - html->has_ns = true; - } - - xmlSetNs(node, html->ns[tag->ns]); - - free(name); - - for (size_t i = 0; i < tag->n_attributes; i++) { - hubbub_attribute *attr = &tag->attributes[i]; - - char *name = strndup((const char *) attr->name.ptr, - attr->name.len); - char *value = strndup((const char *) attr->value.ptr, - attr->value.len); - - if (attr->ns == HUBBUB_NS_NULL) { - xmlNewProp(node, BAD_CAST name, BAD_CAST value); - } else { - xmlNewNsProp(node, html->ns[attr->ns], BAD_CAST name, - BAD_CAST value); - } - - free(name); - free(value); - } - - return 0; -} - -int create_text(void *ctx, const hubbub_string *data, void **result) -{ - xmlNode *node = xmlNewTextLen(BAD_CAST data->ptr, data->len); - node->_private = (void *)1; - *result = node; - - return 0; -} - -int ref_node(void *ctx, void *node) -{ - xmlNode *n = node; - n->_private = (void *)((uintptr_t)n->_private + 1); - - return 0; -} - -int unref_node(void *ctx, void *node) -{ - xmlNode *n = node; - n->_private = (void *)((uintptr_t)n->_private - 1); - - if (n->_private == (void *)0 && n->parent == NULL) { - xmlFreeNode(n); - } - - return 0; -} - -int append_child(void *ctx, void *parent, void *child, void **result) -{ - xmlNode *nparent = parent; - xmlNode *nchild = child; - - if (nchild->type == XML_TEXT_NODE && - nparent->last != NULL && - nparent->last->type == XML_TEXT_NODE) { - xmlNode *clone; - clone_node(ctx, nchild, false, (void **) &clone); - *result = xmlAddChild(parent, clone); - /* node referenced by clone_node */ - } else { - *result = xmlAddChild(parent, child); - ref_node(ctx, *result); - } - - return 0; -} - -/* insert 'child' before 'ref_child', under 'parent' */ -int insert_before(void *ctx, void *parent, void *child, void *ref_child, - void **result) -{ - *result = xmlAddPrevSibling(ref_child, child); - ref_node(ctx, *result); - - return 0; -} - -int remove_child(void *ctx, void *parent, void *child, void **result) -{ - xmlUnlinkNode(child); - *result = child; - - ref_node(ctx, *result); - - return 0; -} - -int clone_node(void *ctx, void *node, bool deep, void **result) -{ - xmlNode *n = xmlCopyNode(node, deep ? 1 : 2); - n->_private = (void *)1; - *result = n; - - return 0; -} - -/* Take all of the child nodes of "node" and append them to "new_parent" */ -int reparent_children(void *ctx, void *node, void *new_parent) -{ - xmlNode *n = (xmlNode *) node; - xmlNode *p = (xmlNode *) new_parent; - - for (xmlNode *child = n->children; child != NULL; ) { - xmlNode *next = child->next; - - xmlUnlinkNode(child); - - if (xmlAddChild(p, child) == NULL) - return 1; - - child = next; - } - - return 0; -} - -int get_parent(void *ctx, void *node, bool element_only, void **result) -{ - *result = ((xmlNode *)node)->parent; - - if (*result != NULL && element_only && - ((xmlNode *) *result)->type != XML_ELEMENT_NODE) - *result = NULL; - - if (*result != NULL) - ref_node(ctx, *result); - - return 0; -} - -int has_children(void *ctx, void *node, bool *result) -{ - *result = ((xmlNode *)node)->children ? true : false; - - return 0; -} - -int form_associate(void *ctx, void *form, void *node) -{ - return 0; -} - -int add_attributes(void *ctx, void *node, - const hubbub_attribute *attributes, uint32_t n_attributes) -{ - struct content *c = ctx; - struct content_html_data *html = &c->data.html; - - for (size_t i = 0; i < n_attributes; i++) { - const hubbub_attribute *attr = &attributes[i]; - - char *name = strndup((const char *) attr->name.ptr, - attr->name.len); - char *value = strndup((const char *) attr->value.ptr, - attr->value.len); - - if (attr->ns == HUBBUB_NS_NULL) { - xmlNewProp(node, BAD_CAST name, BAD_CAST value); - } else { - xmlNewNsProp(node, html->ns[attr->ns], BAD_CAST name, - BAD_CAST value); - } - - free(name); - free(value); - } - - return 0; -} - -int set_quirks_mode(void *ctx, hubbub_quirks_mode mode) -{ - return 0; -} - -int change_encoding(void *ctx, const char *name) -{ - struct content *c = ctx; - struct content_html_data *html = &c->data.html; - - /* If we have an encoding here, it means we are *certain* */ - if (html->encoding) { - return 0; - } - - /* Find the confidence otherwise (can only be from a BOM) */ - uint32_t source; - const char *charset = hubbub_parser_read_charset(html->parser, &source); - - if (source == HUBBUB_CHARSET_CONFIDENT) { - html->encoding_source = ENCODING_SOURCE_DETECTED; - html->encoding = (char *) charset; - return 0; - } - - /* So here we have something of confidence tentative... */ - /* http://www.whatwg.org/specs/web-apps/current-work/#change */ - - /* 2. "If the new encoding is identical or equivalent to the encoding - * that is already being used to interpret the input stream, then set - * the confidence to confident and abort these steps." */ - - /* Whatever happens, the encoding should be set here; either for - * reprocessing with a different charset, or for confirming that the - * charset is in fact correct */ - html->encoding = (char *) name; - html->encoding_source = ENCODING_SOURCE_META; - - /* Equal encodings will have the same string pointers */ - return (charset == name) ? 0 : 1; -} - - -/** - * Talloc'd-up allocation hook for Hubbub. - */ -static void *html_hubbub_realloc(void *ptr, size_t len, void *pw) -{ - return talloc_realloc_size(pw, ptr, len); -} - - - -/** - * Create, set up, and whatnot, a Hubbub parser instance, along with the - * relevant libxml2 bits. - */ -static int html_create_parser(struct content *c) -{ - struct content_html_data *html = &c->data.html; - hubbub_parser_optparams param; - - html->parser = hubbub_parser_create(html->encoding, - html_hubbub_realloc, - c); - if (!html->parser) - return 1; - - html->document = xmlNewDoc(BAD_CAST "1.0"); - if (!html->document) - return 1; - - html->tree_handler = tree_handler; - html->tree_handler.ctx = c; - param.tree_handler = &html->tree_handler; - hubbub_parser_setopt(html->parser, HUBBUB_PARSER_TREE_HANDLER, ¶m); - - param.document_node = html->document; - hubbub_parser_setopt(html->parser, HUBBUB_PARSER_DOCUMENT_NODE, ¶m); - - return 0; -} - - - -#endif - - - - /** * Create a CONTENT_HTML. * @@ -485,15 +99,9 @@ bool html_create(struct content *c, const char *params[]) struct content_html_data *html = &c->data.html; union content_msg_data msg_data; - html->parser = 0; + html->parser_binding = NULL; html->document = 0; -#ifdef WITH_HUBBUB - html->has_ns = false; - memset(html->ns, 0, sizeof(html->ns)); -#endif - html->encoding_handler = 0; html->encoding = 0; - html->getenc = true; html->base_url = c->url; html->base_target = NULL; html->layout = 0; @@ -520,31 +128,14 @@ bool html_create(struct content *c, const char *params[]) if (!html->encoding) goto no_memory; html->encoding_source = ENCODING_SOURCE_HEADER; - html->getenc = false; break; } } -#ifndef WITH_HUBBUB - html->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0, - XML_CHAR_ENCODING_NONE); - if (!html->parser) + /* Create the parser binding */ + html->parser_binding = binding_create_tree(c, html->encoding); + if (!html->parser_binding) goto no_memory; -#else - - /* Set up the parser, libxml2 document, and that */ - if (html_create_parser(c) != 0) - goto no_memory; - -#endif - -#ifndef WITH_HUBBUB - if (html->encoding) { - /* an encoding was specified in the Content-Type header */ - if (!html_set_parser_encoding(c, html->encoding)) - return false; - } -#endif return true; @@ -564,150 +155,39 @@ no_memory: bool html_process_data(struct content *c, char *data, unsigned int size) { unsigned long x; - -#ifndef WITH_HUBBUB - if (c->data.html.getenc) { - /* No encoding was specified in the Content-Type header. - * Attempt to detect if the encoding is not 8-bit. If the - * encoding is 8-bit, leave the parser unchanged, so that it - * searches for a <meta http-equiv="content-type" - * content="text/html; charset=...">. */ - const char *encoding; - encoding = html_detect_encoding((const char **) &data, &size); - if (encoding) { - if (!html_set_parser_encoding(c, encoding)) - return false; - c->data.html.encoding = talloc_strdup(c, encoding); - if (!c->data.html.encoding) - return false; - c->data.html.encoding_source = - ENCODING_SOURCE_DETECTED; - } - c->data.html.getenc = false; - - /* The data we received may have solely consisted of a BOM. - * If so, it will have been stripped by html_detect_encoding. - * Therefore, we'll have nothing to do in that case. */ - if (size == 0) - return true; - } -#endif - -#ifdef WITH_HUBBUB - hubbub_error err; -#endif + binding_error err; for (x = 0; x + CHUNK <= size; x += CHUNK) { -#ifdef WITH_HUBBUB LOG(("Parsing %d bytes", CHUNK)); - err = hubbub_parser_parse_chunk( - c->data.html.parser, + err = binding_parse_chunk(c->data.html.parser_binding, (uint8_t *) data + x, CHUNK); - if (err == HUBBUB_ENCODINGCHANGE) { + if (err == BINDING_ENCODINGCHANGE) { goto encoding_change; } -#else - htmlParseChunk(c->data.html.parser, data + x, CHUNK, 0); -#endif + gui_multitask(); } -#ifdef WITH_HUBBUB LOG(("Parsing %lu bytes", (size - x))); - err = hubbub_parser_parse_chunk( - c->data.html.parser, + err = binding_parse_chunk(c->data.html.parser_binding, (uint8_t *) data + x, (size - x)); - if (err == HUBBUB_ENCODINGCHANGE) { + if (err == BINDING_ENCODINGCHANGE) { goto encoding_change; } -#else - htmlParseChunk(c->data.html.parser, data + x, (int) (size - x), 0); -#endif - -#ifndef WITH_HUBBUB - if (!c->data.html.encoding && c->data.html.parser->input->encoding) { - /* The encoding was not in headers or detected, - * and the parser found a <meta http-equiv="content-type" - * content="text/html; charset=...">. */ - - /* However, if that encoding is non-ASCII-compatible, - * ignore it, as it can't possibly be correct */ - if (strncasecmp((const char *) c->data.html.parser-> - input->encoding, - "UTF-16", 6) == 0 || /* UTF-16(LE|BE)? */ - strncasecmp((const char *) c->data.html.parser-> - input->encoding, - "UTF-32", 6) == 0) { /* UTF-32(LE|BE)? */ - c->data.html.encoding = talloc_strdup(c, "ISO-8859-1"); - c->data.html.encoding_source = - ENCODING_SOURCE_DETECTED; - } else { - c->data.html.encoding = talloc_strdup(c, - (const char *) c->data.html.parser-> - input->encoding); - c->data.html.encoding_source = ENCODING_SOURCE_META; - } - - if (!c->data.html.encoding) { - union content_msg_data msg_data; - - msg_data.error = messages_get("NoMemory"); - content_broadcast(c, CONTENT_MSG_ERROR, msg_data); - return false; - } - - /* have the encoding; don't attempt to detect it */ - c->data.html.getenc = false; - - /* now, we must reset the parser such that it reparses - * using the correct charset, and then reparse any document - * source we've got. we achieve this by recreating the - * parser in its entirety as this is simpler than resetting - * the existing one and ensuring it's still set up correctly. - */ - if (c->data.html.parser->myDoc) - xmlFreeDoc(c->data.html.parser->myDoc); - htmlFreeParserCtxt(c->data.html.parser); - - c->data.html.parser = htmlCreatePushParserCtxt(0, 0, "", 0, - 0, XML_CHAR_ENCODING_NONE); - if (!c->data.html.parser) { - union content_msg_data msg_data; - - msg_data.error = messages_get("NoMemory"); - content_broadcast(c, CONTENT_MSG_ERROR, msg_data); - return false; - } - if (!html_set_parser_encoding(c, c->data.html.encoding)) - return false; - - /* and reparse received document source - the recursion - * is safe as we've just set c->data.html.encoding so - * we'll never get back in here. */ - if (!html_process_data(c, c->source_data, c->source_size)) - return false; - } -#endif return true; -#ifdef WITH_HUBBUB - encoding_change: LOG(("Changing encoding")); - /* Free up hubbub, libxml2 etc */ - hubbub_parser_destroy(c->data.html.parser); - if (c->data.html.document) { - xmlFreeDoc(c->data.html.document); - c->data.html.document = NULL; - } - c->data.html.has_ns = false; - memset(c->data.html.ns, 0, sizeof(c->data.html.ns)); + /* Retrieve new encoding */ + const char *encoding = binding_get_encoding( + c->data.html.parser_binding, + &c->data.html.encoding_source); - /* Set up the parser, libxml2 document, and that */ - if (html_create_parser(c) != 0) { + c->data.html.encoding = strdup(encoding); + if (!c->data.html.encoding) { union content_msg_data msg_data; msg_data.error = messages_get("NoMemory"); @@ -715,144 +195,26 @@ encoding_change: return false; } - /* Recurse to reprocess all that data. This is safe because - * the encoding is now specified at parser-start which means - * it cannot be changed again. */ - return html_process_data(c, c->source_data, c->source_size); - -#endif - -} - - -#ifndef WITH_HUBBUB + /* Destroy binding */ + binding_destroy_tree(c->data.html.parser_binding); -/** - * Set the HTML parser character encoding. - * - * \param c content of type CONTENT_HTML - * \param encoding name of encoding - * \return true on success, false on error and error reported - */ -bool html_set_parser_encoding(struct content *c, const char *encoding) -{ - struct content_html_data *html = &c->data.html; - xmlError *error; - char error_message[500]; - union content_msg_data msg_data; - - html->encoding_handler = xmlFindCharEncodingHandler(encoding); - if (!html->encoding_handler) { - /* either out of memory, or no handler available */ - /* assume no handler available, which is not a fatal error */ - LOG(("no encoding handler for \"%s\"", encoding)); - /* \todo warn user and ask them to install iconv? */ - return true; - } + /* Create new binding, using the new encoding */ + c->data.html.parser_binding = binding_create_tree(c, + c->data.html.encoding); + if (!c->data.html.parser_binding) { + union content_msg_data msg_data; - xmlCtxtResetLastError(html->parser); - if (xmlSwitchToEncoding(html->parser, html->encoding_handler)) { - error = xmlCtxtGetLastError(html->parser); - snprintf(error_message, sizeof error_message, - "%s xmlSwitchToEncoding(): %s", - messages_get("MiscError"), - error ? error->message : "failed"); - msg_data.error = error_message; + msg_data.error = messages_get("NoMemory"); content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } - /* Dirty hack to get around libxml oddness: - * 1) When creating a push parser context, the input flow's encoding - * string is not set (whether an encoding is specified or not) - * 2) When switching encoding (as above), the input flow's encoding - * string is never changed - * 3) When handling a meta charset, the input flow's encoding string - * is checked to determine if an encoding has already been set. - * If it has been set, then the meta charset is ignored. - * - * The upshot of this is that, if we don't explicitly set the input - * flow's encoding string here, any meta charset in the document - * will override our setting, which is incorrect behaviour. - * - * Ideally, this would be fixed in libxml, but that requires rather - * more knowledge than I currently have of what libxml is doing. - */ - if (!html->parser->input->encoding) - html->parser->input->encoding = - xmlStrdup((const xmlChar *) encoding); - - /* Ensure noone else attempts to reset the encoding */ - html->getenc = false; - - return true; -} - - -/** - * Attempt to detect the encoding of some HTML data. - * - * \param data Pointer to HTML source data - * \param size Pointer to length of data - * \return a constant string giving the encoding, or 0 if the encoding - * appears to be some 8-bit encoding - * - * If a BOM is encountered, *data and *size will be modified to skip over it - */ - -const char *html_detect_encoding(const char **data, unsigned int *size) -{ - const unsigned char *d = (const unsigned char *) *data; - - /* this detection assumes that the first two characters are <= 0xff */ - if (*size < 4) - return 0; - - if (d[0] == 0x00 && d[1] == 0x00 && - d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */ - *data += 4; - *size -= 4; - return "UTF-32BE"; - } else if (d[0] == 0xff && d[1] == 0xfe && - d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */ - *data += 4; - *size -= 4; - return "UTF-32LE"; - } - else if (d[0] == 0x00 && d[1] != 0x00 && - d[2] == 0x00 && d[3] != 0x00) /* 00 xx 00 xx */ - return "UTF-16BE"; - else if (d[0] != 0x00 && d[1] == 0x00 && - d[2] != 0x00 && d[3] == 0x00) /* xx 00 xx 00 */ - return "UTF-16LE"; - else if (d[0] == 0x00 && d[1] == 0x00 && - d[2] == 0x00 && d[3] != 0x00) /* 00 00 00 xx */ - return "ISO-10646-UCS-4"; - else if (d[0] != 0x00 && d[1] == 0x00 && - d[2] == 0x00 && d[3] == 0x00) /* xx 00 00 00 */ - return "ISO-10646-UCS-4"; - else if (d[0] == 0xfe && d[1] == 0xff) { /* BOM fe ff */ - *data += 2; - *size -= 2; - return "UTF-16BE"; - } else if (d[0] == 0xff && d[1] == 0xfe) { /* BOM ff fe */ - *data += 2; - *size -= 2; - return "UTF-16LE"; - } else if (d[0] == 0xef && d[1] == 0xbb && - d[2] == 0xbf) { /* BOM ef bb bf */ - *data += 3; - *size -= 3; - return "UTF-8"; - } - - return 0; + /* Recurse to reprocess all that data. This is safe because + * the encoding is now specified at parser-start which means + * it cannot be changed again. */ + return html_process_data(c, c->source_data, c->source_size); } - -#endif - - /** * Convert a CONTENT_HTML for display. * @@ -875,29 +237,19 @@ bool html_convert(struct content *c, int width, int height) unsigned int time_before, time_taken; /* finish parsing */ - if (c->source_size == 0) -#ifndef WITH_HUBBUB - htmlParseChunk(c->data.html.parser, empty_document, - sizeof empty_document, 0); -#else - hubbub_parser_parse_chunk(c->data.html.parser, + if (c->source_size == 0) { + binding_parse_chunk(c->data.html.parser_binding, (uint8_t *) empty_document, sizeof empty_document); -#endif + } -#ifndef WITH_HUBBUB - htmlParseChunk(c->data.html.parser, "", 0, 1); - c->data.html.document = c->data.html.parser->myDoc; + binding_parse_completed(c->data.html.parser_binding); + c->data.html.document = + binding_get_document(c->data.html.parser_binding); /*xmlDebugDumpDocument(stderr, c->data.html.document);*/ - htmlFreeParserCtxt(c->data.html.parser); - c->data.html.parser = 0; -#else - hubbub_parser_completed(c->data.html.parser); - hubbub_parser_destroy(c->data.html.parser); - c->data.html.parser = 0; - c->data.html.document = c->data.html.document; - /*xmlDebugDumpDocument(stderr, document);*/ -#endif + binding_destroy_tree(c->data.html.parser_binding); + c->data.html.parser_binding = NULL; + if (!c->data.html.document) { LOG(("Parsing failed")); msg_data.error = messages_get("ParsingFail"); @@ -2206,12 +1558,8 @@ void html_destroy(struct content *c) c->bitmap = NULL; } - if (c->data.html.parser) -#ifndef WITH_HUBBUB - htmlFreeParserCtxt(c->data.html.parser); -#else - hubbub_parser_destroy(c->data.html.parser); -#endif + if (c->data.html.parser_binding) + binding_destroy_tree(c->data.html.parser_binding); if (c->data.html.document) xmlFreeDoc(c->data.html.document); diff --git a/render/html.h b/render/html.h index a67900f29..574205f32 100644 --- a/render/html.h +++ b/render/html.h @@ -26,13 +26,9 @@ #define _NETSURF_RENDER_HTML_H_ #include <stdbool.h> -#ifdef WITH_HUBBUB -#include <hubbub/parser.h> -#include <hubbub/tree.h> -#endif -#include <libxml/HTMLparser.h> #include "content/content_type.h" #include "css/css.h" +#include "render/parser_binding.h" struct box; struct rect; @@ -43,9 +39,6 @@ struct imagemap; struct object_params; struct plotters; -/* Number of namespaces we support */ -#define NUM_NAMESPACES 7 - /* entries in stylesheet_content */ #define STYLESHEET_BASE 0 /* base style sheet */ #define STYLESHEET_ADBLOCK 1 /* adblocking stylesheet */ @@ -121,26 +114,12 @@ struct content_html_iframe { /** Data specific to CONTENT_HTML. */ struct content_html_data { -#ifndef WITH_HUBBUB - htmlParserCtxt *parser; /**< HTML parser context. */ -#else - hubbub_parser *parser; /**< HTML parser context. */ - hubbub_tree_handler tree_handler; - - bool has_ns; - xmlNs *ns[NUM_NAMESPACES]; -#endif + void *parser_binding; xmlDoc *document; - /** HTML parser encoding handler. */ - xmlCharEncodingHandler *encoding_handler; - char *encoding; /**< Encoding of source, 0 if unknown. */ - enum { ENCODING_SOURCE_HEADER, ENCODING_SOURCE_DETECTED, - ENCODING_SOURCE_META } encoding_source; + binding_encoding_source encoding_source; /**< Source of encoding information. */ - bool getenc; /**< Need to get the encoding from the document, as it - * wasn't specified in the Content-Type header. */ char *base_url; /**< Base URL (may be a copy of content->url). */ char *base_target; /**< Base target */ diff --git a/render/hubbub_binding.c b/render/hubbub_binding.c new file mode 100644 index 000000000..b3ed259b2 --- /dev/null +++ b/render/hubbub_binding.c @@ -0,0 +1,643 @@ +/* + * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org> + * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org> + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifdef WITH_HUBBUB + +#define _GNU_SOURCE /* for strndup */ +#include <assert.h> +#include <stdbool.h> +#include <string.h> + +#include <libxml/HTMLparser.h> +#include <libxml/HTMLtree.h> + +#include <hubbub/parser.h> +#include <hubbub/tree.h> + +#include "render/parser_binding.h" + +#include "utils/log.h" +#include "utils/talloc.h" + +typedef struct hubbub_ctx { + hubbub_parser *parser; + + htmlDocPtr document; + bool owns_doc; + + const char *encoding; + binding_encoding_source encoding_source; + +#define NUM_NAMESPACES (6) + xmlNsPtr namespaces[NUM_NAMESPACES]; +#undef NUM_NAMESPACES + + hubbub_tree_handler tree_handler; +} hubbub_ctx; + +static struct { + const char *prefix; + const char *url; +} namespaces[] = { + { NULL, NULL }, + { NULL, "http://www.w3.org/1999/xhtml" }, + { "math", "http://www.w3.org/1998/Math/MathML" }, + { "svg", "http://www.w3.org/2000/svg" }, + { "xlink", "http://www.w3.org/1999/xlink" }, + /** \todo Oh dear. LibXML2 refuses to create any namespace with a + * prefix of "xml". That sucks, royally. */ + { "xml", "http://www.w3.org/XML/1998/namespace" }, + { "xmlns", "http://www.w3.org/2000/xmlns/" } +}; + +static inline char *c_string_from_hubbub_string(hubbub_ctx *ctx, + const hubbub_string *str); +static void create_namespaces(hubbub_ctx *ctx, xmlNode *root); +static int create_comment(void *ctx, const hubbub_string *data, void **result); +static int create_doctype(void *ctx, const hubbub_doctype *doctype, + void **result); +static int create_element(void *ctx, const hubbub_tag *tag, void **result); +static int create_text(void *ctx, const hubbub_string *data, void **result); +static int ref_node(void *ctx, void *node); +static int unref_node(void *ctx, void *node); +static int append_child(void *ctx, void *parent, void *child, void **result); +static int insert_before(void *ctx, void *parent, void *child, void *ref_child, + void **result); +static int remove_child(void *ctx, void *parent, void *child, void **result); +static int clone_node(void *ctx, void *node, bool deep, void **result); +static int reparent_children(void *ctx, void *node, void *new_parent); +static int get_parent(void *ctx, void *node, bool element_only, void **result); +static int has_children(void *ctx, void *node, bool *result); +static int form_associate(void *ctx, void *form, void *node); +static int add_attributes(void *ctx, void *node, + const hubbub_attribute *attributes, uint32_t n_attributes); +static int set_quirks_mode(void *ctx, hubbub_quirks_mode mode); +static int change_encoding(void *ctx, const char *charset); + +static hubbub_tree_handler tree_handler = { + create_comment, + create_doctype, + create_element, + create_text, + ref_node, + unref_node, + append_child, + insert_before, + remove_child, + clone_node, + reparent_children, + get_parent, + has_children, + form_associate, + add_attributes, + set_quirks_mode, + change_encoding, + NULL +}; + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + return talloc_realloc_size(pw, ptr, len); +} + +void *binding_create_tree(void *arena, const char *charset) +{ + hubbub_ctx *ctx; + hubbub_parser_optparams params; + + ctx = malloc(sizeof(hubbub_ctx)); + if (ctx == NULL) + return NULL; + + ctx->parser = NULL; + ctx->encoding = charset; + ctx->encoding_source = ENCODING_SOURCE_HEADER; + ctx->document = NULL; + ctx->owns_doc = true; + + ctx->parser = hubbub_parser_create(charset, myrealloc, arena); + if (ctx->parser == NULL) { + free(ctx); + return NULL; + } + + ctx->document = htmlNewDocNoDtD(NULL, NULL); + if (ctx->document == NULL) { + hubbub_parser_destroy(ctx->parser); + free(ctx); + return NULL; + } + ctx->document->_private = (void *) 0; + + for (uint32_t i = 0; + i < sizeof(ctx->namespaces) / sizeof(ctx->namespaces[0]); i++) { + ctx->namespaces[i] = NULL; + } + + ctx->tree_handler = tree_handler; + ctx->tree_handler.ctx = (void *) ctx; + + params.tree_handler = &ctx->tree_handler; + hubbub_parser_setopt(ctx->parser, HUBBUB_PARSER_TREE_HANDLER, ¶ms); + + ref_node(ctx, ctx->document); + params.document_node = ctx->document; + hubbub_parser_setopt(ctx->parser, HUBBUB_PARSER_DOCUMENT_NODE, ¶ms); + + return (void *) ctx; +} + +void binding_destroy_tree(void *ctx) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + + if (ctx == NULL) + return; + + if (c->parser != NULL) + hubbub_parser_destroy(c->parser); + + if (c->owns_doc) + xmlFreeDoc(c->document); + + c->parser = NULL; + c->encoding = NULL; + c->document = NULL; + + free(c); +} + +binding_error binding_parse_chunk(void *ctx, const uint8_t *data, size_t len) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + hubbub_error err; + + err = hubbub_parser_parse_chunk(c->parser, (uint8_t *) data, len); + if (err == HUBBUB_ENCODINGCHANGE) + return BINDING_ENCODINGCHANGE; + + return BINDING_OK; +} + +binding_error binding_parse_completed(void *ctx) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + hubbub_error error; + + error = hubbub_parser_completed(c->parser); + /** \todo error handling */ + + return BINDING_OK; +} + +const char *binding_get_encoding(void *ctx, binding_encoding_source *source) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + + *source = c->encoding_source; + + return c->encoding; +} + +xmlDocPtr binding_get_document(void *ctx) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + xmlDocPtr doc = c->document; + + c->owns_doc = false; + + return doc; +} + +/*****************************************************************************/ + +char *c_string_from_hubbub_string(hubbub_ctx *ctx, const hubbub_string *str) +{ + return strndup((const char *) str->ptr, (int) str->len); +} + +void create_namespaces(hubbub_ctx *ctx, xmlNode *root) +{ + for (uint32_t i = 1; + i < sizeof(namespaces) / sizeof(namespaces[0]); i++) { + ctx->namespaces[i - 1] = xmlNewNs(root, + BAD_CAST namespaces[i].url, + BAD_CAST namespaces[i].prefix); + + if (ctx->namespaces[i - 1] == NULL) { + LOG(("Failed creating namespace %s\n", + namespaces[i].prefix)); + } + } +} + +int create_comment(void *ctx, const hubbub_string *data, void **result) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + char *content; + xmlNodePtr n; + + content = c_string_from_hubbub_string(c, data); + if (content == NULL) + return 1; + + n = xmlNewDocComment(c->document, BAD_CAST content); + if (n == NULL) { + free(content); + return 1; + } + n->_private = (void *) (uintptr_t) 1; + + free(content); + + *result = (void *) n; + + return 0; +} + +int create_doctype(void *ctx, const hubbub_doctype *doctype, void **result) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + char *name, *public = NULL, *system = NULL; + xmlDtdPtr n; + + name = c_string_from_hubbub_string(c, &doctype->name); + if (name == NULL) + return 1; + + if (!doctype->public_missing) { + public = c_string_from_hubbub_string(c, &doctype->public_id); + if (public == NULL) { + free(name); + return 1; + } + } + + if (!doctype->system_missing) { + system = c_string_from_hubbub_string(c, &doctype->system_id); + if (system == NULL) { + free(public); + free(name); + return 1; + } + } + + n = xmlNewDtd(c->document, BAD_CAST name, + BAD_CAST (public ? public : ""), + BAD_CAST (system ? system : "")); + if (n == NULL) { + free(system); + free(public); + free(name); + return 1; + } + n->_private = (void *) (uintptr_t) 1; + + *result = (void *) n; + + free(system); + free(public); + free(name); + + return 0; +} + +int create_element(void *ctx, const hubbub_tag *tag, void **result) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + char *name; + xmlNodePtr n; + + name = c_string_from_hubbub_string(c, &tag->name); + if (name == NULL) + return 1; + + if (c->namespaces[0] != NULL) { + n = xmlNewDocNode(c->document, c->namespaces[tag->ns - 1], + BAD_CAST name, NULL); + } else { + n = xmlNewDocNode(c->document, NULL, BAD_CAST name, NULL); + + /* We're creating the root node of the document. Therefore, + * create the namespaces and set this node's namespace */ + if (n != NULL && c->namespaces[0] == NULL) { + create_namespaces(c, (void *) n); + + xmlSetNs(n, c->namespaces[tag->ns - 1]); + } + } + if (n == NULL) { + free(name); + return 1; + } + n->_private = (void *) (uintptr_t) 1; + + if (tag->n_attributes > 0 && add_attributes(ctx, (void *) n, + tag->attributes, tag->n_attributes) != 0) { + xmlFreeNode(n); + free(name); + return 1; + } + + *result = (void *) n; + + free(name); + + return 0; +} + +int create_text(void *ctx, const hubbub_string *data, void **result) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + xmlNodePtr n; + + n = xmlNewDocTextLen(c->document, BAD_CAST data->ptr, (int) data->len); + if (n == NULL) { + return 1; + } + n->_private = (void *) (uintptr_t) 1; + + *result = (void *) n; + + return 0; +} + +int ref_node(void *ctx, void *node) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + + if (node == c->document) { + xmlDoc *n = (xmlDoc *) node; + uintptr_t count = (uintptr_t) n->_private; + + n->_private = (void *) ++count; + } else { + xmlNode *n = (xmlNode *) node; + uintptr_t count = (uintptr_t) n->_private; + + n->_private = (void *) ++count; + } + + return 0; +} + +int unref_node(void *ctx, void *node) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + + if (node == c->document) { + xmlDoc *n = (xmlDoc *) node; + uintptr_t count = (uintptr_t) n->_private; + + assert(count != 0 && "Node has refcount of zero"); + + n->_private = (void *) --count; + } else { + xmlNode *n = (xmlNode *) node; + uintptr_t count = (uintptr_t) n->_private; + + assert(count != 0 && "Node has refcount of zero"); + + n->_private = (void *) --count; + + if (count == 0 && n->parent == NULL) { + xmlFreeNode(n); + } + } + + return 0; +} + +int append_child(void *ctx, void *parent, void *child, void **result) +{ + xmlNode *chld = (xmlNode *) child; + xmlNode *p = (xmlNode *) parent; + + if (chld->type == XML_TEXT_NODE && p->last != NULL && + p->last->type == XML_TEXT_NODE) { + /* Need to clone the child, as libxml will free it if it + * merges the content with a pre-existing text node. */ + chld = xmlCopyNode(chld, 0); + if (chld == NULL) + return 1; + + *result = xmlAddChild(p, chld); + + assert(*result != (void *) chld); + } else { + *result = xmlAddChild(p, chld); + } + + if (*result == NULL) + return 1; + + ref_node(ctx, *result); + + return 0; +} + +int insert_before(void *ctx, void *parent, void *child, void *ref_child, + void **result) +{ + xmlNode *chld = (xmlNode *) child; + xmlNode *ref = (xmlNode *) ref_child; + + if (chld->type == XML_TEXT_NODE && ref->prev != NULL && + ref->prev->type == XML_TEXT_NODE) { + /* Clone text node, as it'll be freed by libxml */ + chld = xmlCopyNode(chld, 0); + if (chld == NULL) + return 1; + + *result = xmlAddNextSibling(ref->prev, chld); + + assert(*result != (void *) chld); + } else { + *result = xmlAddPrevSibling(ref, chld); + } + + if (*result == NULL) + return 1; + + ref_node(ctx, *result); + + return 0; +} + +int remove_child(void *ctx, void *parent, void *child, void **result) +{ + xmlNode *chld = (xmlNode *) child; + + xmlUnlinkNode(chld); + + *result = child; + + ref_node(ctx, *result); + + return 0; +} + +int clone_node(void *ctx, void *node, bool deep, void **result) +{ + xmlNode *n = (xmlNode *) node; + + *result = xmlCopyNode(n, deep ? 1 : 2); + + if (*result == NULL) + return 1; + + ((xmlNode *)(*result))->_private = (void *) (uintptr_t) 1; + + return 0; +} + +int reparent_children(void *ctx, void *node, void *new_parent) +{ + xmlNode *n = (xmlNode *) node; + xmlNode *p = (xmlNode *) new_parent; + + for (xmlNode *child = n->children; child != NULL; ) { + xmlNode *next = child->next; + + xmlUnlinkNode(child); + + if (xmlAddChild(p, child) == NULL) + return 1; + + child = next; + } + + return 0; +} + +int get_parent(void *ctx, void *node, bool element_only, void **result) +{ + xmlNode *n = (xmlNode *) node; + + *result = (void *) n->parent; + + if (*result != NULL && element_only && + ((xmlNode *) *result)->type != XML_ELEMENT_NODE) { + *result = NULL; + } + + if (*result != NULL) + ref_node(ctx, *result); + + return 0; +} + +int has_children(void *ctx, void *node, bool *result) +{ + xmlNode *n = (xmlNode *) node; + + *result = n->children != NULL; + + return 0; +} + +int form_associate(void *ctx, void *form, void *node) +{ + return 0; +} + +int add_attributes(void *ctx, void *node, + const hubbub_attribute *attributes, uint32_t n_attributes) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + xmlNode *n = (xmlNode *) node; + + for (uint32_t attr = 0; attr < n_attributes; attr++) { + xmlAttr *prop; + char *name, *value; + + name = c_string_from_hubbub_string(c, &attributes[attr].name); + if (name == NULL) + return 1; + + value = c_string_from_hubbub_string(c, &attributes[attr].value); + if (value == NULL) { + free(name); + return 1; + } + + if (attributes[attr].ns != HUBBUB_NS_NULL && + c->namespaces[0] != NULL) { + prop = xmlNewNsProp(n, + c->namespaces[attributes[attr].ns - 1], + BAD_CAST name, BAD_CAST value); + } else { + prop = xmlNewProp(n, BAD_CAST name, BAD_CAST value); + } + if (prop == NULL) { + free(value); + free(name); + return 1; + } + + free(value); + free(name); + } + + return 0; +} + +int set_quirks_mode(void *ctx, hubbub_quirks_mode mode) +{ + return 0; +} + +int change_encoding(void *ctx, const char *charset) +{ + hubbub_ctx *c = (hubbub_ctx *) ctx; + + /* If we have an encoding here, it means we are *certain* */ + if (c->encoding != NULL) { + return 0; + } + + /* Find the confidence otherwise (can only be from a BOM) */ + uint32_t source; + const char *name = hubbub_parser_read_charset(c->parser, &source); + + if (source == HUBBUB_CHARSET_CONFIDENT) { + c->encoding_source = ENCODING_SOURCE_DETECTED; + c->encoding = (char *) charset; + return 0; + } + + /* So here we have something of confidence tentative... */ + /* http://www.whatwg.org/specs/web-apps/current-work/#change */ + + /* 2. "If the new encoding is identical or equivalent to the encoding + * that is already being used to interpret the input stream, then set + * the confidence to confident and abort these steps." */ + + /* Whatever happens, the encoding should be set here; either for + * reprocessing with a different charset, or for confirming that the + * charset is in fact correct */ + c->encoding = charset; + c->encoding_source = ENCODING_SOURCE_META; + + /* Equal encodings will have the same string pointers */ + return (charset == name) ? 0 : 1; +} + +#endif + diff --git a/render/libxml_binding.c b/render/libxml_binding.c new file mode 100644 index 000000000..51cf0a6be --- /dev/null +++ b/render/libxml_binding.c @@ -0,0 +1,308 @@ +/* + * Copyright 2007 James Bursa <bursa@users.sourceforge.net> + * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org> + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef WITH_HUBBUB + +#include <stdbool.h> +#include <string.h> + +#include <libxml/HTMLparser.h> +#include <libxml/HTMLtree.h> +#include <libxml/parser.h> +#include <libxml/parserInternals.h> + +#include "render/parser_binding.h" + +#include "utils/log.h" +#include "utils/talloc.h" + +typedef struct libxml_ctx { + htmlParserCtxt *parser; + + /** HTML parser encoding handler. */ + xmlCharEncodingHandler *encoding_handler; + + const char *encoding; + binding_encoding_source encoding_source; + + bool getenc; +} libxml_ctx; + +static bool set_parser_encoding(libxml_ctx *c, const char *encoding); +static const char *detect_encoding(const char **data, size_t *size); + +void *binding_create_tree(void *arena, const char *charset) +{ + libxml_ctx *ctx; + + ctx = malloc(sizeof(libxml_ctx)); + if (ctx == NULL) + return NULL; + + ctx->parser = NULL; + ctx->encoding_handler = NULL; + ctx->encoding = charset; + ctx->encoding_source = ENCODING_SOURCE_HEADER; + ctx->getenc = true; + + ctx->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0, + XML_CHAR_ENCODING_NONE); + if (ctx->parser == NULL) { + free(ctx); + return NULL; + } + + if (ctx->encoding != NULL && !set_parser_encoding(ctx, charset)) { + if (ctx->parser->myDoc != NULL) + xmlFreeDoc(ctx->parser->myDoc); + htmlFreeParserCtxt(ctx->parser); + free(ctx); + return NULL; + } + + return (void *) ctx; +} + +void binding_destroy_tree(void *ctx) +{ + libxml_ctx *c = (libxml_ctx *) ctx; + + if (ctx == NULL) + return; + + if (c->parser->myDoc != NULL) + xmlFreeDoc(c->parser->myDoc); + + if (c->parser != NULL) + htmlFreeParserCtxt(c->parser); + + c->parser = NULL; + c->encoding = NULL; + + free(c); +} + +binding_error binding_parse_chunk(void *ctx, const uint8_t *data, size_t len) +{ + libxml_ctx *c = (libxml_ctx *) ctx; + + if (c->getenc) { + /* No encoding was specified in the Content-Type header. + * Attempt to detect if the encoding is not 8-bit. If the + * encoding is 8-bit, leave the parser unchanged, so that it + * searches for a <meta http-equiv="content-type" + * content="text/html; charset=...">. */ + const char *encoding; + encoding = detect_encoding((const char **) (void *) &data, + &len); + if (encoding) { + if (!set_parser_encoding(c, encoding)) + return BINDING_NOMEM; + c->encoding = encoding; + c->encoding_source = ENCODING_SOURCE_DETECTED; + } + c->getenc = false; + + /* The data we received may have solely consisted of a BOM. + * If so, it will have been stripped by html_detect_encoding. + * Therefore, we'll have nothing to do in that case. */ + if (len == 0) + return BINDING_OK; + } + + htmlParseChunk(c->parser, (const char *) data, len, 0); + /** \todo error handling */ + + if (!c->encoding && c->parser->input->encoding) { + /* The encoding was not in headers or detected, + * and the parser found a <meta http-equiv="content-type" + * content="text/html; charset=...">. */ + + /* However, if that encoding is non-ASCII-compatible, + * ignore it, as it can't possibly be correct */ + if (strncasecmp((const char *) c->parser->input->encoding, + "UTF-16", 6) == 0 || /* UTF-16(LE|BE)? */ + strncasecmp((const char *) c->parser->input->encoding, + "UTF-32", 6) == 0) { /* UTF-32(LE|BE)? */ + c->encoding = "ISO-8859-1"; + c->encoding_source = ENCODING_SOURCE_DETECTED; + } else { + c->encoding = (const char *) c->parser->input->encoding; + c->encoding_source = ENCODING_SOURCE_META; + } + + if (!c->encoding) + return BINDING_NOMEM; + + /* have the encoding; don't attempt to detect it */ + c->getenc = false; + + return BINDING_ENCODINGCHANGE; + } + + return BINDING_OK; +} + +binding_error binding_parse_completed(void *ctx) +{ + libxml_ctx *c = (libxml_ctx *) ctx; + + htmlParseChunk(c->parser, "", 0, 1); + /** \todo error handling */ + + return BINDING_OK; +} + +const char *binding_get_encoding(void *ctx, binding_encoding_source *source) +{ + libxml_ctx *c = (libxml_ctx *) ctx; + + *source = c->encoding_source; + + return c->encoding; +} + +xmlDocPtr binding_get_document(void *ctx) +{ + libxml_ctx *c = (libxml_ctx *) ctx; + xmlDocPtr doc = c->parser->myDoc; + + c->parser->myDoc = NULL; + + return doc; +} + +/******************************************************************************/ + +/** + * Set the HTML parser character encoding. + * + * \param c context + * \param encoding name of encoding + * \return true on success, false on error and error reported + */ +bool set_parser_encoding(libxml_ctx *c, const char *encoding) +{ + xmlError *error; + + c->encoding_handler = xmlFindCharEncodingHandler(encoding); + if (!c->encoding_handler) { + /* either out of memory, or no handler available */ + /* assume no handler available, which is not a fatal error */ + LOG(("no encoding handler for \"%s\"", encoding)); + /* \todo warn user and ask them to install iconv? */ + return true; + } + + xmlCtxtResetLastError(c->parser); + if (xmlSwitchToEncoding(c->parser, c->encoding_handler)) { + error = xmlCtxtGetLastError(c->parser); + LOG(("xmlSwitchToEncoding(): %s", + error ? error->message : "failed")); + return false; + } + + /* Dirty hack to get around libxml oddness: + * 1) When creating a push parser context, the input flow's encoding + * string is not set (whether an encoding is specified or not) + * 2) When switching encoding (as above), the input flow's encoding + * string is never changed + * 3) When handling a meta charset, the input flow's encoding string + * is checked to determine if an encoding has already been set. + * If it has been set, then the meta charset is ignored. + * + * The upshot of this is that, if we don't explicitly set the input + * flow's encoding string here, any meta charset in the document + * will override our setting, which is incorrect behaviour. + * + * Ideally, this would be fixed in libxml, but that requires rather + * more knowledge than I currently have of what libxml is doing. + */ + if (!c->parser->input->encoding) + c->parser->input->encoding = + xmlStrdup((const xmlChar *) encoding); + + /* Ensure noone else attempts to reset the encoding */ + c->getenc = false; + + return true; +} + +/** + * Attempt to detect the encoding of some HTML data. + * + * \param data Pointer to HTML source data + * \param size Pointer to length of data + * \return a constant string giving the encoding, or 0 if the encoding + * appears to be some 8-bit encoding + * + * If a BOM is encountered, *data and *size will be modified to skip over it + */ + +const char *detect_encoding(const char **data, size_t *size) +{ + const unsigned char *d = (const unsigned char *) *data; + + /* this detection assumes that the first two characters are <= 0xff */ + if (*size < 4) + return 0; + + if (d[0] == 0x00 && d[1] == 0x00 && + d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */ + *data += 4; + *size -= 4; + return "UTF-32BE"; + } else if (d[0] == 0xff && d[1] == 0xfe && + d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */ + *data += 4; + *size -= 4; + return "UTF-32LE"; + } + else if (d[0] == 0x00 && d[1] != 0x00 && + d[2] == 0x00 && d[3] != 0x00) /* 00 xx 00 xx */ + return "UTF-16BE"; + else if (d[0] != 0x00 && d[1] == 0x00 && + d[2] != 0x00 && d[3] == 0x00) /* xx 00 xx 00 */ + return "UTF-16LE"; + else if (d[0] == 0x00 && d[1] == 0x00 && + d[2] == 0x00 && d[3] != 0x00) /* 00 00 00 xx */ + return "ISO-10646-UCS-4"; + else if (d[0] != 0x00 && d[1] == 0x00 && + d[2] == 0x00 && d[3] == 0x00) /* xx 00 00 00 */ + return "ISO-10646-UCS-4"; + else if (d[0] == 0xfe && d[1] == 0xff) { /* BOM fe ff */ + *data += 2; + *size -= 2; + return "UTF-16BE"; + } else if (d[0] == 0xff && d[1] == 0xfe) { /* BOM ff fe */ + *data += 2; + *size -= 2; + return "UTF-16LE"; + } else if (d[0] == 0xef && d[1] == 0xbb && + d[2] == 0xbf) { /* BOM ef bb bf */ + *data += 3; + *size -= 3; + return "UTF-8"; + } + + return 0; +} + +#endif + diff --git a/render/parser_binding.h b/render/parser_binding.h new file mode 100644 index 000000000..73e6e9708 --- /dev/null +++ b/render/parser_binding.h @@ -0,0 +1,48 @@ +/* + * Copyright 2008 John-Mark Bell <jmb@netsurf-browser.org> + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +#ifndef _NETSURF_RENDER_PARSER_BINDING_H_ +#define _NETSURF_RENDER_PARSER_BINDING_H_ + +#include <stdint.h> + +#include <libxml/tree.h> + +typedef enum binding_error { + BINDING_OK, + BINDING_NOMEM, + BINDING_ENCODINGCHANGE +} binding_error; + +typedef enum binding_encoding_source { + ENCODING_SOURCE_HEADER, + ENCODING_SOURCE_DETECTED, + ENCODING_SOURCE_META +} binding_encoding_source; + +void *binding_create_tree(void *arena, const char *charset); +void binding_destroy_tree(void *ctx); + +binding_error binding_parse_chunk(void *ctx, const uint8_t *data, size_t len); +binding_error binding_parse_completed(void *ctx); + +const char *binding_get_encoding(void *ctx, binding_encoding_source *source); +xmlDocPtr binding_get_document(void *ctx); + +#endif + |