diff options
author | John Mark Bell <jmb@netsurf-browser.org> | 2004-03-08 18:21:21 +0000 |
---|---|---|
committer | John Mark Bell <jmb@netsurf-browser.org> | 2004-03-08 18:21:21 +0000 |
commit | 217eae922b588ce33c4d6a6590df2f8f96865112 (patch) | |
tree | 503f481585aceb23c392a946b024147e6db39510 /render | |
parent | 7d9bf053b4ca97fd25359d7ea063d9233ed5c63a (diff) | |
download | netsurf-217eae922b588ce33c4d6a6590df2f8f96865112.tar.gz netsurf-217eae922b588ce33c4d6a6590df2f8f96865112.tar.bz2 |
[project @ 2004-03-08 18:21:21 by jmb]
Attempt to detect document charset encoding if the server doesn't send it.
svn path=/import/netsurf/; revision=592
Diffstat (limited to 'render')
-rw-r--r-- | render/html.c | 21 | ||||
-rw-r--r-- | render/html.h | 1 |
2 files changed, 21 insertions, 1 deletions
diff --git a/render/html.c b/render/html.c index d1c2cafa3..aaf0c4bf6 100644 --- a/render/html.c +++ b/render/html.c @@ -13,6 +13,7 @@ #include <string.h> #include <strings.h> #include <stdlib.h> +#include "libxml/parserInternals.h" #include "netsurf/utils/config.h" #include "netsurf/content/content.h" #include "netsurf/content/fetch.h" @@ -53,12 +54,16 @@ void html_create(struct content *c, const char *params[]) struct content_html_data *html = &c->data.html; html->encoding = XML_CHAR_ENCODING_8859_1; + html->getenc = true; for (i = 0; params[i]; i += 2) { if (strcasecmp(params[i], "charset") == 0) { html->encoding = xmlParseCharEncoding(params[i + 1]); - if (html->encoding == XML_CHAR_ENCODING_ERROR) + html->getenc = false; /* encoding specified - trust the server... */ + if (html->encoding == XML_CHAR_ENCODING_ERROR) { html->encoding = XML_CHAR_ENCODING_8859_1; + html->getenc = true; + } break; } } @@ -97,6 +102,20 @@ void html_process_data(struct content *c, char *data, unsigned long size) memcpy(c->data.html.source + c->data.html.length, data, size); c->data.html.length += size; c->size += size; + /* First time through, check if we need to get the encoding + * if so, get it and reset the parser instance with it. + * if it fails, assume Latin1 + */ + if (c->data.html.getenc) { + c->data.html.encoding = xmlDetectCharEncoding(c->data.html.source, c->data.html.length); + if (c->data.html.encoding == XML_CHAR_ENCODING_ERROR || + c->data.html.encoding == XML_CHAR_ENCODING_NONE) { + c->data.html.encoding = XML_CHAR_ENCODING_8859_1; + } + xmlSwitchEncoding((xmlParserCtxtPtr)c->data.html.parser, c->data.html.encoding); + c->data.html.getenc = false; + LOG(("Encoding: %s", xmlGetCharEncodingName(c->data.html.encoding))); + } for (x = 0; x + CHUNK <= size; x += CHUNK) { htmlParseChunk(c->data.html.parser, data + x, CHUNK, 0); gui_multitask(); diff --git a/render/html.h b/render/html.h index 6eaa651df..b20040a2e 100644 --- a/render/html.h +++ b/render/html.h @@ -40,6 +40,7 @@ struct content_html_data { char *source; /**< Source data. */ int length; /**< Length of source. */ xmlCharEncoding encoding; /**< Encoding of source. */ + bool getenc; /**< Need to get the encoding from the document, as server is broken. */ char *base_url; /**< Base URL (may be a copy of content->url). */ |