From 01316168fc813f22ea15eda250f4d925c1fa9061 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Tue, 29 May 2007 18:03:07 +0000 Subject: Fix bugs in charset detection. Strip BOM from parser input, as it confuses libxml. Ignore non-ASCII-compatible charsets declared in meta tag (the parser defaults to 8 bit, so if it's managed to extract a meta charset, then it must be ASCII-compatible, so a non-ASCII-compatible meta charset is lies). Fixes WightLink timetable and 1726341. svn path=/trunk/netsurf/; revision=3304 --- render/html.c | 86 +++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 66 insertions(+), 20 deletions(-) diff --git a/render/html.c b/render/html.c index 0cf080381..9fe76455a 100644 --- a/render/html.c +++ b/render/html.c @@ -38,7 +38,7 @@ static bool html_set_parser_encoding(struct content *c, const char *encoding); -static const char *html_detect_encoding(const char *data, unsigned int size); +static const char *html_detect_encoding(const char **data, unsigned int *size); static void html_convert_css_callback(content_msg msg, struct content *css, intptr_t p1, intptr_t p2, union content_msg_data data); static bool html_meta_refresh(struct content *c, xmlNode *head); @@ -157,7 +157,7 @@ bool html_process_data(struct content *c, char *data, unsigned int size) * searches for a . */ const char *encoding; - encoding = html_detect_encoding(data, size); + encoding = html_detect_encoding((const char **) &data, &size); if (encoding) { if (!html_set_parser_encoding(c, encoding)) return false; @@ -168,6 +168,12 @@ bool html_process_data(struct content *c, char *data, unsigned int size) ENCODING_SOURCE_DETECTED; } c->data.html.getenc = false; + + /* The data we received may have solely consisted of a BOM. + * If so, it will have been stripped by html_detect_encoding. + * Therefore, we'll have nothing to do in that case. */ + if (size == 0) + return true; } for (x = 0; x + CHUNK <= size; x += CHUNK) { @@ -180,8 +186,22 @@ bool html_process_data(struct content *c, char *data, unsigned int size) /* The encoding was not in headers or detected, * and the parser found a . */ - c->data.html.encoding = talloc_strdup(c, + + /* However, if that encoding is non-ASCII-compatible, + * ignore it, as it can't possibly be correct */ + if (strncasecmp(c->data.html.parser->input->encoding, + "UTF-16", 6) == 0 || /* UTF-16(LE|BE)? */ + strncasecmp(c->data.html.parser->input->encoding, + "UTF-32", 6) == 0) { /* UTF-32(LE|BE)? */ + c->data.html.encoding = talloc_strdup(c, "ISO-8859-1"); + c->data.html.encoding_source = + ENCODING_SOURCE_DETECTED; + } else { + c->data.html.encoding = talloc_strdup(c, c->data.html.parser->input->encoding); + c->data.html.encoding_source = ENCODING_SOURCE_META; + } + if (!c->data.html.encoding) { union content_msg_data msg_data; @@ -189,7 +209,6 @@ bool html_process_data(struct content *c, char *data, unsigned int size) content_broadcast(c, CONTENT_MSG_ERROR, msg_data); return false; } - c->data.html.encoding_source = ENCODING_SOURCE_META; /* have the encoding; don't attempt to detect it */ c->data.html.getenc = false; @@ -293,33 +312,60 @@ bool html_set_parser_encoding(struct content *c, const char *encoding) /** * Attempt to detect the encoding of some HTML data. * - * \param data HTML source data - * \param size length of data + * \param data Pointer to HTML source data + * \param size Pointer to length of data * \return a constant string giving the encoding, or 0 if the encoding * appears to be some 8-bit encoding + * + * If a BOM is encountered, *data and *size will be modified to skip over it */ -const char *html_detect_encoding(const char *data, unsigned int size) +const char *html_detect_encoding(const char **data, unsigned int *size) { + const unsigned char *d = (const unsigned char *) *data; + /* this detection assumes that the first two characters are <= 0xff */ - if (size < 4) + if (*size < 4) return 0; - if (data[0] == 0xfe && data[1] == 0xff) /* BOM fe ff */ + + if (d[0] == 0x00 && d[1] == 0x00 && + d[2] == 0xfe && d[3] == 0xff) { /* BOM 00 00 fe ff */ + *data += 4; + *size -= 4; + return "UTF-32BE"; + } else if (d[0] == 0xff && d[1] == 0xfe && + d[2] == 0x00 && d[3] == 0x00) { /* BOM ff fe 00 00 */ + *data += 4; + *size -= 4; + return "UTF-32LE"; + } + else if (d[0] == 0x00 && d[1] != 0x00 && + d[2] == 0x00 && d[3] != 0x00) /* 00 xx 00 xx */ return "UTF-16BE"; - else if (data[0] == 0xfe && data[1] == 0xff) /* BOM ff fe */ + else if (d[0] != 0x00 && d[1] == 0x00 && + d[2] != 0x00 && d[3] == 0x00) /* xx 00 xx 00 */ return "UTF-16LE"; - else if (data[0] == 0x00 && data[1] != 0x00 && - data[2] == 0x00 && data[3] != 0x00) /* 00 xx 00 xx */ - return "UTF-16BE"; - else if (data[0] != 0x00 && data[1] == 0x00 && - data[2] != 0x00 && data[3] == 0x00) /* xx 00 xx 00 */ - return "UTF-16BE"; - else if (data[0] == 0x00 && data[1] == 0x00 && - data[2] == 0x00 && data[3] != 0x00) /* 00 00 00 xx */ + else if (d[0] == 0x00 && d[1] == 0x00 && + d[2] == 0x00 && d[3] != 0x00) /* 00 00 00 xx */ return "ISO-10646-UCS-4"; - else if (data[0] != 0x00 && data[1] == 0x00 && - data[2] == 0x00 && data[3] == 0x00) /* xx 00 00 00 */ + else if (d[0] != 0x00 && d[1] == 0x00 && + d[2] == 0x00 && d[3] == 0x00) /* xx 00 00 00 */ return "ISO-10646-UCS-4"; + else if (d[0] == 0xfe && d[1] == 0xff) { /* BOM fe ff */ + *data += 2; + *size -= 2; + return "UTF-16BE"; + } else if (d[0] == 0xfe && d[1] == 0xff) { /* BOM ff fe */ + *data += 2; + *size -= 2; + return "UTF-16LE"; + } else if (d[0] == 0xef && d[1] == 0xbb && + d[2] == 0xbf) { /* BOM ef bb bf */ + *data += 3; + *size -= 3; + return "UTF-8"; + } + return 0; } -- cgit v1.2.3