summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2008-10-14 15:44:05 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2008-10-14 15:44:05 +0000
commit6df8f99a707326655b4f285920f19fef6d9eb90a (patch)
tree893e8d1ff525027eb482957c259d8885c3436ae2 /src
parent58837fe7fb2196d39f09425329087b6b48aace46 (diff)
downloadlibhubbub-6df8f99a707326655b4f285920f19fef6d9eb90a.tar.gz
libhubbub-6df8f99a707326655b4f285920f19fef6d9eb90a.tar.bz2
Fixup dubious charsets
svn path=/trunk/hubbub/; revision=5575
Diffstat (limited to 'src')
-rw-r--r--src/charset/detect.c1
-rw-r--r--src/charset/detect.h3
-rw-r--r--src/parser.c21
-rw-r--r--src/treebuilder/in_head.c4
4 files changed, 26 insertions, 3 deletions
diff --git a/src/charset/detect.c b/src/charset/detect.c
index 755d9fd..562c12d 100644
--- a/src/charset/detect.c
+++ b/src/charset/detect.c
@@ -25,7 +25,6 @@ static bool hubbub_charset_get_attribute(const uint8_t **data,
const uint8_t *end,
const uint8_t **name, uint32_t *namelen,
const uint8_t **value, uint32_t *valuelen);
-static void hubbub_charset_fix_charset(uint16_t *charset);
/**
* Extract a charset from a chunk of data
diff --git a/src/charset/detect.h b/src/charset/detect.h
index cb837d0..ec97267 100644
--- a/src/charset/detect.h
+++ b/src/charset/detect.h
@@ -20,5 +20,8 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
uint16_t hubbub_charset_parse_content(const uint8_t *value,
uint32_t valuelen);
+/* Fix up frequently misused character sets */
+void hubbub_charset_fix_charset(uint16_t *charset);
+
#endif
diff --git a/src/parser.c b/src/parser.c
index e43a309..075a0e2 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -5,6 +5,9 @@
* Copyright 2007-8 John-Mark Bell <jmb@netsurf-browser.org>
*/
+#include <string.h>
+
+#include <parserutils/charset/mibenum.h>
#include <parserutils/input/inputstream.h>
#include <hubbub/parser.h>
@@ -29,11 +32,12 @@ struct hubbub_parser {
* Create a hubbub parser
*
* \param enc Source document encoding, or NULL to autodetect
+ * `param fix_enc Permit fixing up of encoding if it's frequently misused
* \param alloc Memory (de)allocation function
* \param pw Pointer to client-specific private data (may be NULL)
* \return Pointer to parser instance, or NULL on error
*/
-hubbub_parser *hubbub_parser_create(const char *enc,
+hubbub_parser *hubbub_parser_create(const char *enc, bool fix_enc,
hubbub_alloc alloc, void *pw)
{
hubbub_parser *parser;
@@ -45,6 +49,19 @@ hubbub_parser *hubbub_parser_create(const char *enc,
if (parser == NULL)
return NULL;
+ /* If we have an encoding and we're permitted to fix up likely broken
+ * ones, then attempt to do so. */
+ if (enc != NULL && fix_enc == true) {
+ uint16_t mibenum = parserutils_charset_mibenum_from_name(enc,
+ strlen(enc));
+
+ if (mibenum != 0) {
+ hubbub_charset_fix_charset(&mibenum);
+
+ enc = parserutils_charset_mibenum_to_name(mibenum);
+ }
+ }
+
parser->stream = parserutils_inputstream_create(enc,
enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN,
hubbub_charset_extract, alloc, pw);
@@ -201,7 +218,7 @@ hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser,
* Pass a chunk of extraneous data to a hubbub parser for parsing
*
* \param parser Parser instance to use
- * \param data Data to parse (encoded in internal charset)
+ * \param data Data to parse (encoded in UTF-8)
* \param len Length, in byte, of data
* \return HUBBUB_OK on success, appropriate error otherwise
*/
diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c
index e6cba81..ba4daf5 100644
--- a/src/treebuilder/in_head.c
+++ b/src/treebuilder/in_head.c
@@ -61,6 +61,8 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder,
}
if (charset_enc != 0) {
+ hubbub_charset_fix_charset(&charset_enc);
+
if (treebuilder->tree_handler->encoding_change) {
const char *name = parserutils_charset_mibenum_to_name(
charset_enc);
@@ -73,6 +75,8 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder,
}
}
} else if (content_type_enc != 0) {
+ hubbub_charset_fix_charset(&content_type_enc);
+
if (treebuilder->tree_handler->encoding_change) {
const char *name = parserutils_charset_mibenum_to_name(
content_type_enc);