summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAndrew Sidwell <andy@entai.co.uk>2008-08-11 01:29:00 +0000
committerAndrew Sidwell <andy@entai.co.uk>2008-08-11 01:29:00 +0000
commita6c3624c75a547e142fc732898f9a3890fa9e2f5 (patch)
tree6e1b44c51a706843d2057e3e1126ffa5714becf1 /src
parent4aef3c775e93d5add0277941fe0ecccc91d4ccb3 (diff)
downloadlibhubbub-a6c3624c75a547e142fc732898f9a3890fa9e2f5.tar.gz
libhubbub-a6c3624c75a547e142fc732898f9a3890fa9e2f5.tar.bz2
Move one step closer to getting encoding changes working.
svn path=/trunk/hubbub/; revision=5000
Diffstat (limited to 'src')
-rw-r--r--src/charset/detect.c21
-rw-r--r--src/parser.c4
-rw-r--r--src/tokeniser/tokeniser.c2
-rw-r--r--src/treebuilder/in_head.c18
4 files changed, 24 insertions, 21 deletions
diff --git a/src/charset/detect.c b/src/charset/detect.c
index 7d3459f..f3f2e4f 100644
--- a/src/charset/detect.c
+++ b/src/charset/detect.c
@@ -49,19 +49,18 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
if (data == NULL || mibenum == NULL || source == NULL)
return PARSERUTILS_BADPARM;
- /* 1 */
+ /* 1. */
/* If the source is dictated, there's nothing for us to do */
- if (*source == HUBBUB_CHARSET_DICTATED) {
- /* confidence = certain; */
+ if (*source == HUBBUB_CHARSET_CONFIDENT) {
return PARSERUTILS_OK;
}
- /* 2 */
+ /* 2. */
/** \todo We probably want to wait for ~512 bytes of data / 500ms here */
- /* 3 */
+ /* 3. */
/* We need at least 3 bytes of data */
if (len < 3)
@@ -71,13 +70,12 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
charset = hubbub_charset_read_bom(data, len);
if (charset != 0) {
*mibenum = charset;
- *source = HUBBUB_CHARSET_DOCUMENT;
- /* confidence = certain; */
+ *source = HUBBUB_CHARSET_CONFIDENT;
return PARSERUTILS_OK;
}
- /* 4 */
+ /* 4. */
/* No BOM was found, so we must look for a meta charset within
* the document itself. */
@@ -111,8 +109,7 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
"UTF-32BE", SLEN("UTF-32BE"))) {
*mibenum = charset;
- *source = HUBBUB_CHARSET_DOCUMENT;
- /* confidence = tentative; */
+ *source = HUBBUB_CHARSET_TENTATIVE;
return PARSERUTILS_OK;
}
@@ -126,7 +123,7 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
/* We failed to autodetect a charset, so use the default fallback */
default_encoding:
- /* 7 */
+ /* 7. */
charset = parserutils_charset_mibenum_from_name("Windows-1252",
SLEN("Windows-1252"));
@@ -135,7 +132,7 @@ default_encoding:
SLEN("ISO-8859-1"));
*mibenum = charset;
- *source = HUBBUB_CHARSET_DEFAULT;
+ *source = HUBBUB_CHARSET_TENTATIVE;
return PARSERUTILS_OK;
}
diff --git a/src/parser.c b/src/parser.c
index 7f187a6..e43a309 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -46,7 +46,7 @@ hubbub_parser *hubbub_parser_create(const char *enc,
return NULL;
parser->stream = parserutils_inputstream_create(enc,
- enc != NULL ? HUBBUB_CHARSET_DICTATED : HUBBUB_CHARSET_UNKNOWN,
+ enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN,
hubbub_charset_extract, alloc, pw);
if (parser->stream == NULL) {
alloc(parser, 0, pw);
@@ -105,7 +105,7 @@ hubbub_error hubbub_parser_setopt(hubbub_parser *parser,
hubbub_parser_opttype type,
hubbub_parser_optparams *params)
{
- hubbub_error result = HUBBUB_OK;;
+ hubbub_error result = HUBBUB_OK;
if (parser == NULL || params == NULL)
return HUBBUB_BADPARM;
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index 26fc1fb..0bf72ef 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -558,7 +558,7 @@ hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
}
}
- return HUBBUB_OK;
+ return (cont == HUBBUB_OOD) ? HUBBUB_OK : cont;
}
diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c
index 88fcff5..897610b 100644
--- a/src/treebuilder/in_head.c
+++ b/src/treebuilder/in_head.c
@@ -64,20 +64,26 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder,
if (treebuilder->tree_handler->encoding_change) {
const char *name = parserutils_charset_mibenum_to_name(
charset_enc);
- treebuilder->tree_handler->encoding_change(
+
+ /* 1 indicates the encoding should actually change */
+ if (treebuilder->tree_handler->encoding_change(
treebuilder->tree_handler->ctx,
- name);
+ name) == 1) {
+ return HUBBUB_ENCODINGCHANGE;
+ }
}
- return HUBBUB_ENCODINGCHANGE;
} else if (content_type_enc != 0) {
if (treebuilder->tree_handler->encoding_change) {
const char *name = parserutils_charset_mibenum_to_name(
content_type_enc);
- treebuilder->tree_handler->encoding_change(
+
+ /* 1 indicates the encoding should actually change */
+ if (treebuilder->tree_handler->encoding_change(
treebuilder->tree_handler->ctx,
- name);
+ name) == 1) {
+ return HUBBUB_ENCODINGCHANGE;
+ }
}
- return HUBBUB_ENCODINGCHANGE;
}
return HUBBUB_OK;