summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/Treebuilder14
-rw-r--r--include/hubbub/types.h9
-rw-r--r--src/charset/detect.c21
-rw-r--r--src/parser.c4
-rw-r--r--src/tokeniser/tokeniser.c2
-rw-r--r--src/treebuilder/in_head.c18
6 files changed, 41 insertions, 27 deletions
diff --git a/docs/Treebuilder b/docs/Treebuilder
index 9c7dce4..d8924f5 100644
--- a/docs/Treebuilder
+++ b/docs/Treebuilder
@@ -187,4 +187,16 @@ Callback behaviour
This function must set the quirks mode flag of the document to "mode".
-
+ | int hubbub_tree_encoding_change(void *ctx,
+ | const char *name);
+
+ This function is called when a meta tag which specifies a charset is seen
+ in the treebuilder. [1] The client is responsible for checking if the
+ encoding the document is being processed as should actually be changed, and
+ if it should, this function should return 1. In this case, the parser
+ instance will return the error code HUBBUB_ENCODINGCHANGE when it returns
+ from parsing the chunk that triggered the encoding change. The parser
+ instance should then be destroyed and a new one created with that encoding
+ specified.
+
+ [1] http://www.whatwg.org/specs/web-apps/current-work/#in-head
diff --git a/include/hubbub/types.h b/include/hubbub/types.h
index 42d1460..07ef2ab 100644
--- a/include/hubbub/types.h
+++ b/include/hubbub/types.h
@@ -15,11 +15,10 @@
* A client-dictated charset will override all others.
* A document-specified charset will override autodetection or the default */
typedef enum hubbub_charset_source {
- HUBBUB_CHARSET_UNKNOWN = 0, /**< Unknown */
- HUBBUB_CHARSET_DEFAULT = 1, /**< Default setting */
- HUBBUB_CHARSET_DETECTED = 2, /**< Autodetected */
- HUBBUB_CHARSET_DOCUMENT = 3, /**< Defined in document */
- HUBBUB_CHARSET_DICTATED = 4, /**< Dictated by client */
+ HUBBUB_CHARSET_UNKNOWN = 0, /**< Unknown */
+ HUBBUB_CHARSET_TENTATIVE = 1, /**< Charset may be changed
+ * with further data */
+ HUBBUB_CHARSET_CONFIDENT = 2, /**< Charset definite */
} hubbub_charset_source;
/**
diff --git a/src/charset/detect.c b/src/charset/detect.c
index 7d3459f..f3f2e4f 100644
--- a/src/charset/detect.c
+++ b/src/charset/detect.c
@@ -49,19 +49,18 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
if (data == NULL || mibenum == NULL || source == NULL)
return PARSERUTILS_BADPARM;
- /* 1 */
+ /* 1. */
/* If the source is dictated, there's nothing for us to do */
- if (*source == HUBBUB_CHARSET_DICTATED) {
- /* confidence = certain; */
+ if (*source == HUBBUB_CHARSET_CONFIDENT) {
return PARSERUTILS_OK;
}
- /* 2 */
+ /* 2. */
/** \todo We probably want to wait for ~512 bytes of data / 500ms here */
- /* 3 */
+ /* 3. */
/* We need at least 3 bytes of data */
if (len < 3)
@@ -71,13 +70,12 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
charset = hubbub_charset_read_bom(data, len);
if (charset != 0) {
*mibenum = charset;
- *source = HUBBUB_CHARSET_DOCUMENT;
- /* confidence = certain; */
+ *source = HUBBUB_CHARSET_CONFIDENT;
return PARSERUTILS_OK;
}
- /* 4 */
+ /* 4. */
/* No BOM was found, so we must look for a meta charset within
* the document itself. */
@@ -111,8 +109,7 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
"UTF-32BE", SLEN("UTF-32BE"))) {
*mibenum = charset;
- *source = HUBBUB_CHARSET_DOCUMENT;
- /* confidence = tentative; */
+ *source = HUBBUB_CHARSET_TENTATIVE;
return PARSERUTILS_OK;
}
@@ -126,7 +123,7 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
/* We failed to autodetect a charset, so use the default fallback */
default_encoding:
- /* 7 */
+ /* 7. */
charset = parserutils_charset_mibenum_from_name("Windows-1252",
SLEN("Windows-1252"));
@@ -135,7 +132,7 @@ default_encoding:
SLEN("ISO-8859-1"));
*mibenum = charset;
- *source = HUBBUB_CHARSET_DEFAULT;
+ *source = HUBBUB_CHARSET_TENTATIVE;
return PARSERUTILS_OK;
}
diff --git a/src/parser.c b/src/parser.c
index 7f187a6..e43a309 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -46,7 +46,7 @@ hubbub_parser *hubbub_parser_create(const char *enc,
return NULL;
parser->stream = parserutils_inputstream_create(enc,
- enc != NULL ? HUBBUB_CHARSET_DICTATED : HUBBUB_CHARSET_UNKNOWN,
+ enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN,
hubbub_charset_extract, alloc, pw);
if (parser->stream == NULL) {
alloc(parser, 0, pw);
@@ -105,7 +105,7 @@ hubbub_error hubbub_parser_setopt(hubbub_parser *parser,
hubbub_parser_opttype type,
hubbub_parser_optparams *params)
{
- hubbub_error result = HUBBUB_OK;;
+ hubbub_error result = HUBBUB_OK;
if (parser == NULL || params == NULL)
return HUBBUB_BADPARM;
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index 26fc1fb..0bf72ef 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -558,7 +558,7 @@ hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
}
}
- return HUBBUB_OK;
+ return (cont == HUBBUB_OOD) ? HUBBUB_OK : cont;
}
diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c
index 88fcff5..897610b 100644
--- a/src/treebuilder/in_head.c
+++ b/src/treebuilder/in_head.c
@@ -64,20 +64,26 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder,
if (treebuilder->tree_handler->encoding_change) {
const char *name = parserutils_charset_mibenum_to_name(
charset_enc);
- treebuilder->tree_handler->encoding_change(
+
+ /* 1 indicates the encoding should actually change */
+ if (treebuilder->tree_handler->encoding_change(
treebuilder->tree_handler->ctx,
- name);
+ name) == 1) {
+ return HUBBUB_ENCODINGCHANGE;
+ }
}
- return HUBBUB_ENCODINGCHANGE;
} else if (content_type_enc != 0) {
if (treebuilder->tree_handler->encoding_change) {
const char *name = parserutils_charset_mibenum_to_name(
content_type_enc);
- treebuilder->tree_handler->encoding_change(
+
+ /* 1 indicates the encoding should actually change */
+ if (treebuilder->tree_handler->encoding_change(
treebuilder->tree_handler->ctx,
- name);
+ name) == 1) {
+ return HUBBUB_ENCODINGCHANGE;
+ }
}
- return HUBBUB_ENCODINGCHANGE;
}
return HUBBUB_OK;