From a6c3624c75a547e142fc732898f9a3890fa9e2f5 Mon Sep 17 00:00:00 2001 From: Andrew Sidwell Date: Mon, 11 Aug 2008 01:29:00 +0000 Subject: Move one step closer to getting encoding changes working. svn path=/trunk/hubbub/; revision=5000 --- docs/Treebuilder | 14 +++++++++++++- include/hubbub/types.h | 9 ++++----- src/charset/detect.c | 21 +++++++++------------ src/parser.c | 4 ++-- src/tokeniser/tokeniser.c | 2 +- src/treebuilder/in_head.c | 18 ++++++++++++------ 6 files changed, 41 insertions(+), 27 deletions(-) diff --git a/docs/Treebuilder b/docs/Treebuilder index 9c7dce4..d8924f5 100644 --- a/docs/Treebuilder +++ b/docs/Treebuilder @@ -187,4 +187,16 @@ Callback behaviour This function must set the quirks mode flag of the document to "mode". - + | int hubbub_tree_encoding_change(void *ctx, + | const char *name); + + This function is called when a meta tag which specifies a charset is seen + in the treebuilder. [1] The client is responsible for checking if the + encoding the document is being processed as should actually be changed, and + if it should, this function should return 1. In this case, the parser + instance will return the error code HUBBUB_ENCODINGCHANGE when it returns + from parsing the chunk that triggered the encoding change. The parser + instance should then be destroyed and a new one created with that encoding + specified. + + [1] http://www.whatwg.org/specs/web-apps/current-work/#in-head diff --git a/include/hubbub/types.h b/include/hubbub/types.h index 42d1460..07ef2ab 100644 --- a/include/hubbub/types.h +++ b/include/hubbub/types.h @@ -15,11 +15,10 @@ * A client-dictated charset will override all others. * A document-specified charset will override autodetection or the default */ typedef enum hubbub_charset_source { - HUBBUB_CHARSET_UNKNOWN = 0, /**< Unknown */ - HUBBUB_CHARSET_DEFAULT = 1, /**< Default setting */ - HUBBUB_CHARSET_DETECTED = 2, /**< Autodetected */ - HUBBUB_CHARSET_DOCUMENT = 3, /**< Defined in document */ - HUBBUB_CHARSET_DICTATED = 4, /**< Dictated by client */ + HUBBUB_CHARSET_UNKNOWN = 0, /**< Unknown */ + HUBBUB_CHARSET_TENTATIVE = 1, /**< Charset may be changed + * with further data */ + HUBBUB_CHARSET_CONFIDENT = 2, /**< Charset definite */ } hubbub_charset_source; /** diff --git a/src/charset/detect.c b/src/charset/detect.c index 7d3459f..f3f2e4f 100644 --- a/src/charset/detect.c +++ b/src/charset/detect.c @@ -49,19 +49,18 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, if (data == NULL || mibenum == NULL || source == NULL) return PARSERUTILS_BADPARM; - /* 1 */ + /* 1. */ /* If the source is dictated, there's nothing for us to do */ - if (*source == HUBBUB_CHARSET_DICTATED) { - /* confidence = certain; */ + if (*source == HUBBUB_CHARSET_CONFIDENT) { return PARSERUTILS_OK; } - /* 2 */ + /* 2. */ /** \todo We probably want to wait for ~512 bytes of data / 500ms here */ - /* 3 */ + /* 3. */ /* We need at least 3 bytes of data */ if (len < 3) @@ -71,13 +70,12 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, charset = hubbub_charset_read_bom(data, len); if (charset != 0) { *mibenum = charset; - *source = HUBBUB_CHARSET_DOCUMENT; - /* confidence = certain; */ + *source = HUBBUB_CHARSET_CONFIDENT; return PARSERUTILS_OK; } - /* 4 */ + /* 4. */ /* No BOM was found, so we must look for a meta charset within * the document itself. */ @@ -111,8 +109,7 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, "UTF-32BE", SLEN("UTF-32BE"))) { *mibenum = charset; - *source = HUBBUB_CHARSET_DOCUMENT; - /* confidence = tentative; */ + *source = HUBBUB_CHARSET_TENTATIVE; return PARSERUTILS_OK; } @@ -126,7 +123,7 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, /* We failed to autodetect a charset, so use the default fallback */ default_encoding: - /* 7 */ + /* 7. */ charset = parserutils_charset_mibenum_from_name("Windows-1252", SLEN("Windows-1252")); @@ -135,7 +132,7 @@ default_encoding: SLEN("ISO-8859-1")); *mibenum = charset; - *source = HUBBUB_CHARSET_DEFAULT; + *source = HUBBUB_CHARSET_TENTATIVE; return PARSERUTILS_OK; } diff --git a/src/parser.c b/src/parser.c index 7f187a6..e43a309 100644 --- a/src/parser.c +++ b/src/parser.c @@ -46,7 +46,7 @@ hubbub_parser *hubbub_parser_create(const char *enc, return NULL; parser->stream = parserutils_inputstream_create(enc, - enc != NULL ? HUBBUB_CHARSET_DICTATED : HUBBUB_CHARSET_UNKNOWN, + enc != NULL ? HUBBUB_CHARSET_CONFIDENT : HUBBUB_CHARSET_UNKNOWN, hubbub_charset_extract, alloc, pw); if (parser->stream == NULL) { alloc(parser, 0, pw); @@ -105,7 +105,7 @@ hubbub_error hubbub_parser_setopt(hubbub_parser *parser, hubbub_parser_opttype type, hubbub_parser_optparams *params) { - hubbub_error result = HUBBUB_OK;; + hubbub_error result = HUBBUB_OK; if (parser == NULL || params == NULL) return HUBBUB_BADPARM; diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c index 26fc1fb..0bf72ef 100644 --- a/src/tokeniser/tokeniser.c +++ b/src/tokeniser/tokeniser.c @@ -558,7 +558,7 @@ hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser) } } - return HUBBUB_OK; + return (cont == HUBBUB_OOD) ? HUBBUB_OK : cont; } diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c index 88fcff5..897610b 100644 --- a/src/treebuilder/in_head.c +++ b/src/treebuilder/in_head.c @@ -64,20 +64,26 @@ static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder, if (treebuilder->tree_handler->encoding_change) { const char *name = parserutils_charset_mibenum_to_name( charset_enc); - treebuilder->tree_handler->encoding_change( + + /* 1 indicates the encoding should actually change */ + if (treebuilder->tree_handler->encoding_change( treebuilder->tree_handler->ctx, - name); + name) == 1) { + return HUBBUB_ENCODINGCHANGE; + } } - return HUBBUB_ENCODINGCHANGE; } else if (content_type_enc != 0) { if (treebuilder->tree_handler->encoding_change) { const char *name = parserutils_charset_mibenum_to_name( content_type_enc); - treebuilder->tree_handler->encoding_change( + + /* 1 indicates the encoding should actually change */ + if (treebuilder->tree_handler->encoding_change( treebuilder->tree_handler->ctx, - name); + name) == 1) { + return HUBBUB_ENCODINGCHANGE; + } } - return HUBBUB_ENCODINGCHANGE; } return HUBBUB_OK; -- cgit v1.2.3