From 5eb5ec3682ba75cec9616039c60987eeee26e5ee Mon Sep 17 00:00:00 2001 From: Andrew Sidwell Date: Sun, 10 Aug 2008 16:37:52 +0000 Subject: Add support in the treebuilder. svn path=/trunk/hubbub/; revision=4991 --- include/hubbub/errors.h | 1 + include/hubbub/functypes.h | 5 ++++ include/hubbub/tree.h | 1 + src/charset/detect.c | 2 -- src/charset/detect.h | 4 +++ src/treebuilder/in_head.c | 72 ++++++++++++++++++++++++++++++++++++++++++---- src/utils/errors.c | 3 ++ 7 files changed, 81 insertions(+), 7 deletions(-) diff --git a/include/hubbub/errors.h b/include/hubbub/errors.h index fe369dc..9199d09 100644 --- a/include/hubbub/errors.h +++ b/include/hubbub/errors.h @@ -14,6 +14,7 @@ typedef enum hubbub_error { HUBBUB_OK = 0, HUBBUB_OOD = 1, /**< Out of data */ HUBBUB_REPROCESS = 2, + HUBBUB_ENCODINGCHANGE = 3, HUBBUB_NOMEM = 5, HUBBUB_BADPARM = 6, diff --git a/include/hubbub/functypes.h b/include/hubbub/functypes.h index d41f6bd..8d26926 100644 --- a/include/hubbub/functypes.h +++ b/include/hubbub/functypes.h @@ -121,5 +121,10 @@ typedef int (*hubbub_tree_add_attributes)(void *ctx, void *node, */ typedef int (*hubbub_tree_set_quirks_mode)(void *ctx, hubbub_quirks_mode mode); +/** + * Type of encoding change notification function + */ +typedef int (*hubbub_tree_encoding_change)(void *ctx, uint16_t mibenum); + #endif diff --git a/include/hubbub/tree.h b/include/hubbub/tree.h index 7e2e11f..0a286a0 100644 --- a/include/hubbub/tree.h +++ b/include/hubbub/tree.h @@ -30,6 +30,7 @@ typedef struct hubbub_tree_handler { hubbub_tree_form_associate form_associate; hubbub_tree_add_attributes add_attributes; hubbub_tree_set_quirks_mode set_quirks_mode; + hubbub_tree_encoding_change encoding_change; void *ctx; } hubbub_tree_handler; diff --git a/src/charset/detect.c b/src/charset/detect.c index 3809770..7d3459f 100644 --- a/src/charset/detect.c +++ b/src/charset/detect.c @@ -21,8 +21,6 @@ static uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len); static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len); static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, const uint8_t *end); -static uint16_t hubbub_charset_parse_content(const uint8_t *value, - uint32_t valuelen); static bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, const uint8_t **name, uint32_t *namelen, diff --git a/src/charset/detect.h b/src/charset/detect.h index 807f374..cb837d0 100644 --- a/src/charset/detect.h +++ b/src/charset/detect.h @@ -16,5 +16,9 @@ parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len, uint16_t *mibenum, uint32_t *source); +/* Parse a Content-Type string for an encoding */ +uint16_t hubbub_charset_parse_content(const uint8_t *value, + uint32_t valuelen); + #endif diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c index 034be21..ddb7453 100644 --- a/src/treebuilder/in_head.c +++ b/src/treebuilder/in_head.c @@ -8,10 +8,76 @@ #include #include +#include + #include "treebuilder/modes.h" #include "treebuilder/internal.h" #include "treebuilder/treebuilder.h" + +#include "charset/detect.h" + #include "utils/utils.h" +#include "utils/string.h" + + +/** + * Process a tag as if "in head". + * + * \param treebuilder The treebuilder instance + * \param token The token to process + */ +static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder, + const hubbub_token *token) +{ + insert_element_no_push(treebuilder, &token->data.tag); + + /** \todo ack sc flag */ + +#if 0 + if (confidence == certain) + return HUBBUB_OK; +#endif + + uint16_t charset_enc = 0; + uint16_t content_type_enc = 0; + + for (size_t i = 0; i < token->data.tag.n_attributes; i++) { + hubbub_attribute *attr = &token->data.tag.attributes[i]; + + if (hubbub_string_match(attr->name.ptr, attr->name.len, + (const uint8_t *) "charset", + SLEN("charset")) == true) { + /* Extract charset */ + charset_enc = parserutils_charset_mibenum_from_name( + (const char *) attr->value.ptr, + attr->value.len); + } else if (hubbub_string_match(attr->name.ptr, attr->name.len, + (const uint8_t *) "content", + SLEN("content")) == true) { + /* Extract charset from Content-Type */ + content_type_enc = hubbub_charset_parse_content( + attr->value.ptr, attr->value.len); + } + } + + if (charset_enc != 0) { + if (treebuilder->tree_handler->encoding_change) { + treebuilder->tree_handler->encoding_change( + treebuilder->tree_handler->ctx, + charset_enc); + } + return HUBBUB_ENCODINGCHANGE; + } else if (content_type_enc != 0) { + if (treebuilder->tree_handler->encoding_change) { + treebuilder->tree_handler->encoding_change( + treebuilder->tree_handler->ctx, + content_type_enc); + } + return HUBBUB_ENCODINGCHANGE; + } + + return HUBBUB_OK; +} @@ -101,11 +167,7 @@ hubbub_error handle_in_head(hubbub_treebuilder *treebuilder, /** \todo ack sc flag */ } else if (type == META) { - insert_element_no_push(treebuilder, &token->data.tag); - - /** \todo ack sc flag */ - - /** \todo detect charset */ + err = process_meta_in_head(treebuilder, token); } else if (type == TITLE) { parse_generic_rcdata(treebuilder, token, true); } else if (type == NOFRAMES || type == STYLE) { diff --git a/src/utils/errors.c b/src/utils/errors.c index 9b9dfdb..7c6b5ea 100644 --- a/src/utils/errors.c +++ b/src/utils/errors.c @@ -29,6 +29,9 @@ const char *hubbub_error_to_string(hubbub_error error) case HUBBUB_REPROCESS: result = "Internal (reprocess token)"; break; + case HUBBUB_ENCODINGCHANGE: + result = "Encoding of document has changed"; + break; case HUBBUB_NOMEM: result = "Insufficient memory"; break; -- cgit v1.2.3