summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/hubbub/errors.h1
-rw-r--r--include/hubbub/functypes.h5
-rw-r--r--include/hubbub/tree.h1
-rw-r--r--src/charset/detect.c2
-rw-r--r--src/charset/detect.h4
-rw-r--r--src/treebuilder/in_head.c72
-rw-r--r--src/utils/errors.c3
7 files changed, 81 insertions, 7 deletions
diff --git a/include/hubbub/errors.h b/include/hubbub/errors.h
index fe369dc..9199d09 100644
--- a/include/hubbub/errors.h
+++ b/include/hubbub/errors.h
@@ -14,6 +14,7 @@ typedef enum hubbub_error {
HUBBUB_OK = 0,
HUBBUB_OOD = 1, /**< Out of data */
HUBBUB_REPROCESS = 2,
+ HUBBUB_ENCODINGCHANGE = 3,
HUBBUB_NOMEM = 5,
HUBBUB_BADPARM = 6,
diff --git a/include/hubbub/functypes.h b/include/hubbub/functypes.h
index d41f6bd..8d26926 100644
--- a/include/hubbub/functypes.h
+++ b/include/hubbub/functypes.h
@@ -121,5 +121,10 @@ typedef int (*hubbub_tree_add_attributes)(void *ctx, void *node,
*/
typedef int (*hubbub_tree_set_quirks_mode)(void *ctx, hubbub_quirks_mode mode);
+/**
+ * Type of encoding change notification function
+ */
+typedef int (*hubbub_tree_encoding_change)(void *ctx, uint16_t mibenum);
+
#endif
diff --git a/include/hubbub/tree.h b/include/hubbub/tree.h
index 7e2e11f..0a286a0 100644
--- a/include/hubbub/tree.h
+++ b/include/hubbub/tree.h
@@ -30,6 +30,7 @@ typedef struct hubbub_tree_handler {
hubbub_tree_form_associate form_associate;
hubbub_tree_add_attributes add_attributes;
hubbub_tree_set_quirks_mode set_quirks_mode;
+ hubbub_tree_encoding_change encoding_change;
void *ctx;
} hubbub_tree_handler;
diff --git a/src/charset/detect.c b/src/charset/detect.c
index 3809770..7d3459f 100644
--- a/src/charset/detect.c
+++ b/src/charset/detect.c
@@ -21,8 +21,6 @@ static uint16_t hubbub_charset_read_bom(const uint8_t *data, size_t len);
static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len);
static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
const uint8_t *end);
-static uint16_t hubbub_charset_parse_content(const uint8_t *value,
- uint32_t valuelen);
static bool hubbub_charset_get_attribute(const uint8_t **data,
const uint8_t *end,
const uint8_t **name, uint32_t *namelen,
diff --git a/src/charset/detect.h b/src/charset/detect.h
index 807f374..cb837d0 100644
--- a/src/charset/detect.h
+++ b/src/charset/detect.h
@@ -16,5 +16,9 @@
parserutils_error hubbub_charset_extract(const uint8_t *data, size_t len,
uint16_t *mibenum, uint32_t *source);
+/* Parse a Content-Type string for an encoding */
+uint16_t hubbub_charset_parse_content(const uint8_t *value,
+ uint32_t valuelen);
+
#endif
diff --git a/src/treebuilder/in_head.c b/src/treebuilder/in_head.c
index 034be21..ddb7453 100644
--- a/src/treebuilder/in_head.c
+++ b/src/treebuilder/in_head.c
@@ -8,10 +8,76 @@
#include <assert.h>
#include <string.h>
+#include <parserutils/charset/mibenum.h>
+
#include "treebuilder/modes.h"
#include "treebuilder/internal.h"
#include "treebuilder/treebuilder.h"
+
+#include "charset/detect.h"
+
#include "utils/utils.h"
+#include "utils/string.h"
+
+
+/**
+ * Process a <meta> tag as if "in head".
+ *
+ * \param treebuilder The treebuilder instance
+ * \param token The token to process
+ */
+static hubbub_error process_meta_in_head(hubbub_treebuilder *treebuilder,
+ const hubbub_token *token)
+{
+ insert_element_no_push(treebuilder, &token->data.tag);
+
+ /** \todo ack sc flag */
+
+#if 0
+ if (confidence == certain)
+ return HUBBUB_OK;
+#endif
+
+ uint16_t charset_enc = 0;
+ uint16_t content_type_enc = 0;
+
+ for (size_t i = 0; i < token->data.tag.n_attributes; i++) {
+ hubbub_attribute *attr = &token->data.tag.attributes[i];
+
+ if (hubbub_string_match(attr->name.ptr, attr->name.len,
+ (const uint8_t *) "charset",
+ SLEN("charset")) == true) {
+ /* Extract charset */
+ charset_enc = parserutils_charset_mibenum_from_name(
+ (const char *) attr->value.ptr,
+ attr->value.len);
+ } else if (hubbub_string_match(attr->name.ptr, attr->name.len,
+ (const uint8_t *) "content",
+ SLEN("content")) == true) {
+ /* Extract charset from Content-Type */
+ content_type_enc = hubbub_charset_parse_content(
+ attr->value.ptr, attr->value.len);
+ }
+ }
+
+ if (charset_enc != 0) {
+ if (treebuilder->tree_handler->encoding_change) {
+ treebuilder->tree_handler->encoding_change(
+ treebuilder->tree_handler->ctx,
+ charset_enc);
+ }
+ return HUBBUB_ENCODINGCHANGE;
+ } else if (content_type_enc != 0) {
+ if (treebuilder->tree_handler->encoding_change) {
+ treebuilder->tree_handler->encoding_change(
+ treebuilder->tree_handler->ctx,
+ content_type_enc);
+ }
+ return HUBBUB_ENCODINGCHANGE;
+ }
+
+ return HUBBUB_OK;
+}
@@ -101,11 +167,7 @@ hubbub_error handle_in_head(hubbub_treebuilder *treebuilder,
/** \todo ack sc flag */
} else if (type == META) {
- insert_element_no_push(treebuilder, &token->data.tag);
-
- /** \todo ack sc flag */
-
- /** \todo detect charset */
+ err = process_meta_in_head(treebuilder, token);
} else if (type == TITLE) {
parse_generic_rcdata(treebuilder, token, true);
} else if (type == NOFRAMES || type == STYLE) {
diff --git a/src/utils/errors.c b/src/utils/errors.c
index 9b9dfdb..7c6b5ea 100644
--- a/src/utils/errors.c
+++ b/src/utils/errors.c
@@ -29,6 +29,9 @@ const char *hubbub_error_to_string(hubbub_error error)
case HUBBUB_REPROCESS:
result = "Internal (reprocess token)";
break;
+ case HUBBUB_ENCODINGCHANGE:
+ result = "Encoding of document has changed";
+ break;
case HUBBUB_NOMEM:
result = "Insufficient memory";
break;