diff options
Diffstat (limited to 'src/treebuilder/treebuilder.c')
-rw-r--r-- | src/treebuilder/treebuilder.c | 838 |
1 files changed, 2 insertions, 836 deletions
diff --git a/src/treebuilder/treebuilder.c b/src/treebuilder/treebuilder.c index 68f82d8..2b256b4 100644 --- a/src/treebuilder/treebuilder.c +++ b/src/treebuilder/treebuilder.c @@ -8,11 +8,12 @@ #include <assert.h> #include <string.h> -#include "treebuilder/in_body.h" +#include "treebuilder/modes.h" #include "treebuilder/internal.h" #include "treebuilder/treebuilder.h" #include "utils/utils.h" + static const struct { const char *name; element_type type; @@ -68,23 +69,6 @@ static void hubbub_treebuilder_buffer_handler(const uint8_t *data, static void hubbub_treebuilder_token_handler(const hubbub_token *token, void *pw); -static bool handle_initial(hubbub_treebuilder *treebuilder, - const hubbub_token *token); -static bool handle_before_html(hubbub_treebuilder *treebuilder, - const hubbub_token *token); -static bool handle_before_head(hubbub_treebuilder *treebuilder, - const hubbub_token *token); -static bool handle_in_head(hubbub_treebuilder *treebuilder, - const hubbub_token *token); -static bool handle_in_head_noscript(hubbub_treebuilder *treebuilder, - const hubbub_token *token); -static bool handle_after_head(hubbub_treebuilder *treebuilder, - const hubbub_token *token); -static bool handle_generic_rcdata(hubbub_treebuilder *treebuilder, - const hubbub_token *token); -static bool handle_script_collect_characters(hubbub_treebuilder *treebuilder, - const hubbub_token *token); - /** * Create a hubbub treebuilder @@ -371,824 +355,6 @@ void hubbub_treebuilder_token_handler(const hubbub_token *token, } } -/** - * Handle token in initial insertion mode - * - * \param treebuilder The treebuilder instance - * \param token The token to handle - * \return True to reprocess token, false otherwise - */ -bool handle_initial(hubbub_treebuilder *treebuilder, const hubbub_token *token) -{ - bool reprocess = false; - - switch (token->type) { - case HUBBUB_TOKEN_CHARACTER: - if (process_characters_expect_whitespace(treebuilder, token, - false)) { - /** \todo parse error */ - - treebuilder->tree_handler->set_quirks_mode( - treebuilder->tree_handler->ctx, - HUBBUB_QUIRKS_MODE_FULL); - treebuilder->context.mode = BEFORE_HTML; - reprocess = true; - } - break; - case HUBBUB_TOKEN_COMMENT: - process_comment_append(treebuilder, token, - treebuilder->context.document); - break; - case HUBBUB_TOKEN_DOCTYPE: - { - int success; - void *doctype, *appended; - - /** \todo parse error */ - - /** \todo need public and system ids from tokeniser */ - success = treebuilder->tree_handler->create_doctype( - treebuilder->tree_handler->ctx, - &token->data.doctype.name, - &token->data.doctype.public_id, - &token->data.doctype.system_id, &doctype); - if (success != 0) { - /** \todo errors */ - } - - /* Append to Document node */ - success = treebuilder->tree_handler->append_child( - treebuilder->tree_handler->ctx, - treebuilder->context.document, - doctype, &appended); - if (success != 0) { - /** \todo errors */ - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, - doctype); - } - - /* \todo look up the doctype in a catalog */ - - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, appended); - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, doctype); - - treebuilder->context.mode = BEFORE_HTML; - } - break; - case HUBBUB_TOKEN_START_TAG: - case HUBBUB_TOKEN_END_TAG: - case HUBBUB_TOKEN_EOF: - /** \todo parse error */ - treebuilder->tree_handler->set_quirks_mode( - treebuilder->tree_handler->ctx, - HUBBUB_QUIRKS_MODE_FULL); - reprocess = true; - break; - } - - if (reprocess) { - treebuilder->context.mode = BEFORE_HTML; - } - - return reprocess; -} - -/** - * Handle token in "before html" insertion mode - * - * \param treebuilder The treebuilder instance - * \param token The token to handle - * \return True to reprocess token, false otherwise - */ -bool handle_before_html(hubbub_treebuilder *treebuilder, - const hubbub_token *token) -{ - bool reprocess = false; - bool handled = false; - - switch (token->type) { - case HUBBUB_TOKEN_DOCTYPE: - /** \todo parse error */ - break; - case HUBBUB_TOKEN_COMMENT: - process_comment_append(treebuilder, token, - treebuilder->context.document); - break; - case HUBBUB_TOKEN_CHARACTER: - reprocess = process_characters_expect_whitespace(treebuilder, - token, false); - break; - case HUBBUB_TOKEN_START_TAG: - { - element_type type = element_type_from_name(treebuilder, - &token->data.tag.name); - - if (type == HTML) { - handled = true; - } else { - reprocess = true; - } - } - break; - case HUBBUB_TOKEN_END_TAG: - case HUBBUB_TOKEN_EOF: - reprocess = true; - break; - } - - - if (handled || reprocess) { - int success; - void *html, *appended; - - /* We can't use insert_element() here, as it assumes - * that we're inserting into current_node. There is - * no current_node to insert into at this point so - * we get to do it manually. */ - - if (reprocess) { - /* Need to manufacture html element */ - hubbub_tag tag; - - /** \todo UTF-16 */ - tag.name.type = HUBBUB_STRING_PTR; - tag.name.data.ptr = (const uint8_t *) "html"; - tag.name.len = SLEN("html"); - - tag.n_attributes = 0; - tag.attributes = NULL; - - success = treebuilder->tree_handler->create_element( - treebuilder->tree_handler->ctx, - &tag, &html); - } else { - success = treebuilder->tree_handler->create_element( - treebuilder->tree_handler->ctx, - &token->data.tag, &html); - } - - if (success != 0) { - /** \todo errors */ - } - - success = treebuilder->tree_handler->append_child( - treebuilder->tree_handler->ctx, - treebuilder->context.document, - html, &appended); - if (success != 0) { - /** \todo errors */ - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, - html); - } - - /* We can't use element_stack_push() here, as it - * assumes that current_node is pointing at the index - * before the one to insert at. For the first entry in - * the stack, this does not hold so we must insert - * manually. */ - treebuilder->context.element_stack[0].type = HTML; - treebuilder->context.element_stack[0].node = html; - treebuilder->context.current_node = 0; - - /** \todo cache selection algorithm */ - - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, - appended); - - treebuilder->context.mode = BEFORE_HEAD; - } - - return reprocess; -} - -/** - * Handle token in "before head" insertion mode - * - * \param treebuilder The treebuilder instance - * \param token The token to handle - * \return True to reprocess token, false otherwise - */ -bool handle_before_head(hubbub_treebuilder *treebuilder, - const hubbub_token *token) -{ - bool reprocess = false; - bool handled = false; - - switch (token->type) { - case HUBBUB_TOKEN_CHARACTER: - reprocess = process_characters_expect_whitespace(treebuilder, - token, false); - break; - case HUBBUB_TOKEN_COMMENT: - process_comment_append(treebuilder, token, - treebuilder->context.element_stack[ - treebuilder->context.current_node].node); - break; - case HUBBUB_TOKEN_DOCTYPE: - /** \todo parse error */ - break; - case HUBBUB_TOKEN_START_TAG: - { - element_type type = element_type_from_name(treebuilder, - &token->data.tag.name); - - if (type == HTML) { - /* Process as if "in body" */ - process_tag_in_body(treebuilder, token); - } else if (type == HEAD) { - handled = true; - } else { - reprocess = true; - } - } - break; - case HUBBUB_TOKEN_END_TAG: - { - element_type type = element_type_from_name(treebuilder, - &token->data.tag.name); - - if (type == HEAD || type == BR) { - reprocess = true; - } else { - /** \todo parse error */ - } - } - break; - case HUBBUB_TOKEN_EOF: - reprocess = true; - break; - } - - if (handled || reprocess) { - hubbub_tag tag; - - if (reprocess) { - /* Manufacture head tag */ - tag.name.type = HUBBUB_STRING_PTR; - tag.name.data.ptr = (const uint8_t *) "head"; - tag.name.len = SLEN("head"); - - tag.n_attributes = 0; - tag.attributes = NULL; - } else { - tag = token->data.tag; - } - - insert_element(treebuilder, &tag); - - treebuilder->tree_handler->ref_node( - treebuilder->tree_handler->ctx, - treebuilder->context.element_stack[ - treebuilder->context.current_node].node); - - treebuilder->context.head_element = - treebuilder->context.element_stack[ - treebuilder->context.current_node].node; - - treebuilder->context.mode = IN_HEAD; - } - - return reprocess; -} - -/** - * Handle token in "in head" insertion mode - * - * \param treebuilder The treebuilder instance - * \param token The token to handle - * \return True to reprocess token, false otherwise - */ -bool handle_in_head(hubbub_treebuilder *treebuilder, - const hubbub_token *token) -{ - bool reprocess = false; - bool handled = false; - - switch (token->type) { - case HUBBUB_TOKEN_CHARACTER: - reprocess = process_characters_expect_whitespace(treebuilder, - token, true); - break; - case HUBBUB_TOKEN_COMMENT: - process_comment_append(treebuilder, token, - treebuilder->context.element_stack[ - treebuilder->context.current_node].node); - break; - case HUBBUB_TOKEN_DOCTYPE: - /** \todo parse error */ - break; - case HUBBUB_TOKEN_START_TAG: - { - element_type type = element_type_from_name(treebuilder, - &token->data.tag.name); - - if (type == HTML) { - /* Process as if "in body" */ - process_tag_in_body(treebuilder, token); - } else if (type == BASE || type == COMMAND || - type == EVENT_SOURCE || type == LINK) { - process_base_link_meta_in_head(treebuilder, - token, type); - - /** \todo ack sc flag */ - } else if (type == META) { - process_base_link_meta_in_head(treebuilder, - token, type); - - /** \todo ack sc flag */ - - /** \todo detect charset */ - } else if (type == TITLE) { - parse_generic_rcdata(treebuilder, token, true); - } else if (type == NOFRAMES || type == STYLE) { - parse_generic_rcdata(treebuilder, token, false); - } else if (type == NOSCRIPT) { - /** \todo determine if scripting is enabled */ - if (false /*scripting_is_enabled*/) { - parse_generic_rcdata(treebuilder, token, false); - } else { - insert_element(treebuilder, &token->data.tag); - treebuilder->context.mode = IN_HEAD_NOSCRIPT; - } - } else if (type == SCRIPT) { - process_script_in_head(treebuilder, token); - } else if (type == HEAD) { - /** \todo parse error */ - } else { - reprocess = true; - } - } - break; - case HUBBUB_TOKEN_END_TAG: - { - element_type type = element_type_from_name(treebuilder, - &token->data.tag.name); - - if (type == HEAD) { - handled = true; - } else if (type == BR) { - reprocess = true; - } /** \todo parse error */ - } - break; - case HUBBUB_TOKEN_EOF: - reprocess = true; - break; - } - - if (handled || reprocess) { - element_type otype; - void *node; - - if (!element_stack_pop(treebuilder, &otype, &node)) { - /** \todo errors */ - } - - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, - node); - - treebuilder->context.mode = AFTER_HEAD; - } - - return reprocess; -} - -/** - * Handle tokens in "in head noscript" insertion mode - * - * \param treebuilder The treebuilder instance - * \param token The token to process - * \return True to reprocess the token, false otherwise - */ -bool handle_in_head_noscript(hubbub_treebuilder *treebuilder, - const hubbub_token *token) -{ - bool reprocess = false; - bool handled = false; - - switch (token->type) { - case HUBBUB_TOKEN_CHARACTER: - /* This should be equivalent to "in head" processing */ - reprocess = process_characters_expect_whitespace(treebuilder, - token, true); - break; - case HUBBUB_TOKEN_COMMENT: - /* This should be equivalent to "in head" processing */ - process_comment_append(treebuilder, token, - treebuilder->context.element_stack[ - treebuilder->context.current_node].node); - break; - case HUBBUB_TOKEN_DOCTYPE: - /** \todo parse error */ - break; - case HUBBUB_TOKEN_START_TAG: - { - element_type type = element_type_from_name(treebuilder, - &token->data.tag.name); - - if (type == HTML) { - /* Process as "in body" */ - process_tag_in_body(treebuilder, token); - } else if (type == NOSCRIPT) { - handled = true; - } else if (type == LINK) { - /* This should be equivalent to "in head" processing */ - process_base_link_meta_in_head(treebuilder, - token, type); - - /** \todo ack sc flag */ - } else if (type == META) { - /* This should be equivalent to "in head" processing */ - process_base_link_meta_in_head(treebuilder, - token, type); - - /** \todo ack sc flag */ - - /** \todo detect charset */ - } else if (type == NOFRAMES) { - /* This should be equivalent to "in head" processing */ - parse_generic_rcdata(treebuilder, token, true); - } else if (type == STYLE) { - /* This should be equivalent to "in head" processing */ - parse_generic_rcdata(treebuilder, token, false); - } else if (type == HEAD || type == NOSCRIPT) { - /** \todo parse error */ - } else { - /** \todo parse error */ - reprocess = true; - } - } - break; - case HUBBUB_TOKEN_END_TAG: - { - element_type type = element_type_from_name(treebuilder, - &token->data.tag.name); - - if (type == NOSCRIPT) { - handled = true; - } else if (type == BR) { - /** \todo parse error */ - reprocess = true; - } else { - /** \todo parse error */ - } - } - break; - case HUBBUB_TOKEN_EOF: - /** \todo parse error */ - reprocess = true; - break; - } - - if (handled || reprocess) { - element_type otype; - void *node; - - if (!element_stack_pop(treebuilder, &otype, &node)) { - /** \todo errors */ - } - - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, - node); - - treebuilder->context.mode = IN_HEAD; - } - - return reprocess; -} - -/** - * Handle tokens in "after head" insertion mode - * - * \param treebuilder The treebuilder instance - * \param token The token to process - * \return True to reprocess the token, false otherwise - */ -bool handle_after_head(hubbub_treebuilder *treebuilder, - const hubbub_token *token) -{ - bool reprocess = false; - bool handled = false; - - switch (token->type) { - case HUBBUB_TOKEN_CHARACTER: - append_text(treebuilder, &token->data.character); - break; - case HUBBUB_TOKEN_COMMENT: - process_comment_append(treebuilder, token, - treebuilder->context.element_stack[ - treebuilder->context.current_node].node); - break; - case HUBBUB_TOKEN_DOCTYPE: - /** \todo parse error */ - break; - case HUBBUB_TOKEN_START_TAG: - { - element_type type = element_type_from_name(treebuilder, - &token->data.tag.name); - - if (type == HTML) { - /* Process as if "in body" */ - process_tag_in_body(treebuilder, token); - } else if (type == BODY) { - handled = true; - } else if (type == FRAMESET) { - insert_element(treebuilder, &token->data.tag); - treebuilder->context.mode = IN_FRAMESET; - } else if (type == BASE || type == LINK || type == META || - type == NOFRAMES || type == SCRIPT || - type == STYLE || type == TITLE) { - element_type otype; - void *node; - - /** \todo parse error */ - - if (!element_stack_push(treebuilder, - HEAD, - treebuilder->context.head_element)) { - /** \todo errors */ - } - - - /* This should be identical to handling "in head" */ - if (type == BASE || type == LINK || type == META) { - /** \todo ack sc flag */ - - process_base_link_meta_in_head(treebuilder, - token, type); - } else if (type == SCRIPT) { - process_script_in_head(treebuilder, token); - } else if (type == STYLE || type == NOFRAMES) { - parse_generic_rcdata(treebuilder, token, false); - } else if (type == TITLE) { - parse_generic_rcdata(treebuilder, token, true); - } - - if (!element_stack_pop(treebuilder, &otype, &node)) { - /** \todo errors */ - } - - /* No need to unref node as we never increased - * its reference count when pushing it on the stack */ - } else if (type == HEAD) { - /** \todo parse error */ - } else { - reprocess = true; - } - } - break; - case HUBBUB_TOKEN_END_TAG: - /** \parse error */ - break; - case HUBBUB_TOKEN_EOF: - reprocess = true; - break; - } - - if (handled || reprocess) { - hubbub_tag tag; - - if (reprocess) { - /* Manufacture body */ - tag.name.type = HUBBUB_STRING_PTR; - tag.name.data.ptr = (const uint8_t *) "body"; - tag.name.len = SLEN("body"); - - tag.n_attributes = 0; - tag.attributes = NULL; - } else { - tag = token->data.tag; - } - - insert_element(treebuilder, &tag); - - treebuilder->context.mode = IN_BODY; - } - - return reprocess; -} - -/** - * Handle tokens in "generic rcdata" insertion mode - * - * \param treebuilder The treebuilder instance - * \param token The token to process - * \return True to reprocess the token, false otherwise - */ -bool handle_generic_rcdata(hubbub_treebuilder *treebuilder, - const hubbub_token *token) -{ - bool reprocess = false; - bool done = false; - - if (treebuilder->context.strip_leading_lr && - token->type != HUBBUB_TOKEN_CHARACTER) { - /* Reset the LR stripping flag */ - treebuilder->context.strip_leading_lr = false; - } - - switch (token->type) { - case HUBBUB_TOKEN_CHARACTER: - if (treebuilder->context.collect.string.len == 0) { - treebuilder->context.collect.string.data.off = - token->data.character.data.off; - } - treebuilder->context.collect.string.len += - token->data.character.len; - - if (treebuilder->context.strip_leading_lr) { - const uint8_t *str = treebuilder->input_buffer + - treebuilder->context.collect.string.data.off; - - /** \todo UTF-16 */ - if (*str == '\n') { - treebuilder->context.collect.string.data.off++; - treebuilder->context.collect.string.len--; - } - - treebuilder->context.strip_leading_lr = false; - } - break; - case HUBBUB_TOKEN_END_TAG: - { - element_type type = element_type_from_name(treebuilder, - &token->data.tag.name); - - if (type != treebuilder->context.collect.type) { - /** \todo parse error */ - } - - done = true; - } - break; - case HUBBUB_TOKEN_EOF: - /** \todo parse error */ - done = reprocess = true; - break; - case HUBBUB_TOKEN_COMMENT: - case HUBBUB_TOKEN_DOCTYPE: - case HUBBUB_TOKEN_START_TAG: - /* Should never happen */ - assert(0); - break; - } - - if (done) { - int success; - void *text, *appended; - - success = treebuilder->tree_handler->create_text( - treebuilder->tree_handler->ctx, - &treebuilder->context.collect.string, - &text); - if (success != 0) { - /** \todo errors */ - } - - success = treebuilder->tree_handler->append_child( - treebuilder->tree_handler->ctx, - treebuilder->context.collect.node, - text, &appended); - if (success != 0) { - /** \todo errors */ - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, - text); - } - - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, appended); - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, text); - - /* Clean up context */ - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, - treebuilder->context.collect.node); - treebuilder->context.collect.node = NULL; - - /* Return to previous insertion mode */ - treebuilder->context.mode = - treebuilder->context.collect.mode; - } - - return reprocess; -} - -/** - * Handle tokens in "script collect characters" insertion mode - * - * \param treebuilder The treebuilder instance - * \param token The token to process - * \return True to reprocess the token, false otherwise - */ -bool handle_script_collect_characters(hubbub_treebuilder *treebuilder, - const hubbub_token *token) -{ - bool reprocess = false; - bool done = false; - - switch (token->type) { - case HUBBUB_TOKEN_CHARACTER: - if (treebuilder->context.collect.string.len == 0) { - treebuilder->context.collect.string.data.off = - token->data.character.data.off; - } - treebuilder->context.collect.string.len += - token->data.character.len; - break; - case HUBBUB_TOKEN_END_TAG: - { - element_type type = element_type_from_name(treebuilder, - &token->data.tag.name); - - if (type != treebuilder->context.collect.type) { - /** \todo parse error */ - /** \todo Mark script as "already executed" */ - } - - done = true; - } - break; - case HUBBUB_TOKEN_EOF: - case HUBBUB_TOKEN_COMMENT: - case HUBBUB_TOKEN_DOCTYPE: - case HUBBUB_TOKEN_START_TAG: - /** \todo parse error */ - /** \todo Mark script as "already executed" */ - done = reprocess = true; - break; - } - - if (done) { - int success; - void *text, *appended; - - success = treebuilder->tree_handler->create_text( - treebuilder->tree_handler->ctx, - &treebuilder->context.collect.string, - &text); - if (success != 0) { - /** \todo errors */ - } - - /** \todo fragment case -- skip this lot entirely */ - - success = treebuilder->tree_handler->append_child( - treebuilder->tree_handler->ctx, - treebuilder->context.collect.node, - text, &appended); - if (success != 0) { - /** \todo errors */ - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, - text); - } - - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, appended); - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, text); - - /** \todo insertion point manipulation */ - - /* Append script node to current node */ - success = treebuilder->tree_handler->append_child( - treebuilder->tree_handler->ctx, - treebuilder->context.element_stack[ - treebuilder->context.current_node].node, - treebuilder->context.collect.node, &appended); - if (success != 0) { - /** \todo errors */ - } - - /** \todo restore insertion point */ - - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, - appended); - treebuilder->tree_handler->unref_node( - treebuilder->tree_handler->ctx, - treebuilder->context.collect.node); - treebuilder->context.collect.node = NULL; - - /** \todo process any pending script */ - - /* Return to previous insertion mode */ - treebuilder->context.mode = - treebuilder->context.collect.mode; - } - - return reprocess; -} - /** * Process a character token in cases where we expect only whitespace |