From 012d3f6431ee2c5a7f0214d93dbc60813bc79f5b Mon Sep 17 00:00:00 2001 From: Andrew Sidwell Date: Tue, 17 Jun 2008 01:47:02 +0000 Subject: Fix entity consumption. This gets us to the second set of html5lib tests. svn path=/trunk/hubbub/; revision=4366 --- src/tokeniser/tokeniser.c | 89 ++++++++++++++++++++++++++++------------------- src/utils/dict.c | 21 +++++++---- 2 files changed, 68 insertions(+), 42 deletions(-) (limited to 'src') diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c index a5a35a0..5c268b6 100644 --- a/src/tokeniser/tokeniser.c +++ b/src/tokeniser/tokeniser.c @@ -103,6 +103,7 @@ typedef struct hubbub_tokeniser_context { struct { hubbub_string str; /**< Pending string */ + uint32_t poss_len; uint8_t base; /**< Base for numeric * entities */ uint32_t codepoint; /**< UCS4 codepoint */ @@ -266,6 +267,9 @@ hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input, tok->state = HUBBUB_TOKENISER_STATE_DATA; tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA; + tok->escape_flag = false; + tok->process_cdata_section = false; + tok->input = input; tok->input_buffer = NULL; tok->input_buffer_len = 0; @@ -580,7 +584,8 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) tokeniser->escape_flag == false) { tokeniser->state = HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_DATA; - hubbub_inputstream_advance(tokeniser->input); + /* Don't eat the '&'; it'll be handled by entity + * consumption */ break; } else if (c == '-') { size_t len; @@ -1474,7 +1479,7 @@ bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser) tokeniser->state = HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; tokeniser->context.allowed_char = '"'; - hubbub_inputstream_advance(tokeniser->input); + /* Don't eat the '&'; it'll be handled by entity consumption */ } else if (c == HUBBUB_INPUTSTREAM_EOF) { hubbub_token token; @@ -1521,7 +1526,7 @@ bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser) tokeniser->state = HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; tokeniser->context.allowed_char = '\''; - hubbub_inputstream_advance(tokeniser->input); + /* Don't eat the '&'; it'll be handled by entity consumption */ } else if (c == HUBBUB_INPUTSTREAM_EOF) { hubbub_token token; @@ -1567,7 +1572,7 @@ bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser) tokeniser->context.prev_state = tokeniser->state; tokeniser->state = HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; - hubbub_inputstream_advance(tokeniser->input); + /* Don't eat the '&'; it'll be handled by entity consumption */ } else if (c == '>') { hubbub_token token; @@ -2922,23 +2927,22 @@ bool hubbub_tokeniser_consume_character_reference(hubbub_tokeniser *tokeniser) uint32_t pos; size_t len; - if (tokeniser->context.match_entity.done_setup == false) { - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - tokeniser->context.match_entity.str.data.off = pos; - tokeniser->context.match_entity.str.len = len; - tokeniser->context.match_entity.base = 0; - tokeniser->context.match_entity.codepoint = 0; - tokeniser->context.match_entity.had_data = false; - tokeniser->context.match_entity.return_state = - tokeniser->state; - tokeniser->context.match_entity.complete = false; - tokeniser->context.match_entity.done_setup = true; - tokeniser->context.match_entity.context = NULL; - tokeniser->context.match_entity.prev_len = len; + /* Set things up */ + tokeniser->context.match_entity.str.data.off = pos; + tokeniser->context.match_entity.str.len = len; + tokeniser->context.match_entity.poss_len = 0; + tokeniser->context.match_entity.base = 0; + tokeniser->context.match_entity.codepoint = 0; + tokeniser->context.match_entity.had_data = false; + tokeniser->context.match_entity.return_state = tokeniser->state; + tokeniser->context.match_entity.complete = false; + tokeniser->context.match_entity.done_setup = true; + tokeniser->context.match_entity.context = NULL; + tokeniser->context.match_entity.prev_len = len; - hubbub_inputstream_advance(tokeniser->input); - } + hubbub_inputstream_advance(tokeniser->input); c = hubbub_inputstream_peek(tokeniser->input); @@ -3048,25 +3052,31 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser) hubbub_inputstream_rewind(tokeniser->input, ctx->match_entity.str.len); + /* Had data, so calculate final codepoint */ if (ctx->match_entity.had_data) { - /* Had data, so calculate final codepoint */ - if (0x80 <= ctx->match_entity.codepoint && - ctx->match_entity.codepoint <= 0x9F) { - ctx->match_entity.codepoint = - cp1252Table[ctx->match_entity.codepoint - - 0x80]; - } else if (ctx->match_entity.codepoint == 0 || - ctx->match_entity.codepoint > 0x10FFFF) { - ctx->match_entity.codepoint = 0xFFFD; - } else if (ctx->match_entity.codepoint == 0x0D) { - ctx->match_entity.codepoint = 0x000A; + uint32_t cp = ctx->match_entity.codepoint; + + if (0x80 <= cp && cp <= 0x9F) { + cp = cp1252Table[cp - 0x80]; + } else if (cp == 0x0D) { + cp = 0x000A; + } else if (cp <= 0x0008 || + (0x000E <= cp && cp <= 0x001F) || + (0x007F <= cp && cp <= 0x009F) || + (0xD800 <= cp && cp <= 0xDFFF) || + (0xFDD0 <= cp && cp <= 0xFDDF) || + (cp & 0xFFFE) == 0xFFFE || + cp > 0x10FFFF) { + cp = 0xFFFD; } + printf("%x\n", cp); + /* And replace the matched range with it */ error = hubbub_inputstream_replace_range(tokeniser->input, ctx->match_entity.str.data.off, ctx->match_entity.str.len, - ctx->match_entity.codepoint); + cp); if (error != HUBBUB_OK) { /** \todo handle memory exhaustion */ } @@ -3112,6 +3122,8 @@ bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); ctx->match_entity.str.len += len; + ctx->match_entity.str.len += ctx->match_entity.poss_len; + ctx->match_entity.poss_len = 0; /* And cache length, for replacement */ ctx->match_entity.prev_len = @@ -3122,28 +3134,35 @@ bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) } else { pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - ctx->match_entity.str.len += len; + ctx->match_entity.poss_len += len; } hubbub_inputstream_advance(tokeniser->input); } + /* Rewind back possible matches, if any */ + hubbub_inputstream_rewind(tokeniser->input, + ctx->match_entity.poss_len); + if (c == HUBBUB_INPUTSTREAM_OOD) return false; c = hubbub_inputstream_peek(tokeniser->input); - if (ctx->match_entity.codepoint != 0 && c != ';' && + if ((tokeniser->context.match_entity.return_state == + HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) && + (c != ';') && ((0x0030 <= c && c <= 0x0039) || (0x0041 <= c && c <= 0x005A) || (0x0061 <= c && c <= 0x007A))) { ctx->match_entity.codepoint = 0; } - /* Rewind the inputstream to start of processed sequence */ hubbub_inputstream_rewind(tokeniser->input, - ctx->match_entity.str.len); + ctx->match_entity.str.len); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); /* Now, replace range, if we found a named entity */ if (ctx->match_entity.codepoint != 0) { diff --git a/src/utils/dict.c b/src/utils/dict.c index f50ffab..59eb4be 100644 --- a/src/utils/dict.c +++ b/src/utils/dict.c @@ -14,12 +14,11 @@ typedef struct hubbub_dict_node { uint8_t split; /**< Data to split on */ struct hubbub_dict_node *lt; /**< Subtree for data less than * split */ - struct hubbub_dict_node *eq; /**< Subtree for data equal to split - * If split == '\0', this stores the - * pointer to the actual data, not a - * subtree */ + struct hubbub_dict_node *eq; /**< Subtree for data equal to split */ struct hubbub_dict_node *gt; /**< Subtree for data greater than * split */ + + const void *value; /**< Data for this node */ } hubbub_dict_node; /** Dictionary object */ @@ -143,10 +142,14 @@ hubbub_dict_node *hubbub_dict_insert_internal(hubbub_dict *dict, parent->lt, key, value); } else if ((uint8_t) key[0] == parent->split) { if (key[0] == '\0') { - parent->eq = (hubbub_dict_node *) value; + parent->value = value; + } else if (key[1] == '\0') { + parent->value = value; + parent->eq = hubbub_dict_insert_internal(dict, + parent->eq, key + 1, value); } else { parent->eq = hubbub_dict_insert_internal(dict, - parent->eq, ++key, value); + parent->eq, key + 1, value); } } else { parent->gt = hubbub_dict_insert_internal(dict, @@ -200,7 +203,11 @@ hubbub_error hubbub_dict_search_step(hubbub_dict *dict, uint8_t c, p = NULL; } else if (p->eq != NULL && p->eq->split == '\0') { match = true; - *result = (const void *) p->eq->eq; + *result = p->eq->value; + p = p->eq; + } else if (p->value) { + match = true; + *result = p->value; p = p->eq; } else { p = p->eq; -- cgit v1.2.3