From 4da6a038c15a5fa3d1c754b7278ae47627a44718 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Fri, 21 Mar 2008 13:20:22 +0000 Subject: hubbub_strings may now be either an offset into the data buffer or a pointer to constant data. Fix up tokeniser and treebuilder to deal with this. Fix up testcases, too. The tokeniser will only ever emit strings of type HUBBUB_STRING_OFF. Anything else is a bug which should be fixed. The treebuilder may emit strings of either type. svn path=/trunk/hubbub/; revision=4014 --- include/hubbub/types.h | 11 +++- src/tokeniser/tokeniser.c | 113 ++++++++++++++++++++++++------------------ src/treebuilder/treebuilder.c | 22 ++++---- test/parser-utf16.c | 18 +++---- test/parser.c | 18 +++---- test/tokeniser.c | 18 +++---- test/tokeniser2.c | 16 +++--- test/tree.c | 26 ++++++++-- 8 files changed, 142 insertions(+), 100 deletions(-) diff --git a/include/hubbub/types.h b/include/hubbub/types.h index 922bdbb..e58a88b 100644 --- a/include/hubbub/types.h +++ b/include/hubbub/types.h @@ -57,7 +57,16 @@ typedef enum hubbub_token_type { * Tokeniser string type */ typedef struct hubbub_string { - uint32_t data_off; /**< Byte offset of string start */ + enum { + HUBBUB_STRING_OFF, + HUBBUB_STRING_PTR + } type; + + union { + const uint8_t *ptr; /**< Pointer to data */ + uint32_t off; /**< Byte offset of string start */ + } data; + size_t len; /**< Byte length of string */ } hubbub_string; diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c index f8b6bb3..3d69797 100644 --- a/src/tokeniser/tokeniser.c +++ b/src/tokeniser/tokeniser.c @@ -236,6 +236,12 @@ hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input, } memset(&tok->context, 0, sizeof(hubbub_tokeniser_context)); + tok->context.current_tag.name.type = HUBBUB_STRING_OFF; + tok->context.current_comment.type = HUBBUB_STRING_OFF; + tok->context.current_doctype.name.type = HUBBUB_STRING_OFF; + tok->context.current_chars.type = HUBBUB_STRING_OFF; + tok->context.close_tag_match.tag.type = HUBBUB_STRING_OFF; + tok->context.match_entity.str.type = HUBBUB_STRING_OFF; return tok; } @@ -434,7 +440,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) uint32_t c; /* Clear current characters */ - tokeniser->context.current_chars.data_off = 0; + tokeniser->context.current_chars.data.off = 0; tokeniser->context.current_chars.len = 0; while ((c = hubbub_inputstream_peek(tokeniser->input)) != @@ -462,7 +468,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) } /* Buffer '<' */ - tokeniser->context.current_chars.data_off = + tokeniser->context.current_chars.data.off = hubbub_inputstream_cur_pos(tokeniser->input, &tokeniser->context.current_chars.len); @@ -478,7 +484,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) &len); if (tokeniser->context.current_chars.len == 0) { - tokeniser->context.current_chars.data_off = + tokeniser->context.current_chars.data.off = pos; } tokeniser->context.current_chars.len++; @@ -495,7 +501,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) hubbub_tokeniser_emit_token(tokeniser, &token); - tokeniser->context.current_chars.data_off = 0; + tokeniser->context.current_chars.data.off = 0; tokeniser->context.current_chars.len = 0; } @@ -524,7 +530,8 @@ bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser) /* Emit character */ token.type = HUBBUB_TOKEN_CHARACTER; - token.data.character.data_off = + token.data.character.type = HUBBUB_STRING_OFF; + token.data.character.data.off = hubbub_inputstream_cur_pos(tokeniser->input, &token.data.character.len); @@ -601,7 +608,7 @@ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser) tokeniser->context.current_tag_type = HUBBUB_TOKEN_START_TAG; - ctag->name.data_off = + ctag->name.data.off = hubbub_inputstream_cur_pos(tokeniser->input, &ctag->name.len); ctag->n_attributes = 0; @@ -613,7 +620,7 @@ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser) tokeniser->context.current_tag_type = HUBBUB_TOKEN_START_TAG; - ctag->name.data_off = + ctag->name.data.off = hubbub_inputstream_cur_pos(tokeniser->input, &ctag->name.len); ctag->n_attributes = 0; @@ -644,7 +651,7 @@ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser) &len); tokeniser->context.current_chars.len += len; - tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.data.off = pos; tokeniser->context.current_comment.len = len; tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; @@ -688,7 +695,7 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser) tokeniser->context.current_tag_type = HUBBUB_TOKEN_END_TAG; - ctag->name.data_off = pos; + ctag->name.data.off = pos; ctag->name.len = len; ctag->n_attributes = 0; @@ -700,7 +707,7 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser) tokeniser->context.current_tag_type = HUBBUB_TOKEN_END_TAG; - ctag->name.data_off = pos; + ctag->name.data.off = pos; ctag->name.len = len; ctag->n_attributes = 0; @@ -724,7 +731,7 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser) pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.data.off = pos; tokeniser->context.current_comment.len = len; tokeniser->state = @@ -756,7 +763,7 @@ bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser) off = hubbub_inputstream_cur_pos(tokeniser->input, &len); if (ctx->close_tag_match.tag.len == 0) { - ctx->close_tag_match.tag.data_off = off; + ctx->close_tag_match.tag.data.off = off; ctx->close_tag_match.tag.len = len; } else { ctx->close_tag_match.tag.len += len; @@ -768,8 +775,8 @@ bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser) (ctx->close_tag_match.tag.len == ctag->name.len && hubbub_inputstream_compare_range_ci( tokeniser->input, - ctag->name.data_off, - ctx->close_tag_match.tag.data_off, + ctag->name.data.off, + ctx->close_tag_match.tag.data.off, ctag->name.len) != 0)) { hubbub_token token; @@ -792,8 +799,8 @@ bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser) } else if (ctx->close_tag_match.tag.len == ctag->name.len && hubbub_inputstream_compare_range_ci( tokeniser->input, - ctag->name.data_off, - ctx->close_tag_match.tag.data_off, + ctag->name.data.off, + ctx->close_tag_match.tag.data.off, ctag->name.len) == 0) { /* Matched => stop searching */ break; @@ -968,9 +975,11 @@ bool hubbub_tokeniser_handle_before_attribute_name( ctag->attributes = attr; - attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF; + attr[ctag->n_attributes].name.data.off = pos; attr[ctag->n_attributes].name.len = len; - attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF; + attr[ctag->n_attributes].value.data.off = 0; attr[ctag->n_attributes].value.len = 0; ctag->n_attributes++; @@ -1008,9 +1017,11 @@ bool hubbub_tokeniser_handle_before_attribute_name( ctag->attributes = attr; - attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF; + attr[ctag->n_attributes].name.data.off = pos; attr[ctag->n_attributes].name.len = len; - attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF; + attr[ctag->n_attributes].value.data.off = 0; attr[ctag->n_attributes].value.len = 0; ctag->n_attributes++; @@ -1135,9 +1146,11 @@ bool hubbub_tokeniser_handle_after_attribute_name( ctag->attributes = attr; - attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF; + attr[ctag->n_attributes].name.data.off = pos; attr[ctag->n_attributes].name.len = len; - attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF; + attr[ctag->n_attributes].value.data.off = 0; attr[ctag->n_attributes].value.len = 0; ctag->n_attributes++; @@ -1179,9 +1192,11 @@ bool hubbub_tokeniser_handle_after_attribute_name( ctag->attributes = attr; - attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF; + attr[ctag->n_attributes].name.data.off = pos; attr[ctag->n_attributes].name.len = len; - attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF; + attr[ctag->n_attributes].value.data.off = 0; attr[ctag->n_attributes].value.len = 0; ctag->n_attributes++; @@ -1240,7 +1255,7 @@ bool hubbub_tokeniser_handle_before_attribute_value( pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - ctag->attributes[ctag->n_attributes - 1].value.data_off = pos; + ctag->attributes[ctag->n_attributes - 1].value.data.off = pos; ctag->attributes[ctag->n_attributes - 1].value.len = len; tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ; @@ -1285,7 +1300,7 @@ bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser) pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { - ctag->attributes[ctag->n_attributes - 1].value.data_off = + ctag->attributes[ctag->n_attributes - 1].value.data.off = pos; } @@ -1331,7 +1346,7 @@ bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser) pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { - ctag->attributes[ctag->n_attributes - 1].value.data_off = + ctag->attributes[ctag->n_attributes - 1].value.data.off = pos; } @@ -1388,7 +1403,7 @@ bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser) pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { - ctag->attributes[ctag->n_attributes - 1].value.data_off = + ctag->attributes[ctag->n_attributes - 1].value.data.off = pos; } @@ -1421,7 +1436,7 @@ bool hubbub_tokeniser_handle_entity_in_attribute_value( pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { - ctag->attributes[ctag->n_attributes - 1].value.data_off = + ctag->attributes[ctag->n_attributes - 1].value.data.off = pos; } @@ -1458,7 +1473,7 @@ bool hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser) pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); if (tokeniser->context.current_comment.len == 0) - tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.data.off = pos; tokeniser->context.current_comment.len += len; hubbub_inputstream_advance(tokeniser->input); @@ -1495,7 +1510,7 @@ bool hubbub_tokeniser_handle_markup_declaration_open( tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE; hubbub_inputstream_advance(tokeniser->input); } else { - tokeniser->context.current_comment.data_off = 0; + tokeniser->context.current_comment.data.off = 0; tokeniser->context.current_comment.len = 0; tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; @@ -1511,7 +1526,7 @@ bool hubbub_tokeniser_handle_comment_start(hubbub_tokeniser *tokeniser) if (c == HUBBUB_INPUTSTREAM_OOD) return false; - tokeniser->context.current_comment.data_off = 0; + tokeniser->context.current_comment.data.off = 0; tokeniser->context.current_comment.len = 0; @@ -1553,7 +1568,7 @@ bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser) pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); if (tokeniser->context.current_comment.len == 0) - tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.data.off = pos; tokeniser->context.current_comment.len += len; hubbub_inputstream_advance(tokeniser->input); @@ -1589,11 +1604,11 @@ bool hubbub_tokeniser_handle_comment_dash(hubbub_tokeniser *tokeniser) pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); if (tokeniser->context.current_comment.len == 0) { - tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.data.off = pos; } else { /* Need to do this to get length of '-' */ len += pos - - tokeniser->context.current_comment.data_off; + tokeniser->context.current_comment.data.off; } tokeniser->context.current_comment.len = len; @@ -1631,12 +1646,12 @@ bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser) pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); if (tokeniser->context.current_comment.len == 0) { - tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.data.off = pos; tokeniser->context.current_comment.len = len; } else { /* Need to do this to get length of '-' */ len = pos - - tokeniser->context.current_comment.data_off; + tokeniser->context.current_comment.data.off; } tokeniser->context.current_comment.len = len; @@ -1660,11 +1675,11 @@ bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser) pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); if (tokeniser->context.current_comment.len == 0) { - tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.data.off = pos; } else { /* Need to do this to get length of '--' */ len += pos - - tokeniser->context.current_comment.data_off; + tokeniser->context.current_comment.data.off; } tokeniser->context.current_comment.len = len; @@ -1724,7 +1739,7 @@ bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser) case 1: hubbub_inputstream_push_back(tokeniser->input, 'D'); } - tokeniser->context.current_comment.data_off = 0; + tokeniser->context.current_comment.data.off = 0; tokeniser->context.current_comment.len = 0; tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; @@ -1768,7 +1783,7 @@ bool hubbub_tokeniser_handle_before_doctype_name( pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - cdoc->name.data_off = pos; + cdoc->name.data.off = pos; cdoc->name.len = len; cdoc->correct = false; @@ -1802,7 +1817,7 @@ bool hubbub_tokeniser_handle_before_doctype_name( pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - cdoc->name.data_off = pos; + cdoc->name.data.off = pos; cdoc->name.len = len; cdoc->correct = false; @@ -1834,7 +1849,7 @@ bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser) token.data.doctype.correct = (hubbub_inputstream_compare_range_ascii( tokeniser->input, - token.data.doctype.name.data_off, + token.data.doctype.name.data.off, token.data.doctype.name.len, "HTML", SLEN("HTML")) == 0); @@ -1896,7 +1911,7 @@ bool hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser) token.data.doctype.correct = (hubbub_inputstream_compare_range_ascii( tokeniser->input, - token.data.doctype.name.data_off, + token.data.doctype.name.data.off, token.data.doctype.name.len, "HTML", SLEN("HTML")) == 0); @@ -1969,7 +1984,7 @@ bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser) if (tokeniser->context.match_entity.done_setup == false) { pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - tokeniser->context.match_entity.str.data_off = pos; + tokeniser->context.match_entity.str.data.off = pos; tokeniser->context.match_entity.str.len = len; tokeniser->context.match_entity.base = 0; tokeniser->context.match_entity.codepoint = 0; @@ -2095,7 +2110,7 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser) /* And replace the matched range with it */ error = hubbub_inputstream_replace_range(tokeniser->input, - ctx->match_entity.str.data_off, + ctx->match_entity.str.data.off, ctx->match_entity.str.len, ctx->match_entity.codepoint); if (error != HUBBUB_OK) { @@ -2177,7 +2192,7 @@ bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) /* Now, replace range, if we found a named entity */ if (ctx->match_entity.codepoint != 0) { error = hubbub_inputstream_replace_range(tokeniser->input, - ctx->match_entity.str.data_off, + ctx->match_entity.str.data.off, ctx->match_entity.prev_len, ctx->match_entity.codepoint); if (error != HUBBUB_OK) { @@ -2249,8 +2264,8 @@ void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, attrs[j].name.len || hubbub_inputstream_compare_range_cs( tokeniser->input, - attrs[i].name.data_off, - attrs[j].name.data_off, + attrs[i].name.data.off, + attrs[j].name.data.off, attrs[i].name.len) != 0) { /* Attributes don't match */ continue; diff --git a/src/treebuilder/treebuilder.c b/src/treebuilder/treebuilder.c index 3d2b295..01e31e4 100644 --- a/src/treebuilder/treebuilder.c +++ b/src/treebuilder/treebuilder.c @@ -241,6 +241,8 @@ hubbub_treebuilder *hubbub_treebuilder_create(hubbub_tokeniser *tokeniser, assert(HTML != 0); tb->context.element_stack[0].type = 0; + tb->context.collect.string.type = HUBBUB_STRING_OFF; + tb->buffer_handler = NULL; tb->buffer_pw = NULL; @@ -1070,8 +1072,8 @@ bool handle_generic_rcdata(hubbub_treebuilder *treebuilder, switch (token->type) { case HUBBUB_TOKEN_CHARACTER: if (treebuilder->context.collect.string.len == 0) { - treebuilder->context.collect.string.data_off = - token->data.character.data_off; + treebuilder->context.collect.string.data.off = + token->data.character.data.off; } treebuilder->context.collect.string.len += token->data.character.len; @@ -1158,8 +1160,8 @@ bool handle_script_collect_characters(hubbub_treebuilder *treebuilder, switch (token->type) { case HUBBUB_TOKEN_CHARACTER: if (treebuilder->context.collect.string.len == 0) { - treebuilder->context.collect.string.data_off = - token->data.character.data_off; + treebuilder->context.collect.string.data.off = + token->data.character.data.off; } treebuilder->context.collect.string.len += token->data.character.len; @@ -1265,7 +1267,7 @@ bool process_characters_expect_whitespace(hubbub_treebuilder *treebuilder, const hubbub_token *token, bool insert_into_current_node) { const uint8_t *data = treebuilder->input_buffer + - token->data.character.data_off; + token->data.character.data.off; size_t len = token->data.character.len; size_t c; @@ -1284,7 +1286,7 @@ bool process_characters_expect_whitespace(hubbub_treebuilder *treebuilder, int success; void *text, *appended; - temp.data_off = token->data.character.data_off; + temp.data.off = token->data.character.data.off; temp.len = len - c; /** \todo Append to pre-existing text child, iff @@ -1318,7 +1320,7 @@ bool process_characters_expect_whitespace(hubbub_treebuilder *treebuilder, } /* Update token data to strip leading whitespace */ - ((hubbub_token *) token)->data.character.data_off += + ((hubbub_token *) token)->data.character.data.off += len - c; ((hubbub_token *) token)->data.character.len -= c; @@ -1409,7 +1411,7 @@ void parse_generic_rcdata(hubbub_treebuilder *treebuilder, treebuilder->context.collect.mode = treebuilder->context.mode; treebuilder->context.collect.type = type; treebuilder->context.collect.node = node; - treebuilder->context.collect.string.data_off = 0; + treebuilder->context.collect.string.data.off = 0; treebuilder->context.collect.string.len = 0; treebuilder->tree_handler->unref_node( @@ -1472,7 +1474,7 @@ void process_script_in_head(hubbub_treebuilder *treebuilder, treebuilder->context.collect.mode = treebuilder->context.mode; treebuilder->context.collect.node = script; treebuilder->context.collect.type = SCRIPT; - treebuilder->context.collect.string.data_off = 0; + treebuilder->context.collect.string.data.off = 0; treebuilder->context.collect.string.len = 0; treebuilder->context.mode = SCRIPT_COLLECT_CHARACTERS; @@ -1846,7 +1848,7 @@ void reset_insertion_mode(hubbub_treebuilder *treebuilder) element_type element_type_from_name(hubbub_treebuilder *treebuilder, const hubbub_string *tag_name) { - const uint8_t *name = treebuilder->input_buffer + tag_name->data_off; + const uint8_t *name = treebuilder->input_buffer + tag_name->data.off; return element_type_from_verbatim_name(name, tag_name->len); } diff --git a/test/parser-utf16.c b/test/parser-utf16.c index 9056dd1..86024a6 100644 --- a/test/parser-utf16.c +++ b/test/parser-utf16.c @@ -129,44 +129,44 @@ void token_handler(const hubbub_token *token, void *pw) case HUBBUB_TOKEN_DOCTYPE: printf("'%.*s' (%svalid)\n", (int) token->data.doctype.name.len, - pbuffer + token->data.doctype.name.data_off, + pbuffer + token->data.doctype.name.data.off, token->data.doctype.correct ? "" : "in"); break; case HUBBUB_TOKEN_START_TAG: printf("'%.*s' %s\n", (int) token->data.tag.name.len, - pbuffer + token->data.tag.name.data_off, + pbuffer + token->data.tag.name.data.off, (token->data.tag.n_attributes > 0) ? "attributes:" : ""); for (i = 0; i < token->data.tag.n_attributes; i++) { printf("\t'%.*s' = '%.*s'\n", (int) token->data.tag.attributes[i].name.len, - pbuffer + token->data.tag.attributes[i].name.data_off, + pbuffer + token->data.tag.attributes[i].name.data.off, (int) token->data.tag.attributes[i].value.len, - pbuffer + token->data.tag.attributes[i].value.data_off); + pbuffer + token->data.tag.attributes[i].value.data.off); } break; case HUBBUB_TOKEN_END_TAG: printf("'%.*s' %s\n", (int) token->data.tag.name.len, - pbuffer + token->data.tag.name.data_off, + pbuffer + token->data.tag.name.data.off, (token->data.tag.n_attributes > 0) ? "attributes:" : ""); for (i = 0; i < token->data.tag.n_attributes; i++) { printf("\t'%.*s' = '%.*s'\n", (int) token->data.tag.attributes[i].name.len, - pbuffer + token->data.tag.attributes[i].name.data_off, + pbuffer + token->data.tag.attributes[i].name.data.off, (int) token->data.tag.attributes[i].value.len, - pbuffer + token->data.tag.attributes[i].value.data_off); + pbuffer + token->data.tag.attributes[i].value.data.off); } break; case HUBBUB_TOKEN_COMMENT: printf("'%.*s'\n", (int) token->data.comment.len, - pbuffer + token->data.comment.data_off); + pbuffer + token->data.comment.data.off); break; case HUBBUB_TOKEN_CHARACTER: printf("'%.*s'\n", (int) token->data.character.len, - pbuffer + token->data.character.data_off); + pbuffer + token->data.character.data.off); break; case HUBBUB_TOKEN_EOF: printf("\n"); diff --git a/test/parser.c b/test/parser.c index fe2659d..fa2afb8 100644 --- a/test/parser.c +++ b/test/parser.c @@ -129,44 +129,44 @@ void token_handler(const hubbub_token *token, void *pw) case HUBBUB_TOKEN_DOCTYPE: printf("'%.*s' (%svalid)\n", (int) token->data.doctype.name.len, - pbuffer + token->data.doctype.name.data_off, + pbuffer + token->data.doctype.name.data.off, token->data.doctype.correct ? "" : "in"); break; case HUBBUB_TOKEN_START_TAG: printf("'%.*s' %s\n", (int) token->data.tag.name.len, - pbuffer + token->data.tag.name.data_off, + pbuffer + token->data.tag.name.data.off, (token->data.tag.n_attributes > 0) ? "attributes:" : ""); for (i = 0; i < token->data.tag.n_attributes; i++) { printf("\t'%.*s' = '%.*s'\n", (int) token->data.tag.attributes[i].name.len, - pbuffer + token->data.tag.attributes[i].name.data_off, + pbuffer + token->data.tag.attributes[i].name.data.off, (int) token->data.tag.attributes[i].value.len, - pbuffer + token->data.tag.attributes[i].value.data_off); + pbuffer + token->data.tag.attributes[i].value.data.off); } break; case HUBBUB_TOKEN_END_TAG: printf("'%.*s' %s\n", (int) token->data.tag.name.len, - pbuffer + token->data.tag.name.data_off, + pbuffer + token->data.tag.name.data.off, (token->data.tag.n_attributes > 0) ? "attributes:" : ""); for (i = 0; i < token->data.tag.n_attributes; i++) { printf("\t'%.*s' = '%.*s'\n", (int) token->data.tag.attributes[i].name.len, - pbuffer + token->data.tag.attributes[i].name.data_off, + pbuffer + token->data.tag.attributes[i].name.data.off, (int) token->data.tag.attributes[i].value.len, - pbuffer + token->data.tag.attributes[i].value.data_off); + pbuffer + token->data.tag.attributes[i].value.data.off); } break; case HUBBUB_TOKEN_COMMENT: printf("'%.*s'\n", (int) token->data.comment.len, - pbuffer + token->data.comment.data_off); + pbuffer + token->data.comment.data.off); break; case HUBBUB_TOKEN_CHARACTER: printf("'%.*s'\n", (int) token->data.character.len, - pbuffer + token->data.character.data_off); + pbuffer + token->data.character.data.off); break; case HUBBUB_TOKEN_EOF: printf("\n"); diff --git a/test/tokeniser.c b/test/tokeniser.c index 271b986..32ecdbc 100644 --- a/test/tokeniser.c +++ b/test/tokeniser.c @@ -128,44 +128,44 @@ void token_handler(const hubbub_token *token, void *pw) case HUBBUB_TOKEN_DOCTYPE: printf("'%.*s' (%svalid)\n", (int) token->data.doctype.name.len, - pbuffer + token->data.doctype.name.data_off, + pbuffer + token->data.doctype.name.data.off, token->data.doctype.correct ? "" : "in"); break; case HUBBUB_TOKEN_START_TAG: printf("'%.*s' %s\n", (int) token->data.tag.name.len, - pbuffer + token->data.tag.name.data_off, + pbuffer + token->data.tag.name.data.off, (token->data.tag.n_attributes > 0) ? "attributes:" : ""); for (i = 0; i < token->data.tag.n_attributes; i++) { printf("\t'%.*s' = '%.*s'\n", (int) token->data.tag.attributes[i].name.len, - pbuffer + token->data.tag.attributes[i].name.data_off, + pbuffer + token->data.tag.attributes[i].name.data.off, (int) token->data.tag.attributes[i].value.len, - pbuffer + token->data.tag.attributes[i].value.data_off); + pbuffer + token->data.tag.attributes[i].value.data.off); } break; case HUBBUB_TOKEN_END_TAG: printf("'%.*s' %s\n", (int) token->data.tag.name.len, - pbuffer + token->data.tag.name.data_off, + pbuffer + token->data.tag.name.data.off, (token->data.tag.n_attributes > 0) ? "attributes:" : ""); for (i = 0; i < token->data.tag.n_attributes; i++) { printf("\t'%.*s' = '%.*s'\n", (int) token->data.tag.attributes[i].name.len, - pbuffer + token->data.tag.attributes[i].name.data_off, + pbuffer + token->data.tag.attributes[i].name.data.off, (int) token->data.tag.attributes[i].value.len, - pbuffer + token->data.tag.attributes[i].value.data_off); + pbuffer + token->data.tag.attributes[i].value.data.off); } break; case HUBBUB_TOKEN_COMMENT: printf("'%.*s'\n", (int) token->data.comment.len, - pbuffer + token->data.comment.data_off); + pbuffer + token->data.comment.data.off); break; case HUBBUB_TOKEN_CHARACTER: printf("'%.*s'\n", (int) token->data.character.len, - pbuffer + token->data.character.data_off); + pbuffer + token->data.character.data.off); break; case HUBBUB_TOKEN_EOF: printf("\n"); diff --git a/test/tokeniser2.c b/test/tokeniser2.c index 103a3d5..f72e0d7 100644 --- a/test/tokeniser2.c +++ b/test/tokeniser2.c @@ -280,7 +280,7 @@ void token_handler(const hubbub_token *token, void *pw) bool expvalid = json_object_get_boolean((struct json_object *) array_list_get_idx(items, 2)); char *gotname = (char *) (ctx->pbuffer + - token->data.doctype.name.data_off); + token->data.doctype.name.data.off); printf("'%.*s' (%svalid)\n", (int) token->data.doctype.name.len, @@ -302,7 +302,7 @@ void token_handler(const hubbub_token *token, void *pw) (struct json_object *) array_list_get_idx(items, 2))->head; char *tagname = (char *) (ctx->pbuffer + - token->data.tag.name.data_off); + token->data.tag.name.data.off); printf("'%.*s' %s\n", (int) token->data.tag.name.len, @@ -318,11 +318,11 @@ void token_handler(const hubbub_token *token, void *pw) char *expval = json_object_get_string( (struct json_object *) expattrs->v); char *gotname = (char *) (ctx->pbuffer + - token->data.tag.attributes[i].name.data_off); + token->data.tag.attributes[i].name.data.off); size_t namelen = token->data.tag.attributes[i].name.len; char *gotval = (char *) (ctx->pbuffer + - token->data.tag.attributes[i].value.data_off); + token->data.tag.attributes[i].value.data.off); size_t vallen = token->data.tag.attributes[i].value.len; @@ -347,7 +347,7 @@ void token_handler(const hubbub_token *token, void *pw) char *expname = json_object_get_string((struct json_object *) array_list_get_idx(items, 1)); char *tagname = (char *) (ctx->pbuffer + - token->data.tag.name.data_off); + token->data.tag.name.data.off); printf("'%.*s' %s\n", (int) token->data.tag.name.len, @@ -364,7 +364,7 @@ void token_handler(const hubbub_token *token, void *pw) char *expstr = json_object_get_string((struct json_object *) array_list_get_idx(items, 1)); char *gotstr = (char *) (ctx->pbuffer + - token->data.comment.data_off); + token->data.comment.data.off); printf("'%.*s'\n", (int) token->data.comment.len, gotstr); @@ -377,7 +377,7 @@ void token_handler(const hubbub_token *token, void *pw) char *expstr = json_object_get_string((struct json_object *) array_list_get_idx(items, 1)); char *gotstr = (char *) (ctx->pbuffer + - token->data.character.data_off); + token->data.character.data.off); size_t len = min(token->data.character.len, strlen(expstr + ctx->char_off)); @@ -392,7 +392,7 @@ void token_handler(const hubbub_token *token, void *pw) hubbub_token t; t.type = HUBBUB_TOKEN_CHARACTER; - t.data.character.data_off += len; + t.data.character.data.off += len; t.data.character.len -= len; ctx->char_off = 0; diff --git a/test/tree.c b/test/tree.c index 04ce026..f4e6c3c 100644 --- a/test/tree.c +++ b/test/tree.c @@ -11,7 +11,7 @@ #include "testutils.h" -#define NODE_REF_CHUNK 1024 +#define NODE_REF_CHUNK 8192 static uint16_t *node_ref; static uintptr_t node_ref_alloc; static uintptr_t node_counter; @@ -72,6 +72,22 @@ static void *myrealloc(void *ptr, size_t len, void *pw) return realloc(ptr, len); } +static const uint8_t *ptr_from_hubbub_string(const hubbub_string *string) +{ + const uint8_t *data; + + switch (string->type) { + case HUBBUB_STRING_OFF: + data = pbuffer + string->data.off; + break; + case HUBBUB_STRING_PTR: + data = string->data.ptr; + break; + } + + return data; +} + int main(int argc, char **argv) { hubbub_parser *parser; @@ -188,7 +204,7 @@ void buffer_handler(const uint8_t *buffer, size_t len, void *pw) int create_comment(void *ctx, const hubbub_string *data, void **result) { printf("Creating (%u) [comment '%.*s']\n", ++node_counter, - data->len, pbuffer + data->data_off); + data->len, ptr_from_hubbub_string(data)); GROW_REF node_ref[node_counter] = 0; @@ -208,7 +224,7 @@ int create_doctype(void *ctx, const hubbub_string *qname, UNUSED(system_id); printf("Creating (%u) [doctype '%.*s']\n", ++node_counter, - qname->len, pbuffer + qname->data_off); + qname->len, ptr_from_hubbub_string(qname)); GROW_REF node_ref[node_counter] = 0; @@ -223,7 +239,7 @@ int create_doctype(void *ctx, const hubbub_string *qname, int create_element(void *ctx, const hubbub_tag *tag, void **result) { printf("Creating (%u) [element '%.*s']\n", ++node_counter, - tag->name.len, pbuffer + tag->name.data_off); + tag->name.len, ptr_from_hubbub_string(&tag->name)); GROW_REF node_ref[node_counter] = 0; @@ -254,7 +270,7 @@ int create_element_verbatim(void *ctx, const uint8_t *name, size_t len, int create_text(void *ctx, const hubbub_string *data, void **result) { printf("Creating (%u) [text '%.*s']\n", ++node_counter, - data->len, pbuffer + data->data_off); + data->len, ptr_from_hubbub_string(data)); GROW_REF node_ref[node_counter] = 0; -- cgit v1.2.3