summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2008-03-21 13:20:22 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2008-03-21 13:20:22 +0000
commit4da6a038c15a5fa3d1c754b7278ae47627a44718 (patch)
treebd3f06540ff87963913848c6fea5cb1b7eec62e3 /src
parenteebff1268e029fa6a31d391c175a66e159a45ecf (diff)
downloadlibhubbub-4da6a038c15a5fa3d1c754b7278ae47627a44718.tar.gz
libhubbub-4da6a038c15a5fa3d1c754b7278ae47627a44718.tar.bz2
hubbub_strings may now be either an offset into the data buffer or a pointer to constant data.
Fix up tokeniser and treebuilder to deal with this. Fix up testcases, too. The tokeniser will only ever emit strings of type HUBBUB_STRING_OFF. Anything else is a bug which should be fixed. The treebuilder may emit strings of either type. svn path=/trunk/hubbub/; revision=4014
Diffstat (limited to 'src')
-rw-r--r--src/tokeniser/tokeniser.c113
-rw-r--r--src/treebuilder/treebuilder.c22
2 files changed, 76 insertions, 59 deletions
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index f8b6bb3..3d69797 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -236,6 +236,12 @@ hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
}
memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
+ tok->context.current_tag.name.type = HUBBUB_STRING_OFF;
+ tok->context.current_comment.type = HUBBUB_STRING_OFF;
+ tok->context.current_doctype.name.type = HUBBUB_STRING_OFF;
+ tok->context.current_chars.type = HUBBUB_STRING_OFF;
+ tok->context.close_tag_match.tag.type = HUBBUB_STRING_OFF;
+ tok->context.match_entity.str.type = HUBBUB_STRING_OFF;
return tok;
}
@@ -434,7 +440,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
uint32_t c;
/* Clear current characters */
- tokeniser->context.current_chars.data_off = 0;
+ tokeniser->context.current_chars.data.off = 0;
tokeniser->context.current_chars.len = 0;
while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
@@ -462,7 +468,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
}
/* Buffer '<' */
- tokeniser->context.current_chars.data_off =
+ tokeniser->context.current_chars.data.off =
hubbub_inputstream_cur_pos(tokeniser->input,
&tokeniser->context.current_chars.len);
@@ -478,7 +484,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
&len);
if (tokeniser->context.current_chars.len == 0) {
- tokeniser->context.current_chars.data_off =
+ tokeniser->context.current_chars.data.off =
pos;
}
tokeniser->context.current_chars.len++;
@@ -495,7 +501,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
hubbub_tokeniser_emit_token(tokeniser, &token);
- tokeniser->context.current_chars.data_off = 0;
+ tokeniser->context.current_chars.data.off = 0;
tokeniser->context.current_chars.len = 0;
}
@@ -524,7 +530,8 @@ bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser)
/* Emit character */
token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character.data_off =
+ token.data.character.type = HUBBUB_STRING_OFF;
+ token.data.character.data.off =
hubbub_inputstream_cur_pos(tokeniser->input,
&token.data.character.len);
@@ -601,7 +608,7 @@ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
tokeniser->context.current_tag_type =
HUBBUB_TOKEN_START_TAG;
- ctag->name.data_off =
+ ctag->name.data.off =
hubbub_inputstream_cur_pos(tokeniser->input,
&ctag->name.len);
ctag->n_attributes = 0;
@@ -613,7 +620,7 @@ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
tokeniser->context.current_tag_type =
HUBBUB_TOKEN_START_TAG;
- ctag->name.data_off =
+ ctag->name.data.off =
hubbub_inputstream_cur_pos(tokeniser->input,
&ctag->name.len);
ctag->n_attributes = 0;
@@ -644,7 +651,7 @@ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
&len);
tokeniser->context.current_chars.len += len;
- tokeniser->context.current_comment.data_off = pos;
+ tokeniser->context.current_comment.data.off = pos;
tokeniser->context.current_comment.len = len;
tokeniser->state =
HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
@@ -688,7 +695,7 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
tokeniser->context.current_tag_type =
HUBBUB_TOKEN_END_TAG;
- ctag->name.data_off = pos;
+ ctag->name.data.off = pos;
ctag->name.len = len;
ctag->n_attributes = 0;
@@ -700,7 +707,7 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
tokeniser->context.current_tag_type =
HUBBUB_TOKEN_END_TAG;
- ctag->name.data_off = pos;
+ ctag->name.data.off = pos;
ctag->name.len = len;
ctag->n_attributes = 0;
@@ -724,7 +731,7 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
pos = hubbub_inputstream_cur_pos(tokeniser->input,
&len);
- tokeniser->context.current_comment.data_off = pos;
+ tokeniser->context.current_comment.data.off = pos;
tokeniser->context.current_comment.len = len;
tokeniser->state =
@@ -756,7 +763,7 @@ bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
off = hubbub_inputstream_cur_pos(tokeniser->input, &len);
if (ctx->close_tag_match.tag.len == 0) {
- ctx->close_tag_match.tag.data_off = off;
+ ctx->close_tag_match.tag.data.off = off;
ctx->close_tag_match.tag.len = len;
} else {
ctx->close_tag_match.tag.len += len;
@@ -768,8 +775,8 @@ bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
(ctx->close_tag_match.tag.len == ctag->name.len &&
hubbub_inputstream_compare_range_ci(
tokeniser->input,
- ctag->name.data_off,
- ctx->close_tag_match.tag.data_off,
+ ctag->name.data.off,
+ ctx->close_tag_match.tag.data.off,
ctag->name.len) != 0)) {
hubbub_token token;
@@ -792,8 +799,8 @@ bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
} else if (ctx->close_tag_match.tag.len == ctag->name.len &&
hubbub_inputstream_compare_range_ci(
tokeniser->input,
- ctag->name.data_off,
- ctx->close_tag_match.tag.data_off,
+ ctag->name.data.off,
+ ctx->close_tag_match.tag.data.off,
ctag->name.len) == 0) {
/* Matched => stop searching */
break;
@@ -968,9 +975,11 @@ bool hubbub_tokeniser_handle_before_attribute_name(
ctag->attributes = attr;
- attr[ctag->n_attributes].name.data_off = pos;
+ attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
+ attr[ctag->n_attributes].name.data.off = pos;
attr[ctag->n_attributes].name.len = len;
- attr[ctag->n_attributes].value.data_off = 0;
+ attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
+ attr[ctag->n_attributes].value.data.off = 0;
attr[ctag->n_attributes].value.len = 0;
ctag->n_attributes++;
@@ -1008,9 +1017,11 @@ bool hubbub_tokeniser_handle_before_attribute_name(
ctag->attributes = attr;
- attr[ctag->n_attributes].name.data_off = pos;
+ attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
+ attr[ctag->n_attributes].name.data.off = pos;
attr[ctag->n_attributes].name.len = len;
- attr[ctag->n_attributes].value.data_off = 0;
+ attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
+ attr[ctag->n_attributes].value.data.off = 0;
attr[ctag->n_attributes].value.len = 0;
ctag->n_attributes++;
@@ -1135,9 +1146,11 @@ bool hubbub_tokeniser_handle_after_attribute_name(
ctag->attributes = attr;
- attr[ctag->n_attributes].name.data_off = pos;
+ attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
+ attr[ctag->n_attributes].name.data.off = pos;
attr[ctag->n_attributes].name.len = len;
- attr[ctag->n_attributes].value.data_off = 0;
+ attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
+ attr[ctag->n_attributes].value.data.off = 0;
attr[ctag->n_attributes].value.len = 0;
ctag->n_attributes++;
@@ -1179,9 +1192,11 @@ bool hubbub_tokeniser_handle_after_attribute_name(
ctag->attributes = attr;
- attr[ctag->n_attributes].name.data_off = pos;
+ attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
+ attr[ctag->n_attributes].name.data.off = pos;
attr[ctag->n_attributes].name.len = len;
- attr[ctag->n_attributes].value.data_off = 0;
+ attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
+ attr[ctag->n_attributes].value.data.off = 0;
attr[ctag->n_attributes].value.len = 0;
ctag->n_attributes++;
@@ -1240,7 +1255,7 @@ bool hubbub_tokeniser_handle_before_attribute_value(
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
- ctag->attributes[ctag->n_attributes - 1].value.data_off = pos;
+ ctag->attributes[ctag->n_attributes - 1].value.data.off = pos;
ctag->attributes[ctag->n_attributes - 1].value.len = len;
tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ;
@@ -1285,7 +1300,7 @@ bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser)
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
- ctag->attributes[ctag->n_attributes - 1].value.data_off =
+ ctag->attributes[ctag->n_attributes - 1].value.data.off =
pos;
}
@@ -1331,7 +1346,7 @@ bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser)
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
- ctag->attributes[ctag->n_attributes - 1].value.data_off =
+ ctag->attributes[ctag->n_attributes - 1].value.data.off =
pos;
}
@@ -1388,7 +1403,7 @@ bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
- ctag->attributes[ctag->n_attributes - 1].value.data_off =
+ ctag->attributes[ctag->n_attributes - 1].value.data.off =
pos;
}
@@ -1421,7 +1436,7 @@ bool hubbub_tokeniser_handle_entity_in_attribute_value(
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
- ctag->attributes[ctag->n_attributes - 1].value.data_off =
+ ctag->attributes[ctag->n_attributes - 1].value.data.off =
pos;
}
@@ -1458,7 +1473,7 @@ bool hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
if (tokeniser->context.current_comment.len == 0)
- tokeniser->context.current_comment.data_off = pos;
+ tokeniser->context.current_comment.data.off = pos;
tokeniser->context.current_comment.len += len;
hubbub_inputstream_advance(tokeniser->input);
@@ -1495,7 +1510,7 @@ bool hubbub_tokeniser_handle_markup_declaration_open(
tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE;
hubbub_inputstream_advance(tokeniser->input);
} else {
- tokeniser->context.current_comment.data_off = 0;
+ tokeniser->context.current_comment.data.off = 0;
tokeniser->context.current_comment.len = 0;
tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
@@ -1511,7 +1526,7 @@ bool hubbub_tokeniser_handle_comment_start(hubbub_tokeniser *tokeniser)
if (c == HUBBUB_INPUTSTREAM_OOD)
return false;
- tokeniser->context.current_comment.data_off = 0;
+ tokeniser->context.current_comment.data.off = 0;
tokeniser->context.current_comment.len = 0;
@@ -1553,7 +1568,7 @@ bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
if (tokeniser->context.current_comment.len == 0)
- tokeniser->context.current_comment.data_off = pos;
+ tokeniser->context.current_comment.data.off = pos;
tokeniser->context.current_comment.len += len;
hubbub_inputstream_advance(tokeniser->input);
@@ -1589,11 +1604,11 @@ bool hubbub_tokeniser_handle_comment_dash(hubbub_tokeniser *tokeniser)
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
if (tokeniser->context.current_comment.len == 0) {
- tokeniser->context.current_comment.data_off = pos;
+ tokeniser->context.current_comment.data.off = pos;
} else {
/* Need to do this to get length of '-' */
len += pos -
- tokeniser->context.current_comment.data_off;
+ tokeniser->context.current_comment.data.off;
}
tokeniser->context.current_comment.len = len;
@@ -1631,12 +1646,12 @@ bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser)
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
if (tokeniser->context.current_comment.len == 0) {
- tokeniser->context.current_comment.data_off = pos;
+ tokeniser->context.current_comment.data.off = pos;
tokeniser->context.current_comment.len = len;
} else {
/* Need to do this to get length of '-' */
len = pos -
- tokeniser->context.current_comment.data_off;
+ tokeniser->context.current_comment.data.off;
}
tokeniser->context.current_comment.len = len;
@@ -1660,11 +1675,11 @@ bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser)
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
if (tokeniser->context.current_comment.len == 0) {
- tokeniser->context.current_comment.data_off = pos;
+ tokeniser->context.current_comment.data.off = pos;
} else {
/* Need to do this to get length of '--' */
len += pos -
- tokeniser->context.current_comment.data_off;
+ tokeniser->context.current_comment.data.off;
}
tokeniser->context.current_comment.len = len;
@@ -1724,7 +1739,7 @@ bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
case 1: hubbub_inputstream_push_back(tokeniser->input, 'D');
}
- tokeniser->context.current_comment.data_off = 0;
+ tokeniser->context.current_comment.data.off = 0;
tokeniser->context.current_comment.len = 0;
tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
@@ -1768,7 +1783,7 @@ bool hubbub_tokeniser_handle_before_doctype_name(
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
- cdoc->name.data_off = pos;
+ cdoc->name.data.off = pos;
cdoc->name.len = len;
cdoc->correct = false;
@@ -1802,7 +1817,7 @@ bool hubbub_tokeniser_handle_before_doctype_name(
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
- cdoc->name.data_off = pos;
+ cdoc->name.data.off = pos;
cdoc->name.len = len;
cdoc->correct = false;
@@ -1834,7 +1849,7 @@ bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
token.data.doctype.correct =
(hubbub_inputstream_compare_range_ascii(
tokeniser->input,
- token.data.doctype.name.data_off,
+ token.data.doctype.name.data.off,
token.data.doctype.name.len,
"HTML", SLEN("HTML")) == 0);
@@ -1896,7 +1911,7 @@ bool hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser)
token.data.doctype.correct =
(hubbub_inputstream_compare_range_ascii(
tokeniser->input,
- token.data.doctype.name.data_off,
+ token.data.doctype.name.data.off,
token.data.doctype.name.len,
"HTML", SLEN("HTML")) == 0);
@@ -1969,7 +1984,7 @@ bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser)
if (tokeniser->context.match_entity.done_setup == false) {
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
- tokeniser->context.match_entity.str.data_off = pos;
+ tokeniser->context.match_entity.str.data.off = pos;
tokeniser->context.match_entity.str.len = len;
tokeniser->context.match_entity.base = 0;
tokeniser->context.match_entity.codepoint = 0;
@@ -2095,7 +2110,7 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
/* And replace the matched range with it */
error = hubbub_inputstream_replace_range(tokeniser->input,
- ctx->match_entity.str.data_off,
+ ctx->match_entity.str.data.off,
ctx->match_entity.str.len,
ctx->match_entity.codepoint);
if (error != HUBBUB_OK) {
@@ -2177,7 +2192,7 @@ bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
/* Now, replace range, if we found a named entity */
if (ctx->match_entity.codepoint != 0) {
error = hubbub_inputstream_replace_range(tokeniser->input,
- ctx->match_entity.str.data_off,
+ ctx->match_entity.str.data.off,
ctx->match_entity.prev_len,
ctx->match_entity.codepoint);
if (error != HUBBUB_OK) {
@@ -2249,8 +2264,8 @@ void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
attrs[j].name.len ||
hubbub_inputstream_compare_range_cs(
tokeniser->input,
- attrs[i].name.data_off,
- attrs[j].name.data_off,
+ attrs[i].name.data.off,
+ attrs[j].name.data.off,
attrs[i].name.len) != 0) {
/* Attributes don't match */
continue;
diff --git a/src/treebuilder/treebuilder.c b/src/treebuilder/treebuilder.c
index 3d2b295..01e31e4 100644
--- a/src/treebuilder/treebuilder.c
+++ b/src/treebuilder/treebuilder.c
@@ -241,6 +241,8 @@ hubbub_treebuilder *hubbub_treebuilder_create(hubbub_tokeniser *tokeniser,
assert(HTML != 0);
tb->context.element_stack[0].type = 0;
+ tb->context.collect.string.type = HUBBUB_STRING_OFF;
+
tb->buffer_handler = NULL;
tb->buffer_pw = NULL;
@@ -1070,8 +1072,8 @@ bool handle_generic_rcdata(hubbub_treebuilder *treebuilder,
switch (token->type) {
case HUBBUB_TOKEN_CHARACTER:
if (treebuilder->context.collect.string.len == 0) {
- treebuilder->context.collect.string.data_off =
- token->data.character.data_off;
+ treebuilder->context.collect.string.data.off =
+ token->data.character.data.off;
}
treebuilder->context.collect.string.len +=
token->data.character.len;
@@ -1158,8 +1160,8 @@ bool handle_script_collect_characters(hubbub_treebuilder *treebuilder,
switch (token->type) {
case HUBBUB_TOKEN_CHARACTER:
if (treebuilder->context.collect.string.len == 0) {
- treebuilder->context.collect.string.data_off =
- token->data.character.data_off;
+ treebuilder->context.collect.string.data.off =
+ token->data.character.data.off;
}
treebuilder->context.collect.string.len +=
token->data.character.len;
@@ -1265,7 +1267,7 @@ bool process_characters_expect_whitespace(hubbub_treebuilder *treebuilder,
const hubbub_token *token, bool insert_into_current_node)
{
const uint8_t *data = treebuilder->input_buffer +
- token->data.character.data_off;
+ token->data.character.data.off;
size_t len = token->data.character.len;
size_t c;
@@ -1284,7 +1286,7 @@ bool process_characters_expect_whitespace(hubbub_treebuilder *treebuilder,
int success;
void *text, *appended;
- temp.data_off = token->data.character.data_off;
+ temp.data.off = token->data.character.data.off;
temp.len = len - c;
/** \todo Append to pre-existing text child, iff
@@ -1318,7 +1320,7 @@ bool process_characters_expect_whitespace(hubbub_treebuilder *treebuilder,
}
/* Update token data to strip leading whitespace */
- ((hubbub_token *) token)->data.character.data_off +=
+ ((hubbub_token *) token)->data.character.data.off +=
len - c;
((hubbub_token *) token)->data.character.len -= c;
@@ -1409,7 +1411,7 @@ void parse_generic_rcdata(hubbub_treebuilder *treebuilder,
treebuilder->context.collect.mode = treebuilder->context.mode;
treebuilder->context.collect.type = type;
treebuilder->context.collect.node = node;
- treebuilder->context.collect.string.data_off = 0;
+ treebuilder->context.collect.string.data.off = 0;
treebuilder->context.collect.string.len = 0;
treebuilder->tree_handler->unref_node(
@@ -1472,7 +1474,7 @@ void process_script_in_head(hubbub_treebuilder *treebuilder,
treebuilder->context.collect.mode = treebuilder->context.mode;
treebuilder->context.collect.node = script;
treebuilder->context.collect.type = SCRIPT;
- treebuilder->context.collect.string.data_off = 0;
+ treebuilder->context.collect.string.data.off = 0;
treebuilder->context.collect.string.len = 0;
treebuilder->context.mode = SCRIPT_COLLECT_CHARACTERS;
@@ -1846,7 +1848,7 @@ void reset_insertion_mode(hubbub_treebuilder *treebuilder)
element_type element_type_from_name(hubbub_treebuilder *treebuilder,
const hubbub_string *tag_name)
{
- const uint8_t *name = treebuilder->input_buffer + tag_name->data_off;
+ const uint8_t *name = treebuilder->input_buffer + tag_name->data.off;
return element_type_from_verbatim_name(name, tag_name->len);
}