From 4da6a038c15a5fa3d1c754b7278ae47627a44718 Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Fri, 21 Mar 2008 13:20:22 +0000
Subject: hubbub_strings may now be either an offset into the data buffer or a
 pointer to constant data. Fix up tokeniser and treebuilder to deal with this.
 Fix up testcases, too.

The tokeniser will only ever emit strings of type HUBBUB_STRING_OFF. Anything else is a bug which should be fixed.

The treebuilder may emit strings of either type.

svn path=/trunk/hubbub/; revision=4014
---
 src/tokeniser/tokeniser.c | 113 ++++++++++++++++++++++++++--------------------
 1 file changed, 64 insertions(+), 49 deletions(-)

(limited to 'src/tokeniser')

diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index f8b6bb3..3d69797 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -236,6 +236,12 @@ hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
 	}
 
 	memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
+	tok->context.current_tag.name.type = HUBBUB_STRING_OFF;
+	tok->context.current_comment.type = HUBBUB_STRING_OFF;
+	tok->context.current_doctype.name.type = HUBBUB_STRING_OFF;
+	tok->context.current_chars.type = HUBBUB_STRING_OFF;
+	tok->context.close_tag_match.tag.type = HUBBUB_STRING_OFF;
+	tok->context.match_entity.str.type = HUBBUB_STRING_OFF;
 
 	return tok;
 }
@@ -434,7 +440,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 	uint32_t c;
 
 	/* Clear current characters */
-	tokeniser->context.current_chars.data_off = 0;
+	tokeniser->context.current_chars.data.off = 0;
 	tokeniser->context.current_chars.len = 0;
 
 	while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
@@ -462,7 +468,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 			}
 
 			/* Buffer '<' */
-			tokeniser->context.current_chars.data_off =
+			tokeniser->context.current_chars.data.off =
 				hubbub_inputstream_cur_pos(tokeniser->input,
 					&tokeniser->context.current_chars.len);
 
@@ -478,7 +484,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 					&len);
 
 			if (tokeniser->context.current_chars.len == 0) {
-				tokeniser->context.current_chars.data_off =
+				tokeniser->context.current_chars.data.off =
 						pos;
 			}
 			tokeniser->context.current_chars.len++;
@@ -495,7 +501,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 
 		hubbub_tokeniser_emit_token(tokeniser, &token);
 
-		tokeniser->context.current_chars.data_off = 0;
+		tokeniser->context.current_chars.data.off = 0;
 		tokeniser->context.current_chars.len = 0;
 	}
 
@@ -524,7 +530,8 @@ bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser)
 
 		/* Emit character */
 		token.type = HUBBUB_TOKEN_CHARACTER;
-		token.data.character.data_off =
+		token.data.character.type = HUBBUB_STRING_OFF;
+		token.data.character.data.off =
 				hubbub_inputstream_cur_pos(tokeniser->input,
 						&token.data.character.len);
 
@@ -601,7 +608,7 @@ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
 			tokeniser->context.current_tag_type =
 					HUBBUB_TOKEN_START_TAG;
 
-			ctag->name.data_off =
+			ctag->name.data.off =
 				hubbub_inputstream_cur_pos(tokeniser->input,
 				&ctag->name.len);
 			ctag->n_attributes = 0;
@@ -613,7 +620,7 @@ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
 			tokeniser->context.current_tag_type =
 					HUBBUB_TOKEN_START_TAG;
 
-			ctag->name.data_off =
+			ctag->name.data.off =
 				hubbub_inputstream_cur_pos(tokeniser->input,
 				&ctag->name.len);
 			ctag->n_attributes = 0;
@@ -644,7 +651,7 @@ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
 					&len);
 			tokeniser->context.current_chars.len += len;
 
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 			tokeniser->context.current_comment.len = len;
 			tokeniser->state =
 				HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
@@ -688,7 +695,7 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 
 			tokeniser->context.current_tag_type =
 					HUBBUB_TOKEN_END_TAG;
-			ctag->name.data_off = pos;
+			ctag->name.data.off = pos;
 			ctag->name.len = len;
 			ctag->n_attributes = 0;
 
@@ -700,7 +707,7 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 
 			tokeniser->context.current_tag_type =
 					HUBBUB_TOKEN_END_TAG;
-			ctag->name.data_off = pos;
+			ctag->name.data.off = pos;
 			ctag->name.len = len;
 			ctag->n_attributes = 0;
 
@@ -724,7 +731,7 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 			pos = hubbub_inputstream_cur_pos(tokeniser->input,
 					&len);
 
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 			tokeniser->context.current_comment.len = len;
 
 			tokeniser->state =
@@ -756,7 +763,7 @@ bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
 		off = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (ctx->close_tag_match.tag.len == 0) {
-			ctx->close_tag_match.tag.data_off = off;
+			ctx->close_tag_match.tag.data.off = off;
 			ctx->close_tag_match.tag.len = len;
 		} else {
 			ctx->close_tag_match.tag.len += len;
@@ -768,8 +775,8 @@ bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
 			(ctx->close_tag_match.tag.len == ctag->name.len &&
 				hubbub_inputstream_compare_range_ci(
 					tokeniser->input,
-					ctag->name.data_off,
-					ctx->close_tag_match.tag.data_off,
+					ctag->name.data.off,
+					ctx->close_tag_match.tag.data.off,
 					ctag->name.len) != 0)) {
 			hubbub_token token;
 
@@ -792,8 +799,8 @@ bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
 		} else if (ctx->close_tag_match.tag.len == ctag->name.len &&
 				hubbub_inputstream_compare_range_ci(
 					tokeniser->input,
-					ctag->name.data_off,
-					ctx->close_tag_match.tag.data_off,
+					ctag->name.data.off,
+					ctx->close_tag_match.tag.data.off,
 					ctag->name.len) == 0) {
 			/* Matched => stop searching */
 			break;
@@ -968,9 +975,11 @@ bool hubbub_tokeniser_handle_before_attribute_name(
 
 		ctag->attributes = attr;
 
-		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].name.data.off = pos;
 		attr[ctag->n_attributes].name.len = len;
-		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].value.data.off = 0;
 		attr[ctag->n_attributes].value.len = 0;
 
 		ctag->n_attributes++;
@@ -1008,9 +1017,11 @@ bool hubbub_tokeniser_handle_before_attribute_name(
 
 		ctag->attributes = attr;
 
-		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].name.data.off = pos;
 		attr[ctag->n_attributes].name.len = len;
-		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].value.data.off = 0;
 		attr[ctag->n_attributes].value.len = 0;
 
 		ctag->n_attributes++;
@@ -1135,9 +1146,11 @@ bool hubbub_tokeniser_handle_after_attribute_name(
 
 		ctag->attributes = attr;
 
-		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].name.data.off = pos;
 		attr[ctag->n_attributes].name.len = len;
-		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].value.data.off = 0;
 		attr[ctag->n_attributes].value.len = 0;
 
 		ctag->n_attributes++;
@@ -1179,9 +1192,11 @@ bool hubbub_tokeniser_handle_after_attribute_name(
 
 		ctag->attributes = attr;
 
-		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].name.data.off = pos;
 		attr[ctag->n_attributes].name.len = len;
-		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].value.data.off = 0;
 		attr[ctag->n_attributes].value.len = 0;
 
 		ctag->n_attributes++;
@@ -1240,7 +1255,7 @@ bool hubbub_tokeniser_handle_before_attribute_value(
 
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
-		ctag->attributes[ctag->n_attributes - 1].value.data_off = pos;
+		ctag->attributes[ctag->n_attributes - 1].value.data.off = pos;
 		ctag->attributes[ctag->n_attributes - 1].value.len = len;
 
 		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ;
@@ -1285,7 +1300,7 @@ bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
-			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+			ctag->attributes[ctag->n_attributes - 1].value.data.off =
 					pos;
 		}
 
@@ -1331,7 +1346,7 @@ bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
-			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+			ctag->attributes[ctag->n_attributes - 1].value.data.off =
 					pos;
 		}
 
@@ -1388,7 +1403,7 @@ bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
-			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+			ctag->attributes[ctag->n_attributes - 1].value.data.off =
 					pos;
 		}
 
@@ -1421,7 +1436,7 @@ bool hubbub_tokeniser_handle_entity_in_attribute_value(
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
-			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+			ctag->attributes[ctag->n_attributes - 1].value.data.off =
 					pos;
 		}
 
@@ -1458,7 +1473,7 @@ bool hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (tokeniser->context.current_comment.len == 0)
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 		tokeniser->context.current_comment.len += len;
 
 		hubbub_inputstream_advance(tokeniser->input);
@@ -1495,7 +1510,7 @@ bool hubbub_tokeniser_handle_markup_declaration_open(
 		tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE;
 		hubbub_inputstream_advance(tokeniser->input);
 	} else {
-		tokeniser->context.current_comment.data_off = 0;
+		tokeniser->context.current_comment.data.off = 0;
 		tokeniser->context.current_comment.len = 0;
 
 		tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
@@ -1511,7 +1526,7 @@ bool hubbub_tokeniser_handle_comment_start(hubbub_tokeniser *tokeniser)
 	if (c == HUBBUB_INPUTSTREAM_OOD)
 		return false;
 
-	tokeniser->context.current_comment.data_off = 0;
+	tokeniser->context.current_comment.data.off = 0;
 	tokeniser->context.current_comment.len = 0;
 
 
@@ -1553,7 +1568,7 @@ bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (tokeniser->context.current_comment.len == 0)
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 		tokeniser->context.current_comment.len += len;
 
 		hubbub_inputstream_advance(tokeniser->input);
@@ -1589,11 +1604,11 @@ bool hubbub_tokeniser_handle_comment_dash(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (tokeniser->context.current_comment.len == 0) {
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 		} else {
 			/* Need to do this to get length of '-' */
 			len += pos -
-				tokeniser->context.current_comment.data_off;
+				tokeniser->context.current_comment.data.off;
 		}
 
 		tokeniser->context.current_comment.len = len;
@@ -1631,12 +1646,12 @@ bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (tokeniser->context.current_comment.len == 0) {
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 			tokeniser->context.current_comment.len = len;
 		} else {
 			/* Need to do this to get length of '-' */
 			len = pos -
-				tokeniser->context.current_comment.data_off;
+				tokeniser->context.current_comment.data.off;
 		}
 
 		tokeniser->context.current_comment.len = len;
@@ -1660,11 +1675,11 @@ bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (tokeniser->context.current_comment.len == 0) {
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 		} else {
 			/* Need to do this to get length of '--' */
 			len += pos -
-				tokeniser->context.current_comment.data_off;
+				tokeniser->context.current_comment.data.off;
 		}
 
 		tokeniser->context.current_comment.len = len;
@@ -1724,7 +1739,7 @@ bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
 		case 1: hubbub_inputstream_push_back(tokeniser->input, 'D');
 		}
 
-		tokeniser->context.current_comment.data_off = 0;
+		tokeniser->context.current_comment.data.off = 0;
 		tokeniser->context.current_comment.len = 0;
 
 		tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
@@ -1768,7 +1783,7 @@ bool hubbub_tokeniser_handle_before_doctype_name(
 
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
-		cdoc->name.data_off = pos;
+		cdoc->name.data.off = pos;
 		cdoc->name.len = len;
 		cdoc->correct = false;
 
@@ -1802,7 +1817,7 @@ bool hubbub_tokeniser_handle_before_doctype_name(
 
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
-		cdoc->name.data_off = pos;
+		cdoc->name.data.off = pos;
 		cdoc->name.len = len;
 		cdoc->correct = false;
 
@@ -1834,7 +1849,7 @@ bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
 		token.data.doctype.correct =
 			(hubbub_inputstream_compare_range_ascii(
 				tokeniser->input,
-				token.data.doctype.name.data_off,
+				token.data.doctype.name.data.off,
 				token.data.doctype.name.len,
 				"HTML", SLEN("HTML")) == 0);
 
@@ -1896,7 +1911,7 @@ bool hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser)
 		token.data.doctype.correct =
 			(hubbub_inputstream_compare_range_ascii(
 				tokeniser->input,
-				token.data.doctype.name.data_off,
+				token.data.doctype.name.data.off,
 				token.data.doctype.name.len,
 				"HTML", SLEN("HTML")) == 0);
 
@@ -1969,7 +1984,7 @@ bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser)
 	if (tokeniser->context.match_entity.done_setup == false) {
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
-		tokeniser->context.match_entity.str.data_off = pos;
+		tokeniser->context.match_entity.str.data.off = pos;
 		tokeniser->context.match_entity.str.len = len;
 		tokeniser->context.match_entity.base = 0;
 		tokeniser->context.match_entity.codepoint = 0;
@@ -2095,7 +2110,7 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
 
 		/* And replace the matched range with it */
 		error = hubbub_inputstream_replace_range(tokeniser->input,
-				ctx->match_entity.str.data_off,
+				ctx->match_entity.str.data.off,
 				ctx->match_entity.str.len,
 				ctx->match_entity.codepoint);
 		if (error != HUBBUB_OK) {
@@ -2177,7 +2192,7 @@ bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
 	/* Now, replace range, if we found a named entity */
 	if (ctx->match_entity.codepoint != 0) {
 		error = hubbub_inputstream_replace_range(tokeniser->input,
-				ctx->match_entity.str.data_off,
+				ctx->match_entity.str.data.off,
 				ctx->match_entity.prev_len,
 				ctx->match_entity.codepoint);
 		if (error != HUBBUB_OK) {
@@ -2249,8 +2264,8 @@ void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
 							attrs[j].name.len ||
 					hubbub_inputstream_compare_range_cs(
 						tokeniser->input,
-						attrs[i].name.data_off,
-						attrs[j].name.data_off,
+						attrs[i].name.data.off,
+						attrs[j].name.data.off,
 						attrs[i].name.len) != 0) {
 					/* Attributes don't match */
 					continue;
-- 
cgit v1.2.3