From 4da6a038c15a5fa3d1c754b7278ae47627a44718 Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Fri, 21 Mar 2008 13:20:22 +0000
Subject: hubbub_strings may now be either an offset into the data buffer or a
 pointer to constant data. Fix up tokeniser and treebuilder to deal with this.
 Fix up testcases, too.

The tokeniser will only ever emit strings of type HUBBUB_STRING_OFF. Anything else is a bug which should be fixed.

The treebuilder may emit strings of either type.

svn path=/trunk/hubbub/; revision=4014
---
 include/hubbub/types.h        |  11 +++-
 src/tokeniser/tokeniser.c     | 113 ++++++++++++++++++++++++------------------
 src/treebuilder/treebuilder.c |  22 ++++----
 test/parser-utf16.c           |  18 +++----
 test/parser.c                 |  18 +++----
 test/tokeniser.c              |  18 +++----
 test/tokeniser2.c             |  16 +++---
 test/tree.c                   |  26 ++++++++--
 8 files changed, 142 insertions(+), 100 deletions(-)

diff --git a/include/hubbub/types.h b/include/hubbub/types.h
index 922bdbb..e58a88b 100644
--- a/include/hubbub/types.h
+++ b/include/hubbub/types.h
@@ -57,7 +57,16 @@ typedef enum hubbub_token_type {
  * Tokeniser string type
  */
 typedef struct hubbub_string {
-	uint32_t data_off;		/**< Byte offset of string start */
+	enum {
+		HUBBUB_STRING_OFF,
+		HUBBUB_STRING_PTR
+	} type;
+
+	union {
+		const uint8_t *ptr;	/**< Pointer to data */
+		uint32_t off;		/**< Byte offset of string start */
+	} data;
+
 	size_t len;			/**< Byte length of string */
 } hubbub_string;
 
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index f8b6bb3..3d69797 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -236,6 +236,12 @@ hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
 	}
 
 	memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
+	tok->context.current_tag.name.type = HUBBUB_STRING_OFF;
+	tok->context.current_comment.type = HUBBUB_STRING_OFF;
+	tok->context.current_doctype.name.type = HUBBUB_STRING_OFF;
+	tok->context.current_chars.type = HUBBUB_STRING_OFF;
+	tok->context.close_tag_match.tag.type = HUBBUB_STRING_OFF;
+	tok->context.match_entity.str.type = HUBBUB_STRING_OFF;
 
 	return tok;
 }
@@ -434,7 +440,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 	uint32_t c;
 
 	/* Clear current characters */
-	tokeniser->context.current_chars.data_off = 0;
+	tokeniser->context.current_chars.data.off = 0;
 	tokeniser->context.current_chars.len = 0;
 
 	while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
@@ -462,7 +468,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 			}
 
 			/* Buffer '<' */
-			tokeniser->context.current_chars.data_off =
+			tokeniser->context.current_chars.data.off =
 				hubbub_inputstream_cur_pos(tokeniser->input,
 					&tokeniser->context.current_chars.len);
 
@@ -478,7 +484,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 					&len);
 
 			if (tokeniser->context.current_chars.len == 0) {
-				tokeniser->context.current_chars.data_off =
+				tokeniser->context.current_chars.data.off =
 						pos;
 			}
 			tokeniser->context.current_chars.len++;
@@ -495,7 +501,7 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 
 		hubbub_tokeniser_emit_token(tokeniser, &token);
 
-		tokeniser->context.current_chars.data_off = 0;
+		tokeniser->context.current_chars.data.off = 0;
 		tokeniser->context.current_chars.len = 0;
 	}
 
@@ -524,7 +530,8 @@ bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser)
 
 		/* Emit character */
 		token.type = HUBBUB_TOKEN_CHARACTER;
-		token.data.character.data_off =
+		token.data.character.type = HUBBUB_STRING_OFF;
+		token.data.character.data.off =
 				hubbub_inputstream_cur_pos(tokeniser->input,
 						&token.data.character.len);
 
@@ -601,7 +608,7 @@ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
 			tokeniser->context.current_tag_type =
 					HUBBUB_TOKEN_START_TAG;
 
-			ctag->name.data_off =
+			ctag->name.data.off =
 				hubbub_inputstream_cur_pos(tokeniser->input,
 				&ctag->name.len);
 			ctag->n_attributes = 0;
@@ -613,7 +620,7 @@ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
 			tokeniser->context.current_tag_type =
 					HUBBUB_TOKEN_START_TAG;
 
-			ctag->name.data_off =
+			ctag->name.data.off =
 				hubbub_inputstream_cur_pos(tokeniser->input,
 				&ctag->name.len);
 			ctag->n_attributes = 0;
@@ -644,7 +651,7 @@ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
 					&len);
 			tokeniser->context.current_chars.len += len;
 
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 			tokeniser->context.current_comment.len = len;
 			tokeniser->state =
 				HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
@@ -688,7 +695,7 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 
 			tokeniser->context.current_tag_type =
 					HUBBUB_TOKEN_END_TAG;
-			ctag->name.data_off = pos;
+			ctag->name.data.off = pos;
 			ctag->name.len = len;
 			ctag->n_attributes = 0;
 
@@ -700,7 +707,7 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 
 			tokeniser->context.current_tag_type =
 					HUBBUB_TOKEN_END_TAG;
-			ctag->name.data_off = pos;
+			ctag->name.data.off = pos;
 			ctag->name.len = len;
 			ctag->n_attributes = 0;
 
@@ -724,7 +731,7 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 			pos = hubbub_inputstream_cur_pos(tokeniser->input,
 					&len);
 
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 			tokeniser->context.current_comment.len = len;
 
 			tokeniser->state =
@@ -756,7 +763,7 @@ bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
 		off = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (ctx->close_tag_match.tag.len == 0) {
-			ctx->close_tag_match.tag.data_off = off;
+			ctx->close_tag_match.tag.data.off = off;
 			ctx->close_tag_match.tag.len = len;
 		} else {
 			ctx->close_tag_match.tag.len += len;
@@ -768,8 +775,8 @@ bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
 			(ctx->close_tag_match.tag.len == ctag->name.len &&
 				hubbub_inputstream_compare_range_ci(
 					tokeniser->input,
-					ctag->name.data_off,
-					ctx->close_tag_match.tag.data_off,
+					ctag->name.data.off,
+					ctx->close_tag_match.tag.data.off,
 					ctag->name.len) != 0)) {
 			hubbub_token token;
 
@@ -792,8 +799,8 @@ bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
 		} else if (ctx->close_tag_match.tag.len == ctag->name.len &&
 				hubbub_inputstream_compare_range_ci(
 					tokeniser->input,
-					ctag->name.data_off,
-					ctx->close_tag_match.tag.data_off,
+					ctag->name.data.off,
+					ctx->close_tag_match.tag.data.off,
 					ctag->name.len) == 0) {
 			/* Matched => stop searching */
 			break;
@@ -968,9 +975,11 @@ bool hubbub_tokeniser_handle_before_attribute_name(
 
 		ctag->attributes = attr;
 
-		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].name.data.off = pos;
 		attr[ctag->n_attributes].name.len = len;
-		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].value.data.off = 0;
 		attr[ctag->n_attributes].value.len = 0;
 
 		ctag->n_attributes++;
@@ -1008,9 +1017,11 @@ bool hubbub_tokeniser_handle_before_attribute_name(
 
 		ctag->attributes = attr;
 
-		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].name.data.off = pos;
 		attr[ctag->n_attributes].name.len = len;
-		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].value.data.off = 0;
 		attr[ctag->n_attributes].value.len = 0;
 
 		ctag->n_attributes++;
@@ -1135,9 +1146,11 @@ bool hubbub_tokeniser_handle_after_attribute_name(
 
 		ctag->attributes = attr;
 
-		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].name.data.off = pos;
 		attr[ctag->n_attributes].name.len = len;
-		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].value.data.off = 0;
 		attr[ctag->n_attributes].value.len = 0;
 
 		ctag->n_attributes++;
@@ -1179,9 +1192,11 @@ bool hubbub_tokeniser_handle_after_attribute_name(
 
 		ctag->attributes = attr;
 
-		attr[ctag->n_attributes].name.data_off = pos;
+		attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].name.data.off = pos;
 		attr[ctag->n_attributes].name.len = len;
-		attr[ctag->n_attributes].value.data_off = 0;
+		attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF;
+		attr[ctag->n_attributes].value.data.off = 0;
 		attr[ctag->n_attributes].value.len = 0;
 
 		ctag->n_attributes++;
@@ -1240,7 +1255,7 @@ bool hubbub_tokeniser_handle_before_attribute_value(
 
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
-		ctag->attributes[ctag->n_attributes - 1].value.data_off = pos;
+		ctag->attributes[ctag->n_attributes - 1].value.data.off = pos;
 		ctag->attributes[ctag->n_attributes - 1].value.len = len;
 
 		tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ;
@@ -1285,7 +1300,7 @@ bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
-			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+			ctag->attributes[ctag->n_attributes - 1].value.data.off =
 					pos;
 		}
 
@@ -1331,7 +1346,7 @@ bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
-			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+			ctag->attributes[ctag->n_attributes - 1].value.data.off =
 					pos;
 		}
 
@@ -1388,7 +1403,7 @@ bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
-			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+			ctag->attributes[ctag->n_attributes - 1].value.data.off =
 					pos;
 		}
 
@@ -1421,7 +1436,7 @@ bool hubbub_tokeniser_handle_entity_in_attribute_value(
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
-			ctag->attributes[ctag->n_attributes - 1].value.data_off =
+			ctag->attributes[ctag->n_attributes - 1].value.data.off =
 					pos;
 		}
 
@@ -1458,7 +1473,7 @@ bool hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (tokeniser->context.current_comment.len == 0)
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 		tokeniser->context.current_comment.len += len;
 
 		hubbub_inputstream_advance(tokeniser->input);
@@ -1495,7 +1510,7 @@ bool hubbub_tokeniser_handle_markup_declaration_open(
 		tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE;
 		hubbub_inputstream_advance(tokeniser->input);
 	} else {
-		tokeniser->context.current_comment.data_off = 0;
+		tokeniser->context.current_comment.data.off = 0;
 		tokeniser->context.current_comment.len = 0;
 
 		tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
@@ -1511,7 +1526,7 @@ bool hubbub_tokeniser_handle_comment_start(hubbub_tokeniser *tokeniser)
 	if (c == HUBBUB_INPUTSTREAM_OOD)
 		return false;
 
-	tokeniser->context.current_comment.data_off = 0;
+	tokeniser->context.current_comment.data.off = 0;
 	tokeniser->context.current_comment.len = 0;
 
 
@@ -1553,7 +1568,7 @@ bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (tokeniser->context.current_comment.len == 0)
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 		tokeniser->context.current_comment.len += len;
 
 		hubbub_inputstream_advance(tokeniser->input);
@@ -1589,11 +1604,11 @@ bool hubbub_tokeniser_handle_comment_dash(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (tokeniser->context.current_comment.len == 0) {
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 		} else {
 			/* Need to do this to get length of '-' */
 			len += pos -
-				tokeniser->context.current_comment.data_off;
+				tokeniser->context.current_comment.data.off;
 		}
 
 		tokeniser->context.current_comment.len = len;
@@ -1631,12 +1646,12 @@ bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (tokeniser->context.current_comment.len == 0) {
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 			tokeniser->context.current_comment.len = len;
 		} else {
 			/* Need to do this to get length of '-' */
 			len = pos -
-				tokeniser->context.current_comment.data_off;
+				tokeniser->context.current_comment.data.off;
 		}
 
 		tokeniser->context.current_comment.len = len;
@@ -1660,11 +1675,11 @@ bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser)
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 		if (tokeniser->context.current_comment.len == 0) {
-			tokeniser->context.current_comment.data_off = pos;
+			tokeniser->context.current_comment.data.off = pos;
 		} else {
 			/* Need to do this to get length of '--' */
 			len += pos -
-				tokeniser->context.current_comment.data_off;
+				tokeniser->context.current_comment.data.off;
 		}
 
 		tokeniser->context.current_comment.len = len;
@@ -1724,7 +1739,7 @@ bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
 		case 1: hubbub_inputstream_push_back(tokeniser->input, 'D');
 		}
 
-		tokeniser->context.current_comment.data_off = 0;
+		tokeniser->context.current_comment.data.off = 0;
 		tokeniser->context.current_comment.len = 0;
 
 		tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
@@ -1768,7 +1783,7 @@ bool hubbub_tokeniser_handle_before_doctype_name(
 
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
-		cdoc->name.data_off = pos;
+		cdoc->name.data.off = pos;
 		cdoc->name.len = len;
 		cdoc->correct = false;
 
@@ -1802,7 +1817,7 @@ bool hubbub_tokeniser_handle_before_doctype_name(
 
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
-		cdoc->name.data_off = pos;
+		cdoc->name.data.off = pos;
 		cdoc->name.len = len;
 		cdoc->correct = false;
 
@@ -1834,7 +1849,7 @@ bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
 		token.data.doctype.correct =
 			(hubbub_inputstream_compare_range_ascii(
 				tokeniser->input,
-				token.data.doctype.name.data_off,
+				token.data.doctype.name.data.off,
 				token.data.doctype.name.len,
 				"HTML", SLEN("HTML")) == 0);
 
@@ -1896,7 +1911,7 @@ bool hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser)
 		token.data.doctype.correct =
 			(hubbub_inputstream_compare_range_ascii(
 				tokeniser->input,
-				token.data.doctype.name.data_off,
+				token.data.doctype.name.data.off,
 				token.data.doctype.name.len,
 				"HTML", SLEN("HTML")) == 0);
 
@@ -1969,7 +1984,7 @@ bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser)
 	if (tokeniser->context.match_entity.done_setup == false) {
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
-		tokeniser->context.match_entity.str.data_off = pos;
+		tokeniser->context.match_entity.str.data.off = pos;
 		tokeniser->context.match_entity.str.len = len;
 		tokeniser->context.match_entity.base = 0;
 		tokeniser->context.match_entity.codepoint = 0;
@@ -2095,7 +2110,7 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
 
 		/* And replace the matched range with it */
 		error = hubbub_inputstream_replace_range(tokeniser->input,
-				ctx->match_entity.str.data_off,
+				ctx->match_entity.str.data.off,
 				ctx->match_entity.str.len,
 				ctx->match_entity.codepoint);
 		if (error != HUBBUB_OK) {
@@ -2177,7 +2192,7 @@ bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
 	/* Now, replace range, if we found a named entity */
 	if (ctx->match_entity.codepoint != 0) {
 		error = hubbub_inputstream_replace_range(tokeniser->input,
-				ctx->match_entity.str.data_off,
+				ctx->match_entity.str.data.off,
 				ctx->match_entity.prev_len,
 				ctx->match_entity.codepoint);
 		if (error != HUBBUB_OK) {
@@ -2249,8 +2264,8 @@ void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
 							attrs[j].name.len ||
 					hubbub_inputstream_compare_range_cs(
 						tokeniser->input,
-						attrs[i].name.data_off,
-						attrs[j].name.data_off,
+						attrs[i].name.data.off,
+						attrs[j].name.data.off,
 						attrs[i].name.len) != 0) {
 					/* Attributes don't match */
 					continue;
diff --git a/src/treebuilder/treebuilder.c b/src/treebuilder/treebuilder.c
index 3d2b295..01e31e4 100644
--- a/src/treebuilder/treebuilder.c
+++ b/src/treebuilder/treebuilder.c
@@ -241,6 +241,8 @@ hubbub_treebuilder *hubbub_treebuilder_create(hubbub_tokeniser *tokeniser,
 	assert(HTML != 0);
 	tb->context.element_stack[0].type = 0;
 
+	tb->context.collect.string.type = HUBBUB_STRING_OFF;
+
 	tb->buffer_handler = NULL;
 	tb->buffer_pw = NULL;
 
@@ -1070,8 +1072,8 @@ bool handle_generic_rcdata(hubbub_treebuilder *treebuilder,
 	switch (token->type) {
 	case HUBBUB_TOKEN_CHARACTER:
 		if (treebuilder->context.collect.string.len == 0) {
-			treebuilder->context.collect.string.data_off =
-					token->data.character.data_off;
+			treebuilder->context.collect.string.data.off =
+					token->data.character.data.off;
 		}
 		treebuilder->context.collect.string.len += 
 				token->data.character.len;
@@ -1158,8 +1160,8 @@ bool handle_script_collect_characters(hubbub_treebuilder *treebuilder,
 	switch (token->type) {
 	case HUBBUB_TOKEN_CHARACTER:
 		if (treebuilder->context.collect.string.len == 0) {
-			treebuilder->context.collect.string.data_off =
-					token->data.character.data_off;
+			treebuilder->context.collect.string.data.off =
+					token->data.character.data.off;
 		}
 		treebuilder->context.collect.string.len += 
 				token->data.character.len;
@@ -1265,7 +1267,7 @@ bool process_characters_expect_whitespace(hubbub_treebuilder *treebuilder,
 		const hubbub_token *token, bool insert_into_current_node)
 {
 	const uint8_t *data = treebuilder->input_buffer + 
-			token->data.character.data_off;
+			token->data.character.data.off;
 	size_t len = token->data.character.len;
 	size_t c;
 
@@ -1284,7 +1286,7 @@ bool process_characters_expect_whitespace(hubbub_treebuilder *treebuilder,
 			int success;
 			void *text, *appended;
 
-			temp.data_off = token->data.character.data_off;
+			temp.data.off = token->data.character.data.off;
 			temp.len = len - c;
 
 			/** \todo Append to pre-existing text child, iff
@@ -1318,7 +1320,7 @@ bool process_characters_expect_whitespace(hubbub_treebuilder *treebuilder,
 		}
 
 		/* Update token data to strip leading whitespace */
-		((hubbub_token *) token)->data.character.data_off += 
+		((hubbub_token *) token)->data.character.data.off += 
 				len - c;
 		((hubbub_token *) token)->data.character.len -= c;
 
@@ -1409,7 +1411,7 @@ void parse_generic_rcdata(hubbub_treebuilder *treebuilder,
 	treebuilder->context.collect.mode = treebuilder->context.mode;
 	treebuilder->context.collect.type = type;
 	treebuilder->context.collect.node = node;
-	treebuilder->context.collect.string.data_off = 0;
+	treebuilder->context.collect.string.data.off = 0;
 	treebuilder->context.collect.string.len = 0;
 
 	treebuilder->tree_handler->unref_node(
@@ -1472,7 +1474,7 @@ void process_script_in_head(hubbub_treebuilder *treebuilder,
 	treebuilder->context.collect.mode = treebuilder->context.mode;
 	treebuilder->context.collect.node = script;
 	treebuilder->context.collect.type = SCRIPT;
-	treebuilder->context.collect.string.data_off = 0;
+	treebuilder->context.collect.string.data.off = 0;
 	treebuilder->context.collect.string.len = 0;
 
 	treebuilder->context.mode = SCRIPT_COLLECT_CHARACTERS;
@@ -1846,7 +1848,7 @@ void reset_insertion_mode(hubbub_treebuilder *treebuilder)
 element_type element_type_from_name(hubbub_treebuilder *treebuilder,
 		const hubbub_string *tag_name)
 {
-	const uint8_t *name = treebuilder->input_buffer + tag_name->data_off;
+	const uint8_t *name = treebuilder->input_buffer + tag_name->data.off;
 
 	return element_type_from_verbatim_name(name, tag_name->len);
 }
diff --git a/test/parser-utf16.c b/test/parser-utf16.c
index 9056dd1..86024a6 100644
--- a/test/parser-utf16.c
+++ b/test/parser-utf16.c
@@ -129,44 +129,44 @@ void token_handler(const hubbub_token *token, void *pw)
 	case HUBBUB_TOKEN_DOCTYPE:
 		printf("'%.*s' (%svalid)\n",
 				(int) token->data.doctype.name.len,
-				pbuffer + token->data.doctype.name.data_off,
+				pbuffer + token->data.doctype.name.data.off,
 				token->data.doctype.correct ? "" : "in");
 		break;
 	case HUBBUB_TOKEN_START_TAG:
 		printf("'%.*s' %s\n",
 				(int) token->data.tag.name.len,
-				pbuffer + token->data.tag.name.data_off,
+				pbuffer + token->data.tag.name.data.off,
 				(token->data.tag.n_attributes > 0) ?
 						"attributes:" : "");
 		for (i = 0; i < token->data.tag.n_attributes; i++) {
 			printf("\t'%.*s' = '%.*s'\n",
 					(int) token->data.tag.attributes[i].name.len,
-					pbuffer + token->data.tag.attributes[i].name.data_off,
+					pbuffer + token->data.tag.attributes[i].name.data.off,
 					(int) token->data.tag.attributes[i].value.len,
-					pbuffer + token->data.tag.attributes[i].value.data_off);
+					pbuffer + token->data.tag.attributes[i].value.data.off);
 		}
 		break;
 	case HUBBUB_TOKEN_END_TAG:
 		printf("'%.*s' %s\n",
 				(int) token->data.tag.name.len,
-				pbuffer + token->data.tag.name.data_off,
+				pbuffer + token->data.tag.name.data.off,
 				(token->data.tag.n_attributes > 0) ?
 						"attributes:" : "");
 		for (i = 0; i < token->data.tag.n_attributes; i++) {
 			printf("\t'%.*s' = '%.*s'\n",
 					(int) token->data.tag.attributes[i].name.len,
-					pbuffer + token->data.tag.attributes[i].name.data_off,
+					pbuffer + token->data.tag.attributes[i].name.data.off,
 					(int) token->data.tag.attributes[i].value.len,
-					pbuffer + token->data.tag.attributes[i].value.data_off);
+					pbuffer + token->data.tag.attributes[i].value.data.off);
 		}
 		break;
 	case HUBBUB_TOKEN_COMMENT:
 		printf("'%.*s'\n", (int) token->data.comment.len,
-				pbuffer + token->data.comment.data_off);
+				pbuffer + token->data.comment.data.off);
 		break;
 	case HUBBUB_TOKEN_CHARACTER:
 		printf("'%.*s'\n", (int) token->data.character.len,
-				pbuffer + token->data.character.data_off);
+				pbuffer + token->data.character.data.off);
 		break;
 	case HUBBUB_TOKEN_EOF:
 		printf("\n");
diff --git a/test/parser.c b/test/parser.c
index fe2659d..fa2afb8 100644
--- a/test/parser.c
+++ b/test/parser.c
@@ -129,44 +129,44 @@ void token_handler(const hubbub_token *token, void *pw)
 	case HUBBUB_TOKEN_DOCTYPE:
 		printf("'%.*s' (%svalid)\n",
 				(int) token->data.doctype.name.len,
-				pbuffer + token->data.doctype.name.data_off,
+				pbuffer + token->data.doctype.name.data.off,
 				token->data.doctype.correct ? "" : "in");
 		break;
 	case HUBBUB_TOKEN_START_TAG:
 		printf("'%.*s' %s\n",
 				(int) token->data.tag.name.len,
-				pbuffer + token->data.tag.name.data_off,
+				pbuffer + token->data.tag.name.data.off,
 				(token->data.tag.n_attributes > 0) ?
 						"attributes:" : "");
 		for (i = 0; i < token->data.tag.n_attributes; i++) {
 			printf("\t'%.*s' = '%.*s'\n",
 					(int) token->data.tag.attributes[i].name.len,
-					pbuffer + token->data.tag.attributes[i].name.data_off,
+					pbuffer + token->data.tag.attributes[i].name.data.off,
 					(int) token->data.tag.attributes[i].value.len,
-					pbuffer + token->data.tag.attributes[i].value.data_off);
+					pbuffer + token->data.tag.attributes[i].value.data.off);
 		}
 		break;
 	case HUBBUB_TOKEN_END_TAG:
 		printf("'%.*s' %s\n",
 				(int) token->data.tag.name.len,
-				pbuffer + token->data.tag.name.data_off,
+				pbuffer + token->data.tag.name.data.off,
 				(token->data.tag.n_attributes > 0) ?
 						"attributes:" : "");
 		for (i = 0; i < token->data.tag.n_attributes; i++) {
 			printf("\t'%.*s' = '%.*s'\n",
 					(int) token->data.tag.attributes[i].name.len,
-					pbuffer + token->data.tag.attributes[i].name.data_off,
+					pbuffer + token->data.tag.attributes[i].name.data.off,
 					(int) token->data.tag.attributes[i].value.len,
-					pbuffer + token->data.tag.attributes[i].value.data_off);
+					pbuffer + token->data.tag.attributes[i].value.data.off);
 		}
 		break;
 	case HUBBUB_TOKEN_COMMENT:
 		printf("'%.*s'\n", (int) token->data.comment.len,
-				pbuffer + token->data.comment.data_off);
+				pbuffer + token->data.comment.data.off);
 		break;
 	case HUBBUB_TOKEN_CHARACTER:
 		printf("'%.*s'\n", (int) token->data.character.len,
-				pbuffer + token->data.character.data_off);
+				pbuffer + token->data.character.data.off);
 		break;
 	case HUBBUB_TOKEN_EOF:
 		printf("\n");
diff --git a/test/tokeniser.c b/test/tokeniser.c
index 271b986..32ecdbc 100644
--- a/test/tokeniser.c
+++ b/test/tokeniser.c
@@ -128,44 +128,44 @@ void token_handler(const hubbub_token *token, void *pw)
 	case HUBBUB_TOKEN_DOCTYPE:
 		printf("'%.*s' (%svalid)\n",
 				(int) token->data.doctype.name.len,
-				pbuffer + token->data.doctype.name.data_off,
+				pbuffer + token->data.doctype.name.data.off,
 				token->data.doctype.correct ? "" : "in");
 		break;
 	case HUBBUB_TOKEN_START_TAG:
 		printf("'%.*s' %s\n",
 				(int) token->data.tag.name.len,
-				pbuffer + token->data.tag.name.data_off,
+				pbuffer + token->data.tag.name.data.off,
 				(token->data.tag.n_attributes > 0) ?
 						"attributes:" : "");
 		for (i = 0; i < token->data.tag.n_attributes; i++) {
 			printf("\t'%.*s' = '%.*s'\n",
 					(int) token->data.tag.attributes[i].name.len,
-					pbuffer + token->data.tag.attributes[i].name.data_off,
+					pbuffer + token->data.tag.attributes[i].name.data.off,
 					(int) token->data.tag.attributes[i].value.len,
-					pbuffer + token->data.tag.attributes[i].value.data_off);
+					pbuffer + token->data.tag.attributes[i].value.data.off);
 		}
 		break;
 	case HUBBUB_TOKEN_END_TAG:
 		printf("'%.*s' %s\n",
 				(int) token->data.tag.name.len,
-				pbuffer + token->data.tag.name.data_off,
+				pbuffer + token->data.tag.name.data.off,
 				(token->data.tag.n_attributes > 0) ?
 						"attributes:" : "");
 		for (i = 0; i < token->data.tag.n_attributes; i++) {
 			printf("\t'%.*s' = '%.*s'\n",
 					(int) token->data.tag.attributes[i].name.len,
-					pbuffer + token->data.tag.attributes[i].name.data_off,
+					pbuffer + token->data.tag.attributes[i].name.data.off,
 					(int) token->data.tag.attributes[i].value.len,
-					pbuffer + token->data.tag.attributes[i].value.data_off);
+					pbuffer + token->data.tag.attributes[i].value.data.off);
 		}
 		break;
 	case HUBBUB_TOKEN_COMMENT:
 		printf("'%.*s'\n", (int) token->data.comment.len,
-				pbuffer + token->data.comment.data_off);
+				pbuffer + token->data.comment.data.off);
 		break;
 	case HUBBUB_TOKEN_CHARACTER:
 		printf("'%.*s'\n", (int) token->data.character.len,
-				pbuffer + token->data.character.data_off);
+				pbuffer + token->data.character.data.off);
 		break;
 	case HUBBUB_TOKEN_EOF:
 		printf("\n");
diff --git a/test/tokeniser2.c b/test/tokeniser2.c
index 103a3d5..f72e0d7 100644
--- a/test/tokeniser2.c
+++ b/test/tokeniser2.c
@@ -280,7 +280,7 @@ void token_handler(const hubbub_token *token, void *pw)
 		bool expvalid = json_object_get_boolean((struct json_object *)
 				array_list_get_idx(items, 2));
 		char *gotname = (char *) (ctx->pbuffer +
-				token->data.doctype.name.data_off);
+				token->data.doctype.name.data.off);
 
 		printf("'%.*s' (%svalid)\n",
 				(int) token->data.doctype.name.len,
@@ -302,7 +302,7 @@ void token_handler(const hubbub_token *token, void *pw)
 			(struct json_object *)
 					array_list_get_idx(items, 2))->head;
 		char *tagname = (char *) (ctx->pbuffer +
-				token->data.tag.name.data_off);
+				token->data.tag.name.data.off);
 
 		printf("'%.*s' %s\n",
 				(int) token->data.tag.name.len,
@@ -318,11 +318,11 @@ void token_handler(const hubbub_token *token, void *pw)
 			char *expval = json_object_get_string(
 					(struct json_object *) expattrs->v);
 			char *gotname = (char *) (ctx->pbuffer +
-				token->data.tag.attributes[i].name.data_off);
+				token->data.tag.attributes[i].name.data.off);
 			size_t namelen =
 				token->data.tag.attributes[i].name.len;
 			char *gotval = (char *) (ctx->pbuffer +
-				token->data.tag.attributes[i].value.data_off);
+				token->data.tag.attributes[i].value.data.off);
 			size_t vallen =
 				token->data.tag.attributes[i].value.len;
 
@@ -347,7 +347,7 @@ void token_handler(const hubbub_token *token, void *pw)
 		char *expname = json_object_get_string((struct json_object *)
 				array_list_get_idx(items, 1));
 		char *tagname = (char *) (ctx->pbuffer +
-				token->data.tag.name.data_off);
+				token->data.tag.name.data.off);
 
 		printf("'%.*s' %s\n",
 				(int) token->data.tag.name.len,
@@ -364,7 +364,7 @@ void token_handler(const hubbub_token *token, void *pw)
 		char *expstr = json_object_get_string((struct json_object *)
 				array_list_get_idx(items, 1));
 		char *gotstr = (char *) (ctx->pbuffer +
-				token->data.comment.data_off);
+				token->data.comment.data.off);
 
 		printf("'%.*s'\n", (int) token->data.comment.len, gotstr);
 
@@ -377,7 +377,7 @@ void token_handler(const hubbub_token *token, void *pw)
 		char *expstr = json_object_get_string((struct json_object *)
 				array_list_get_idx(items, 1));
 		char *gotstr = (char *) (ctx->pbuffer +
-				token->data.character.data_off);
+				token->data.character.data.off);
 		size_t len = min(token->data.character.len,
 				strlen(expstr + ctx->char_off));
 
@@ -392,7 +392,7 @@ void token_handler(const hubbub_token *token, void *pw)
 			hubbub_token t;
 
 			t.type = HUBBUB_TOKEN_CHARACTER;
-			t.data.character.data_off += len;
+			t.data.character.data.off += len;
 			t.data.character.len -= len;
 
 			ctx->char_off = 0;
diff --git a/test/tree.c b/test/tree.c
index 04ce026..f4e6c3c 100644
--- a/test/tree.c
+++ b/test/tree.c
@@ -11,7 +11,7 @@
 
 #include "testutils.h"
 
-#define NODE_REF_CHUNK 1024
+#define NODE_REF_CHUNK 8192
 static uint16_t *node_ref;
 static uintptr_t node_ref_alloc;
 static uintptr_t node_counter;
@@ -72,6 +72,22 @@ static void *myrealloc(void *ptr, size_t len, void *pw)
 	return realloc(ptr, len);
 }
 
+static const uint8_t *ptr_from_hubbub_string(const hubbub_string *string)
+{
+	const uint8_t *data;
+
+	switch (string->type) {
+	case HUBBUB_STRING_OFF:
+		data = pbuffer + string->data.off;
+		break;
+	case HUBBUB_STRING_PTR:
+		data = string->data.ptr;
+		break;
+	}
+
+	return data;
+}
+
 int main(int argc, char **argv)
 {
 	hubbub_parser *parser;
@@ -188,7 +204,7 @@ void buffer_handler(const uint8_t *buffer, size_t len, void *pw)
 int create_comment(void *ctx, const hubbub_string *data, void **result)
 {
 	printf("Creating (%u) [comment '%.*s']\n", ++node_counter,
-			data->len, pbuffer + data->data_off);
+			data->len, ptr_from_hubbub_string(data));
 
 	GROW_REF
 	node_ref[node_counter] = 0;
@@ -208,7 +224,7 @@ int create_doctype(void *ctx, const hubbub_string *qname,
 	UNUSED(system_id);
 
 	printf("Creating (%u) [doctype '%.*s']\n", ++node_counter,
-			qname->len, pbuffer + qname->data_off);
+			qname->len, ptr_from_hubbub_string(qname));
 
 	GROW_REF
 	node_ref[node_counter] = 0;
@@ -223,7 +239,7 @@ int create_doctype(void *ctx, const hubbub_string *qname,
 int create_element(void *ctx, const hubbub_tag *tag, void **result)
 {
 	printf("Creating (%u) [element '%.*s']\n", ++node_counter,
-			tag->name.len, pbuffer + tag->name.data_off);
+			tag->name.len, ptr_from_hubbub_string(&tag->name));
 
 	GROW_REF
 	node_ref[node_counter] = 0;
@@ -254,7 +270,7 @@ int create_element_verbatim(void *ctx, const uint8_t *name, size_t len,
 int create_text(void *ctx, const hubbub_string *data, void **result)
 {
 	printf("Creating (%u) [text '%.*s']\n", ++node_counter,
-			data->len, pbuffer + data->data_off);
+			data->len, ptr_from_hubbub_string(data));
 
 	GROW_REF
 	node_ref[node_counter] = 0;
-- 
cgit v1.2.3