From 012d3f6431ee2c5a7f0214d93dbc60813bc79f5b Mon Sep 17 00:00:00 2001
From: Andrew Sidwell <andy@entai.co.uk>
Date: Tue, 17 Jun 2008 01:47:02 +0000
Subject: Fix entity consumption.  This gets us to the second set of html5lib
 tests.

svn path=/trunk/hubbub/; revision=4366
---
 src/tokeniser/tokeniser.c | 89 ++++++++++++++++++++++++++++-------------------
 src/utils/dict.c          | 21 +++++++----
 2 files changed, 68 insertions(+), 42 deletions(-)

(limited to 'src')

diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index a5a35a0..5c268b6 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -103,6 +103,7 @@ typedef struct hubbub_tokeniser_context {
 
 	struct {
 		hubbub_string str;		/**< Pending string */
+		uint32_t poss_len;
 		uint8_t base;			/**< Base for numeric
 						 * entities */
 		uint32_t codepoint;		/**< UCS4 codepoint */
@@ -266,6 +267,9 @@ hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
 	tok->state = HUBBUB_TOKENISER_STATE_DATA;
 	tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
 
+	tok->escape_flag = false;
+	tok->process_cdata_section = false;
+
 	tok->input = input;
 	tok->input_buffer = NULL;
 	tok->input_buffer_len = 0;
@@ -580,7 +584,8 @@ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
 				tokeniser->escape_flag == false) {
 			tokeniser->state =
 					HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_DATA;
-			hubbub_inputstream_advance(tokeniser->input);
+			/* Don't eat the '&'; it'll be handled by entity
+			 * consumption */
 			break;
 		} else if (c == '-') {
 			size_t len;
@@ -1474,7 +1479,7 @@ bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser)
 		tokeniser->state =
 			HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
 		tokeniser->context.allowed_char = '"';
-		hubbub_inputstream_advance(tokeniser->input);
+		/* Don't eat the '&'; it'll be handled by entity consumption */
 	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
 		hubbub_token token;
 
@@ -1521,7 +1526,7 @@ bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser)
 		tokeniser->state =
 			HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
 		tokeniser->context.allowed_char = '\'';
-		hubbub_inputstream_advance(tokeniser->input);
+		/* Don't eat the '&'; it'll be handled by entity consumption */
 	} else if (c == HUBBUB_INPUTSTREAM_EOF) {
 		hubbub_token token;
 
@@ -1567,7 +1572,7 @@ bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
 		tokeniser->context.prev_state = tokeniser->state;
 		tokeniser->state =
 			HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE;
-		hubbub_inputstream_advance(tokeniser->input);
+		/* Don't eat the '&'; it'll be handled by entity consumption */
 	} else if (c == '>') {
 		hubbub_token token;
 
@@ -2922,23 +2927,22 @@ bool hubbub_tokeniser_consume_character_reference(hubbub_tokeniser *tokeniser)
 	uint32_t pos;
 	size_t len;
 
-	if (tokeniser->context.match_entity.done_setup == false) {
-		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+	pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
-		tokeniser->context.match_entity.str.data.off = pos;
-		tokeniser->context.match_entity.str.len = len;
-		tokeniser->context.match_entity.base = 0;
-		tokeniser->context.match_entity.codepoint = 0;
-		tokeniser->context.match_entity.had_data = false;
-		tokeniser->context.match_entity.return_state =
-				tokeniser->state;
-		tokeniser->context.match_entity.complete = false;
-		tokeniser->context.match_entity.done_setup = true;
-		tokeniser->context.match_entity.context = NULL;
-		tokeniser->context.match_entity.prev_len = len;
+	/* Set things up */
+	tokeniser->context.match_entity.str.data.off = pos;
+	tokeniser->context.match_entity.str.len = len;
+	tokeniser->context.match_entity.poss_len = 0;
+	tokeniser->context.match_entity.base = 0;
+	tokeniser->context.match_entity.codepoint = 0;
+	tokeniser->context.match_entity.had_data = false;
+	tokeniser->context.match_entity.return_state = tokeniser->state;
+	tokeniser->context.match_entity.complete = false;
+	tokeniser->context.match_entity.done_setup = true;
+	tokeniser->context.match_entity.context = NULL;
+	tokeniser->context.match_entity.prev_len = len;
 
-		hubbub_inputstream_advance(tokeniser->input);
-	}
+	hubbub_inputstream_advance(tokeniser->input);
 
 	c = hubbub_inputstream_peek(tokeniser->input);
 
@@ -3048,25 +3052,31 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
 	hubbub_inputstream_rewind(tokeniser->input,
 			ctx->match_entity.str.len);
 
+	/* Had data, so calculate final codepoint */
 	if (ctx->match_entity.had_data) {
-		/* Had data, so calculate final codepoint */
-		if (0x80 <= ctx->match_entity.codepoint &&
-				ctx->match_entity.codepoint <= 0x9F) {
-			ctx->match_entity.codepoint =
-				cp1252Table[ctx->match_entity.codepoint -
-						0x80];
-		} else if (ctx->match_entity.codepoint == 0 ||
-				ctx->match_entity.codepoint > 0x10FFFF) {
-			ctx->match_entity.codepoint = 0xFFFD;
-		} else if (ctx->match_entity.codepoint == 0x0D) {
-			ctx->match_entity.codepoint = 0x000A;
+		uint32_t cp = ctx->match_entity.codepoint;
+
+		if (0x80 <= cp && cp <= 0x9F) {
+			cp = cp1252Table[cp - 0x80];
+		} else if (cp == 0x0D) {
+			cp = 0x000A;
+		} else if (cp <= 0x0008 ||
+				(0x000E <= cp && cp <= 0x001F) ||
+				(0x007F <= cp && cp <= 0x009F) ||
+				(0xD800 <= cp && cp <= 0xDFFF) ||
+				(0xFDD0 <= cp && cp <= 0xFDDF) ||
+				(cp & 0xFFFE) == 0xFFFE ||
+				cp > 0x10FFFF) {
+			cp = 0xFFFD;
 		}
 
+		printf("%x\n", cp);
+
 		/* And replace the matched range with it */
 		error = hubbub_inputstream_replace_range(tokeniser->input,
 				ctx->match_entity.str.data.off,
 				ctx->match_entity.str.len,
-				ctx->match_entity.codepoint);
+				cp);
 		if (error != HUBBUB_OK) {
 			/** \todo handle memory exhaustion */
 		}
@@ -3112,6 +3122,8 @@ bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
 			pos = hubbub_inputstream_cur_pos(tokeniser->input,
 					&len);
 			ctx->match_entity.str.len += len;
+			ctx->match_entity.str.len += ctx->match_entity.poss_len;
+			ctx->match_entity.poss_len = 0;
 
 			/* And cache length, for replacement */
 			ctx->match_entity.prev_len =
@@ -3122,28 +3134,35 @@ bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
 		} else {
 			pos = hubbub_inputstream_cur_pos(tokeniser->input,
 					&len);
-			ctx->match_entity.str.len += len;
+			ctx->match_entity.poss_len += len;
 		}
 
 		hubbub_inputstream_advance(tokeniser->input);
 	}
 
+	/* Rewind back possible matches, if any */
+	hubbub_inputstream_rewind(tokeniser->input,
+			ctx->match_entity.poss_len);
+
 	if (c == HUBBUB_INPUTSTREAM_OOD)
 		return false;
 
 	c = hubbub_inputstream_peek(tokeniser->input);
 
-	if (ctx->match_entity.codepoint != 0 && c != ';' &&
+	if ((tokeniser->context.match_entity.return_state ==
+			HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) &&
+			(c != ';') &&
 			((0x0030 <= c && c <= 0x0039) ||
 			(0x0041 <= c && c <= 0x005A) ||
 			(0x0061 <= c && c <= 0x007A))) {
 		ctx->match_entity.codepoint = 0;
 	}
 
-
 	/* Rewind the inputstream to start of processed sequence */
 	hubbub_inputstream_rewind(tokeniser->input,
-			ctx->match_entity.str.len);
+		ctx->match_entity.str.len);
+
+	pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
 
 	/* Now, replace range, if we found a named entity */
 	if (ctx->match_entity.codepoint != 0) {
diff --git a/src/utils/dict.c b/src/utils/dict.c
index f50ffab..59eb4be 100644
--- a/src/utils/dict.c
+++ b/src/utils/dict.c
@@ -14,12 +14,11 @@ typedef struct hubbub_dict_node {
 	uint8_t split;			/**< Data to split on */
 	struct hubbub_dict_node *lt;	/**< Subtree for data less than
 					 * split */
-	struct hubbub_dict_node *eq;	/**< Subtree for data equal to split
-					 * If split == '\0', this stores the
-					 * pointer to the actual data, not a
-					 * subtree */
+	struct hubbub_dict_node *eq;	/**< Subtree for data equal to split */
 	struct hubbub_dict_node *gt;	/**< Subtree for data greater than
 					 * split */
+
+	const void *value;			/**< Data for this node */
 } hubbub_dict_node;
 
 /** Dictionary object */
@@ -143,10 +142,14 @@ hubbub_dict_node *hubbub_dict_insert_internal(hubbub_dict *dict,
 				parent->lt, key, value);
 	} else if ((uint8_t) key[0] == parent->split) {
 		if (key[0] == '\0') {
-			parent->eq = (hubbub_dict_node *) value;
+			parent->value = value;
+		} else if (key[1] == '\0') {
+			parent->value = value;
+			parent->eq = hubbub_dict_insert_internal(dict,
+					parent->eq, key + 1, value);
 		} else {
 			parent->eq = hubbub_dict_insert_internal(dict,
-					parent->eq, ++key, value);
+					parent->eq, key + 1, value);
 		}
 	} else  {
 		parent->gt = hubbub_dict_insert_internal(dict,
@@ -200,7 +203,11 @@ hubbub_error hubbub_dict_search_step(hubbub_dict *dict, uint8_t c,
 				p = NULL;
 			} else if (p->eq != NULL && p->eq->split == '\0') {
 				match = true;
-				*result = (const void *) p->eq->eq;
+				*result = p->eq->value;
+				p = p->eq;
+			} else if (p->value) {
+				match = true;
+				*result = p->value;
 				p = p->eq;
 			} else {
 				p = p->eq;
-- 
cgit v1.2.3