Fix remaining issues around passing the testsuite, and make sure all the tokeniser is tested.

svn path=/trunk/hubbub/; revision=4387
author: Andrew Sidwell <andy@entai.co.uk> 2008-06-18 17:33:24 +0000
committer: Andrew Sidwell <andy@entai.co.uk> 2008-06-18 17:33:24 +0000
commit: 48ba3bdbd561645a78ef5e5cb99ead7ef3a10661 (patch)
tree: ff11bfb8ff547502d5dd2691da9ce3f34c382223
parent: fcc857c2b72ecc43388a0ee34f0a8ddfed8d13d8 (diff)
download: libhubbub-48ba3bdbd561645a78ef5e5cb99ead7ef3a10661.tar.gz
libhubbub-48ba3bdbd561645a78ef5e5cb99ead7ef3a10661.tar.bz2
3 files changed, 16 insertions, 155 deletions
diff --git a/src/input/filter.c b/src/input/filter.c
index 5ac5391..7a97840 100644
--- a/src/input/filter.c
+++ b/src/input/filter.c
@@ -348,24 +348,24 @@ hubbub_error read_character_filter(uint32_t c, uint32_t **output,
 #define LF  (0x0000000A)
 #define REP (0x0000FFFD)
 
-	if (c == NUL) {
-		/* Replace NUL (U+0000) characters in input with U+FFFD */
-		input->filter_output[0] = REP;
+	/* Replace NUL (U+0000) characters in input with U+FFFD */
+	if (c == NUL)
+		c = REP;
+
+	if (c == CR) {
+		/* Convert CRs to LFs straight away */
+		input->filter_output[0] = LF;
 		len = 1;
-	} else if (c == CR) {
-		/* Trap CR characters */
+	} else if (input->last_filter_char == CR && c == LF) {
+		/* Trap this LF */
 		len = 0;
-	} else if (input->last_filter_char == CR && c != LF) {
-		/* Last char was CR and this isn't LF => CR -> LF */
-		input->filter_output[0] = LF;
-		input->filter_output[1] = c;
-		len = 2;
 	} else {
 		/* Let character through unchanged */
 		input->filter_output[0] = c;
 		len = 1;
 	}
 
+
 #undef NUL
 #undef CR
 #undef LF
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index f3c902d..6c564a8 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -4,7 +4,6 @@
  *                http://www.opensource.org/licenses/mit-license.php
  * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
  */
-#include <stdio.h>
 #include <stdbool.h>
 #include <string.h>
 
@@ -31,7 +30,6 @@ typedef enum hubbub_tokeniser_state {
 	HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_DATA,
 	HUBBUB_TOKENISER_STATE_TAG_OPEN,
 	HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN,
-	HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH,
 	HUBBUB_TOKENISER_STATE_TAG_NAME,
 	HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME,
 	HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME,
@@ -92,7 +90,6 @@ typedef struct hubbub_tokeniser_context {
 	hubbub_string last_start_tag_name;	/**< Name of the last start tag
 						 * emitted */
 	struct {
-		hubbub_string tag;		/**< Pending close tag */
 		uint32_t count;
 	} close_tag_match;
 
@@ -170,8 +167,6 @@ static bool hubbub_tokeniser_handle_character_reference_data(
 static bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser);
 static bool hubbub_tokeniser_handle_close_tag_open(
 		hubbub_tokeniser *tokeniser);
-static bool hubbub_tokeniser_handle_close_tag_match(
-		hubbub_tokeniser *tokeniser);
 static bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser);
 static bool hubbub_tokeniser_handle_before_attribute_name(
 		hubbub_tokeniser *tokeniser);
@@ -301,7 +296,6 @@ hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
 	tok->context.current_tag.name.type = HUBBUB_STRING_OFF;
 	tok->context.current_comment.type = HUBBUB_STRING_OFF;
 	tok->context.current_chars.type = HUBBUB_STRING_OFF;
-	tok->context.close_tag_match.tag.type = HUBBUB_STRING_OFF;
 	tok->context.match_entity.str.type = HUBBUB_STRING_OFF;
 
 	return tok;
@@ -396,10 +390,6 @@ hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
 			cont = hubbub_tokeniser_handle_close_tag_open(
 					tokeniser);
 			break;
-		case HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH:
-			cont = hubbub_tokeniser_handle_close_tag_match(
-					tokeniser);
-			break;
 		case HUBBUB_TOKENISER_STATE_TAG_NAME:
 			cont = hubbub_tokeniser_handle_tag_name(tokeniser);
 			break;
@@ -1016,132 +1006,6 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
 	return true;
 }
 
-bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
-{
-	hubbub_tokeniser_context *ctx = &tokeniser->context;
-	hubbub_tag *ctag = &tokeniser->context.current_tag;
-	uint32_t c = 0;
-
-	while (ctx->close_tag_match.tag.len < ctag->name.len &&
-			(c = hubbub_inputstream_peek(tokeniser->input)) !=
-			HUBBUB_INPUTSTREAM_EOF &&
-			c != HUBBUB_INPUTSTREAM_OOD) {
-		/* Match last open tag */
-		uint32_t off;
-		size_t len;
-
-		off = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
-		if (ctx->close_tag_match.tag.len == 0) {
-			ctx->close_tag_match.tag.data.off = off;
-			ctx->close_tag_match.tag.len = len;
-		} else {
-			ctx->close_tag_match.tag.len += len;
-		}
-
-		hubbub_inputstream_advance(tokeniser->input);
-
-		if (ctx->close_tag_match.tag.len > ctag->name.len ||
-			(ctx->close_tag_match.tag.len == ctag->name.len &&
-				hubbub_inputstream_compare_range_ci(
-					tokeniser->input,
-					ctag->name.data.off,
-					ctx->close_tag_match.tag.data.off,
-					ctag->name.len) != 0)) {
-			hubbub_token token;
-
-			/* Rewind input stream to start of tag name */
-			if (hubbub_inputstream_rewind(tokeniser->input,
-					ctx->close_tag_match.tag.len) !=
-					HUBBUB_OK)
-				abort();
-
-			/* Emit "</" */
-			token.type = HUBBUB_TOKEN_CHARACTER;
-			token.data.character =
-					tokeniser->context.current_chars;
-
-			hubbub_tokeniser_emit_token(tokeniser, &token);
-
-			tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
-			hubbub_inputstream_advance(tokeniser->input);
-
-			return true;
-		} else if (ctx->close_tag_match.tag.len == ctag->name.len &&
-				hubbub_inputstream_compare_range_ci(
-					tokeniser->input,
-					ctag->name.data.off,
-					ctx->close_tag_match.tag.data.off,
-					ctag->name.len) == 0) {
-			/* Matched => stop searching */
-			break;
-		}
-	}
-
-	if (c == HUBBUB_INPUTSTREAM_OOD) {
-		/* Need more data */
-		return false;
-	}
-
-	if (c == HUBBUB_INPUTSTREAM_EOF) {
-		/* Ran out of data - parse error */
-		hubbub_token token;
-
-		/* Rewind input stream to start of tag name */
-		if (hubbub_inputstream_rewind(tokeniser->input,
-				ctx->close_tag_match.tag.len) != HUBBUB_OK)
-			abort();
-
-		/* Emit "</" */
-		token.type = HUBBUB_TOKEN_CHARACTER;
-		token.data.character = tokeniser->context.current_chars;
-
-		hubbub_tokeniser_emit_token(tokeniser, &token);
-
-		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
-
-		return true;
-	}
-
-	/* Match following char */
-	c = hubbub_inputstream_peek(tokeniser->input);
-
-	if (c == HUBBUB_INPUTSTREAM_OOD) {
-		/* Need more data */
-		return false;
-	}
-
-	/* Rewind input stream to start of tag name */
-	if (hubbub_inputstream_rewind(tokeniser->input,
-			ctx->close_tag_match.tag.len) != HUBBUB_OK)
-		abort();
-
-	/* Check that following char was valid */
-	if (c != '\t' && c != '\n' && c != '\f' && c != ' ' && c != '>' &&
-			c != '/' && c != HUBBUB_INPUTSTREAM_EOF) {
-		hubbub_token token;
-
-		/* Emit "</" */
-		token.type = HUBBUB_TOKEN_CHARACTER;
-		token.data.character = tokeniser->context.current_chars;
-
-		hubbub_tokeniser_emit_token(tokeniser, &token);
-
-		tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
-		hubbub_inputstream_advance(tokeniser->input);
-
-		return true;
-	}
-
-	/* Switch the content model back to PCDATA */
-	tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
-
-	/* Finally, transition back to close tag open state */
-	tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
-
-	return true;
-}
-
 bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
 {
 	hubbub_tag *ctag = &tokeniser->context.current_tag;
@@ -1673,12 +1537,8 @@ bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
 		size_t len;
 
 		pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
-		if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
-			ctag->attributes[ctag->n_attributes - 1].value.data.off =
-					pos;
-		}
-
+		/* don't worry about setting the offset -- this is
+		 * always done before this state is reached */
 		ctag->attributes[ctag->n_attributes - 1].value.len += len;
 
 		hubbub_inputstream_advance(tokeniser->input);
diff --git a/test/data/tokeniser2/test2.test b/test/data/tokeniser2/test2.test
index 8b4f516..1a15fde 100644
--- a/test/data/tokeniser2/test2.test
+++ b/test/data/tokeniser2/test2.test
@@ -132,12 +132,9 @@
 "input":"foo < bar",
 "output":[["Character", "foo "], "ParseError", ["Character", "< bar"]]},
 
-/* jmb -- libjson uses C strings internally, thus the input gets truncated before the 	 
-	  * data is fed to the input stream (and thus the tokeniser)
 {"description":"Null Byte Replacement",
 "input":"\u0000",
 "output":["ParseError", ["Character", "\ufffd"]]},
-*/
 
 {"description":"Comment with dash",
 "input":"<!---x",
@@ -147,6 +144,10 @@
 "input":"\nx\n&gt;\n",
 "output":[["Character","\nx\n>\n"]]}
 
+{"description":"Empty attribute followed by uppercase attribute",
+"input":"<h a B=''>",
+"output":[["StartTag", "h", {"a":"", "b":""}]]},
+
 ]}
author	Andrew Sidwell <andy@entai.co.uk>	2008-06-18 17:33:24 +0000
committer	Andrew Sidwell <andy@entai.co.uk>	2008-06-18 17:33:24 +0000
commit	48ba3bdbd561645a78ef5e5cb99ead7ef3a10661 (patch)
tree	ff11bfb8ff547502d5dd2691da9ce3f34c382223
parent	fcc857c2b72ecc43388a0ee34f0a8ddfed8d13d8 (diff)
download	libhubbub-48ba3bdbd561645a78ef5e5cb99ead7ef3a10661.tar.gz libhubbub-48ba3bdbd561645a78ef5e5cb99ead7ef3a10661.tar.bz2