summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRupinder Singh Khokhar <rsk1coder99@gmail.com>2014-06-17 00:54:12 +0530
committerRupinder Singh Khokhar <rsk1coder99@gmail.com>2014-07-09 10:04:21 +0530
commit8c55e32256f4081d097cd7114fcf5e307a8a9288 (patch)
tree4377a7d60ab894e15b3da93d1191a5090603f09b
parente68a4b8ac410f402d12308ce7d63083b78d7ee89 (diff)
downloadlibhubbub-8c55e32256f4081d097cd7114fcf5e307a8a9288.tar.gz
libhubbub-8c55e32256f4081d097cd7114fcf5e307a8a9288.tar.bz2
added RAWTEXT contentModel. Also removed an if(c='-') condition because I felt it was extranious, with no clear logic, not according to the specs. Also fixed a sever bug in handling the tagname state. In all 3 more test files give a PASS
-rw-r--r--src/tokeniser/tokeniser.c188
-rw-r--r--test/data/tokeniser2/INDEX6
2 files changed, 96 insertions, 98 deletions
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index 64eaf30..7a54df9 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -693,33 +693,13 @@ hubbub_error hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
/* Don't eat the '&'; it'll be handled by entity
* consumption */
break;
- } else if (c == '-' &&
- tokeniser->escape_flag == false &&
- (tokeniser->content_model ==
- HUBBUB_CONTENT_MODEL_RCDATA ||
- tokeniser->content_model ==
- HUBBUB_CONTENT_MODEL_CDATA) &&
- tokeniser->context.pending >= 3) {
- size_t ignore;
- error = parserutils_inputstream_peek(
- tokeniser->input,
- tokeniser->context.pending - 3,
- &cptr,
- &ignore);
-
- assert(error == PARSERUTILS_OK);
-
- if (strncmp((char *)cptr,
- "<!--", SLEN("<!--")) == 0) {
- tokeniser->escape_flag = true;
- }
-
- tokeniser->context.pending += len;
} else if (c == '<' && (tokeniser->content_model ==
HUBBUB_CONTENT_MODEL_PCDATA ||
((tokeniser->content_model ==
HUBBUB_CONTENT_MODEL_RCDATA ||
tokeniser->content_model ==
+ HUBBUB_CONTENT_MODEL_RAWTEXT ||
+ tokeniser->content_model ==
HUBBUB_CONTENT_MODEL_CDATA) &&
tokeniser->escape_flag == false))) {
if (tokeniser->context.pending > 0) {
@@ -910,6 +890,7 @@ hubbub_error hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
tokeniser->state = STATE_CLOSE_TAG_OPEN;
} else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+ tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT ||
tokeniser->content_model ==
HUBBUB_CONTENT_MODEL_CDATA) {
/* Return to data state with '<' still in "chars" */
@@ -982,6 +963,7 @@ hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
/**\todo fragment case */
if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+ tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT ||
tokeniser->content_model ==
HUBBUB_CONTENT_MODEL_CDATA) {
uint8_t *start_tag_name =
@@ -1037,73 +1019,67 @@ hubbub_error hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
}
}
- if (ctx->close_tag_match.match == false &&
- tokeniser->content_model !=
- HUBBUB_CONTENT_MODEL_PCDATA) {
- /* We should emit "</" here, but instead we leave it in the
- * buffer so the data state emits it with any characters
- * following it */
- tokeniser->state = STATE_DATA;
- } else {
- error = parserutils_inputstream_peek(tokeniser->input,
- tokeniser->context.pending, &cptr, &len);
- if (error == PARSERUTILS_EOF) {
- /** \todo parse error */
+ error = parserutils_inputstream_peek(tokeniser->input,
+ tokeniser->context.pending, &cptr, &len);
- /* Return to data state with "</" pending */
- tokeniser->state = STATE_DATA;
- return HUBBUB_OK;
- } else if (error != PARSERUTILS_OK) {
- return hubbub_error_from_parserutils_error(error);
- }
+ if (error == PARSERUTILS_EOF) {
+ /** \todo parse error */
+ /* Return to data state with "</" pending */
+ tokeniser->state = STATE_DATA;
+ return HUBBUB_OK;
+ } else if (error != PARSERUTILS_OK) {
+ return hubbub_error_from_parserutils_error(error);
+ }
- c = *cptr;
+ c = *cptr;
- if ('A' <= c && c <= 'Z') {
- uint8_t lc = (c + 0x20);
- START_BUF(tokeniser->context.current_tag.name,
- &lc, len);
- tokeniser->context.current_tag.n_attributes = 0;
+ if ('A' <= c && c <= 'Z') {
+ uint8_t lc = (c + 0x20);
+ START_BUF(tokeniser->context.current_tag.name,
+ &lc, len);
+ tokeniser->context.current_tag.n_attributes = 0;
- tokeniser->context.current_tag_type =
- HUBBUB_TOKEN_END_TAG;
+ tokeniser->context.current_tag_type =
+ HUBBUB_TOKEN_END_TAG;
- tokeniser->context.pending += len;
+ tokeniser->context.pending += len;
- tokeniser->state = STATE_TAG_NAME;
- } else if ('a' <= c && c <= 'z') {
- START_BUF(tokeniser->context.current_tag.name,
- cptr, len);
- tokeniser->context.current_tag.n_attributes = 0;
+ tokeniser->state = STATE_TAG_NAME;
+ } else if ('a' <= c && c <= 'z') {
+ START_BUF(tokeniser->context.current_tag.name,
+ cptr, len);
+ tokeniser->context.current_tag.n_attributes = 0;
- tokeniser->context.current_tag_type =
- HUBBUB_TOKEN_END_TAG;
+ tokeniser->context.current_tag_type =
+ HUBBUB_TOKEN_END_TAG;
- tokeniser->context.pending += len;
+ tokeniser->context.pending += len;
- tokeniser->state = STATE_TAG_NAME;
- } else if (c == '>') {
- /* Cursor still at "</", need to collect ">" */
- tokeniser->context.pending += len;
+ tokeniser->state = STATE_TAG_NAME;
+ } else if(tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+ tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT) {
+ tokeniser->state = STATE_DATA;
+ } else if (c == '>') {
+ /* Cursor still at "</", need to collect ">" */
+ tokeniser->context.pending += len;
- /* Now need to advance past "</>" */
- parserutils_inputstream_advance(tokeniser->input,
- tokeniser->context.pending);
- tokeniser->context.pending = 0;
+ /* Now need to advance past "</>" */
+ parserutils_inputstream_advance(tokeniser->input,
+ tokeniser->context.pending);
+ tokeniser->context.pending = 0;
- /** \todo parse error */
- tokeniser->state = STATE_DATA;
- } else {
- /** \todo parse error */
+ /** \todo parse error */
+ tokeniser->state = STATE_DATA;
+ } else {
+ /** \todo parse error */
- /* Cursor still at "</", need to advance past it */
- parserutils_inputstream_advance(tokeniser->input,
- tokeniser->context.pending);
- tokeniser->context.pending = 0;
+ /* Cursor still at "</", need to advance past it */
+ parserutils_inputstream_advance(tokeniser->input,
+ tokeniser->context.pending);
+ tokeniser->context.pending = 0;
- tokeniser->state = STATE_BOGUS_COMMENT;
- }
+ tokeniser->state = STATE_BOGUS_COMMENT;
}
return HUBBUB_OK;
@@ -1131,36 +1107,58 @@ hubbub_error hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
if (error != PARSERUTILS_OK) {
if (error == PARSERUTILS_EOF) {
tokeniser->state = STATE_DATA;
- parserutils_inputstream_advance(tokeniser->input,
- tokeniser->context.pending);
- return HUBBUB_OK;
+ if(tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+ tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT) {
+ return emit_current_chars(tokeniser);
+ } else {
+ parserutils_inputstream_advance(tokeniser->input,
+ tokeniser->context.pending);
+ return HUBBUB_OK;
+ }
} else {
return hubbub_error_from_parserutils_error(error);
}
}
c = *cptr;
-
- if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
- tokeniser->context.pending += len;
- tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
- } else if (c == '>') {
- tokeniser->context.pending += len;
- tokeniser->state = STATE_DATA;
- return emit_current_tag(tokeniser);
- } else if (c == '\0') {
- COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
- tokeniser->context.pending += len;
- } else if (c == '/') {
- tokeniser->context.pending += len;
- tokeniser->state = STATE_SELF_CLOSING_START_TAG;
- } else if ('A' <= c && c <= 'Z') {
+ if ('A' <= c && c <= 'Z') {
uint8_t lc = (c + 0x20);
COLLECT(ctag->name, &lc, len);
tokeniser->context.pending += len;
- } else {
- COLLECT(ctag->name, cptr, len);
+ } else if('a' <=c && c <= 'z') {
+ COLLECT(ctag->name, &c, len);
tokeniser->context.pending += len;
+ } else if (tokeniser->context.close_tag_match.match == false &&
+ tokeniser->content_model !=
+ HUBBUB_CONTENT_MODEL_PCDATA) {
+ /* We should emit "</" here, but instead we leave it in the
+ * buffer so the data state emits it with any characters
+ * following it */
+ tokeniser->state = STATE_DATA;
+
+ return emit_current_chars(tokeniser);
+ } else {
+ if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') {
+ tokeniser->context.pending += len;
+ tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME;
+ } else if (c == '>') {
+ tokeniser->context.pending += len;
+ tokeniser->state = STATE_DATA;
+ return emit_current_tag(tokeniser);
+ } else if (c == '\0') {
+ COLLECT(ctag->name, u_fffd, sizeof(u_fffd));
+ tokeniser->context.pending += len;
+ } else if (c == '/') {
+ tokeniser->context.pending += len;
+ tokeniser->state = STATE_SELF_CLOSING_START_TAG;
+ } else if(tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+ tokeniser->content_model == HUBBUB_CONTENT_MODEL_RAWTEXT) {
+ tokeniser->state = STATE_DATA;
+ return emit_current_chars(tokeniser);
+ } else {
+ COLLECT(ctag->name, cptr, len);
+ tokeniser->context.pending += len;
+ }
}
return HUBBUB_OK;
diff --git a/test/data/tokeniser2/INDEX b/test/data/tokeniser2/INDEX
index 33b9060..4b9e037 100644
--- a/test/data/tokeniser2/INDEX
+++ b/test/data/tokeniser2/INDEX
@@ -6,11 +6,11 @@ test1.test html5lib tests (part 1)
test2.test html5lib tests (part 2)
test3.test html5lib tests (part 3)
test4.test html5lib tests (part 4)
-#contentModelFlags.test html5lib content model tests
+contentModelFlags.test html5lib content model tests
entities.test html5lib entity tests
-#escapeFlag.test html5lib escape flag tests
+escapeFlag.test html5lib escape flag tests
numericEntities.test html5lib numeric entities tests
-#unicodeChars.test html5lib unicode character tests
+unicodeChars.test html5lib unicode character tests
cdata.test CDATA section tests
regression.test Regression tests
#domjs.test NA