summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/input/filter.c20
-rw-r--r--src/tokeniser/tokeniser.c144
2 files changed, 12 insertions, 152 deletions
diff --git a/src/input/filter.c b/src/input/filter.c
index 5ac5391..7a97840 100644
--- a/src/input/filter.c
+++ b/src/input/filter.c
@@ -348,24 +348,24 @@ hubbub_error read_character_filter(uint32_t c, uint32_t **output,
#define LF (0x0000000A)
#define REP (0x0000FFFD)
- if (c == NUL) {
- /* Replace NUL (U+0000) characters in input with U+FFFD */
- input->filter_output[0] = REP;
+ /* Replace NUL (U+0000) characters in input with U+FFFD */
+ if (c == NUL)
+ c = REP;
+
+ if (c == CR) {
+ /* Convert CRs to LFs straight away */
+ input->filter_output[0] = LF;
len = 1;
- } else if (c == CR) {
- /* Trap CR characters */
+ } else if (input->last_filter_char == CR && c == LF) {
+ /* Trap this LF */
len = 0;
- } else if (input->last_filter_char == CR && c != LF) {
- /* Last char was CR and this isn't LF => CR -> LF */
- input->filter_output[0] = LF;
- input->filter_output[1] = c;
- len = 2;
} else {
/* Let character through unchanged */
input->filter_output[0] = c;
len = 1;
}
+
#undef NUL
#undef CR
#undef LF
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
index f3c902d..6c564a8 100644
--- a/src/tokeniser/tokeniser.c
+++ b/src/tokeniser/tokeniser.c
@@ -4,7 +4,6 @@
* http://www.opensource.org/licenses/mit-license.php
* Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
*/
-#include <stdio.h>
#include <stdbool.h>
#include <string.h>
@@ -31,7 +30,6 @@ typedef enum hubbub_tokeniser_state {
HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_DATA,
HUBBUB_TOKENISER_STATE_TAG_OPEN,
HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN,
- HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH,
HUBBUB_TOKENISER_STATE_TAG_NAME,
HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME,
HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME,
@@ -92,7 +90,6 @@ typedef struct hubbub_tokeniser_context {
hubbub_string last_start_tag_name; /**< Name of the last start tag
* emitted */
struct {
- hubbub_string tag; /**< Pending close tag */
uint32_t count;
} close_tag_match;
@@ -170,8 +167,6 @@ static bool hubbub_tokeniser_handle_character_reference_data(
static bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser);
static bool hubbub_tokeniser_handle_close_tag_open(
hubbub_tokeniser *tokeniser);
-static bool hubbub_tokeniser_handle_close_tag_match(
- hubbub_tokeniser *tokeniser);
static bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser);
static bool hubbub_tokeniser_handle_before_attribute_name(
hubbub_tokeniser *tokeniser);
@@ -301,7 +296,6 @@ hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
tok->context.current_tag.name.type = HUBBUB_STRING_OFF;
tok->context.current_comment.type = HUBBUB_STRING_OFF;
tok->context.current_chars.type = HUBBUB_STRING_OFF;
- tok->context.close_tag_match.tag.type = HUBBUB_STRING_OFF;
tok->context.match_entity.str.type = HUBBUB_STRING_OFF;
return tok;
@@ -396,10 +390,6 @@ hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
cont = hubbub_tokeniser_handle_close_tag_open(
tokeniser);
break;
- case HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH:
- cont = hubbub_tokeniser_handle_close_tag_match(
- tokeniser);
- break;
case HUBBUB_TOKENISER_STATE_TAG_NAME:
cont = hubbub_tokeniser_handle_tag_name(tokeniser);
break;
@@ -1016,132 +1006,6 @@ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
return true;
}
-bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
-{
- hubbub_tokeniser_context *ctx = &tokeniser->context;
- hubbub_tag *ctag = &tokeniser->context.current_tag;
- uint32_t c = 0;
-
- while (ctx->close_tag_match.tag.len < ctag->name.len &&
- (c = hubbub_inputstream_peek(tokeniser->input)) !=
- HUBBUB_INPUTSTREAM_EOF &&
- c != HUBBUB_INPUTSTREAM_OOD) {
- /* Match last open tag */
- uint32_t off;
- size_t len;
-
- off = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (ctx->close_tag_match.tag.len == 0) {
- ctx->close_tag_match.tag.data.off = off;
- ctx->close_tag_match.tag.len = len;
- } else {
- ctx->close_tag_match.tag.len += len;
- }
-
- hubbub_inputstream_advance(tokeniser->input);
-
- if (ctx->close_tag_match.tag.len > ctag->name.len ||
- (ctx->close_tag_match.tag.len == ctag->name.len &&
- hubbub_inputstream_compare_range_ci(
- tokeniser->input,
- ctag->name.data.off,
- ctx->close_tag_match.tag.data.off,
- ctag->name.len) != 0)) {
- hubbub_token token;
-
- /* Rewind input stream to start of tag name */
- if (hubbub_inputstream_rewind(tokeniser->input,
- ctx->close_tag_match.tag.len) !=
- HUBBUB_OK)
- abort();
-
- /* Emit "</" */
- token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character =
- tokeniser->context.current_chars;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
-
- return true;
- } else if (ctx->close_tag_match.tag.len == ctag->name.len &&
- hubbub_inputstream_compare_range_ci(
- tokeniser->input,
- ctag->name.data.off,
- ctx->close_tag_match.tag.data.off,
- ctag->name.len) == 0) {
- /* Matched => stop searching */
- break;
- }
- }
-
- if (c == HUBBUB_INPUTSTREAM_OOD) {
- /* Need more data */
- return false;
- }
-
- if (c == HUBBUB_INPUTSTREAM_EOF) {
- /* Ran out of data - parse error */
- hubbub_token token;
-
- /* Rewind input stream to start of tag name */
- if (hubbub_inputstream_rewind(tokeniser->input,
- ctx->close_tag_match.tag.len) != HUBBUB_OK)
- abort();
-
- /* Emit "</" */
- token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character = tokeniser->context.current_chars;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
-
- return true;
- }
-
- /* Match following char */
- c = hubbub_inputstream_peek(tokeniser->input);
-
- if (c == HUBBUB_INPUTSTREAM_OOD) {
- /* Need more data */
- return false;
- }
-
- /* Rewind input stream to start of tag name */
- if (hubbub_inputstream_rewind(tokeniser->input,
- ctx->close_tag_match.tag.len) != HUBBUB_OK)
- abort();
-
- /* Check that following char was valid */
- if (c != '\t' && c != '\n' && c != '\f' && c != ' ' && c != '>' &&
- c != '/' && c != HUBBUB_INPUTSTREAM_EOF) {
- hubbub_token token;
-
- /* Emit "</" */
- token.type = HUBBUB_TOKEN_CHARACTER;
- token.data.character = tokeniser->context.current_chars;
-
- hubbub_tokeniser_emit_token(tokeniser, &token);
-
- tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
- hubbub_inputstream_advance(tokeniser->input);
-
- return true;
- }
-
- /* Switch the content model back to PCDATA */
- tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
-
- /* Finally, transition back to close tag open state */
- tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
-
- return true;
-}
-
bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
{
hubbub_tag *ctag = &tokeniser->context.current_tag;
@@ -1673,12 +1537,8 @@ bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
size_t len;
pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
-
- if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
- ctag->attributes[ctag->n_attributes - 1].value.data.off =
- pos;
- }
-
+ /* don't worry about setting the offset -- this is
+ * always done before this state is reached */
ctag->attributes[ctag->n_attributes - 1].value.len += len;
hubbub_inputstream_advance(tokeniser->input);