diff options
Diffstat (limited to 'src/tokeniser/tokeniser.c')
-rw-r--r-- | src/tokeniser/tokeniser.c | 3565 |
1 files changed, 1646 insertions, 1919 deletions
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c index df8946b..dee0a76 100644 --- a/src/tokeniser/tokeniser.c +++ b/src/tokeniser/tokeniser.c @@ -3,11 +3,16 @@ * Licensed under the MIT License, * http://www.opensource.org/licenses/mit-license.php * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + * Copyright 2008 Andrew Sidwell <takkaria@netsurf-browser.org> */ #include <assert.h> #include <stdbool.h> #include <string.h> +#include <stdio.h> + +#include <parserutils/charset/utf8.h> + #include "utils/utils.h" #include "tokeniser/entities.h" @@ -24,72 +29,87 @@ static const uint32_t cp1252Table[32] = { }; /** + * UTF-8 encoding of U+FFFD REPLACEMENT CHARACTER + */ +static const uint8_t u_fffd[3] = { '\xEF', '\xBF', '\xBD' }; +static const hubbub_string u_fffd_str = { u_fffd, sizeof(u_fffd) }; + + +/** + * String for when we want to emit newlines + */ +static const uint8_t lf = '\n'; +static const hubbub_string lf_str = { &lf, 1 }; + + +/** * Tokeniser states */ typedef enum hubbub_tokeniser_state { - HUBBUB_TOKENISER_STATE_DATA, - HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_DATA, - HUBBUB_TOKENISER_STATE_TAG_OPEN, - HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN, - HUBBUB_TOKENISER_STATE_TAG_NAME, - HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME, - HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME, - HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME, - HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE, - HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ, - HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ, - HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ, - HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE, - HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_VALUE_Q, - HUBBUB_TOKENISER_STATE_SELF_CLOSING_START_TAG, - HUBBUB_TOKENISER_STATE_BOGUS_COMMENT, - HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN, - HUBBUB_TOKENISER_STATE_MATCH_COMMENT, - HUBBUB_TOKENISER_STATE_COMMENT_START, - HUBBUB_TOKENISER_STATE_COMMENT_START_DASH, - HUBBUB_TOKENISER_STATE_COMMENT, - HUBBUB_TOKENISER_STATE_COMMENT_END_DASH, - HUBBUB_TOKENISER_STATE_COMMENT_END, - HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE, - HUBBUB_TOKENISER_STATE_DOCTYPE, - HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME, - HUBBUB_TOKENISER_STATE_DOCTYPE_NAME, - HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME, - HUBBUB_TOKENISER_STATE_MATCH_PUBLIC, - HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_PUBLIC, - HUBBUB_TOKENISER_STATE_DOCTYPE_PUBLIC_DQ, - HUBBUB_TOKENISER_STATE_DOCTYPE_PUBLIC_SQ, - HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_PUBLIC, - HUBBUB_TOKENISER_STATE_MATCH_SYSTEM, - HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_SYSTEM, - HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_DQ, - HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_SQ, - HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_SYSTEM, - HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE, - HUBBUB_TOKENISER_STATE_MATCH_CDATA, - HUBBUB_TOKENISER_STATE_CDATA_BLOCK, - HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY, - HUBBUB_TOKENISER_STATE_NAMED_ENTITY + STATE_DATA, + STATE_CHARACTER_REFERENCE_DATA, + STATE_TAG_OPEN, + STATE_CLOSE_TAG_OPEN, + STATE_TAG_NAME, + STATE_BEFORE_ATTRIBUTE_NAME, + STATE_ATTRIBUTE_NAME, + STATE_AFTER_ATTRIBUTE_NAME, + STATE_BEFORE_ATTRIBUTE_VALUE, + STATE_ATTRIBUTE_VALUE_DQ, + STATE_ATTRIBUTE_VALUE_SQ, + STATE_ATTRIBUTE_VALUE_UQ, + STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE, + STATE_AFTER_ATTRIBUTE_VALUE_Q, + STATE_SELF_CLOSING_START_TAG, + STATE_BOGUS_COMMENT, + STATE_MARKUP_DECLARATION_OPEN, + STATE_MATCH_COMMENT, + STATE_COMMENT_START, + STATE_COMMENT_START_DASH, + STATE_COMMENT, + STATE_COMMENT_END_DASH, + STATE_COMMENT_END, + STATE_MATCH_DOCTYPE, + STATE_DOCTYPE, + STATE_BEFORE_DOCTYPE_NAME, + STATE_DOCTYPE_NAME, + STATE_AFTER_DOCTYPE_NAME, + STATE_MATCH_PUBLIC, + STATE_BEFORE_DOCTYPE_PUBLIC, + STATE_DOCTYPE_PUBLIC_DQ, + STATE_DOCTYPE_PUBLIC_SQ, + STATE_AFTER_DOCTYPE_PUBLIC, + STATE_MATCH_SYSTEM, + STATE_BEFORE_DOCTYPE_SYSTEM, + STATE_DOCTYPE_SYSTEM_DQ, + STATE_DOCTYPE_SYSTEM_SQ, + STATE_AFTER_DOCTYPE_SYSTEM, + STATE_BOGUS_DOCTYPE, + STATE_MATCH_CDATA, + STATE_CDATA_BLOCK, + STATE_NUMBERED_ENTITY, + STATE_NAMED_ENTITY } hubbub_tokeniser_state; /** * Context for tokeniser */ typedef struct hubbub_tokeniser_context { - hubbub_token_type current_tag_type; /**< Type of current_tag */ - hubbub_tag current_tag; /**< Current tag */ + hubbub_string chars; /**< Pending characters */ - hubbub_string current_comment; /**< Current comment */ + hubbub_string current_comment; /**< Current comment text */ + hubbub_token_type current_tag_type; /**< Type of current_tag */ + hubbub_tag current_tag; /**< Current tag */ hubbub_doctype current_doctype; /**< Current doctype */ - - hubbub_string current_chars; /**< Pending characters */ - hubbub_tokeniser_state prev_state; /**< Previous state */ - - hubbub_string last_start_tag_name; /**< Name of the last start tag + uint8_t last_start_tag_name[10]; /**< Name of the last start tag * emitted */ + size_t last_start_tag_len; + + bool to_buf; + struct { uint32_t count; bool match; @@ -105,26 +125,27 @@ typedef struct hubbub_tokeniser_context { } match_cdata; struct { - hubbub_string str; /**< Pending string */ - uint32_t poss_len; + size_t offset; /**< Offset in buffer */ + uint32_t length; /**< Length of entity */ + uint32_t codepoint; /**< UCS4 codepoint */ + bool complete; /**< True if match complete */ + + uint32_t poss_length; /**< Optimistic length + * when matching named + * character references */ uint8_t base; /**< Base for numeric * entities */ - uint32_t codepoint; /**< UCS4 codepoint */ + void *context; /**< Context for named + * entity search */ + size_t prev_len; /**< Previous byte length + * of str */ bool had_data; /**< Whether we read * anything after &#(x)? */ - hubbub_tokeniser_state return_state; /**< State we were - * called from */ - bool complete; /**< Flag that entity - * matching completed */ - bool done_setup; /**< Flag that match setup - * has completed */ bool overflow; /**< Whether this entity has * has overflowed the maximum * numeric entity value */ - void *context; /**< Context for named - * entity search */ - size_t prev_len; /**< Previous byte length - * of str */ + hubbub_tokeniser_state return_state; /**< State we were + * called from */ } match_entity; struct { @@ -146,19 +167,14 @@ struct hubbub_tokeniser { bool escape_flag; /**< Escape flag **/ bool process_cdata_section; - hubbub_inputstream *input; /**< Input stream */ - - const uint8_t *input_buffer; /**< Start of input stream's buffer */ - size_t input_buffer_len; /**< Length of input buffer */ + parserutils_inputstream *input; /**< Input stream */ + parserutils_buffer *buffer; /**< Input buffer */ hubbub_tokeniser_context context; /**< Tokeniser context */ hubbub_token_handler token_handler; void *token_pw; - hubbub_buffer_handler buffer_handler; - void *buffer_pw; - hubbub_error_handler error_handler; void *error_pw; @@ -198,14 +214,7 @@ static bool hubbub_tokeniser_handle_bogus_comment( static bool hubbub_tokeniser_handle_markup_declaration_open( hubbub_tokeniser *tokeniser); static bool hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser); -static bool hubbub_tokeniser_handle_comment_start( - hubbub_tokeniser *tokeniser); -static bool hubbub_tokeniser_handle_comment_start_dash( - hubbub_tokeniser *tokeniser); static bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser); -static bool hubbub_tokeniser_handle_comment_end_dash( - hubbub_tokeniser *tokeniser); -static bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser); static bool hubbub_tokeniser_handle_match_doctype( hubbub_tokeniser *tokeniser); static bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser); @@ -238,13 +247,12 @@ static bool hubbub_tokeniser_handle_bogus_doctype( static bool hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser); static bool hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser); static bool hubbub_tokeniser_consume_character_reference( - hubbub_tokeniser *tokeniser); + hubbub_tokeniser *tokeniser, size_t off); static bool hubbub_tokeniser_handle_numbered_entity( hubbub_tokeniser *tokeniser); static bool hubbub_tokeniser_handle_named_entity( hubbub_tokeniser *tokeniser); -static void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer, - size_t len, void *pw); + static void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, hubbub_token *token); @@ -256,7 +264,7 @@ static void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, * \param pw Pointer to client-specific private data (may be NULL) * \return Pointer to tokeniser instance, or NULL on failure */ -hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input, +hubbub_tokeniser *hubbub_tokeniser_create(parserutils_inputstream *input, hubbub_alloc alloc, void *pw) { hubbub_tokeniser *tok; @@ -268,40 +276,30 @@ hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input, if (tok == NULL) return NULL; - tok->state = HUBBUB_TOKENISER_STATE_DATA; + tok->buffer = parserutils_buffer_create(alloc, pw); + if (tok->buffer == NULL) { + alloc(tok, 0, pw); + return NULL; + } + + tok->state = STATE_DATA; tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA; tok->escape_flag = false; tok->process_cdata_section = false; tok->input = input; - tok->input_buffer = NULL; - tok->input_buffer_len = 0; tok->token_handler = NULL; tok->token_pw = NULL; - tok->buffer_handler = NULL; - tok->buffer_pw = NULL; - tok->error_handler = NULL; tok->error_pw = NULL; tok->alloc = alloc; tok->alloc_pw = pw; - if (hubbub_inputstream_register_movehandler(input, - hubbub_tokeniser_buffer_moved_handler, tok) != - HUBBUB_OK) { - alloc(tok, 0, pw); - return NULL; - } - memset(&tok->context, 0, sizeof(hubbub_tokeniser_context)); - tok->context.current_tag.name.type = HUBBUB_STRING_OFF; - tok->context.current_comment.type = HUBBUB_STRING_OFF; - tok->context.current_chars.type = HUBBUB_STRING_OFF; - tok->context.match_entity.str.type = HUBBUB_STRING_OFF; return tok; } @@ -316,9 +314,6 @@ void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser) if (tokeniser == NULL) return; - hubbub_inputstream_deregister_movehandler(tokeniser->input, - hubbub_tokeniser_buffer_moved_handler, tokeniser); - if (tokeniser->context.current_tag.attributes != NULL) { tokeniser->alloc(tokeniser->context.current_tag.attributes, 0, tokeniser->alloc_pw); @@ -347,13 +342,6 @@ hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, tokeniser->token_handler = params->token_handler.handler; tokeniser->token_pw = params->token_handler.pw; break; - case HUBBUB_TOKENISER_BUFFER_HANDLER: - tokeniser->buffer_handler = params->buffer_handler.handler; - tokeniser->buffer_pw = params->buffer_handler.pw; - tokeniser->buffer_handler(tokeniser->input_buffer, - tokeniser->input_buffer_len, - tokeniser->buffer_pw); - break; case HUBBUB_TOKENISER_ERROR_HANDLER: tokeniser->error_handler = params->error_handler.handler; tokeniser->error_pw = params->error_handler.pw; @@ -382,183 +370,174 @@ hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser) if (tokeniser == NULL) return HUBBUB_BADPARM; +#ifdef NDEBUG +#define state(x) \ + case x: +#else +#define state(x) \ + case x: \ + printf( #x "\n"); +#endif + while (cont) { switch (tokeniser->state) { - case HUBBUB_TOKENISER_STATE_DATA: + state(STATE_DATA) cont = hubbub_tokeniser_handle_data(tokeniser); break; - case HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_DATA: + state(STATE_CHARACTER_REFERENCE_DATA) cont = hubbub_tokeniser_handle_character_reference_data( tokeniser); break; - case HUBBUB_TOKENISER_STATE_TAG_OPEN: + state(STATE_TAG_OPEN) cont = hubbub_tokeniser_handle_tag_open(tokeniser); break; - case HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN: + state(STATE_CLOSE_TAG_OPEN) cont = hubbub_tokeniser_handle_close_tag_open( tokeniser); break; - case HUBBUB_TOKENISER_STATE_TAG_NAME: + state(STATE_TAG_NAME) cont = hubbub_tokeniser_handle_tag_name(tokeniser); break; - case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME: + state(STATE_BEFORE_ATTRIBUTE_NAME) cont = hubbub_tokeniser_handle_before_attribute_name( tokeniser); break; - case HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME: + state(STATE_ATTRIBUTE_NAME) cont = hubbub_tokeniser_handle_attribute_name( tokeniser); break; - case HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME: + state(STATE_AFTER_ATTRIBUTE_NAME) cont = hubbub_tokeniser_handle_after_attribute_name( tokeniser); break; - case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE: + state(STATE_BEFORE_ATTRIBUTE_VALUE) cont = hubbub_tokeniser_handle_before_attribute_value( tokeniser); break; - case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ: + state(STATE_ATTRIBUTE_VALUE_DQ) cont = hubbub_tokeniser_handle_attribute_value_dq( tokeniser); break; - case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ: + state(STATE_ATTRIBUTE_VALUE_SQ) cont = hubbub_tokeniser_handle_attribute_value_sq( tokeniser); break; - case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ: + state(STATE_ATTRIBUTE_VALUE_UQ) cont = hubbub_tokeniser_handle_attribute_value_uq( tokeniser); break; - case HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE: + state(STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) cont = hubbub_tokeniser_handle_character_reference_in_attribute_value( tokeniser); break; - case HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_VALUE_Q: + state(STATE_AFTER_ATTRIBUTE_VALUE_Q) cont = hubbub_tokeniser_handle_after_attribute_value_q( tokeniser); break; - case HUBBUB_TOKENISER_STATE_SELF_CLOSING_START_TAG: + state(STATE_SELF_CLOSING_START_TAG) cont = hubbub_tokeniser_handle_self_closing_start_tag( tokeniser); break; - case HUBBUB_TOKENISER_STATE_BOGUS_COMMENT: + state(STATE_BOGUS_COMMENT) cont = hubbub_tokeniser_handle_bogus_comment( tokeniser); break; - case HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN: + state(STATE_MARKUP_DECLARATION_OPEN) cont = hubbub_tokeniser_handle_markup_declaration_open( tokeniser); break; - case HUBBUB_TOKENISER_STATE_MATCH_COMMENT: + state(STATE_MATCH_COMMENT) cont = hubbub_tokeniser_handle_match_comment( tokeniser); break; - case HUBBUB_TOKENISER_STATE_COMMENT_START: - cont = hubbub_tokeniser_handle_comment_start( - tokeniser); - break; - case HUBBUB_TOKENISER_STATE_COMMENT_START_DASH: - cont = hubbub_tokeniser_handle_comment_start_dash( - tokeniser); - break; - case HUBBUB_TOKENISER_STATE_COMMENT: + case STATE_COMMENT_START: + case STATE_COMMENT_START_DASH: + case STATE_COMMENT: + case STATE_COMMENT_END_DASH: + case STATE_COMMENT_END: +#ifndef NDEBUG + printf("COMMENT %d\n", + tokeniser->state - STATE_COMMENT_START + 1); +#endif cont = hubbub_tokeniser_handle_comment(tokeniser); break; - case HUBBUB_TOKENISER_STATE_COMMENT_END_DASH: - cont = hubbub_tokeniser_handle_comment_end_dash( - tokeniser); - break; - case HUBBUB_TOKENISER_STATE_COMMENT_END: - cont = hubbub_tokeniser_handle_comment_end( - tokeniser); - break; - case HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE: + state(STATE_MATCH_DOCTYPE) cont = hubbub_tokeniser_handle_match_doctype( tokeniser); break; - case HUBBUB_TOKENISER_STATE_DOCTYPE: + state(STATE_DOCTYPE) cont = hubbub_tokeniser_handle_doctype(tokeniser); break; - case HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME: + state(STATE_BEFORE_DOCTYPE_NAME) cont = hubbub_tokeniser_handle_before_doctype_name( tokeniser); break; - case HUBBUB_TOKENISER_STATE_DOCTYPE_NAME: + state(STATE_DOCTYPE_NAME) cont = hubbub_tokeniser_handle_doctype_name( tokeniser); break; - case HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME: + state(STATE_AFTER_DOCTYPE_NAME) cont = hubbub_tokeniser_handle_after_doctype_name( tokeniser); break; - case HUBBUB_TOKENISER_STATE_MATCH_PUBLIC: + state(STATE_MATCH_PUBLIC) cont = hubbub_tokeniser_handle_match_public( tokeniser); break; - - case HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_PUBLIC: + state(STATE_BEFORE_DOCTYPE_PUBLIC) cont = hubbub_tokeniser_handle_before_doctype_public( tokeniser); break; - - case HUBBUB_TOKENISER_STATE_DOCTYPE_PUBLIC_DQ: + state(STATE_DOCTYPE_PUBLIC_DQ) cont = hubbub_tokeniser_handle_doctype_public_dq( tokeniser); break; - - case HUBBUB_TOKENISER_STATE_DOCTYPE_PUBLIC_SQ: + state(STATE_DOCTYPE_PUBLIC_SQ) cont = hubbub_tokeniser_handle_doctype_public_sq( tokeniser); break; - - case HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_PUBLIC: + state(STATE_AFTER_DOCTYPE_PUBLIC) cont = hubbub_tokeniser_handle_after_doctype_public( tokeniser); break; - - case HUBBUB_TOKENISER_STATE_MATCH_SYSTEM: + state(STATE_MATCH_SYSTEM) cont = hubbub_tokeniser_handle_match_system( tokeniser); break; - - case HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_SYSTEM: + state(STATE_BEFORE_DOCTYPE_SYSTEM) cont = hubbub_tokeniser_handle_before_doctype_system( tokeniser); break; - - case HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_DQ: + state(STATE_DOCTYPE_SYSTEM_DQ) cont = hubbub_tokeniser_handle_doctype_system_dq( tokeniser); break; - - case HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_SQ: + state(STATE_DOCTYPE_SYSTEM_SQ) cont = hubbub_tokeniser_handle_doctype_system_sq( tokeniser); break; - - case HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_SYSTEM: + state(STATE_AFTER_DOCTYPE_SYSTEM) cont = hubbub_tokeniser_handle_after_doctype_system( tokeniser); break; - - case HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE: + state(STATE_BOGUS_DOCTYPE) cont = hubbub_tokeniser_handle_bogus_doctype( tokeniser); break; - case HUBBUB_TOKENISER_STATE_MATCH_CDATA: + state(STATE_MATCH_CDATA) cont = hubbub_tokeniser_handle_match_cdata( tokeniser); break; - case HUBBUB_TOKENISER_STATE_CDATA_BLOCK: + state(STATE_CDATA_BLOCK) cont = hubbub_tokeniser_handle_cdata_block( tokeniser); break; - case HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY: + state(STATE_NUMBERED_ENTITY) cont = hubbub_tokeniser_handle_numbered_entity( tokeniser); break; - case HUBBUB_TOKENISER_STATE_NAMED_ENTITY: + state(STATE_NAMED_ENTITY) cont = hubbub_tokeniser_handle_named_entity( tokeniser); break; @@ -568,514 +547,691 @@ hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser) return HUBBUB_OK; } + +/** + * Macro to obtain the current character from the pointer "cptr". + * + * To be eliminated as soon as checks for EOF always happen before we want + * the current character. + */ +#define CHAR(cptr) \ + (((cptr) == PARSERUTILS_INPUTSTREAM_EOF) ? 0 : (*((uint8_t *) cptr))) + + +/** + * Various macros for manipulating buffers. + * + * \todo make some of these inline functions (type-safety) + * \todo document them properly here + */ + +#define START_BUF(str, cptr, lengt) \ + do { \ + uint8_t *data = tokeniser->buffer->data + \ + tokeniser->buffer->length; \ + parserutils_buffer_append( \ + tokeniser->buffer, \ + cptr, (lengt)); \ + (str).ptr = data; \ + (str).len = (lengt); \ + tokeniser->context.to_buf = true; \ + } while (0) + +#define START(str, cptr, length) \ + do { \ + if (tokeniser->context.to_buf) { \ + START_BUF(str, (uint8_t *)(cptr), length); \ + } else { \ + (str).ptr = (uint8_t *)(cptr); \ + (str).len = (length); \ + } \ + } while (0) + +#define COLLECT(str, cptr, length) \ + do { \ + assert(str.len != 0); \ + if (tokeniser->context.to_buf == true) { \ + parserutils_buffer_append(tokeniser->buffer, \ + (uint8_t *) cptr, (length)); \ + } \ + (str).len += (length); \ + } while (0) + +#define COLLECT_NOBUF(str, length) \ + do { \ + assert(str.len != 0); \ + (str).len += (length); \ + } while (0) + +#define COLLECT_MS(str, cptr, length) \ + do { \ + if ((str).len == 0) { \ + START(str, cptr, length); \ + } else { \ + COLLECT(str, cptr, length); \ + } \ + } while (0) + +#define COLLECT_MS_NOBUF(str, cptr, length) \ + do { \ + if ((str).len == 0) { \ + (str).ptr = (uint8_t *) cptr; \ + } \ + (str).len += (length); \ + } while (0) + +#define FINISH(str) \ + tokeniser->context.to_buf = false + +#define SWITCH(str) \ + do { \ + uint8_t *data = tokeniser->buffer->data + \ + tokeniser->buffer->length; \ + parserutils_buffer_append( \ + tokeniser->buffer, \ + (str).ptr, (str).len); \ + (str).ptr = data; \ + tokeniser->context.to_buf = true; \ + } while (0) + +#define COLLECT_CHAR(str, cptr, length) \ + do { \ + assert(str.len != 0); \ + if (tokeniser->context.to_buf == false) { \ + SWITCH(str); \ + } \ + parserutils_buffer_append(tokeniser->buffer, cptr, (length)); \ + str.len += (length); \ + } while (0) + + + +/** + * Emit a character token. + * + * \param tokeniser Tokeniser instance + * \param chars Pointer to hubbub_string to emit + * \return true + */ +static inline bool emit_character_token(hubbub_tokeniser *tokeniser, + const hubbub_string *chars) +{ + hubbub_token token; + + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = *chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + return true; +} + +/** + * Emit the current pending characters being stored in the tokeniser context. + * + * \param tokeniser Tokeniser instance + * \return true + */ +static inline bool emit_current_chars(hubbub_tokeniser *tokeniser) +{ + hubbub_token token; + + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, 0, &len); + + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character.ptr = (uint8_t *) cptr; + token.data.character.len = tokeniser->context.chars.len; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + return true; +} + +/** + * Emit the current tag token being stored in the tokeniser context. + * + * \param tokeniser Tokeniser instance + * \return true + */ +static inline bool emit_current_tag(hubbub_tokeniser *tokeniser) +{ + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + return true; +} + +/** + * Emit the current comment token being stored in the tokeniser context. + * + * \param tokeniser Tokeniser instance + * \return true + */ +static inline bool emit_current_comment(hubbub_tokeniser *tokeniser) +{ + hubbub_token token; + + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + return true; +} + +/** + * Emit the current doctype token being stored in the tokeniser context. + * + * \param tokeniser Tokeniser instance + * \param force_qurirks Force quirks mode on this document + * \return true + */ +static inline bool emit_current_doctype(hubbub_tokeniser *tokeniser, + bool force_quirks) +{ + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + if (force_quirks == true) + token.data.doctype.force_quirks = true; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + return true; +} + + + + + +/* this should always be called with an empty "chars" buffer */ bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) { hubbub_token token; - uint32_t c; + uintptr_t cptr; + size_t len; - /* Clear current characters */ - tokeniser->context.current_chars.data.off = 0; - tokeniser->context.current_chars.len = 0; + while ((cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len)) != + PARSERUTILS_INPUTSTREAM_EOF && + cptr != PARSERUTILS_INPUTSTREAM_OOD) { + uint8_t c = CHAR(cptr); - while ((c = hubbub_inputstream_peek(tokeniser->input)) != - HUBBUB_INPUTSTREAM_EOF && - c != HUBBUB_INPUTSTREAM_OOD) { if (c == '&' && (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA || tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA) && tokeniser->escape_flag == false) { tokeniser->state = - HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_DATA; + STATE_CHARACTER_REFERENCE_DATA; /* Don't eat the '&'; it'll be handled by entity * consumption */ break; - } else if (c == '-') { - size_t len; - uint32_t pos; - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); + } else if (c == '-') { + hubbub_string *chars = &tokeniser->context.chars; if (tokeniser->escape_flag == false && - (tokeniser->content_model == - HUBBUB_CONTENT_MODEL_RCDATA || - tokeniser->content_model == - HUBBUB_CONTENT_MODEL_CDATA) && - pos >= 3 && - hubbub_inputstream_compare_range_ascii( - tokeniser->input, pos - 3, 4, - "<!--", SLEN("<!--")) == 0) - { - tokeniser->escape_flag = true; - } + (tokeniser->content_model == + HUBBUB_CONTENT_MODEL_RCDATA || + tokeniser->content_model == + HUBBUB_CONTENT_MODEL_CDATA) && + chars->len >= 3) { + + cptr = parserutils_inputstream_peek( + tokeniser->input, + chars->len - 3, &len); - if (tokeniser->context.current_chars.len == 0) { - tokeniser->context.current_chars.data.off = - pos; + if (strncmp((char *)cptr, + "<!--", SLEN("<!--")) == 0) + tokeniser->escape_flag = true; } - tokeniser->context.current_chars.len += len; - hubbub_inputstream_advance(tokeniser->input); + COLLECT_MS(tokeniser->context.chars, cptr, len); } else if (c == '<' && (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA || - ((tokeniser->content_model == + ((tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || - tokeniser->content_model == + tokeniser->content_model == HUBBUB_CONTENT_MODEL_CDATA) && tokeniser->escape_flag == false))) { - if (tokeniser->context.current_chars.len > 0) { + if (tokeniser->context.chars.len > 0) { /* Emit any pending characters */ - token.type = HUBBUB_TOKEN_CHARACTER; - token.data.character = - tokeniser->context.current_chars; - - hubbub_tokeniser_emit_token(tokeniser, - &token); + emit_current_chars(tokeniser); } /* Buffer '<' */ - tokeniser->context.current_chars.data.off = - hubbub_inputstream_cur_pos(tokeniser->input, - &tokeniser->context.current_chars.len); - - tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_OPEN; - hubbub_inputstream_advance(tokeniser->input); + START(tokeniser->context.chars, cptr, len); + tokeniser->state = STATE_TAG_OPEN; break; } else if (c == '>') { - size_t len; - uint32_t pos; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); + hubbub_string *chars = &tokeniser->context.chars; /* no need to check that there are enough characters, * since you can only run into this if the flag is * true in the first place, which requires four * characters. */ if (tokeniser->escape_flag == true && - (tokeniser->content_model == - HUBBUB_CONTENT_MODEL_RCDATA || - tokeniser->content_model == - HUBBUB_CONTENT_MODEL_CDATA) && - hubbub_inputstream_compare_range_ascii( - tokeniser->input, pos - 2, 3, - "-->", SLEN("-->")) == 0) - { - tokeniser->escape_flag = false; + (tokeniser->content_model == + HUBBUB_CONTENT_MODEL_RCDATA || + tokeniser->content_model == + HUBBUB_CONTENT_MODEL_CDATA)) { + + cptr = parserutils_inputstream_peek( + tokeniser->input, + chars->len - 2, &len); + + if (strncmp((char *)cptr, + "-->", SLEN("-->")) == 0) { + tokeniser->escape_flag = false; + } } - if (tokeniser->context.current_chars.len == 0) { - tokeniser->context.current_chars.data.off = - pos; + COLLECT_MS(tokeniser->context.chars, cptr, len); + } else if (c == '\0') { + if (tokeniser->context.chars.len > 0) { + /* Emit any pending characters */ + emit_current_chars(tokeniser); } - tokeniser->context.current_chars.len += len; - hubbub_inputstream_advance(tokeniser->input); - } else { - uint32_t pos; - size_t len; + /* Emit a replacement character */ + emit_character_token(tokeniser, &u_fffd_str); - /* Accumulate characters into buffer */ - pos = hubbub_inputstream_cur_pos(tokeniser->input, + /* Advance past NUL */ + parserutils_inputstream_advance(tokeniser->input, 1); + } else if (c == '\r') { + cptr = parserutils_inputstream_peek( + tokeniser->input, + tokeniser->context.chars.len + len, &len); - if (tokeniser->context.current_chars.len == 0) { - tokeniser->context.current_chars.data.off = - pos; + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { + break; + } + + if (tokeniser->context.chars.len > 0) { + /* Emit any pending characters */ + emit_current_chars(tokeniser); } - tokeniser->context.current_chars.len += len; - hubbub_inputstream_advance(tokeniser->input); + c = CHAR(cptr); + if (c != '\n') { + /* Emit newline */ + emit_character_token(tokeniser, &lf_str); + } + + /* Advance over */ + parserutils_inputstream_advance(tokeniser->input, 1); + } else { + /* Just collect into buffer */ + COLLECT_MS(tokeniser->context.chars, cptr, len); } } - if (tokeniser->state != HUBBUB_TOKENISER_STATE_TAG_OPEN && - tokeniser->context.current_chars.len > 0) { + if (tokeniser->state != STATE_TAG_OPEN && + (tokeniser->state != STATE_DATA || + cptr == PARSERUTILS_INPUTSTREAM_EOF) && + tokeniser->context.chars.len > 0) { /* Emit any pending characters */ - token.type = HUBBUB_TOKEN_CHARACTER; - token.data.character = tokeniser->context.current_chars; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->context.current_chars.data.off = 0; - tokeniser->context.current_chars.len = 0; + emit_current_chars(tokeniser); } - if (c == HUBBUB_INPUTSTREAM_EOF) { + if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { token.type = HUBBUB_TOKEN_EOF; - hubbub_tokeniser_emit_token(tokeniser, &token); } - return (c != HUBBUB_INPUTSTREAM_EOF && c != HUBBUB_INPUTSTREAM_OOD); + return (cptr != PARSERUTILS_INPUTSTREAM_EOF && cptr != PARSERUTILS_INPUTSTREAM_OOD); } +/* emit any pending tokens before calling */ bool hubbub_tokeniser_handle_character_reference_data(hubbub_tokeniser *tokeniser) { + assert(tokeniser->context.chars.len == 0); + if (tokeniser->context.match_entity.complete == false) { - return hubbub_tokeniser_consume_character_reference(tokeniser); + return hubbub_tokeniser_consume_character_reference(tokeniser, + tokeniser->context.chars.len); } else { hubbub_token token; -#ifndef NDEBUG - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - assert(c != HUBBUB_INPUTSTREAM_OOD && - c != HUBBUB_INPUTSTREAM_EOF); -#endif + uint8_t utf8[6]; + uint8_t *utf8ptr = utf8; + size_t len = sizeof(utf8); - /* Emit character */ token.type = HUBBUB_TOKEN_CHARACTER; - token.data.character.type = HUBBUB_STRING_OFF; - token.data.character.data.off = - hubbub_inputstream_cur_pos(tokeniser->input, - &token.data.character.len); - hubbub_tokeniser_emit_token(tokeniser, &token); + if (tokeniser->context.match_entity.codepoint) { + parserutils_charset_utf8_from_ucs4( + tokeniser->context.match_entity.codepoint, + &utf8ptr, &len); + + token.data.character.ptr = utf8; + token.data.character.len = sizeof(utf8) - len; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + /* +1 for ampersand */ + parserutils_inputstream_advance(tokeniser->input, + tokeniser->context.match_entity.length + + 1); + } else { + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, + tokeniser->context.chars.len, + &len); + + token.data.character.ptr = (uint8_t *)cptr; + token.data.character.len = len; + + hubbub_tokeniser_emit_token(tokeniser, &token); + parserutils_inputstream_advance(tokeniser->input, len); + } /* Reset for next time */ tokeniser->context.match_entity.complete = false; - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_DATA; } return true; } +/* this state always switches to another state straight away */ +/* this state expects the current character to be '<' */ bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); hubbub_tag *ctag = &tokeniser->context.current_tag; - uint32_t pos; + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) - return false; + assert(tokeniser->context.chars.len == 1); +/* assert(tokeniser->context.chars.ptr[0] == '<'); */ - if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || - tokeniser->content_model == - HUBBUB_CONTENT_MODEL_CDATA) { - if (c == '/') { - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { + return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + /* Emit '<' */ + emit_character_token(tokeniser, + &tokeniser->context.chars); - tokeniser->context.current_chars.len += len; + tokeniser->state = STATE_DATA; + return true; + } - tokeniser->context.close_tag_match.match = false; - tokeniser->context.close_tag_match.count = 0; + uint8_t c = CHAR(cptr); - tokeniser->state = - HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN; - hubbub_inputstream_advance(tokeniser->input); - } else { - hubbub_token token; + if (c == '/') { + COLLECT_NOBUF(tokeniser->context.chars, len); - /* Emit '<' */ - token.type = HUBBUB_TOKEN_CHARACTER; - token.data.character = - tokeniser->context.current_chars; - - hubbub_tokeniser_emit_token(tokeniser, &token); + tokeniser->context.close_tag_match.match = false; + tokeniser->context.close_tag_match.count = 0; - tokeniser->state = - HUBBUB_TOKENISER_STATE_DATA; - } + tokeniser->state = STATE_CLOSE_TAG_OPEN; + } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || + tokeniser->content_model == + HUBBUB_CONTENT_MODEL_CDATA) { + /* Return to data state with '<' still in "chars" */ + tokeniser->state = STATE_DATA; } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) { if (c == '!') { - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); - - tokeniser->context.current_chars.len += len; - - tokeniser->state = - HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '/') { - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); - - tokeniser->context.current_chars.len += len; + parserutils_inputstream_advance(tokeniser->input, + SLEN("<!")); - tokeniser->context.close_tag_match.match = false; - tokeniser->context.close_tag_match.count = 0; - - tokeniser->state = - HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->context.chars.len = 0; + tokeniser->state = STATE_MARKUP_DECLARATION_OPEN; } else if ('A' <= c && c <= 'Z') { - hubbub_inputstream_lowercase(tokeniser->input); - + COLLECT_NOBUF(tokeniser->context.chars, len); tokeniser->context.current_tag_type = HUBBUB_TOKEN_START_TAG; - ctag->name.data.off = - hubbub_inputstream_cur_pos(tokeniser->input, - &ctag->name.len); + uint8_t lc = (c + 0x20); + START_BUF(ctag->name, &lc, len); ctag->n_attributes = 0; - tokeniser->state = - HUBBUB_TOKENISER_STATE_TAG_NAME; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_TAG_NAME; } else if ('a' <= c && c <= 'z') { + COLLECT_NOBUF(tokeniser->context.chars, len); tokeniser->context.current_tag_type = HUBBUB_TOKEN_START_TAG; - ctag->name.data.off = - hubbub_inputstream_cur_pos(tokeniser->input, - &ctag->name.len); + START_BUF(ctag->name, (uint8_t *)cptr, len); ctag->n_attributes = 0; - tokeniser->state = - HUBBUB_TOKENISER_STATE_TAG_NAME; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_TAG_NAME; } else if (c == '>') { - hubbub_token token; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); - tokeniser->context.current_chars.len += len; + /** \todo parse error */ - /* Emit "<>" */ - token.type = HUBBUB_TOKEN_CHARACTER; - token.data.character = - tokeniser->context.current_chars; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = - HUBBUB_TOKENISER_STATE_DATA; - - hubbub_inputstream_advance(tokeniser->input); + COLLECT_NOBUF(tokeniser->context.chars, len); + tokeniser->state = STATE_DATA; } else if (c == '?') { - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); - tokeniser->context.current_chars.len += len; + /** \todo parse error */ - tokeniser->context.current_comment.data.off = pos; - tokeniser->context.current_comment.len = len; - tokeniser->state = - HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; - hubbub_inputstream_advance(tokeniser->input); - } else { - hubbub_token token; + /* Cursor still at "<", need to advance past it */ + parserutils_inputstream_advance( + tokeniser->input, SLEN("<")); + tokeniser->context.chars.len = 0; + tokeniser->state = STATE_BOGUS_COMMENT; + } else { /* Emit '<' */ - token.type = HUBBUB_TOKEN_CHARACTER; - token.data.character = - tokeniser->context.current_chars; - - hubbub_tokeniser_emit_token(tokeniser, &token); + emit_character_token(tokeniser, + &tokeniser->context.chars); - tokeniser->state = - HUBBUB_TOKENISER_STATE_DATA; + tokeniser->state = STATE_DATA; } } return true; } +/* this state expects tokeniser->context.chars to be "</" */ +/* this state never stays in this state for more than one character */ bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser) { hubbub_tokeniser_context *ctx = &tokeniser->context; - /**\todo Handle the fragment case here */ + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); + + assert(tokeniser->context.chars.len == 2); +/* assert(tokeniser->context.chars.ptr[0] == '<'); */ +/* assert(tokeniser->context.chars.ptr[1] == '/'); */ + + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { + return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + emit_current_chars(tokeniser); + tokeniser->state = STATE_DATA; + return true; + } + + uint8_t c = CHAR(cptr); + + /**\todo fragment case */ if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || tokeniser->content_model == HUBBUB_CONTENT_MODEL_CDATA) { - uint32_t c; - hubbub_string start_tag = + uint8_t *start_tag_name = tokeniser->context.last_start_tag_name; + size_t start_tag_len = + tokeniser->context.last_start_tag_len; - while ((c = hubbub_inputstream_peek(tokeniser->input)) != - HUBBUB_INPUTSTREAM_EOF && - c != HUBBUB_INPUTSTREAM_OOD && - ctx->close_tag_match.match != true) { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); - - if (ctx->close_tag_match.count+1 == - start_tag.len) { - ctx->close_tag_match.match = true; - } else if (hubbub_inputstream_compare_range_ci( - tokeniser->input, pos, - start_tag.data.off + + while ((cptr = parserutils_inputstream_peek(tokeniser->input, + ctx->chars.len + ctx->close_tag_match.count, - len) != 0) { + &len)) != + PARSERUTILS_INPUTSTREAM_EOF && + cptr != PARSERUTILS_INPUTSTREAM_OOD) { + c = CHAR(cptr); + + if ((start_tag_name[ctx->close_tag_match.count] & ~0x20) + != (c & ~0x20)) { break; } - hubbub_inputstream_advance(tokeniser->input); ctx->close_tag_match.count += len; + + if (ctx->close_tag_match.count == start_tag_len) { + ctx->close_tag_match.match = true; + break; + } } - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } - if (ctx->close_tag_match.match) { - c = hubbub_inputstream_peek(tokeniser->input); - if (c != '\t' && c != '\n' && c != '\f' && - c != ' ' && c != '>' && c != '/' && - c != HUBBUB_INPUTSTREAM_EOF) { - ctx->close_tag_match.match = false; + if (ctx->close_tag_match.match == true) { + cptr = parserutils_inputstream_peek( + tokeniser->input, + ctx->chars.len + + ctx->close_tag_match.count, + &len); + + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { + return false; + } else if (cptr != PARSERUTILS_INPUTSTREAM_EOF) { + c = CHAR(cptr); + + if (c != '\t' && c != '\n' && c != '\f' && + c != ' ' && c != '>' && + c != '/') { + ctx->close_tag_match.match = false; + } } } - - /* After a match (or not), rewind */ - hubbub_inputstream_rewind(tokeniser->input, - tokeniser->context.close_tag_match.count); - tokeniser->context.close_tag_match.count = 0; } - if (ctx->close_tag_match.match == false && tokeniser->content_model != - HUBBUB_CONTENT_MODEL_PCDATA) { - hubbub_token token; - - uint32_t pos; - size_t len; - - /* emit a '</' character token -- by rewinding */ - hubbub_inputstream_rewind(tokeniser->input, 2); - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + if (ctx->close_tag_match.match == false && + tokeniser->content_model != + HUBBUB_CONTENT_MODEL_PCDATA) { + /* We should emit "</" here, but instead we leave it in the + * buffer so the data state emits it with any characters + * following it */ + tokeniser->state = STATE_DATA; + } else { + cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - token.type = HUBBUB_TOKEN_CHARACTER; - token.data.character.type = HUBBUB_STRING_OFF; - token.data.character.data.off = pos; - token.data.character.len = 2; + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { + return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + /** \todo parse error */ - hubbub_tokeniser_emit_token(tokeniser, &token); + /* Emit "</" */ + emit_current_chars(tokeniser); + tokeniser->state = STATE_DATA; + return true; + } - hubbub_inputstream_advance(tokeniser->input); - hubbub_inputstream_advance(tokeniser->input); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - } else { - hubbub_tag *ctag = &tokeniser->context.current_tag; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - uint32_t pos; - size_t len; + c = CHAR(cptr); if ('A' <= c && c <= 'Z') { - hubbub_inputstream_lowercase(tokeniser->input); - - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); + COLLECT_NOBUF(tokeniser->context.chars, len); tokeniser->context.current_tag_type = HUBBUB_TOKEN_END_TAG; - ctag->name.data.off = pos; - ctag->name.len = len; - ctag->n_attributes = 0; - tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME; - hubbub_inputstream_advance(tokeniser->input); + uint8_t lc = (c + 0x20); + START_BUF(tokeniser->context.current_tag.name, + &lc, len); + tokeniser->context.current_tag.n_attributes = 0; + + tokeniser->state = STATE_TAG_NAME; } else if ('a' <= c && c <= 'z') { - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); + COLLECT_NOBUF(tokeniser->context.chars, len); tokeniser->context.current_tag_type = HUBBUB_TOKEN_END_TAG; - ctag->name.data.off = pos; - ctag->name.len = len; - ctag->n_attributes = 0; + START_BUF(tokeniser->context.current_tag.name, + (uint8_t *) cptr, len); + tokeniser->context.current_tag.n_attributes = 0; - tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_TAG_NAME; } else if (c == '>') { - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; + /* Cursor still at "</", need to collect ">" */ + COLLECT_NOBUF(tokeniser->context.chars, len); - /* Emit "</" */ - token.type = HUBBUB_TOKEN_CHARACTER; - token.data.character = - tokeniser->context.current_chars; - - hubbub_tokeniser_emit_token(tokeniser, &token); + /* Now need to advance past "</>" */ + parserutils_inputstream_advance(tokeniser->input, + tokeniser->context.chars.len); + tokeniser->context.chars.len = 0; - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - } else if (c != HUBBUB_INPUTSTREAM_OOD) { - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); + /** \todo parse error */ + tokeniser->state = STATE_DATA; + } else { + /** \todo parse error */ - tokeniser->context.current_comment.data.off = pos; - tokeniser->context.current_comment.len = len; + /* Cursor still at "</", need to advance past it */ + parserutils_inputstream_advance(tokeniser->input, + tokeniser->context.chars.len); + tokeniser->context.chars.len = 0; - tokeniser->state = - HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; - hubbub_inputstream_advance(tokeniser->input); - } else { - /* Out of data */ - return false; + tokeniser->state = STATE_BOGUS_COMMENT; } } return true; } +/* this state expects tokeniser->context.current_tag to already have its + first character set */ bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser) { hubbub_tag *ctag = &tokeniser->context.current_tag; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - if (c == HUBBUB_INPUTSTREAM_OOD) - return false; - - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - tokeniser->state = - HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '>') { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if ('A' <= c && c <= 'Z') { - uint32_t pos; - size_t len; + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); - hubbub_inputstream_lowercase(tokeniser->input); + assert(tokeniser->context.chars.len > 0); +/* assert(tokeniser->context.chars.ptr[0] == '<'); */ + assert(ctag->name.len > 0); + assert(ctag->name.ptr); - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { + return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->state = STATE_DATA; + return emit_current_tag(tokeniser); + } - ctag->name.len += len; + uint8_t c = CHAR(cptr); - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; + COLLECT_NOBUF(tokeniser->context.chars, len); - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + FINISH(ctag->name); - hubbub_tokeniser_emit_token(tokeniser, &token); + tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; + } else if (c == '>') { + FINISH(ctag->name); - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + emit_current_tag(tokeniser); + tokeniser->state = STATE_DATA; + } else if (c == '\0') { + COLLECT_CHAR(ctag->name, u_fffd, sizeof(u_fffd)); } else if (c == '/') { - tokeniser->state = - HUBBUB_TOKENISER_STATE_SELF_CLOSING_START_TAG; - hubbub_inputstream_advance(tokeniser->input); + FINISH(ctag->name); + tokeniser->state = STATE_SELF_CLOSING_START_TAG; + } else if ('A' <= c && c <= 'Z') { + uint8_t lc = (c + 0x20); + COLLECT_CHAR(ctag->name, &lc, len); } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - ctag->name.len += len; - - hubbub_inputstream_advance(tokeniser->input); + COLLECT(ctag->name, cptr, len); } return true; @@ -1085,76 +1241,35 @@ bool hubbub_tokeniser_handle_before_attribute_name( hubbub_tokeniser *tokeniser) { hubbub_tag *ctag = &tokeniser->context.current_tag; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - - if (c == HUBBUB_INPUTSTREAM_OOD) - return false; - - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '>') { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if ('A' <= c && c <= 'Z') { - uint32_t pos; - size_t len; - hubbub_attribute *attr; - - hubbub_inputstream_lowercase(tokeniser->input); - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - attr = tokeniser->alloc(ctag->attributes, - (ctag->n_attributes + 1) * - sizeof(hubbub_attribute), - tokeniser->alloc_pw); - if (attr == NULL) { - /** \todo handle memory exhaustion */ - } - - ctag->attributes = attr; + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); - attr[ctag->n_attributes].ns = HUBBUB_NS_NULL; - attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF; - attr[ctag->n_attributes].name.data.off = pos; - attr[ctag->n_attributes].name.len = len; - attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF; - attr[ctag->n_attributes].value.data.off = 0; - attr[ctag->n_attributes].value.len = 0; + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { + return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->state = STATE_DATA; + return emit_current_tag(tokeniser); + } - ctag->n_attributes++; + uint8_t c = CHAR(cptr); - tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + COLLECT_NOBUF(tokeniser->context.chars, len); - hubbub_inputstream_advance(tokeniser->input); + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + /* pass over in silence */ + } else if (c == '>') { + emit_current_tag(tokeniser); + tokeniser->state = STATE_DATA; } else if (c == '/') { - tokeniser->state = - HUBBUB_TOKENISER_STATE_SELF_CLOSING_START_TAG; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + tokeniser->state = STATE_SELF_CLOSING_START_TAG; } else { - uint32_t pos; - size_t len; hubbub_attribute *attr; - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + if (c == '"' || c == '\'' || c == '=') { + /** \todo parse error */ + } attr = tokeniser->alloc(ctag->attributes, (ctag->n_attributes + 1) * @@ -1166,19 +1281,24 @@ bool hubbub_tokeniser_handle_before_attribute_name( ctag->attributes = attr; + if ('A' <= c && c <= 'Z') { + uint8_t lc = (c + 0x20); + START_BUF(attr[ctag->n_attributes].name, &lc, len); + } else if (c == '\0') { + START_BUF(attr[ctag->n_attributes].name, + u_fffd, sizeof(u_fffd)); + } else { + START_BUF(attr[ctag->n_attributes].name, + (uint8_t *) cptr, len); + } + attr[ctag->n_attributes].ns = HUBBUB_NS_NULL; - attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF; - attr[ctag->n_attributes].name.data.off = pos; - attr[ctag->n_attributes].name.len = len; - attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF; - attr[ctag->n_attributes].value.data.off = 0; + attr[ctag->n_attributes].value.ptr = NULL; attr[ctag->n_attributes].value.len = 0; ctag->n_attributes++; - tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; - - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_ATTRIBUTE_NAME; } return true; @@ -1187,150 +1307,89 @@ bool hubbub_tokeniser_handle_before_attribute_name( bool hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser) { hubbub_tag *ctag = &tokeniser->context.current_tag; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - - if (c == HUBBUB_INPUTSTREAM_OOD) - return false; - - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - tokeniser->state = - HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '=') { - tokeniser->state = - HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '>') { - hubbub_token token; - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); - hubbub_tokeniser_emit_token(tokeniser, &token); + assert(ctag->attributes[ctag->n_attributes - 1].name.len > 0); - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if ('A' <= c && c <= 'Z') { - uint32_t pos; - size_t len; + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { + return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->state = STATE_DATA; + return emit_current_tag(tokeniser); + } - hubbub_inputstream_lowercase(tokeniser->input); + uint8_t c = CHAR(cptr); - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + COLLECT_NOBUF(tokeniser->context.chars, len); - ctag->attributes[ctag->n_attributes - 1].name.len += len; + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + FINISH(ctag->attributes[ctag->n_attributes - 1].name); + tokeniser->state = STATE_AFTER_ATTRIBUTE_NAME; + } else if (c == '=') { + FINISH(ctag->attributes[ctag->n_attributes - 1].name); + tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE; + } else if (c == '>') { + FINISH(ctag->attributes[ctag->n_attributes - 1].name); - hubbub_inputstream_advance(tokeniser->input); + emit_current_tag(tokeniser); + tokeniser->state = STATE_DATA; } else if (c == '/') { - tokeniser->state = - HUBBUB_TOKENISER_STATE_SELF_CLOSING_START_TAG; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + FINISH(ctag->attributes[ctag->n_attributes - 1].name); + tokeniser->state = STATE_SELF_CLOSING_START_TAG; + } else if (c == '\0') { + COLLECT_CHAR(ctag->attributes[ctag->n_attributes - 1].name, + u_fffd, sizeof(u_fffd)); + } else if ('A' <= c && c <= 'Z') { + uint8_t lc = (c + 0x20); + COLLECT_CHAR(ctag->attributes[ctag->n_attributes - 1].name, + &lc, len); } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - ctag->attributes[ctag->n_attributes - 1].name.len += len; - - hubbub_inputstream_advance(tokeniser->input); + COLLECT(ctag->attributes[ctag->n_attributes - 1].name, + cptr, len); } return true; } -bool hubbub_tokeniser_handle_after_attribute_name( - hubbub_tokeniser *tokeniser) +bool hubbub_tokeniser_handle_after_attribute_name(hubbub_tokeniser *tokeniser) { hubbub_tag *ctag = &tokeniser->context.current_tag; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - if (c == HUBBUB_INPUTSTREAM_OOD) + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); + + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->state = STATE_DATA; + return emit_current_tag(tokeniser); + } - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - hubbub_inputstream_advance(tokeniser->input); + uint8_t c = CHAR(cptr); + + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + COLLECT_NOBUF(tokeniser->context.chars, len); } else if (c == '=') { - tokeniser->state = - HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE; - hubbub_inputstream_advance(tokeniser->input); + COLLECT_NOBUF(tokeniser->context.chars, len); + tokeniser->state = STATE_BEFORE_ATTRIBUTE_VALUE; } else if (c == '>') { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; + COLLECT_NOBUF(tokeniser->context.chars, len); - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if ('A' <= c && c <= 'Z') { - uint32_t pos; - size_t len; - hubbub_attribute *attr; - - hubbub_inputstream_lowercase(tokeniser->input); - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - attr = tokeniser->alloc(ctag->attributes, - (ctag->n_attributes + 1) * - sizeof(hubbub_attribute), - tokeniser->alloc_pw); - if (attr == NULL) { - /** \todo handle memory exhaustion */ - } - - ctag->attributes = attr; - - attr[ctag->n_attributes].ns = HUBBUB_NS_NULL; - attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF; - attr[ctag->n_attributes].name.data.off = pos; - attr[ctag->n_attributes].name.len = len; - attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF; - attr[ctag->n_attributes].value.data.off = 0; - attr[ctag->n_attributes].value.len = 0; - - ctag->n_attributes++; - - tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; - - hubbub_inputstream_advance(tokeniser->input); + emit_current_tag(tokeniser); + tokeniser->state = STATE_DATA; } else if (c == '/') { - /** \todo permitted slash */ - tokeniser->state = - HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + COLLECT_NOBUF(tokeniser->context.chars, len); + tokeniser->state = STATE_SELF_CLOSING_START_TAG; } else { - uint32_t pos; - size_t len; hubbub_attribute *attr; - hubbub_inputstream_lowercase(tokeniser->input); - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + if (c == '"' || c == '\'' || c == '=') { + /** \todo parse error */ + } attr = tokeniser->alloc(ctag->attributes, (ctag->n_attributes + 1) * @@ -1342,76 +1401,74 @@ bool hubbub_tokeniser_handle_after_attribute_name( ctag->attributes = attr; + if ('A' <= c && c <= 'Z') { + uint8_t lc = (c + 0x20); + START_BUF(attr[ctag->n_attributes].name, &lc, len); + } else if (c == '\0') { + START_BUF(attr[ctag->n_attributes].name, + u_fffd, sizeof(u_fffd)); + } else { + START_BUF(attr[ctag->n_attributes].name, + (uint8_t *)cptr, len); + } + attr[ctag->n_attributes].ns = HUBBUB_NS_NULL; - attr[ctag->n_attributes].name.type = HUBBUB_STRING_OFF; - attr[ctag->n_attributes].name.data.off = pos; - attr[ctag->n_attributes].name.len = len; - attr[ctag->n_attributes].value.type = HUBBUB_STRING_OFF; - attr[ctag->n_attributes].value.data.off = 0; + attr[ctag->n_attributes].value.ptr = NULL; attr[ctag->n_attributes].value.len = 0; ctag->n_attributes++; - tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; - - hubbub_inputstream_advance(tokeniser->input); + COLLECT_NOBUF(tokeniser->context.chars, len); + tokeniser->state = STATE_ATTRIBUTE_NAME; } return true; } +/* this state is only ever triggered by an '=' */ bool hubbub_tokeniser_handle_before_attribute_value( hubbub_tokeniser *tokeniser) { hubbub_tag *ctag = &tokeniser->context.current_tag; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - if (c == HUBBUB_INPUTSTREAM_OOD) + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); + + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->state = STATE_DATA; + return emit_current_tag(tokeniser); + } + + uint8_t c = CHAR(cptr); - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - hubbub_inputstream_advance(tokeniser->input); + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + COLLECT_NOBUF(tokeniser->context.chars, len); } else if (c == '"') { - tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ; - hubbub_inputstream_advance(tokeniser->input); + COLLECT_NOBUF(tokeniser->context.chars, len); + tokeniser->state = STATE_ATTRIBUTE_VALUE_DQ; } else if (c == '&') { - tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ; + tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ; } else if (c == '\'') { - tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ; - hubbub_inputstream_advance(tokeniser->input); + COLLECT_NOBUF(tokeniser->context.chars, len); + tokeniser->state = STATE_ATTRIBUTE_VALUE_SQ; } else if (c == '>') { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + COLLECT_NOBUF(tokeniser->context.chars, len); + + emit_current_tag(tokeniser); + tokeniser->state = STATE_DATA; + } else if (c == '\0') { + COLLECT_NOBUF(tokeniser->context.chars, len); + START_BUF(ctag->attributes[ctag->n_attributes - 1].value, + u_fffd, sizeof(u_fffd)); + tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ; } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - ctag->attributes[ctag->n_attributes - 1].value.data.off = pos; - ctag->attributes[ctag->n_attributes - 1].value.len = len; - - tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ; - - hubbub_inputstream_advance(tokeniser->input); + COLLECT_NOBUF(tokeniser->context.chars, len); + START_BUF(ctag->attributes[ctag->n_attributes - 1].value, + (uint8_t *)cptr, len); + tokeniser->state = STATE_ATTRIBUTE_VALUE_UQ; } return true; @@ -1420,45 +1477,39 @@ bool hubbub_tokeniser_handle_before_attribute_value( bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser) { hubbub_tag *ctag = &tokeniser->context.current_tag; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - if (c == HUBBUB_INPUTSTREAM_OOD) + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); + + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + FINISH(ctag->attributes[ctag->n_attributes - 1].value); + + tokeniser->state = STATE_DATA; + return emit_current_tag(tokeniser); + } + + uint8_t c = CHAR(cptr); if (c == '"') { - tokeniser->state = - HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_VALUE_Q; - hubbub_inputstream_advance(tokeniser->input); + COLLECT_NOBUF(tokeniser->context.chars, len); + FINISH(ctag->attributes[ctag->n_attributes - 1].value); + tokeniser->state = STATE_AFTER_ATTRIBUTE_VALUE_Q; } else if (c == '&') { tokeniser->context.prev_state = tokeniser->state; - tokeniser->state = - HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; + tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; tokeniser->context.allowed_char = '"'; /* Don't eat the '&'; it'll be handled by entity consumption */ - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else if (c == '\0') { + COLLECT_NOBUF(tokeniser->context.chars, len); + COLLECT_CHAR(ctag->attributes[ctag->n_attributes - 1].value, + u_fffd, sizeof(u_fffd)); } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { - ctag->attributes[ctag->n_attributes - 1].value.data.off = - pos; - } - - ctag->attributes[ctag->n_attributes - 1].value.len += len; - - hubbub_inputstream_advance(tokeniser->input); + COLLECT_NOBUF(tokeniser->context.chars, len); + COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, + cptr, len); } return true; @@ -1467,45 +1518,40 @@ bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser) bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser) { hubbub_tag *ctag = &tokeniser->context.current_tag; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - if (c == HUBBUB_INPUTSTREAM_OOD) + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); + + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + FINISH(ctag->attributes[ctag->n_attributes - 1].value); + + tokeniser->state = STATE_DATA; + return emit_current_tag(tokeniser); + } + + uint8_t c = CHAR(cptr); if (c == '\'') { + COLLECT_NOBUF(tokeniser->context.chars, len); + FINISH(ctag->attributes[ctag->n_attributes - 1].value); tokeniser->state = - HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_VALUE_Q; - hubbub_inputstream_advance(tokeniser->input); + STATE_AFTER_ATTRIBUTE_VALUE_Q; } else if (c == '&') { tokeniser->context.prev_state = tokeniser->state; - tokeniser->state = - HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; + tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; tokeniser->context.allowed_char = '\''; /* Don't eat the '&'; it'll be handled by entity consumption */ - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else if (c == '\0') { + COLLECT_NOBUF(tokeniser->context.chars, len); + COLLECT_CHAR(ctag->attributes[ctag->n_attributes - 1].value, + u_fffd, sizeof(u_fffd)); } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { - ctag->attributes[ctag->n_attributes - 1].value.data.off = - pos; - } - - ctag->attributes[ctag->n_attributes - 1].value.len += len; - - hubbub_inputstream_advance(tokeniser->input); + COLLECT_NOBUF(tokeniser->context.chars, len); + COLLECT_MS(ctag->attributes[ctag->n_attributes - 1].value, + cptr, len); } return true; @@ -1514,51 +1560,49 @@ bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser) bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser) { hubbub_tag *ctag = &tokeniser->context.current_tag; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + uint8_t c; - if (c == HUBBUB_INPUTSTREAM_OOD) + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); + + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + FINISH(ctag->attributes[ctag->n_attributes - 1].value); - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - tokeniser->state = - HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_DATA; + return emit_current_tag(tokeniser); + } + + c = CHAR(cptr); + + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + COLLECT_NOBUF(tokeniser->context.chars, len); + FINISH(ctag->attributes[ctag->n_attributes - 1].value); + tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; } else if (c == '&') { tokeniser->context.prev_state = tokeniser->state; - tokeniser->state = - HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; + tokeniser->state = STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE; /* Don't eat the '&'; it'll be handled by entity consumption */ } else if (c == '>') { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + COLLECT_NOBUF(tokeniser->context.chars, len); + FINISH(ctag->attributes[ctag->n_attributes - 1].value); + + emit_current_tag(tokeniser); + tokeniser->state = STATE_DATA; + } else if (c == '\0') { + COLLECT_NOBUF(tokeniser->context.chars, len); + COLLECT_CHAR(ctag->attributes[ctag->n_attributes - 1].value, + u_fffd, sizeof(u_fffd)); } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - /* don't worry about setting the offset -- this is - * always done before this state is reached */ - ctag->attributes[ctag->n_attributes - 1].value.len += len; + if (c == '"' || c == '\'' || c == '=') { + /** \todo parse error */ + } - hubbub_inputstream_advance(tokeniser->input); + COLLECT_NOBUF(tokeniser->context.chars, len); + COLLECT(ctag->attributes[ctag->n_attributes - 1].value, + cptr, len); } return true; @@ -1567,80 +1611,87 @@ bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser) bool hubbub_tokeniser_handle_character_reference_in_attribute_value( hubbub_tokeniser *tokeniser) { - hubbub_tag *ctag = &tokeniser->context.current_tag; - uint32_t pos; - size_t len; - if (tokeniser->context.match_entity.complete == false) { - return hubbub_tokeniser_consume_character_reference(tokeniser); + return hubbub_tokeniser_consume_character_reference(tokeniser, + tokeniser->context.chars.len); } else { -#ifndef NDEBUG - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - - assert(c != HUBBUB_INPUTSTREAM_OOD && - c != HUBBUB_INPUTSTREAM_EOF); -#endif - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + hubbub_tag *ctag = &tokeniser->context.current_tag; + hubbub_attribute *attr = &ctag->attributes[ + ctag->n_attributes - 1]; + + uint8_t utf8[6]; + uint8_t *utf8ptr = utf8; + size_t len = sizeof(utf8); + + if (tokeniser->context.match_entity.codepoint) { + parserutils_charset_utf8_from_ucs4( + tokeniser->context.match_entity.codepoint, + &utf8ptr, &len); + + /* +1 for the ampersand */ + COLLECT_NOBUF(tokeniser->context.chars, + tokeniser->context.match_entity.length + + 1); + + if (attr->value.len == 0) { + START_BUF(attr->value, + utf8, sizeof(utf8) - len); + } else { + SWITCH(attr->value); + COLLECT(attr->value, utf8, sizeof(utf8) - len); + } + } else { + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, + tokeniser->context.chars.len, &len); - if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { - ctag->attributes[ctag->n_attributes - 1].value.data.off = - pos; + /* Insert the ampersand */ + COLLECT(tokeniser->context.chars, cptr, len); + COLLECT_MS(attr->value, cptr, len); } - ctag->attributes[ctag->n_attributes - 1].value.len += len; - /* Reset for next time */ tokeniser->context.match_entity.complete = false; /* And back to the previous state */ tokeniser->state = tokeniser->context.prev_state; - - hubbub_inputstream_advance(tokeniser->input); } return true; } +/* always switches state */ bool hubbub_tokeniser_handle_after_attribute_value_q( hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->state = STATE_DATA; + return emit_current_tag(tokeniser); + } - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - tokeniser->state = - HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '>') { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; + uint8_t c = CHAR(cptr); - hubbub_tokeniser_emit_token(tokeniser, &token); + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + COLLECT_NOBUF(tokeniser->context.chars, len); + tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; + } else if (c == '>') { + COLLECT_NOBUF(tokeniser->context.chars, len); - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); + emit_current_tag(tokeniser); + tokeniser->state = STATE_DATA; } else if (c == '/') { - tokeniser->state = - HUBBUB_TOKENISER_STATE_SELF_CLOSING_START_TAG; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + COLLECT_NOBUF(tokeniser->context.chars, len); + tokeniser->state = STATE_SELF_CLOSING_START_TAG; } else { - tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + /** \todo parse error */ + tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; } return true; @@ -1649,106 +1700,106 @@ bool hubbub_tokeniser_handle_after_attribute_value_q( bool hubbub_tokeniser_handle_self_closing_start_tag( hubbub_tokeniser *tokeniser) { - hubbub_tag *ctag = &tokeniser->context.current_tag; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->state = STATE_DATA; + return emit_current_tag(tokeniser); + } - if (c == '>') { - hubbub_token token; - - ctag->self_closing = true; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; - - hubbub_tokeniser_emit_token(tokeniser, &token); + uint8_t c = CHAR(cptr); - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit current tag */ - token.type = tokeniser->context.current_tag_type; - token.data.tag = tokeniser->context.current_tag; + if (c == '>') { + COLLECT_NOBUF(tokeniser->context.chars, len); - hubbub_tokeniser_emit_token(tokeniser, &token); + tokeniser->context.current_tag.self_closing = true; + emit_current_tag(tokeniser); - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + tokeniser->state = STATE_DATA; } else { - tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + tokeniser->state = STATE_BEFORE_ATTRIBUTE_NAME; } return true; } - +/* this state expects tokeniser->context.chars to be empty on first entry */ bool hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser) { - hubbub_token token; - uint32_t c; - - while ((c = hubbub_inputstream_peek(tokeniser->input)) != - HUBBUB_INPUTSTREAM_EOF && - c != HUBBUB_INPUTSTREAM_OOD) { - uint32_t pos; - size_t len; + hubbub_string *comment = &tokeniser->context.current_comment; - if (c == '>') { - hubbub_inputstream_advance(tokeniser->input); - break; - } - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - if (tokeniser->context.current_comment.len == 0) - tokeniser->context.current_comment.data.off = pos; - tokeniser->context.current_comment.len += len; - - hubbub_inputstream_advance(tokeniser->input); - } + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->state = STATE_DATA; + tokeniser->context.current_comment.ptr = + tokeniser->buffer->data; + return emit_current_comment(tokeniser); + } - /* Emit comment */ - token.type = HUBBUB_TOKEN_COMMENT; - token.data.comment = tokeniser->context.current_comment; + uint8_t c = CHAR(cptr); - hubbub_tokeniser_emit_token(tokeniser, &token); + COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len); - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + if (c == '>') { + tokeniser->context.current_comment.ptr = + tokeniser->buffer->data; + emit_current_comment(tokeniser); + tokeniser->state = STATE_DATA; + } else if (c == '\0') { + parserutils_buffer_append(tokeniser->buffer, + u_fffd, sizeof(u_fffd)); + comment->len += sizeof(u_fffd); + } else { + parserutils_buffer_append(tokeniser->buffer, + (uint8_t *)cptr, len); + comment->len += len; + } return true; } +/* this state always switches to another state straight away */ bool hubbub_tokeniser_handle_markup_declaration_open( hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + 0, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->state = STATE_BOGUS_COMMENT; + return true; + } + + uint8_t c = CHAR(cptr); if (c == '-') { - tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_COMMENT; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->context.chars.ptr = (uint8_t *) cptr; + tokeniser->context.chars.len = len; + tokeniser->state = STATE_MATCH_COMMENT; } else if ((c & ~0x20) == 'D') { - tokeniser->context.match_doctype.count = 1; - tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->context.chars.ptr = (uint8_t *) cptr; + tokeniser->context.chars.len = len; + tokeniser->context.match_doctype.count = len; + tokeniser->state = STATE_MATCH_DOCTYPE; } else if (tokeniser->process_cdata_section == true && c == '[') { - tokeniser->context.match_cdata.count = 1; - tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_CDATA; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->context.chars.ptr = (uint8_t *) cptr; + tokeniser->context.chars.len = len; + tokeniser->context.match_cdata.count = len; + tokeniser->state = STATE_MATCH_CDATA; } else { - tokeniser->context.current_comment.data.off = 0; - tokeniser->context.current_comment.len = 0; - - tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + tokeniser->state = STATE_BOGUS_COMMENT; } return true; @@ -1757,400 +1808,238 @@ bool hubbub_tokeniser_handle_markup_declaration_open( bool hubbub_tokeniser_handle_match_comment(hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->context.chars.len = + tokeniser->context.current_comment.len = + 0; + tokeniser->state = STATE_BOGUS_COMMENT; + return true; + } - tokeniser->context.current_comment.data.off = 0; - tokeniser->context.current_comment.len = 0; + tokeniser->context.chars.len = + tokeniser->context.current_comment.len = + 0; - if (c == '-') { - tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_START; - hubbub_inputstream_advance(tokeniser->input); + if (CHAR(cptr) == '-') { + parserutils_inputstream_advance(tokeniser->input, SLEN("--")); + tokeniser->state = STATE_COMMENT_START; } else { - /* Rewind to the first '-' */ - hubbub_inputstream_rewind(tokeniser->input, 1); - - tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + tokeniser->state = STATE_BOGUS_COMMENT; } return true; } -bool hubbub_tokeniser_handle_comment_start(hubbub_tokeniser *tokeniser) +bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - - if (c == HUBBUB_INPUTSTREAM_OOD) - return false; - - if (c == '-') { - tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_START_DASH; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '>') { - hubbub_token token; - - /* Emit comment */ - token.type = HUBBUB_TOKEN_COMMENT; - token.data.comment = tokeniser->context.current_comment; - - hubbub_tokeniser_emit_token(tokeniser, &token); + hubbub_string *comment = &tokeniser->context.current_comment; - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit comment */ - token.type = HUBBUB_TOKEN_COMMENT; - token.data.comment = tokeniser->context.current_comment; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - if (tokeniser->context.current_comment.len == 0) - tokeniser->context.current_comment.data.off = pos; - tokeniser->context.current_comment.len += len; - - tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; - hubbub_inputstream_advance(tokeniser->input); - } - - return true; -} - -bool hubbub_tokeniser_handle_comment_start_dash(hubbub_tokeniser *tokeniser) -{ - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek( + tokeniser->input, tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; - - if (c == '-') { - tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '>') { - hubbub_token token; - - /* Emit comment */ - token.type = HUBBUB_TOKEN_COMMENT; - token.data.comment = tokeniser->context.current_comment; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit comment */ - token.type = HUBBUB_TOKEN_COMMENT; - token.data.comment = tokeniser->context.current_comment; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - } else { - uint32_t pos; - size_t len; - - /* In order to get to this state, the previous character must - * be '-'. This means we can safely rewind and add to the - * comment buffer. */ - - hubbub_inputstream_rewind(tokeniser->input, 1); - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - if (tokeniser->context.current_comment.len == 0) - tokeniser->context.current_comment.data.off = pos; - - tokeniser->context.current_comment.len += len; - hubbub_inputstream_advance(tokeniser->input); - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - tokeniser->context.current_comment.len += len; - hubbub_inputstream_advance(tokeniser->input); - - tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->context.current_comment.ptr = + tokeniser->buffer->data; + emit_current_comment(tokeniser); + tokeniser->state = STATE_DATA; + return true; } - return true; -} - -bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser) -{ - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - - if (c == HUBBUB_INPUTSTREAM_OOD) - return false; + uint8_t c = CHAR(cptr); - if (c == '-') { - tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END_DASH; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; + if (c == '>' && (tokeniser->state == STATE_COMMENT_START_DASH || + tokeniser->state == STATE_COMMENT_START || + tokeniser->state == STATE_COMMENT_END)) { + COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len); - /* Emit comment */ - token.type = HUBBUB_TOKEN_COMMENT; - token.data.comment = tokeniser->context.current_comment; + /** \todo parse error if state != COMMENT_END */ + tokeniser->context.current_comment.ptr = + tokeniser->buffer->data; + emit_current_comment(tokeniser); - hubbub_tokeniser_emit_token(tokeniser, &token); + tokeniser->state = STATE_DATA; + } else if (c == '-') { + if (tokeniser->state == STATE_COMMENT_START) { + tokeniser->state = STATE_COMMENT_START_DASH; + } else if (tokeniser->state == STATE_COMMENT_START_DASH) { + tokeniser->state = STATE_COMMENT_END; + } else if (tokeniser->state == STATE_COMMENT) { + tokeniser->state = STATE_COMMENT_END_DASH; + } else if (tokeniser->state == STATE_COMMENT_END_DASH) { + tokeniser->state = STATE_COMMENT_END; + } else if (tokeniser->state == STATE_COMMENT_END) { + parserutils_buffer_append(tokeniser->buffer, + (uint8_t *) "-", SLEN("-")); + comment->len += SLEN("-"); + } - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len); } else { - uint32_t pos; - size_t len; + if (tokeniser->state == STATE_COMMENT_START_DASH || + tokeniser->state == STATE_COMMENT_END_DASH) { + parserutils_buffer_append(tokeniser->buffer, + (uint8_t *) "-", SLEN("-")); + comment->len += SLEN("-"); + } else if (tokeniser->state == STATE_COMMENT_END) { + parserutils_buffer_append(tokeniser->buffer, + (uint8_t *) "--", SLEN("--")); + comment->len += SLEN("--"); + } - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - tokeniser->context.current_comment.len += len; + if (c == '\0') { + parserutils_buffer_append(tokeniser->buffer, + u_fffd, sizeof(u_fffd)); + comment->len += sizeof(u_fffd); + } else if (c == '\r') { + cptr = parserutils_inputstream_peek( + tokeniser->input, + tokeniser->context.chars.len + 1, + &len); + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { + return false; + } else if (cptr != PARSERUTILS_INPUTSTREAM_EOF && + CHAR(cptr) != '\n') { + parserutils_buffer_append(tokeniser->buffer, + &lf, sizeof(lf)); + comment->len += sizeof(lf); + } + } else { + parserutils_buffer_append(tokeniser->buffer, + (uint8_t *)cptr, len); + comment->len += len; + } - hubbub_inputstream_advance(tokeniser->input); + COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len); + tokeniser->state = STATE_COMMENT; } return true; } -bool hubbub_tokeniser_handle_comment_end_dash(hubbub_tokeniser *tokeniser) -{ - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - - if (c == HUBBUB_INPUTSTREAM_OOD) - return false; - - if (c == '-') { - tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit comment */ - token.type = HUBBUB_TOKEN_COMMENT; - token.data.comment = tokeniser->context.current_comment; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - } else { - uint32_t pos; - size_t len; - /* In order to get to this state, the previous character must - * be '-'. This means we can safely rewind and add 1 to the - * comment buffer. */ - hubbub_inputstream_rewind(tokeniser->input, 1); - tokeniser->context.current_comment.len += 1; - /* Now add the input char */ - hubbub_inputstream_advance(tokeniser->input); - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - tokeniser->context.current_comment.len += len; - hubbub_inputstream_advance(tokeniser->input); +#define DOCTYPE "DOCTYPE" +#define DOCTYPE_LEN (SLEN(DOCTYPE) - 1) - tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; - } - - return true; -} - -bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser) +bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.match_doctype.count, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->context.current_comment.len = + tokeniser->context.chars.len = + 0; + tokeniser->state = STATE_BOGUS_COMMENT; + return true; + } - if (c == '>') { - hubbub_token token; - - /* Emit comment */ - token.type = HUBBUB_TOKEN_COMMENT; - token.data.comment = tokeniser->context.current_comment; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '-') { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - if (tokeniser->context.current_comment.len == 0) { - tokeniser->context.current_comment.data.off = pos; - } - - tokeniser->context.current_comment.len = len; - - tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit comment */ - token.type = HUBBUB_TOKEN_COMMENT; - token.data.comment = tokeniser->context.current_comment; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - } else { - uint32_t pos; - size_t len; - - /* In order to have got here, the previous two characters - * must be '--', so rewind two characters */ - hubbub_inputstream_rewind(tokeniser->input, 2); - - /* Add first '-' */ - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - if (tokeniser->context.current_comment.len == 0) - tokeniser->context.current_comment.data.off = pos; - tokeniser->context.current_comment.len += len; - hubbub_inputstream_advance(tokeniser->input); - - /* Add second '-' */ - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - tokeniser->context.current_comment.len += len; - hubbub_inputstream_advance(tokeniser->input); + uint8_t c = CHAR(cptr); - /* Add input character */ - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - tokeniser->context.current_comment.len += len; - hubbub_inputstream_advance(tokeniser->input); + assert(tokeniser->context.match_doctype.count <= DOCTYPE_LEN); - tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; + if (DOCTYPE[tokeniser->context.match_doctype.count] != (c & ~0x20)) { + tokeniser->context.current_comment.len = + tokeniser->context.chars.len = + 0; + tokeniser->state = STATE_BOGUS_COMMENT; + return true; } - return true; -} - -bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser) -{ - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + COLLECT_NOBUF(tokeniser->context.chars, len); - if (c == HUBBUB_INPUTSTREAM_OOD) - return false; - - if (tokeniser->context.match_doctype.count == 1 && - (c & ~0x20) == 'O') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 2 && - (c & ~0x20) == 'C') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 3 && - (c & ~0x20) == 'T') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 4 && - (c & ~0x20) == 'Y') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 5 && - (c & ~0x20) == 'P') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 6 && - (c & ~0x20) == 'E') { - tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE; - hubbub_inputstream_advance(tokeniser->input); - } else { - /* Rewind as many characters as have been matched */ - hubbub_inputstream_rewind(tokeniser->input, - tokeniser->context.match_doctype.count); + if (tokeniser->context.match_doctype.count == DOCTYPE_LEN) { + /* Skip over the DOCTYPE bit */ + parserutils_inputstream_advance(tokeniser->input, + tokeniser->context.chars.len); - tokeniser->context.current_comment.data.off = 0; - tokeniser->context.current_comment.len = 0; + memset(&tokeniser->context.current_doctype, 0, + sizeof tokeniser->context.current_doctype); + tokeniser->context.current_doctype.public_missing = true; + tokeniser->context.current_doctype.system_missing = true; + tokeniser->context.chars.len = 0; - tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + tokeniser->state = STATE_DOCTYPE; } + tokeniser->context.match_doctype.count++; + return true; } +#undef DOCTYPE +#undef DOCTYPE_LEN + bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser) { - hubbub_doctype *cdoc = &tokeniser->context.current_doctype; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->state = STATE_BEFORE_DOCTYPE_NAME; + return true; + } - memset(cdoc, 0, sizeof *cdoc); - cdoc->name.type = HUBBUB_STRING_OFF; - cdoc->public_missing = true; - cdoc->public_id.type = HUBBUB_STRING_OFF; - cdoc->system_missing = true; - cdoc->system_id.type = HUBBUB_STRING_OFF; + uint8_t c = CHAR(cptr); - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - hubbub_inputstream_advance(tokeniser->input); + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len); } - tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME; + tokeniser->state = STATE_BEFORE_DOCTYPE_NAME; return true; } -bool hubbub_tokeniser_handle_before_doctype_name( - hubbub_tokeniser *tokeniser) +bool hubbub_tokeniser_handle_before_doctype_name(hubbub_tokeniser *tokeniser) { hubbub_doctype *cdoc = &tokeniser->context.current_doctype; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + /* Emit current doctype, force-quirks on */ + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + return true; + } - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '>') { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); + uint8_t c = CHAR(cptr); + COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len); - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + /* pass over in silence */ + } else if (c == '>') { + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - cdoc->name.data.off = pos; - cdoc->name.len = len; - - tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME; + if (c == '\0') { + START_BUF(cdoc->name, u_fffd, sizeof(u_fffd)); + } else { + START_BUF(cdoc->name, (uint8_t *) cptr, len); + } - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_DOCTYPE_NAME; } return true; @@ -2159,45 +2048,36 @@ bool hubbub_tokeniser_handle_before_doctype_name( bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser) { hubbub_doctype *cdoc = &tokeniser->context.current_doctype; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + FINISH(cdoc->name); - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '>') { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + return true; + } - hubbub_tokeniser_emit_token(tokeniser, &token); + uint8_t c = CHAR(cptr); + COLLECT_NOBUF(tokeniser->context.chars, len); - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + FINISH(cdoc->name); + tokeniser->state = STATE_AFTER_DOCTYPE_NAME; + } else if (c == '>') { + FINISH(cdoc->name); + emit_current_doctype(tokeniser, false); + tokeniser->state = STATE_DATA; } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - cdoc->name.len += len; - - hubbub_inputstream_advance(tokeniser->input); + if (c == '\0') { + COLLECT_CHAR(cdoc->name, u_fffd, sizeof(u_fffd)); + } else { + COLLECT(cdoc->name, cptr, len); + } } return true; @@ -2205,141 +2085,115 @@ bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser) bool hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + return true; + } - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '>') { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); + uint8_t c = CHAR(cptr); + COLLECT_NOBUF(tokeniser->context.chars, len); - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + /* pass over in silence */ + } else if (c == '>') { + emit_current_doctype(tokeniser, false); + tokeniser->state = STATE_DATA; } else if ((c & ~0x20) == 'P') { - tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_PUBLIC; tokeniser->context.match_doctype.count = 1; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_MATCH_PUBLIC; } else if ((c & ~0x20) == 'S') { - tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_SYSTEM; tokeniser->context.match_doctype.count = 1; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_MATCH_SYSTEM; } else { - tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE; + tokeniser->state = STATE_BOGUS_DOCTYPE; tokeniser->context.current_doctype.force_quirks = true; - - hubbub_inputstream_advance(tokeniser->input); } return true; } +#define PUBLIC "PUBLIC" +#define PUBLIC_LEN (SLEN(PUBLIC) - 1) bool hubbub_tokeniser_handle_match_public(hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->context.current_doctype.force_quirks = true; + tokeniser->state = STATE_BOGUS_DOCTYPE; + return true; + } - if (tokeniser->context.match_doctype.count == 1 && - (c & ~0x20) == 'U') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 2 && - (c & ~0x20) == 'B') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 3 && - (c & ~0x20) == 'L') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 4 && - (c & ~0x20) == 'I') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 5 && - (c & ~0x20) == 'C') { - tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_PUBLIC; - hubbub_inputstream_advance(tokeniser->input); - } else { - /* Rewind as many characters as have been matched */ - hubbub_inputstream_rewind(tokeniser->input, - tokeniser->context.match_doctype.count); + uint8_t c = CHAR(cptr); + + assert(tokeniser->context.match_doctype.count <= PUBLIC_LEN); - tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE; + if (PUBLIC[tokeniser->context.match_doctype.count] != (c & ~0x20)) { tokeniser->context.current_doctype.force_quirks = true; + tokeniser->state = STATE_BOGUS_DOCTYPE; + return true; } - return true; -} + COLLECT_NOBUF(tokeniser->context.chars, len); + + if (tokeniser->context.match_doctype.count == PUBLIC_LEN) { + tokeniser->state = STATE_BEFORE_DOCTYPE_PUBLIC; + } + tokeniser->context.match_doctype.count++; + return true; +} +#undef PUBLIC +#undef PUBLIC_LEN bool hubbub_tokeniser_handle_before_doctype_public(hubbub_tokeniser *tokeniser) { hubbub_doctype *cdoc = &tokeniser->context.current_doctype; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + return true; + } + + uint8_t c = CHAR(cptr); + COLLECT_NOBUF(tokeniser->context.chars, len); - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - hubbub_inputstream_advance(tokeniser->input); + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + /* pass over in silence */ } else if (c == '"') { cdoc->public_missing = false; - tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_PUBLIC_DQ; - hubbub_inputstream_advance(tokeniser->input); + cdoc->public_id.len = 0; + tokeniser->state = STATE_DOCTYPE_PUBLIC_DQ; } else if (c == '\'') { cdoc->public_missing = false; - tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_PUBLIC_SQ; - hubbub_inputstream_advance(tokeniser->input); + cdoc->public_id.len = 0; + tokeniser->state = STATE_DOCTYPE_PUBLIC_SQ; } else if (c == '>') { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; } else { cdoc->force_quirks = true; - tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_BOGUS_DOCTYPE; } return true; @@ -2348,49 +2202,37 @@ bool hubbub_tokeniser_handle_before_doctype_public(hubbub_tokeniser *tokeniser) bool hubbub_tokeniser_handle_doctype_public_dq(hubbub_tokeniser *tokeniser) { hubbub_doctype *cdoc = &tokeniser->context.current_doctype; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + FINISH(cdoc->public_id); + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + return true; + } + + uint8_t c = CHAR(cptr); + COLLECT_NOBUF(tokeniser->context.chars, len); if (c == '"') { - tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_PUBLIC; - hubbub_inputstream_advance(tokeniser->input); + FINISH(cdoc->public_id); + tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC; } else if (c == '>') { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + FINISH(cdoc->public_id); + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + } else if (c == '\0') { + if (cdoc->public_id.len == 0) { + START_BUF(cdoc->name, u_fffd, sizeof(u_fffd)); + } else { + COLLECT_CHAR(cdoc->name, u_fffd, sizeof(u_fffd)); + } } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - if (cdoc->public_id.len == 0) - cdoc->public_id.data.off = pos; - - cdoc->public_id.len += len; - - hubbub_inputstream_advance(tokeniser->input); + COLLECT_MS(cdoc->public_id, cptr, len); } return true; @@ -2399,49 +2241,39 @@ bool hubbub_tokeniser_handle_doctype_public_dq(hubbub_tokeniser *tokeniser) bool hubbub_tokeniser_handle_doctype_public_sq(hubbub_tokeniser *tokeniser) { hubbub_doctype *cdoc = &tokeniser->context.current_doctype; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + FINISH(cdoc->public_id); + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + return true; + } + + uint8_t c = CHAR(cptr); + COLLECT_NOBUF(tokeniser->context.chars, len); if (c == '\'') { - tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_PUBLIC; - hubbub_inputstream_advance(tokeniser->input); + FINISH(cdoc->public_id); + tokeniser->state = STATE_AFTER_DOCTYPE_PUBLIC; } else if (c == '>') { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + FINISH(cdoc->public_id); + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + } else if (c == '\0') { + if (cdoc->public_id.len == 0) { + START_BUF(cdoc->public_id, + u_fffd, sizeof(u_fffd)); + } else { + COLLECT_CHAR(cdoc->public_id, + u_fffd, sizeof(u_fffd)); + } } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - if (cdoc->public_id.len == 0) - cdoc->public_id.data.off = pos; - - cdoc->public_id.len += len; - - hubbub_inputstream_advance(tokeniser->input); + COLLECT_MS(cdoc->public_id, cptr, len); } return true; @@ -2451,191 +2283,162 @@ bool hubbub_tokeniser_handle_doctype_public_sq(hubbub_tokeniser *tokeniser) bool hubbub_tokeniser_handle_after_doctype_public(hubbub_tokeniser *tokeniser) { hubbub_doctype *cdoc = &tokeniser->context.current_doctype; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + return true; + } - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - hubbub_inputstream_advance(tokeniser->input); + uint8_t c = CHAR(cptr); + COLLECT_NOBUF(tokeniser->context.chars, len); + + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + /* pass over in silence */ } else if (c == '"') { cdoc->system_missing = false; - tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_DQ; - hubbub_inputstream_advance(tokeniser->input); + cdoc->system_id.len = 0; + + tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ; } else if (c == '\'') { cdoc->system_missing = false; - tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_SQ; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '>') { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; + cdoc->system_id.len = 0; - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ; + } else if (c == '>') { + emit_current_doctype(tokeniser, false); + tokeniser->state = STATE_DATA; } else { cdoc->force_quirks = true; - tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_BOGUS_DOCTYPE; } return true; } + +#define SYSTEM "SYSTEM" +#define SYSTEM_LEN (SLEN(SYSTEM) - 1) + bool hubbub_tokeniser_handle_match_system(hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->context.current_doctype.force_quirks = true; + tokeniser->state = STATE_BOGUS_DOCTYPE; + return true; + } - if (tokeniser->context.match_doctype.count == 1 && - (c & ~0x20) == 'Y') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 2 && - (c & ~0x20) == 'S') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 3 && - (c & ~0x20) == 'T') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 4 && - (c & ~0x20) == 'E') { - tokeniser->context.match_doctype.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_doctype.count == 5 && - (c & ~0x20) == 'M') { - tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_SYSTEM; - hubbub_inputstream_advance(tokeniser->input); - } else { - /* Rewind as many characters as have been matched */ - hubbub_inputstream_rewind(tokeniser->input, - tokeniser->context.match_doctype.count); + uint8_t c = CHAR(cptr); - tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE; + assert(tokeniser->context.match_doctype.count <= SYSTEM_LEN); + + if (SYSTEM[tokeniser->context.match_doctype.count] != (c & ~0x20)) { tokeniser->context.current_doctype.force_quirks = true; + tokeniser->state = STATE_BOGUS_DOCTYPE; + return true; } + COLLECT_NOBUF(tokeniser->context.chars, len); + + if (tokeniser->context.match_doctype.count == SYSTEM_LEN) { + tokeniser->state = STATE_BEFORE_DOCTYPE_SYSTEM; + } + + tokeniser->context.match_doctype.count++; + return true; } +#undef SYSTEM +#undef SYSTEM_LEN bool hubbub_tokeniser_handle_before_doctype_system(hubbub_tokeniser *tokeniser) { hubbub_doctype *cdoc = &tokeniser->context.current_doctype; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + return true; + } + + uint8_t c = CHAR(cptr); + COLLECT_NOBUF(tokeniser->context.chars, len); - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - hubbub_inputstream_advance(tokeniser->input); + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { } else if (c == '"') { cdoc->system_missing = false; - tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_DQ; - hubbub_inputstream_advance(tokeniser->input); + cdoc->system_id.len = 0; + + tokeniser->state = STATE_DOCTYPE_SYSTEM_DQ; } else if (c == '\'') { cdoc->system_missing = false; - tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_SYSTEM_SQ; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '>') { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); + cdoc->system_id.len = 0; - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + tokeniser->state = STATE_DOCTYPE_SYSTEM_SQ; + } else if (c == '>') { + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; } else { cdoc->force_quirks = true; - tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_BOGUS_DOCTYPE; } return true; } - - bool hubbub_tokeniser_handle_doctype_system_dq(hubbub_tokeniser *tokeniser) { hubbub_doctype *cdoc = &tokeniser->context.current_doctype; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + FINISH(cdoc->system_id); + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + return true; + } + + uint8_t c = CHAR(cptr); + COLLECT_NOBUF(tokeniser->context.chars, len); if (c == '"') { - tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_SYSTEM; - hubbub_inputstream_advance(tokeniser->input); + FINISH(cdoc->system_id); + tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM; } else if (c == '>') { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + FINISH(cdoc->system_id); + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + } else if (c == '\0') { + if (cdoc->public_id.len == 0) { + START_BUF(cdoc->system_id, u_fffd, sizeof(u_fffd)); + } else { + COLLECT_CHAR(cdoc->system_id, + u_fffd, sizeof(u_fffd)); + } } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - if (cdoc->system_id.len == 0) - cdoc->system_id.data.off = pos; - - cdoc->system_id.len += len; - - hubbub_inputstream_advance(tokeniser->input); + COLLECT_MS(cdoc->system_id, cptr, len); } return true; @@ -2644,89 +2447,67 @@ bool hubbub_tokeniser_handle_doctype_system_dq(hubbub_tokeniser *tokeniser) bool hubbub_tokeniser_handle_doctype_system_sq(hubbub_tokeniser *tokeniser) { hubbub_doctype *cdoc = &tokeniser->context.current_doctype; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + FINISH(cdoc->system_id); + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + return true; + } + + uint8_t c = CHAR(cptr); + COLLECT_NOBUF(tokeniser->context.chars, len); if (c == '\'') { - tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_SYSTEM; - hubbub_inputstream_advance(tokeniser->input); + FINISH(cdoc->system_id); + tokeniser->state = STATE_AFTER_DOCTYPE_SYSTEM; } else if (c == '>') { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + FINISH(cdoc->system_id); + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + } else if (c == '\0') { + if (cdoc->public_id.len == 0) { + START_BUF(cdoc->system_id, u_fffd, sizeof(u_fffd)); + } else { + COLLECT_CHAR(cdoc->system_id, + u_fffd, sizeof(u_fffd)); + } } else { - uint32_t pos; - size_t len; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - if (cdoc->system_id.len == 0) - cdoc->system_id.data.off = pos; - - cdoc->system_id.len += len; - - hubbub_inputstream_advance(tokeniser->input); + COLLECT_MS(cdoc->system_id, cptr, len); } return true; } - bool hubbub_tokeniser_handle_after_doctype_system(hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + emit_current_doctype(tokeniser, true); + tokeniser->state = STATE_DATA; + return true; + } - if (c == '\t' || c == '\n' || c == '\f' || c == ' ') { - hubbub_inputstream_advance(tokeniser->input); - } else if (c == '>') { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - token.data.doctype.force_quirks = true; - - hubbub_tokeniser_emit_token(tokeniser, &token); + uint8_t c = CHAR(cptr); + COLLECT_NOBUF(tokeniser->context.chars, len); - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || c == '\r') { + /* pass over in silence */ + } else if (c == '>') { + emit_current_doctype(tokeniser, false); + tokeniser->state = STATE_DATA; } else { - tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->state = STATE_BOGUS_DOCTYPE; } return true; @@ -2735,192 +2516,181 @@ bool hubbub_tokeniser_handle_after_doctype_system(hubbub_tokeniser *tokeniser) bool hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + emit_current_doctype(tokeniser, false); + tokeniser->state = STATE_DATA; + return true; + } - if (c == '>') { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit doctype */ - token.type = HUBBUB_TOKEN_DOCTYPE; - token.data.doctype = tokeniser->context.current_doctype; - - hubbub_tokeniser_emit_token(tokeniser, &token); + uint8_t c = CHAR(cptr); + COLLECT_NOBUF(tokeniser->context.chars, len); - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - } else { - hubbub_inputstream_advance(tokeniser->input); + if (c == '>') { + emit_current_doctype(tokeniser, false); + tokeniser->state = STATE_DATA; } return true; } + +#define CDATA "[CDATA[" +#define CDATA_LEN (SLEN(CDATA) - 1) + bool hubbub_tokeniser_handle_match_cdata(hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + tokeniser->context.current_comment.len = + tokeniser->context.chars.len = + 0; + tokeniser->state = STATE_BOGUS_COMMENT; + return true; + } - if (tokeniser->context.match_cdata.count == 1 && c == 'C') { - tokeniser->context.match_cdata.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_cdata.count == 2 && c == 'D') { - tokeniser->context.match_cdata.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_cdata.count == 3 && c == 'A') { - tokeniser->context.match_cdata.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_cdata.count == 4 && c == 'T') { - tokeniser->context.match_cdata.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_cdata.count == 5 && c == 'A') { - tokeniser->context.match_cdata.count++; - hubbub_inputstream_advance(tokeniser->input); - } else if (tokeniser->context.match_cdata.count == 6 && c == '[') { - tokeniser->context.current_chars.data.off = 0; - tokeniser->context.current_chars.len = 0; - - tokeniser->state = HUBBUB_TOKENISER_STATE_CDATA_BLOCK; - hubbub_inputstream_advance(tokeniser->input); - } else { - /* Rewind as many characters as we matched */ - hubbub_inputstream_rewind(tokeniser->input, - tokeniser->context.match_cdata.count); + uint8_t c = CHAR(cptr); + + assert(tokeniser->context.match_cdata.count <= CDATA_LEN); + + if (CDATA[tokeniser->context.match_cdata.count] != (c & ~0x20)) { + tokeniser->context.current_comment.len = + tokeniser->context.chars.len = + 0; + tokeniser->state = STATE_BOGUS_COMMENT; + return true; + } - tokeniser->context.current_comment.data.off = 0; - tokeniser->context.current_comment.len = 0; + COLLECT_NOBUF(tokeniser->context.chars, len); - tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + if (tokeniser->context.match_cdata.count == CDATA_LEN) { + parserutils_inputstream_advance(tokeniser->input, + tokeniser->context.match_cdata.count + len); + tokeniser->context.chars.len = 0; + tokeniser->context.match_cdata.end = 0; + tokeniser->state = STATE_CDATA_BLOCK; } + tokeniser->context.match_cdata.count += len; + return true; } +#undef CDATA +#undef CDATA_LEN + + bool hubbub_tokeniser_handle_cdata_block(hubbub_tokeniser *tokeniser) { - uint32_t c = hubbub_inputstream_peek(tokeniser->input); + size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + tokeniser->context.chars.len, &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) { return false; + } else if (cptr == PARSERUTILS_INPUTSTREAM_EOF) { + emit_character_token(tokeniser, &tokeniser->context.chars); + tokeniser->state = STATE_DATA; + return true; + } + + uint8_t c = CHAR(cptr); if (c == ']' && (tokeniser->context.match_cdata.end == 0 || tokeniser->context.match_cdata.end == 1)) { - tokeniser->context.match_cdata.end++; - hubbub_inputstream_advance(tokeniser->input); + COLLECT(tokeniser->context.chars, cptr, len); + tokeniser->context.match_cdata.end += len; } else if (c == '>' && tokeniser->context.match_cdata.end == 2) { - hubbub_token token; + /* Remove the previous two "]]" */ + tokeniser->context.chars.len -= 2; /* Emit any pending characters */ - token.type = HUBBUB_TOKEN_CHARACTER; - token.data.character = tokeniser->context.current_chars; - - hubbub_tokeniser_emit_token(tokeniser, &token); + emit_character_token(tokeniser, &tokeniser->context.chars); - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - hubbub_inputstream_advance(tokeniser->input); - } else if (c == HUBBUB_INPUTSTREAM_EOF) { - hubbub_token token; - - /* Emit any pending characters */ - token.type = HUBBUB_TOKEN_CHARACTER; - token.data.character = tokeniser->context.current_chars; - - hubbub_tokeniser_emit_token(tokeniser, &token); - - tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; - } else { - uint32_t pos; - size_t len; + /* Now move past the "]]>" bit */ + parserutils_inputstream_advance(tokeniser->input, SLEN("]]>")); - if (tokeniser->context.match_cdata.end) { - hubbub_inputstream_rewind(tokeniser->input, - tokeniser->context.match_cdata.end); - tokeniser->context.match_cdata.end = 0; + tokeniser->state = STATE_DATA; + } else if (c == '\0') { + if (tokeniser->context.chars.len > 0) { + /* Emit any pending characters */ + emit_character_token(tokeniser, + &tokeniser->context.chars); } - /* Accumulate characters into buffer */ - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + /* Perform NUL-byte replacement */ + emit_character_token(tokeniser, &u_fffd_str); - if (tokeniser->context.current_chars.len == 0) - tokeniser->context.current_chars.data.off = pos; - - tokeniser->context.current_chars.len += len; - - hubbub_inputstream_advance(tokeniser->input); + parserutils_inputstream_advance(tokeniser->input, len); + tokeniser->context.match_cdata.end = 0; + } else { + COLLECT_MS_NOBUF(tokeniser->context.chars, cptr, len); + tokeniser->context.match_cdata.end = 0; } return true; } -bool hubbub_tokeniser_consume_character_reference(hubbub_tokeniser *tokeniser) +bool hubbub_tokeniser_consume_character_reference(hubbub_tokeniser *tokeniser, size_t pos) { uint32_t allowed_char = tokeniser->context.allowed_char; - uint32_t c; - uint32_t pos; size_t len; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + pos, &len); + + /* We should always started on a non-OOD character */ + assert(cptr != PARSERUTILS_INPUTSTREAM_OOD); + + size_t off = pos + len; + + /* Look at the character after the ampersand */ + cptr = parserutils_inputstream_peek(tokeniser->input, off, &len); + + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) + return false; - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + uint8_t c = CHAR(cptr); /* Set things up */ - tokeniser->context.match_entity.str.data.off = pos; - tokeniser->context.match_entity.str.len = len; - tokeniser->context.match_entity.poss_len = 0; + tokeniser->context.match_entity.offset = off; + tokeniser->context.match_entity.poss_length = 0; + tokeniser->context.match_entity.length = 0; tokeniser->context.match_entity.base = 0; tokeniser->context.match_entity.codepoint = 0; tokeniser->context.match_entity.had_data = false; tokeniser->context.match_entity.return_state = tokeniser->state; tokeniser->context.match_entity.complete = false; - tokeniser->context.match_entity.done_setup = true; tokeniser->context.match_entity.overflow = false; tokeniser->context.match_entity.context = NULL; tokeniser->context.match_entity.prev_len = len; - hubbub_inputstream_advance(tokeniser->input); - - c = hubbub_inputstream_peek(tokeniser->input); - - if (c == HUBBUB_INPUTSTREAM_OOD) { - /* rewind because we need more data */ - hubbub_inputstream_rewind(tokeniser->input, 1); - return false; - } - /* Reset allowed character for future calls */ tokeniser->context.allowed_char = '\0'; if (c == '\t' || c == '\n' || c == '\f' || c == ' ' || - c == '<' || c == '&' || c == HUBBUB_INPUTSTREAM_EOF || - (allowed_char && c == allowed_char)) { + c == '<' || c == '&' || + cptr == PARSERUTILS_INPUTSTREAM_EOF || + (allowed_char && c == allowed_char)) { tokeniser->context.match_entity.complete = true; - /* rewind to the '&' (de-consume) */ - hubbub_inputstream_rewind(tokeniser->input, 1); - return true; + tokeniser->context.match_entity.codepoint = 0; } else if (c == '#') { - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - tokeniser->context.match_entity.str.len += len; - - tokeniser->state = HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY; - hubbub_inputstream_advance(tokeniser->input); + tokeniser->context.match_entity.length += len; + tokeniser->state = STATE_NUMBERED_ENTITY; } else { - tokeniser->state = HUBBUB_TOKENISER_STATE_NAMED_ENTITY; + tokeniser->state = STATE_NAMED_ENTITY; } return true; @@ -2930,47 +2700,44 @@ bool hubbub_tokeniser_consume_character_reference(hubbub_tokeniser *tokeniser) bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser) { hubbub_tokeniser_context *ctx = &tokeniser->context; - uint32_t c = hubbub_inputstream_peek(tokeniser->input); - uint32_t pos; + size_t len; - hubbub_error error; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + ctx->match_entity.offset + ctx->match_entity.length, + &len); - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) return false; + uint8_t c = CHAR(cptr); + if (ctx->match_entity.base == 0) { if ((c & ~0x20) == 'X') { ctx->match_entity.base = 16; - - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); - ctx->match_entity.str.len += len; - - hubbub_inputstream_advance(tokeniser->input); + ctx->match_entity.length += len; } else { ctx->match_entity.base = 10; } } - while ((c = hubbub_inputstream_peek(tokeniser->input)) != - HUBBUB_INPUTSTREAM_EOF && - c != HUBBUB_INPUTSTREAM_OOD) { + while ((cptr = parserutils_inputstream_peek(tokeniser->input, + ctx->match_entity.offset + ctx->match_entity.length, + &len)) != PARSERUTILS_INPUTSTREAM_EOF && + cptr != PARSERUTILS_INPUTSTREAM_OOD) { + c = CHAR(cptr); + if (ctx->match_entity.base == 10 && ('0' <= c && c <= '9')) { ctx->match_entity.had_data = true; - ctx->match_entity.codepoint = ctx->match_entity.codepoint * 10 + (c - '0'); - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); - ctx->match_entity.str.len += len; + ctx->match_entity.length += len; } else if (ctx->match_entity.base == 16 && (('0' <= c && c <= '9') || ('A' <= (c & ~0x20) && (c & ~0x20) <= 'F'))) { ctx->match_entity.had_data = true; - ctx->match_entity.codepoint *= 16; if ('0' <= c && c <= '9') { @@ -2980,9 +2747,7 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser) ((c & ~0x20) - 'A' + 10); } - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); - ctx->match_entity.str.len += len; + ctx->match_entity.length += len; } else { break; } @@ -2990,25 +2755,18 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser) if (ctx->match_entity.codepoint >= 0x10FFFF) { ctx->match_entity.overflow = true; } - - hubbub_inputstream_advance(tokeniser->input); } - if (c == HUBBUB_INPUTSTREAM_OOD) + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) return false; + c = CHAR(cptr); + /* Eat trailing semicolon, if any */ if (c == ';') { - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - ctx->match_entity.str.len += len; - - hubbub_inputstream_advance(tokeniser->input); + ctx->match_entity.length += len; } - /* Rewind the inputstream to start of matched sequence */ - hubbub_inputstream_rewind(tokeniser->input, - ctx->match_entity.str.len); - /* Had data, so calculate final codepoint */ if (ctx->match_entity.had_data) { uint32_t cp = ctx->match_entity.codepoint; @@ -3028,19 +2786,9 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser) cp = 0xFFFD; } - /* And replace the matched range with it */ - error = hubbub_inputstream_replace_range(tokeniser->input, - ctx->match_entity.str.data.off, - ctx->match_entity.str.len, - cp); - if (error != HUBBUB_OK) { - /** \todo handle memory exhaustion */ - } + ctx->match_entity.codepoint = cp; } - /* Reset for next time */ - ctx->match_entity.done_setup = false; - /* Flag completion */ ctx->match_entity.complete = true; @@ -3053,61 +2801,60 @@ bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser) bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) { hubbub_tokeniser_context *ctx = &tokeniser->context; - uint32_t c; - uint32_t pos; + size_t len; - hubbub_error error; + uintptr_t cptr = parserutils_inputstream_peek(tokeniser->input, + ctx->match_entity.offset, &len); + + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) + return false; + + uint8_t c = CHAR(cptr); - while ((c = hubbub_inputstream_peek(tokeniser->input)) != - HUBBUB_INPUTSTREAM_EOF && - c != HUBBUB_INPUTSTREAM_OOD) { + while ((cptr = parserutils_inputstream_peek(tokeniser->input, + ctx->match_entity.offset + + ctx->match_entity.poss_length, + &len)) != + PARSERUTILS_INPUTSTREAM_EOF && + cptr != PARSERUTILS_INPUTSTREAM_OOD) { uint32_t cp; + c = CHAR(cptr); + if (c > 0x7F) { /* Entity names are ASCII only */ break; } - error = hubbub_entities_search_step((uint8_t) c, - &cp, + hubbub_error error = hubbub_entities_search_step(c, &cp, &ctx->match_entity.context); if (error == HUBBUB_OK) { /* Had a match - store it for later */ ctx->match_entity.codepoint = cp; - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); - ctx->match_entity.str.len += len; - ctx->match_entity.str.len += ctx->match_entity.poss_len; - ctx->match_entity.poss_len = 0; - - /* And cache length, for replacement */ - ctx->match_entity.prev_len = - ctx->match_entity.str.len; + ctx->match_entity.length = + ctx->match_entity.poss_length + len; + ctx->match_entity.poss_length = + ctx->match_entity.length; } else if (error == HUBBUB_INVALID) { /* No further matches - use last found */ break; } else { - pos = hubbub_inputstream_cur_pos(tokeniser->input, - &len); - ctx->match_entity.poss_len += len; + /* Need more data */ + ctx->match_entity.poss_length += len; } - - hubbub_inputstream_advance(tokeniser->input); } - if (c == HUBBUB_INPUTSTREAM_OOD) { + if (cptr == PARSERUTILS_INPUTSTREAM_OOD) return false; - } - /* Rewind back possible matches, if any */ - hubbub_inputstream_rewind(tokeniser->input, - ctx->match_entity.poss_len); - - c = hubbub_inputstream_peek(tokeniser->input); + cptr = parserutils_inputstream_peek(tokeniser->input, + ctx->match_entity.offset + ctx->match_entity.length, + &len); + c = CHAR(cptr); if ((tokeniser->context.match_entity.return_state == - HUBBUB_TOKENISER_STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) && + STATE_CHARACTER_REFERENCE_IN_ATTRIBUTE_VALUE) && (c != ';') && ((0x0030 <= c && c <= 0x0039) || (0x0041 <= c && c <= 0x005A) || @@ -3115,26 +2862,6 @@ bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) ctx->match_entity.codepoint = 0; } - /* Rewind the inputstream to start of processed sequence */ - hubbub_inputstream_rewind(tokeniser->input, - ctx->match_entity.str.len); - - pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); - - /* Now, replace range, if we found a named entity */ - if (ctx->match_entity.codepoint != 0) { - error = hubbub_inputstream_replace_range(tokeniser->input, - ctx->match_entity.str.data.off, - ctx->match_entity.prev_len, - ctx->match_entity.codepoint); - if (error != HUBBUB_OK) { - /** \todo handle memory exhaustion */ - } - } - - /* Reset for next time */ - ctx->match_entity.done_setup = false; - /* Flag completion */ ctx->match_entity.complete = true; @@ -3144,24 +2871,6 @@ bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) return true; } -/** - * Handle input stream buffer moving - * - * \param buffer Pointer to buffer - * \param len Length of data in buffer (bytes) - * \param pw Pointer to our context - */ -void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer, - size_t len, void *pw) -{ - hubbub_tokeniser *tok = (hubbub_tokeniser *) pw; - - tok->input_buffer = buffer; - tok->input_buffer_len = len; - - if (tok->buffer_handler != NULL) - tok->buffer_handler(buffer, len, tok->buffer_pw); -} /** * Emit a token, performing sanity checks if necessary @@ -3175,18 +2884,6 @@ void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, assert(tokeniser != NULL); assert(token != NULL); - if (token->type == HUBBUB_TOKEN_START_TAG) { - tokeniser->context.last_start_tag_name = token->data.tag.name; - token->data.tag.ns = HUBBUB_NS_HTML; - } else if (token->type == HUBBUB_TOKEN_END_TAG) { - tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA; - } - - - /* Nothing to do if there's no registered handler */ - if (tokeniser->token_handler == NULL) - return; - if (token->type == HUBBUB_TOKEN_START_TAG || token->type == HUBBUB_TOKEN_END_TAG) { uint32_t i, j; @@ -3194,6 +2891,8 @@ void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, hubbub_attribute *attrs = token->data.tag.attributes; + token->data.tag.ns = HUBBUB_NS_HTML; + /* Discard duplicate attributes */ for (i = 0; i < n_attributes; i++) { for (j = 0; j < n_attributes; j++) { @@ -3202,10 +2901,8 @@ void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, if (j == i || attrs[i].name.len != attrs[j].name.len || - hubbub_inputstream_compare_range_cs( - tokeniser->input, - attrs[i].name.data.off, - attrs[j].name.data.off, + strncmp((char *)attrs[i].name.ptr, + (char *)attrs[j].name.ptr, attrs[i].name.len) != 0) { /* Attributes don't match */ continue; @@ -3233,5 +2930,35 @@ void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, } /* Finally, emit token */ - tokeniser->token_handler(token, tokeniser->token_pw); + if (tokeniser->token_handler) + tokeniser->token_handler(token, tokeniser->token_pw); + + if (token->type == HUBBUB_TOKEN_START_TAG) { + if (token->data.tag.name.len < + sizeof(tokeniser->context.last_start_tag_name)) { + strncpy((char *)tokeniser->context.last_start_tag_name, + (const char *)token->data.tag.name.ptr, + token->data.tag.name.len); + tokeniser->context.last_start_tag_len = + token->data.tag.name.len; + } else { + tokeniser->context.last_start_tag_name[0] = '\0'; + tokeniser->context.last_start_tag_len = 0; + } + } else if (token->type == HUBBUB_TOKEN_END_TAG) { + tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA; + } + + if (tokeniser->buffer->length) { + /* Discard current buffer */ + parserutils_buffer_discard(tokeniser->buffer, 0, + tokeniser->buffer->length); + } + + /* Advance the pointer */ + if (tokeniser->context.chars.len) { + parserutils_inputstream_advance(tokeniser->input, + tokeniser->context.chars.len); + tokeniser->context.chars.len = 0; + } } |