diff options
author | John Mark Bell <jmb@netsurf-browser.org> | 2007-06-23 22:40:25 +0000 |
---|---|---|
committer | John Mark Bell <jmb@netsurf-browser.org> | 2007-06-23 22:40:25 +0000 |
commit | 7b30a5520cfb56e651f0eb4da85a3e07747da7dc (patch) | |
tree | 5d6281c071c089e1e7a8ae6f8044cecaf6a7db16 | |
download | libhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.gz libhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.bz2 |
Import hubbub -- an HTML parsing library.
Plenty of work still to do (like tree generation ;)
svn path=/trunk/hubbub/; revision=3359
81 files changed, 56908 insertions, 0 deletions
@@ -0,0 +1,19 @@ +Copyright (C) 2007 J-M Bell + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..db5a35b --- /dev/null +++ b/Makefile @@ -0,0 +1,34 @@ +# Toolchain definitions for building on the destination platform +export CC = gcc +export AR = ar +export LD = gcc + +export CP = cp +export RM = rm +export MKDIR = mkdir +export MV = mv +export ECHO = echo +export MAKE = make +export PERL = perl +export PKGCONFIG = pkg-config + +# Toolchain flags +WARNFLAGS = -Wall -Wextra -Wundef -Wpointer-arith -Wcast-align \ + -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \ + -Wmissing-declarations -Wnested-externs -Werror -pedantic +export CFLAGS = -std=c99 -D_BSD_SOURCE -I$(TOP)/include/ $(WARNFLAGS) +export ARFLAGS = -cru +export LDFLAGS = -L$(TOP)/ + +export CPFLAGS = +export RMFLAGS = +export MKDIRFLAGS = -p +export MVFLAGS = +export ECHOFLAGS = +export MAKEFLAGS = +export PKGCONFIGFLAGS = + +export EXEEXT = + + +include build/Makefile.common diff --git a/Makefile-riscos b/Makefile-riscos new file mode 100644 index 0000000..f1d8cf0 --- /dev/null +++ b/Makefile-riscos @@ -0,0 +1,38 @@ +# Toolchain definitions for building for RISC OS using the GCCSDK cross-compiler +GCCSDK_INSTALL_CROSSBIN ?= /home/riscos/cross/bin +GCCSDK_INSTALL_ENV ?= /home/riscos/env + +export CC = $(GCCSDK_INSTALL_CROSSBIN)/gcc +export AR = $(GCCSDK_INSTALL_CROSSBIN)/ar +export LD = $(GCCSDK_INSTALL_CROSSBIN)/gcc + +export CP = cp +export RM = rm +export MKDIR = mkdir +export MV = mv +export ECHO = echo +export MAKE = make +export PERL = perl +export PKGCONFIG = pkg-config + +# Toolchain flags +WARNFLAGS = -Wall -Wextra -Wundef -Wpointer-arith -Wcast-align \ + -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \ + -Wmissing-declarations -Wnested-externs -Werror -pedantic +export CFLAGS = -std=c99 -D_BSD_SOURCE -I$(TOP)/include/ $(WARNFLAGS) \ + -mpoke-function-name +export ARFLAGS = -cru +export LDFLAGS = -L$(TOP)/ + +export CPFLAGS = +export RMFLAGS = +export MKDIRFLAGS = -p +export MVFLAGS = +export ECHOFLAGS = +export MAKEFLAGS = +export PKGCONFIGFLAGS = + +export EXEEXT = ,ff8 + + +include build/Makefile.common @@ -0,0 +1,46 @@ +Hubbub -- an HTML parser +======================== + +Overview +-------- + + Hubbub is a flexible HTML parser. It aims to comply with the HTML5 + specification. + +Requirements +------------ + + Hubbub requires the following tools: + + + A C99 capable C compiler + + GNU make or compatible + + Perl (for the testcases) + + Pkg-config (for the testcases) + + Hubbub also requires the following libraries to be installed: + + + An iconv implementation (e.g. libiconv) + + JSON-C (for the testcases) -- see json/README for further information + +Compilation +----------- + + If necessary, modify the toolchain settings in the Makefile. + Invoke make: + $ make + +Verification +------------ + + To verify that the parser is working, it is necessary to specify a + different makefile target than that used for normal compilation, thus: + + $ make test + +API documentation +----------------- + + Currently, there is none. However, the code is well commented and the + public API may be found in the "include" directory. The testcase sources + may also be of use in working out how to use it. + diff --git a/build/Makefile.common b/build/Makefile.common new file mode 100644 index 0000000..21c319a --- /dev/null +++ b/build/Makefile.common @@ -0,0 +1,39 @@ +# Top-level Makefile fragment for Hubbub + +# Name of component +export COMPONENT = libhubbub + +# Environment +export EXPORT = $(CURDIR)/dist +export TOP = $(CURDIR) + +.PHONY: release debug test clean setup export distclean + +# Rules +release: setup + @$(MAKE) $(MAKEFLAGS) -C src release + +debug: setup + @$(MAKE) $(MAKEFLAGS) -C src debug + +test: debug + @$(MAKE) $(MAKEFLAGS) -C test test + +clean: + @$(MAKE) $(MAKEFLAGS) -C src clean + @$(MAKE) $(MAKEFLAGS) -C test clean + +setup: + @$(MAKE) $(MAKEFLAGS) -C src setup + @$(MAKE) $(MAKEFLAGS) -C test setup + +export: release + @$(MKDIR) $(MKDIRFLAGS) $(TOP)/dist/lib + @$(CP) $(CPFLAGS) -r include $(EXPORT)/ + @$(MAKE) $(MAKEFLAGS) -C src export + @$(MAKE) $(MAKEFLAGS) -C test export + +distclean: clean + -@$(RM) $(RMFLAGS) -r $(TOP)/dist + @$(MAKE) $(MAKEFLAGS) -C src distclean + @$(MAKE) $(MAKEFLAGS) -C test distclean diff --git a/docs/Architecture b/docs/Architecture new file mode 100644 index 0000000..73966eb --- /dev/null +++ b/docs/Architecture @@ -0,0 +1,83 @@ +Hubbub parser architecture +========================== + +Introduction +------------ + + Hubbub is a flexible HTML parser. It offers two interfaces: + + * a SAX-style event interface + * a DOM-style tree-based interface + +Overview +-------- + + Hubbub is comprised of four parts: + + * a charset handler + * an input stream veneer + * a tokeniser + * a tree builder + + Charset handler + --------------- + + The charset handler converts the raw data input into a requested encoding. + + Input stream veneer + ------------------- + + The input stream veneer provides an abstract stream-like interface over + the document buffer. This is used by the tokeniser. The document buffer + will be encoded in either UTf-8 or UTF-16 (this is client-selectable). + + Tokeniser + --------- + + The tokeniser divides the data held in the document buffer into chunks. + It sends SAX-style events for each chunk. The tokeniser is agnostic to + the charset the document buffer is stored in. + + Tree builder + ------------ + + The tree builder constructs a DOM tree from the SAX events emitted by the + tokeniser. The tree builder is tied to the document buffer charset. + +Memory usage and ownership +-------------------------- + + Memory usage within the library is well defined, as is ownership of allocated + memory. + + Raw input data provided by the library client is owned by the client. + + The document buffer is allocated on the fly by the library. + + The document buffer is created and resized by the charset handler. Its + location is passed to the tree builder through a dedicated event. While + parsing is occurring, the ownership of the document buffer lies with the + charset handler. Upon parse completion, the tree builder may request + ownership of the buffer. If it does not, the buffer will be freed on parser + destruction. + + SAX events which refer to document segments contain direct references into + the document buffer (i.e. no copying of data held in the document buffer + occurs). + + The tree builder will allocate memory for use as DOM nodes. References to + strings in the document buffer will be direct and will operate a + copy-on-write strategy. All strings (excepting those which comprise part of + the document buffer) and nodes within the DOM are reference counted. Upon a + reference count reaching 0, the item is freed. + + The above strategy permits data copying to be kept to a minimum, hence + minimising memory usage. + +Parse errors +------------ + + Notification of parse errors is made through a dedicated event similar to + that used for notification of movement of the document buffer. This event + contains the line/column offset of the error location, along with a message + detailing the error. diff --git a/docs/Todo b/docs/Todo new file mode 100644 index 0000000..2abce2b --- /dev/null +++ b/docs/Todo @@ -0,0 +1,12 @@ +TODO list +========= + + + Update tokeniser to comply with latest spec draft (currently complies + with 2007-06-12 draft) + + Implement one or more tree builders + + More charset convertors (or make the iconv codec significantly faster) + + Parse error reporting from the tokeniser + + Implement extraneous chunk insertion/tokenisation + + Statistical charset autodetection + + Shared library, for those platforms that support such things + + Optimise it diff --git a/include/hubbub/errors.h b/include/hubbub/errors.h new file mode 100644 index 0000000..c3b1f5d --- /dev/null +++ b/include/hubbub/errors.h @@ -0,0 +1,29 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_errors_h_ +#define hubbub_errors_h_ + +#include <stddef.h> + +typedef enum hubbub_error { + HUBBUB_OK = 0, + + HUBBUB_NOMEM = 1, + HUBBUB_BADPARM = 2, + HUBBUB_INVALID = 3, + HUBBUB_FILENOTFOUND = 4, + HUBBUB_NEEDDATA = 5, +} hubbub_error; + +/* Convert a hubbub error value to a string */ +const char *hubbub_error_to_string(hubbub_error error); +/* Convert a string to a hubbub error value */ +hubbub_error hubbub_error_from_string(const char *str, size_t len); + +#endif + diff --git a/include/hubbub/functypes.h b/include/hubbub/functypes.h new file mode 100644 index 0000000..aa3e649 --- /dev/null +++ b/include/hubbub/functypes.h @@ -0,0 +1,37 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_functypes_h_ +#define hubbub_functypes_h_ + +#include <stdlib.h> + +#include <hubbub/types.h> + +/* Type of allocation function for hubbub */ +typedef void *(*hubbub_alloc)(void *ptr, size_t size, void *pw); + +/** + * Type of token handling function + */ +typedef void (*hubbub_token_handler)(const hubbub_token *token, void *pw); + +/** + * Type of document buffer handling function + */ +typedef void (*hubbub_buffer_handler)(const uint8_t *data, + size_t len, void *pw); + +/** + * Type of parse error handling function + */ +typedef void (*hubbub_error_handler)(uint32_t line, uint32_t col, + const char *message, void *pw); + + +#endif + diff --git a/include/hubbub/hubbub.h b/include/hubbub/hubbub.h new file mode 100644 index 0000000..8a15eca --- /dev/null +++ b/include/hubbub/hubbub.h @@ -0,0 +1,23 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_h_ +#define hubbub_h_ + +#include <hubbub/errors.h> +#include <hubbub/functypes.h> +#include <hubbub/types.h> + +/* Initialise the Hubbub library for use */ +hubbub_error hubbub_initialise(const char *aliases_file, + hubbub_alloc alloc, void *pw); + +/* Clean up after Hubbub */ +hubbub_error hubbub_finalise(hubbub_alloc alloc, void *pw); + +#endif + diff --git a/include/hubbub/parser.h b/include/hubbub/parser.h new file mode 100644 index 0000000..cdf8664 --- /dev/null +++ b/include/hubbub/parser.h @@ -0,0 +1,84 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_parser_h_ +#define hubbub_parser_h_ + +#include <inttypes.h> + +#include <hubbub/errors.h> +#include <hubbub/functypes.h> +#include <hubbub/types.h> + +typedef struct hubbub_parser hubbub_parser; + +/** + * Hubbub parser option types + */ +typedef enum hubbub_parser_opttype { + HUBBUB_PARSER_TOKEN_HANDLER, + HUBBUB_PARSER_BUFFER_HANDLER, + HUBBUB_PARSER_ERROR_HANDLER, + HUBBUB_PARSER_CONTENT_MODEL, +} hubbub_parser_opttype; + +/** + * Hubbub parser option parameters + */ +typedef union hubbub_parser_optparams { + struct { + hubbub_token_handler handler; + void *pw; + } token_handler; + + struct { + hubbub_buffer_handler handler; + void *pw; + } buffer_handler; + + struct { + hubbub_error_handler handler; + void *pw; + } error_handler; + + struct { + hubbub_content_model model; + } content_model; +} hubbub_parser_optparams; + +/* Create a hubbub parser */ +hubbub_parser *hubbub_parser_create(const char *enc, const char *int_enc, + hubbub_alloc alloc, void *pw); +/* Destroy a hubbub parser */ +void hubbub_parser_destroy(hubbub_parser *parser); + +/* Configure a hubbub parser */ +hubbub_error hubbub_parser_setopt(hubbub_parser *parser, + hubbub_parser_opttype type, + hubbub_parser_optparams *params); + +/* Pass a chunk of data to a hubbub parser for parsing */ +/* This data is encoded in the input charset */ +hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser, + uint8_t *data, size_t len); +/* Pass a chunk of extraneous data to a hubbub parser for parsing */ +/* This data is UTF-8 encoded */ +hubbub_error hubbub_parser_parse_extraneous_chunk(hubbub_parser *parser, + uint8_t *data, size_t len); +/* Inform the parser that the last chunk of data has been parsed */ +hubbub_error hubbub_parser_completed(hubbub_parser *parser); + +/* Read the document charset */ +const char *hubbub_parser_read_charset(hubbub_parser *parser, + hubbub_charset_source *source); + +/* Claim ownership of the document buffer */ +hubbub_error hubbub_parser_claim_buffer(hubbub_parser *parser, + uint8_t **buffer, size_t *len); + +#endif + diff --git a/include/hubbub/types.h b/include/hubbub/types.h new file mode 100644 index 0000000..57518ae --- /dev/null +++ b/include/hubbub/types.h @@ -0,0 +1,97 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_types_h_ +#define hubbub_types_h_ + +#include <stdbool.h> +#include <inttypes.h> + +/** Source of charset information, in order of importance + * A client-dictated charset will override all others. + * A document-specified charset will override autodetection or the default */ +typedef enum hubbub_charset_source { + HUBBUB_CHARSET_UNKNOWN = 0, /**< Unknown */ + HUBBUB_CHARSET_DEFAULT = 1, /**< Default setting */ + HUBBUB_CHARSET_DETECTED = 2, /**< Autodetected */ + HUBBUB_CHARSET_DOCUMENT = 3, /**< Defined in document */ + HUBBUB_CHARSET_DICTATED = 4, /**< Dictated by client */ +} hubbub_charset_source; + +/** + * Content model flag + */ +typedef enum hubbub_content_model { + HUBBUB_CONTENT_MODEL_PCDATA, + HUBBUB_CONTENT_MODEL_RCDATA, + HUBBUB_CONTENT_MODEL_CDATA, + HUBBUB_CONTENT_MODEL_PLAINTEXT +} hubbub_content_model; + +/** + * Type of an emitted token + */ +typedef enum hubbub_token_type { + HUBBUB_TOKEN_DOCTYPE, + HUBBUB_TOKEN_START_TAG, + HUBBUB_TOKEN_END_TAG, + HUBBUB_TOKEN_COMMENT, + HUBBUB_TOKEN_CHARACTER, + HUBBUB_TOKEN_EOF +} hubbub_token_type; + +/** + * Tokeniser string type + */ +typedef struct hubbub_string { + uint32_t data_off; /**< Byte offset of string start */ + size_t len; /**< Byte length of string */ +} hubbub_string; + +/** + * Tag attribute data + */ +typedef struct hubbub_attribute { + hubbub_string name; /**< Attribute name */ + hubbub_string value; /**< Attribute value */ +} hubbub_attribute; + +/** + * Data for doctype token + */ +typedef struct hubbub_doctype { + hubbub_string name; /**< Doctype name */ + bool correct; /**< Doctype validity flag */ +} hubbub_doctype; + +/** + * Data for a tag + */ +typedef struct hubbub_tag { + hubbub_string name; /**< Tag name */ + uint32_t n_attributes; /**< Count of attributes */ + hubbub_attribute *attributes; /**< Array of attribute data */ +} hubbub_tag; + +/** + * Token data + */ +typedef struct hubbub_token { + hubbub_token_type type; + + union { + hubbub_doctype doctype; + + hubbub_tag tag; + + hubbub_string comment; + + hubbub_string character; + } data; +} hubbub_token; + +#endif diff --git a/json/README b/json/README new file mode 100644 index 0000000..50dcf79 --- /dev/null +++ b/json/README @@ -0,0 +1,26 @@ +JSON-C patches +============== + +This directory contains a couple of patches to JSON-C 0.7. +Upstream sources may be found at http://oss.metaparadigm.com/json-c/ + +hex-chars.jmb1.p: + + Fix handling of upper case hex digits. + The previous behaviour resulted in the likes of \uFFFD causing a parse + error. + +void-prototypes.jmb1.p: + + Fix compiler warnings about function prototypes in header files when + compiling client code in standards mode with pedantic warnings switched + on. + +Apply them as follows: + + $ cd json-c-0.7 + $ patch -p 1 -i ../hex-chars.jmb1.p + $ patch -p 1 -i ../void-prototypes.jmb1.p + +They have been submitted upstream, so will probably disappear in due +course. diff --git a/json/hex-chars.jmb1.p b/json/hex-chars.jmb1.p new file mode 100644 index 0000000..10ea30a --- /dev/null +++ b/json/hex-chars.jmb1.p @@ -0,0 +1,12 @@ +diff -urw json-c-0.7/json_object.c json-c-0.7-jmb/json_object.c +--- json-c-0.7/json_object.c 2007-03-13 08:25:39.000000000 +0000 ++++ json-c-0.7-jmb/json_object.c 2007-06-23 13:33:20.000000000 +0100 +@@ -30,7 +30,7 @@ + /* #define REFCOUNT_DEBUG 1 */ + + char *json_number_chars = "0123456789.+-e"; +-char *json_hex_chars = "0123456789abcdef"; ++char *json_hex_chars = "0123456789abcdefABCDEF"; + + #ifdef REFCOUNT_DEBUG + static char* json_type_name[] = { diff --git a/json/void-prototypes.jmb1.p b/json/void-prototypes.jmb1.p new file mode 100644 index 0000000..db71ffe --- /dev/null +++ b/json/void-prototypes.jmb1.p @@ -0,0 +1,45 @@ +diff -urw json-c-0.7/debug.h json-c-0.7-jmb/debug.h +--- json-c-0.7/debug.h 2007-03-13 08:25:39.000000000 +0000 ++++ json-c-0.7-jmb/debug.h 2007-06-22 23:52:37.000000000 +0100 +@@ -13,7 +13,7 @@ + #define _DEBUG_H_ + + extern void mc_set_debug(int debug); +-extern int mc_get_debug(); ++extern int mc_get_debug(void); + + extern void mc_set_syslog(int syslog); + extern void mc_abort(const char *msg, ...); +diff -urw json-c-0.7/json_object.h json-c-0.7-jmb/json_object.h +--- json-c-0.7/json_object.h 2007-03-13 08:25:39.000000000 +0000 ++++ json-c-0.7-jmb/json_object.h 2007-06-22 23:53:10.000000000 +0100 +@@ -98,7 +98,7 @@ + /** Create a new empty object + * @returns a json_object of type json_type_object + */ +-extern struct json_object* json_object_new_object(); ++extern struct json_object* json_object_new_object(void); + + /** Get the hashtable of a json_object of type json_type_object + * @param obj the json_object instance +@@ -167,7 +167,7 @@ + /** Create a new empty json_object of type json_type_array + * @returns a json_object of type json_type_array + */ +-extern struct json_object* json_object_new_array(); ++extern struct json_object* json_object_new_array(void); + + /** Get the arraylist of a json_object of type json_type_array + * @param obj the json_object instance +diff -urw json-c-0.7/json_tokener.h json-c-0.7-jmb/json_tokener.h +--- json-c-0.7/json_tokener.h 2007-03-13 08:25:39.000000000 +0000 ++++ json-c-0.7-jmb/json_tokener.h 2007-06-22 23:53:26.000000000 +0100 +@@ -79,7 +79,7 @@ + + extern const char* json_tokener_errors[]; + +-extern struct json_tokener* json_tokener_new(); ++extern struct json_tokener* json_tokener_new(void); + extern void json_tokener_free(struct json_tokener *tok); + extern void json_tokener_reset(struct json_tokener *tok); + extern struct json_object* json_tokener_parse(char *str); diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..b72a9e0 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,79 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Release output +RELEASE = ${TOP}/${COMPONENT}.a + +# Debug output +DEBUG = ${TOP}/${COMPONENT}-debug.a + +# Objects +OBJS = hubbub parser + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix Release/, $(addsuffix .o, $(OBJS))) + @${MAKE} -C charset release + @${MAKE} -C input release + @${MAKE} -C tokeniser release + @${MAKE} -C utils release + @${AR} ${ARFLAGS} $(RELEASE) Release/* + +debug: $(addprefix Debug/, $(addsuffix .o, $(OBJS))) + @${MAKE} -C charset debug + @${MAKE} -C input debug + @${MAKE} -C tokeniser debug + @${MAKE} -C utils debug + @${AR} ${ARFLAGS} $(DEBUG) Debug/* + +clean: + @${MAKE} -C charset clean + @${MAKE} -C input clean + @${MAKE} -C tokeniser clean + @${MAKE} -C utils clean + -@${RM} ${RMFLAGS} $(addprefix Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix Debug/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(RELEASE) $(DEBUG) + +distclean: + -@${RM} ${RMFLAGS} -r Release + -@${RM} ${RMFLAGS} -r Debug + +setup: + @${MKDIR} ${MKDIRFLAGS} Release + @${MKDIR} ${MKDIRFLAGS} Debug + +export: + @${CP} ${CPFLAGS} $(RELEASE) ${EXPORT}/lib/ + +test: + +# Pattern rules +Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/charset/Makefile b/src/charset/Makefile new file mode 100644 index 0000000..62817b3 --- /dev/null +++ b/src/charset/Makefile @@ -0,0 +1,53 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Objects +OBJS = aliases codec codec_iconv codec_utf8 detect + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS))) + +debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS))) + +clean: + -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS})) + +distclean: + +setup: + +export: + +test: + +# Pattern rules +../Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +../Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/charset/aliases.c b/src/charset/aliases.c new file mode 100644 index 0000000..dcf6de2 --- /dev/null +++ b/src/charset/aliases.c @@ -0,0 +1,361 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <ctype.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "charset/aliases.h" + +struct alias { + struct alias *next; + hubbub_aliases_canon *canon; + uint16_t name_len; + char name[1]; +}; + +#define HASH_SIZE (43) +static hubbub_aliases_canon *canon_tab[HASH_SIZE]; +static struct alias *alias_tab[HASH_SIZE]; + +static hubbub_error hubbub_create_alias(const char *alias, + hubbub_aliases_canon *c, hubbub_alloc alloc, void *pw); +static hubbub_aliases_canon *hubbub_create_canon(const char *canon, + uint16_t mibenum, hubbub_alloc alloc, void *pw); +static uint32_t hubbub_hash_val(const char *alias, size_t len); + +/** + * Create alias data from Aliases file + * + * \param filename The path to the Aliases file + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise. + */ +hubbub_error hubbub_aliases_create(const char *filename, + hubbub_alloc alloc, void *pw) +{ + char buf[300]; + FILE *fp; + + if (filename == NULL || alloc == NULL) + return HUBBUB_BADPARM; + + fp = fopen(filename, "r"); + if (fp == NULL) + return HUBBUB_FILENOTFOUND; + + while (fgets(buf, sizeof buf, fp)) { + char *p, *aliases = 0, *mib, *end; + hubbub_aliases_canon *cf; + + if (buf[0] == 0 || buf[0] == '#') + /* skip blank lines or comments */ + continue; + + buf[strlen(buf) - 1] = 0; /* lose terminating newline */ + end = buf + strlen(buf); + + /* find end of canonical form */ + for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + *p++ = '\0'; /* terminate canonical form */ + + /* skip whitespace */ + for (; *p && isspace(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + mib = p; + + /* find end of mibenum */ + for (; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p < end) + *p++ = '\0'; /* terminate mibenum */ + + cf = hubbub_create_canon(buf, atoi(mib), alloc, pw); + if (cf == NULL) + continue; + + /* skip whitespace */ + for (; p < end && *p && isspace(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + aliases = p; + + while (p < end) { + /* find end of alias */ + for (; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p > end) + /* stop if we've gone past the end */ + break; + /* terminate current alias */ + *p++ = '\0'; + + if (hubbub_create_alias(aliases, cf, + alloc, pw) != HUBBUB_OK) + break; + + /* in terminating, we may have advanced + * past the end - check this here */ + if (p >= end) + break; + + /* skip whitespace */ + for (; *p && isspace(*p); p++) + ; /* do nothing */ + + if (p >= end) + /* gone past end => stop */ + break; + + /* update pointer to current alias */ + aliases = p; + } + } + + fclose(fp); + + return HUBBUB_OK; +} + +/** + * Free all alias data + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + */ +void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw) +{ + hubbub_aliases_canon *c, *d; + struct alias *a, *b; + int i; + + for (i = 0; i != HASH_SIZE; i++) { + for (c = canon_tab[i]; c; c = d) { + d = c->next; + alloc(c, 0, pw); + } + canon_tab[i] = NULL; + + for (a = alias_tab[i]; a; a = b) { + b = a->next; + alloc(a, 0, pw); + } + alias_tab[i] = NULL; + } +} + +/** + * Retrieve the MIB enum value assigned to an encoding name + * + * \param alias The alias to lookup + * \param len The length of the alias string + * \return The MIB enum value, or 0 if not found + */ +uint16_t hubbub_mibenum_from_name(const char *alias, size_t len) +{ + hubbub_aliases_canon *c; + + if (alias == NULL) + return 0; + + c = hubbub_alias_canonicalise(alias, len); + if (c == NULL) + return 0; + + return c->mib_enum; +} + +/** + * Retrieve the canonical name of an encoding from the MIB enum + * + * \param mibenum The MIB enum value + * \return Pointer to canonical name, or NULL if not found + */ +const char *hubbub_mibenum_to_name(uint16_t mibenum) +{ + int i; + hubbub_aliases_canon *c; + + for (i = 0; i != HASH_SIZE; i++) + for (c = canon_tab[i]; c; c = c->next) + if (c->mib_enum == mibenum) + return c->name; + + return NULL; +} + + +/** + * Retrieve the canonical form of an alias name + * + * \param alias The alias name + * \param len The length of the alias name + * \return Pointer to canonical form or NULL if not found + */ +hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias, + size_t len) +{ + uint32_t hash; + hubbub_aliases_canon *c; + struct alias *a; + + if (alias == NULL) + return NULL; + + hash = hubbub_hash_val(alias, len); + + for (c = canon_tab[hash]; c; c = c->next) + if (c->name_len == len && + strncasecmp(c->name, alias, len) == 0) + break; + if (c) + return c; + + for (a = alias_tab[hash]; a; a = a->next) + if (a->name_len == len && + strncasecmp(a->name, alias, len) == 0) + break; + if (a) + return a->canon; + + return NULL; +} + + +/** + * Create an alias + * + * \param alias The alias name + * \param c The canonical form + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_create_alias(const char *alias, hubbub_aliases_canon *c, + hubbub_alloc alloc, void *pw) +{ + struct alias *a; + uint32_t hash; + + if (alias == NULL || c == NULL || alloc == NULL) + return HUBBUB_BADPARM; + + a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw); + if (a == NULL) + return HUBBUB_NOMEM; + + a->canon = c; + a->name_len = strlen(alias); + strcpy(a->name, alias); + a->name[a->name_len] = '\0'; + + hash = hubbub_hash_val(alias, a->name_len); + + a->next = alias_tab[hash]; + alias_tab[hash] = a; + + return HUBBUB_OK; +} + +/** + * Create a canonical form + * + * \param canon The canonical name + * \param mibenum The MIB enum value + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to canonical form or NULL on error + */ +hubbub_aliases_canon *hubbub_create_canon(const char *canon, + uint16_t mibenum, hubbub_alloc alloc, void *pw) +{ + hubbub_aliases_canon *c; + uint32_t hash, len; + + if (canon == NULL || alloc == NULL) + return NULL; + + len = strlen(canon); + + c = alloc(NULL, sizeof(hubbub_aliases_canon) + len + 1, pw); + if (c == NULL) + return NULL; + + c->mib_enum = mibenum; + c->name_len = len; + strcpy(c->name, canon); + c->name[len] = '\0'; + + hash = hubbub_hash_val(canon, len); + + c->next = canon_tab[hash]; + canon_tab[hash] = c; + + return c; +} + +/** + * Hash function + * + * \param alias String to hash + * \return The hashed value + */ +uint32_t hubbub_hash_val(const char *alias, size_t len) +{ + const char *s = alias; + uint32_t h = 5381; + + if (alias == NULL) + return 0; + + while (len--) + h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */ + + return h % HASH_SIZE; +} + + +#ifndef NDEBUG +/** + * Dump all alias data to stdout + */ +void hubbub_aliases_dump(void) +{ + hubbub_aliases_canon *c; + struct alias *a; + int i; + size_t size = 0; + + for (i = 0; i != HASH_SIZE; i++) { + for (c = canon_tab[i]; c; c = c->next) { + printf("%d %s\n", i, c->name); + size += offsetof(hubbub_aliases_canon, name) + + c->name_len; + } + + for (a = alias_tab[i]; a; a = a->next) { + printf("%d %s\n", i, a->name); + size += offsetof(struct alias, name) + a->name_len; + } + } + + size += (sizeof(canon_tab) / sizeof(canon_tab[0])); + size += (sizeof(alias_tab) / sizeof(alias_tab[0])); + + printf("%u\n", (unsigned int) size); +} +#endif diff --git a/src/charset/aliases.h b/src/charset/aliases.h new file mode 100644 index 0000000..e0505d0 --- /dev/null +++ b/src/charset/aliases.h @@ -0,0 +1,42 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_charset_aliases_h_ +#define hubbub_charset_aliases_h_ + +#include <inttypes.h> + +#include <hubbub/errors.h> +#include <hubbub/functypes.h> + +typedef struct hubbub_aliases_canon { + struct hubbub_aliases_canon *next; + uint16_t mib_enum; + uint16_t name_len; + char name[1]; +} hubbub_aliases_canon; + +/* Load encoding aliases from file */ +hubbub_error hubbub_aliases_create(const char *filename, + hubbub_alloc alloc, void *pw); +/* Destroy encoding aliases */ +void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw); + +/* Convert an encoding alias to a MIB enum value */ +uint16_t hubbub_mibenum_from_name(const char *alias, size_t len); +/* Convert a MIB enum value into an encoding alias */ +const char *hubbub_mibenum_to_name(uint16_t mibenum); + +/* Canonicalise an alias name */ +hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias, + size_t len); + +#ifndef NDEBUG +void hubbub_aliases_dump(void); +#endif + +#endif diff --git a/src/charset/codec.c b/src/charset/codec.c new file mode 100644 index 0000000..12a1bdc --- /dev/null +++ b/src/charset/codec.c @@ -0,0 +1,186 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <string.h> + +#include "charset/aliases.h" + +#include "codec_impl.h" + +extern hubbub_charsethandler hubbub_iconv_codec_handler; +extern hubbub_charsethandler hubbub_utf8_codec_handler; + +static hubbub_charsethandler *handler_table[] = { + &hubbub_utf8_codec_handler, + &hubbub_iconv_codec_handler, + NULL, +}; + +/** + * Create a charset codec + * + * \param charset Target charset + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec instance, or NULL on failure + */ +hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset, + hubbub_alloc alloc, void *pw) +{ + hubbub_charsetcodec *codec; + hubbub_charsethandler **handler; + const hubbub_aliases_canon * canon; + + if (charset == NULL || alloc == NULL) + return NULL; + + /* Canonicalise charset name. */ + canon = hubbub_alias_canonicalise(charset, strlen(charset)); + if (canon == NULL) + return NULL; + + /* Search for handler class */ + for (handler = handler_table; *handler != NULL; handler++) { + if ((*handler)->handles_charset(canon->name)) + break; + } + + /* None found */ + if ((*handler) == NULL) + return NULL; + + /* Instantiate class */ + codec = (*handler)->create(canon->name, alloc, pw); + if (codec == NULL) + return NULL; + + /* and initialise it */ + codec->mibenum = canon->mib_enum; + + codec->filter = NULL; + codec->filter_pw = NULL; + + codec->errormode = HUBBUB_CHARSETCODEC_ERROR_LOOSE; + + codec->alloc = alloc; + codec->alloc_pw = pw; + + return codec; +} + +/** + * Destroy a charset codec + * + * \param codec The codec to destroy + */ +void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec) +{ + if (codec == NULL) + return; + + codec->handler.destroy(codec); + + codec->alloc(codec, 0, codec->alloc_pw); +} + +/** + * Configure a charset codec + * + * \param codec The codec to configure + * \parem type The codec option type to configure + * \param params Option-specific parameters + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec, + hubbub_charsetcodec_opttype type, + hubbub_charsetcodec_optparams *params) +{ + if (codec == NULL || params == NULL) + return HUBBUB_BADPARM; + + switch (type) { + case HUBBUB_CHARSETCODEC_FILTER_FUNC: + codec->filter = params->filter_func.filter; + codec->filter_pw = params->filter_func.pw; + break; + + case HUBBUB_CHARSETCODEC_ERROR_MODE: + codec->errormode = params->error_mode.mode; + break; + } + + return HUBBUB_OK; +} + +/** + * Encode a chunk of UCS4 data into a codec's charset + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, appropriate error otherwise. + * + * source, sourcelen, dest and destlen will be updated appropriately on exit + */ +hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + if (codec == NULL || source == NULL || *source == NULL || + sourcelen == NULL || dest == NULL || *dest == NULL || + destlen == NULL) + return HUBBUB_BADPARM; + + return codec->handler.encode(codec, source, sourcelen, dest, destlen); +} + +/** + * Decode a chunk of data in a codec's charset into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, appropriate error otherwise. + * + * source, sourcelen, dest and destlen will be updated appropriately on exit + * + * Call this with a source length of 0 to flush any buffers. + */ +hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + if (codec == NULL || source == NULL || *source == NULL || + sourcelen == NULL || dest == NULL || *dest == NULL || + destlen == NULL) + return HUBBUB_BADPARM; + + return codec->handler.decode(codec, source, sourcelen, dest, destlen); +} + +/** + * Clear a charset codec's encoding state + * + * \param codec The codec to reset + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec) +{ + if (codec == NULL) + return HUBBUB_BADPARM; + + /* Reset filter */ + if (codec->filter) + codec->filter(HUBBUB_CHARSETCODEC_NULL, NULL, NULL, NULL); + + return codec->handler.reset(codec); +} + diff --git a/src/charset/codec.h b/src/charset/codec.h new file mode 100644 index 0000000..4cd94d8 --- /dev/null +++ b/src/charset/codec.h @@ -0,0 +1,153 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_charset_codec_h_ +#define hubbub_charset_codec_h_ + +#include <inttypes.h> + +#include <hubbub/errors.h> +#include <hubbub/functypes.h> + +typedef struct hubbub_charsetcodec hubbub_charsetcodec; + +#define HUBBUB_CHARSETCODEC_NULL (0xffffffffU) + +/** + * Type of charset codec filter function + * + * \param c UCS4 character (in host byte order) or + * HUBBUB_CHARSETCODEC_NULL to reset + * \param output Pointer to location to store output buffer location + * \param outputlen Pointer to location to store output buffer length + * \param pw Pointer to client-specific private data + * \return HUBBUB_OK on success, or appropriate error otherwise. + * + * The output buffer is owned by the filter code and will not be freed by + * any charset codec. It should contain the replacement UCS4 character(s) + * for the input. The replacement characters should be in host byte order. + * The contents of *output and *outputlen on entry are ignored and these + * will be filled in by the filter code. + * + * Filters may elect to replace the input character with no output. In this + * case, *output should be set to NULL and *outputlen should be set to 0 and + * HUBBUB_OK should be returned. + * + * The output length is in terms of the number of UCS4 characters in the + * output buffer. i.e.: + * + * for (size_t i = 0; i < outputlen; i++) { + * dest[curchar++] = output[i]; + * } + * + * would copy the contents of the filter output buffer to the codec's output + * buffer. + */ +typedef hubbub_error (*hubbub_charsetcodec_filter)(uint32_t c, + uint32_t **output, size_t *outputlen, void *pw); + +/** + * Charset codec error mode + * + * A codec's error mode determines its behaviour in the face of: + * + * + characters which are unrepresentable in the destination charset (if + * encoding data) or which cannot be converted to UCS4 (if decoding data). + * + invalid byte sequences (both encoding and decoding) + * + * The options provide a choice between the following approaches: + * + * + draconian, "stop processing" ("strict") + * + "replace the unrepresentable character with something else" ("loose") + * + "attempt to transliterate, or replace if unable" ("translit") + * + * The default error mode is "loose". + * + * + * In the "loose" case, the replacement character will depend upon: + * + * + Whether the operation was encoding or decoding + * + If encoding, what the destination charset is. + * + * If decoding, the replacement character will be: + * + * U+FFFD (REPLACEMENT CHARACTER) + * + * If encoding, the replacement character will be: + * + * U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32) + * U+FFFD (REPLACEMENT CHARACTER) otherwise. + * + * + * In the "translit" case, the codec will attempt to transliterate into + * the destination charset, if encoding. If decoding, or if transliteration + * fails, this option is identical to "loose". + */ +typedef enum hubbub_charsetcodec_errormode { + /** Abort processing if unrepresentable character encountered */ + HUBBUB_CHARSETCODEC_ERROR_STRICT = 0, + /** Replace unrepresentable characters with single alternate */ + HUBBUB_CHARSETCODEC_ERROR_LOOSE = 1, + /** Transliterate unrepresentable characters, if possible */ + HUBBUB_CHARSETCODEC_ERROR_TRANSLIT = 2, +} hubbub_charsetcodec_errormode; + +/** + * Charset codec option types + */ +typedef enum hubbub_charsetcodec_opttype { + /** Register codec filter function */ + HUBBUB_CHARSETCODEC_FILTER_FUNC = 0, + /** Set codec error mode */ + HUBBUB_CHARSETCODEC_ERROR_MODE = 1, +} hubbub_charsetcodec_opttype; + +/** + * Charset codec option parameters + */ +typedef union hubbub_charsetcodec_optparams { + /** Parameters for filter function setting */ + struct { + /** Filter function */ + hubbub_charsetcodec_filter filter; + /** Client-specific private data */ + void *pw; + } filter_func; + + /** Parameters for error mode setting */ + struct { + /** The desired error handling mode */ + hubbub_charsetcodec_errormode mode; + } error_mode; +} hubbub_charsetcodec_optparams; + + +/* Create a charset codec */ +hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset, + hubbub_alloc alloc, void *pw); +/* Destroy a charset codec */ +void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec); + +/* Configure a charset codec */ +hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec, + hubbub_charsetcodec_opttype type, + hubbub_charsetcodec_optparams *params); + +/* Encode a chunk of UCS4 data into a codec's charset */ +hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + +/* Decode a chunk of data in a codec's charset into UCS4 */ +hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + +/* Reset a charset codec */ +hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec); + +#endif diff --git a/src/charset/codec_iconv.c b/src/charset/codec_iconv.c new file mode 100644 index 0000000..097e82a --- /dev/null +++ b/src/charset/codec_iconv.c @@ -0,0 +1,837 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +/* This codec is hideously slow. Only use it as a last resort */ + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +#include <iconv.h> + +/* These two are for htonl / ntohl */ +#include <arpa/inet.h> +#include <netinet/in.h> + +#include "charset/aliases.h" +#include "utils/utils.h" + +#include "codec_impl.h" + +/** + * A note on endianness: + * + * UCS4 is big-endian by default. Therefore, this codec reads and writes + * big-endian values. This is fine, and causes no problems. However, to + * make life easier for client-supplied filter code, character values passed + * to a filter and those read back from a filter are in host-endian. + * Therefore, we need to convert from big-endian to host-endian when passing + * characters to a filter and perform the reverse translation when reading + * characters back. + */ + +/** + * Iconv-based charset codec + */ +typedef struct hubbub_iconv_codec { + hubbub_charsetcodec base; /**< Base class */ + + iconv_t read_cd; /**< Iconv handle for reading */ +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /**< Number of bytes in inval_buf */ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + */ + size_t read_len; /**< Number of characters in + * read_buf */ + + iconv_t write_cd; /**< Iconv handle for writing */ +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + */ + size_t write_len; /**< Number of characters in + * write_buf */ +} hubbub_iconv_codec; + + +static bool hubbub_iconv_codec_handles_charset(const char *charset); +static hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset, + hubbub_alloc alloc, void *pw); +static void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec); +static hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec); +static hubbub_error hubbub_iconv_codec_filter_decoded_char( + hubbub_iconv_codec *c, uint32_t ucs4, uint8_t **dest, + size_t *destlen); +static bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c); +static hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool hubbub_iconv_codec_handles_charset(const char *charset) +{ + iconv_t cd; + bool ret; + + cd = iconv_open("UCS-4", charset); + + ret = (cd != (iconv_t) -1); + + if (ret) + iconv_close(cd); + + return ret; +} + +/** + * Create an iconv-based codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset, + hubbub_alloc alloc, void *pw) +{ + hubbub_iconv_codec *codec; + + codec = alloc(NULL, sizeof(hubbub_iconv_codec), pw); + if (codec == NULL) + return NULL; + + codec->read_cd = iconv_open("UCS-4", charset); + if (codec->read_cd == (iconv_t) -1) { + alloc(codec, 0, pw); + return NULL; + } + + codec->write_cd = iconv_open(charset, "UCS-4"); + if (codec->write_cd == (iconv_t) -1) { + iconv_close(codec->read_cd); + alloc(codec, 0, pw); + return NULL; + } + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = hubbub_iconv_codec_destroy; + codec->base.handler.encode = hubbub_iconv_codec_encode; + codec->base.handler.decode = hubbub_iconv_codec_decode; + codec->base.handler.reset = hubbub_iconv_codec_reset; + + return (hubbub_charsetcodec *) codec; +} + +/** + * Destroy an iconv-based codec + * + * \param codec The codec to destroy + */ +void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + + iconv_close(c->read_cd); + iconv_close(c->write_cd); + + return; +} + +/** + * Encode a chunk of UCS4 data into an iconv-based codec's charset + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * <any_other_error> as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. This buffered data is post-filtering, + * so will not be refiltered on the next call. + * + * In the case of the filter function failing, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the encoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately before the location pointed to by + * ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + uint32_t ucs4; + const uint32_t *towrite; + size_t towritelen; + hubbub_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + + while (c->write_len > 0) { + error = hubbub_iconv_codec_write_char(c, pwrite[0], + dest, destlen); + if (error != HUBBUB_OK) { + /* Copy outstanding chars down, skipping + * invalid one, if present, so as to avoid + * reprocessing the invalid character */ + if (error == HUBBUB_INVALID) { + for (ucs4 = 1; ucs4 < c->write_len; + ucs4++) { + c->write_buf[ucs4] = + pwrite[ucs4]; + } + } + + return error; + } + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + towrite = (const uint32_t *) (const void *) *source; + towritelen = 1; + ucs4 = *towrite; + + /* Run character we're about to output through the + * registered filter, so it can replace it, if it sees + * fit to do so */ + if (c->base.filter != NULL) { + uint32_t *replacement; + + error = c->base.filter(ntohl(ucs4), + &replacement, &towritelen, + c->base.filter_pw); + if (error != HUBBUB_OK) { + /* Don't eat character -- filter failed, + * so nothing gets written or buffered. + * It's up to the client to ensure that + * the filter works in the case where it + * reprocesses this character after the + * fault is fixed up. */ + + return error; + } + + /* Convert filter output to big endian UCS4 */ + for (ucs4 = 0; ucs4 < towritelen; ucs4++) { + replacement[ucs4] = htonl(replacement[ucs4]); + } + + towrite = (const uint32_t *) replacement; + } + + /* Output current character(s) */ + while (towritelen > 0) { + error = hubbub_iconv_codec_write_char(c, towrite[0], + dest, destlen); + + if (error != HUBBUB_OK) { + ucs4 = (error == HUBBUB_INVALID) ? 1 : 0; + + if (towritelen - ucs4 >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen - ucs4; + + /* Copy pending chars to save area, for + * processing next call; skipping invalid + * character, if present, so it's not + * reprocessed. */ + for (; ucs4 < towritelen; ucs4++) { + c->write_buf[ucs4] = towrite[ucs4]; + } + + /* Claim character we've just buffered, + * so it's not repreocessed */ + *source += 4; + *sourcelen -= 4; + + return error; + } + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return HUBBUB_OK; +} + +/** + * Decode a chunk of data in an iconv-based codec's charset into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * <any_other_error> as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately at or before the location pointed + * to by ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + hubbub_error error; + + if (c->read_len > 0) { + /* Output left over from last decode + * Attempt to finish this here */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = pread[0]; + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Run out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) { + c->read_buf[i] = pread[i]; + } + + return HUBBUB_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = hubbub_iconv_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + return error; + } + + + /* And now, fix everything up so the normal processing + * does the right thing. */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Handle memry exhaustion case from above */ + if (error != HUBBUB_OK) + return error; + } + + while (*sourcelen > 0) { + error = hubbub_iconv_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != HUBBUB_OK) { + return error; + } + } + + return HUBBUB_OK; +} + +/** + * Clear an iconv-based codec's encoding state + * + * \param codec The codec to reset + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec) +{ + hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec; + + iconv(c->read_cd, NULL, NULL, NULL, NULL); + iconv(c->write_cd, NULL, NULL, NULL, NULL); + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return HUBBUB_OK; +} + +/** + * Feed a UCS4 character through the registered filter and output the result + * + * \param c Codec to use + * \param ucs4 UCS4 character (big endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * <any_other_error> as a result of the failure of the + * client-provided filter function. + */ +hubbub_error hubbub_iconv_codec_filter_decoded_char(hubbub_iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (c->base.filter != NULL) { + uint32_t *rep; + size_t replen; + hubbub_error error; + + error = c->base.filter(ntohl(ucs4), &rep, &replen, + c->base.filter_pw); + if (error != HUBBUB_OK) { + return error; + } + + while (replen > 0 && *destlen >= replen * 4) { + *((uint32_t *) (void *) *dest) = htonl(*rep); + + *dest += 4; + *destlen -= 4; + + rep++; + replen--; + } + + if (*destlen < replen * 4) { + /* Run out of output buffer */ + size_t i; + + /* Buffer remaining output */ + c->read_len = replen; + + for (i = 0; i < replen; i++) { + c->read_buf[i] = htonl(rep[i]); + } + + return HUBBUB_NOMEM; + } + + } else { + if (*destlen < 4) { + /* Run out of output buffer */ + + c->read_len = 1; + c->read_buf[0] = ucs4; + + return HUBBUB_NOMEM; + } + + *((uint32_t *) (void *) *dest) = ucs4; + *dest += 4; + *destlen -= 4; + } + + return HUBBUB_OK; +} + +/** + * Detect if a codec's charset is Unicode capable + * + * \param c Codec to consider + * \return true if a Unicode variant, false otherwise + */ +bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c) +{ + static uint16_t ucs4; + static uint16_t ucs2; + static uint16_t utf8; + static uint16_t utf16; + static uint16_t utf16be; + static uint16_t utf16le; + static uint16_t utf32; + static uint16_t utf32be; + static uint16_t utf32le; + + if (ucs4 == 0) { + ucs4 = hubbub_mibenum_from_name("UCS-4", SLEN("UCS-4")); + ucs2 = hubbub_mibenum_from_name("UCS-2", SLEN("UCS-2")); + utf8 = hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); + utf16 = hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16")); + utf16be = hubbub_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + utf16le = hubbub_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + utf32 = hubbub_mibenum_from_name("UTF-32", SLEN("UTF-32")); + utf32be = hubbub_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + utf32le = hubbub_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } + + return (c->base.mibenum == ucs4 || + c->base.mibenum == ucs2 || + c->base.mibenum == utf8 || + c->base.mibenum == utf16 || + c->base.mibenum == utf16be || + c->base.mibenum == utf16le || + c->base.mibenum == utf32 || + c->base.mibenum == utf32be || + c->base.mibenum == utf32le); +} + +/** + * Read a character from the codec's native charset to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * <any_other_error> as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + size_t iconv_ret; + const uint8_t *origsrc = *source; + size_t origsrclen = *sourcelen; + uint32_t ucs4; + uint8_t *pucs4 = (uint8_t *) &ucs4; + size_t sucs4 = 4; + hubbub_error error; + + /* Use iconv to convert a single character + * Side effect: Updates *source to point at next input + * character and *sourcelen to reflect reduced input length + */ + iconv_ret = iconv(c->read_cd, (char **) source, sourcelen, + (char **) (void *) &pucs4, &sucs4); + + if (iconv_ret != (size_t) -1 || + (*source != origsrc && sucs4 == 0)) { + /* Read a character */ + error = hubbub_iconv_codec_filter_decoded_char(c, + ucs4, dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + /* filter function failed; restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (errno == E2BIG) { + /* Should never happen */ + abort(); + } else if (errno == EINVAL) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (const char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return HUBBUB_OK; + } else if (errno == EILSEQ) { + /* Illegal input sequence */ + bool found = false; + const uint8_t *oldsrc; + size_t oldsrclen; + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) { + /* restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + + return HUBBUB_INVALID; + } + + /* Ok, this becomes problematic. The iconv API here + * is particularly unhelpful; *source will point at + * the _start_ of the illegal sequence. This means + * that we must find the end of the sequence */ + + /* Search for the start of the next valid input + * sequence (or the end of the input stream) */ + while (*sourcelen > 1) { + pucs4 = (uint8_t *) &ucs4; + sucs4 = 4; + + (*source)++; + (*sourcelen)--; + + oldsrc = *source; + oldsrclen = *sourcelen; + + iconv_ret = iconv(c->read_cd, + (char **) source, sourcelen, + (char **) (void *) &pucs4, &sucs4); + if (iconv_ret != (size_t) -1 || errno != EILSEQ) { + found = true; + break; + } + } + + if (found) { + /* Found start of next valid sequence */ + *source = oldsrc; + *sourcelen = oldsrclen; + } else { + /* Not found - skip last byte in buffer */ + (*source)++; + (*sourcelen)--; + + if (*sourcelen != 0) + abort(); + } + + /* output U+FFFD and continue processing. */ + error = hubbub_iconv_codec_filter_decoded_char(c, + htonl(0xFFFD), dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + /* filter function failed; restore source pointers */ + *source = origsrc; + *sourcelen = origsrclen; + } + + return error; + } + + return HUBBUB_OK; +} + +/** + * Write a UCS4 character in a codec's native charset + * + * \param c The codec + * \param ucs4 The UCS4 character to write (big endian) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if character cannot be represented and the + * codec's error handling mode is set to STRICT. + */ +hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + size_t iconv_ret; + uint8_t *pucs4 = (uint8_t *) &ucs4; + size_t sucs4 = 4; + uint8_t *origdest = *dest; + + iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4, + &sucs4, (char **) dest, destlen); + + if (iconv_ret == (size_t) -1 && errno == E2BIG) { + /* Output buffer is too small */ + return HUBBUB_NOMEM; + } else if (iconv_ret == (size_t) -1 && errno == EILSEQ) { + /* Illegal multibyte sequence */ + /* This should never happen */ + abort(); + } else if (iconv_ret == (size_t) -1 && errno == EINVAL) { + /* Incomplete input character */ + /* This should never happen */ + abort(); + } else if (*dest == origdest) { + /* Nothing was output */ + switch (c->base.errormode) { + case HUBBUB_CHARSETCODEC_ERROR_STRICT: + return HUBBUB_INVALID; + + case HUBBUB_CHARSETCODEC_ERROR_TRANSLIT: + /** \todo transliteration */ + case HUBBUB_CHARSETCODEC_ERROR_LOOSE: + { + pucs4 = (uint8_t *) &ucs4; + sucs4 = 4; + + ucs4 = hubbub_iconv_codec_is_unicode(c) + ? htonl(0xFFFD) : htonl(0x3F); + + iconv_ret = iconv(c->write_cd, + (char **) (void *) &pucs4, &sucs4, + (char **) dest, destlen); + + if (iconv_ret == (size_t) -1 && errno == E2BIG) { + return HUBBUB_NOMEM; + } else if (iconv_ret == (size_t) -1 && + errno == EILSEQ) { + /* Illegal multibyte sequence */ + /* This should never happen */ + abort(); + } else if (iconv_ret == (size_t) -1 && + errno == EINVAL) { + /* Incomplete input character */ + /* This should never happen */ + abort(); + } + } + break; + } + } + + return HUBBUB_OK; +} + +const hubbub_charsethandler hubbub_iconv_codec_handler = { + hubbub_iconv_codec_handles_charset, + hubbub_iconv_codec_create +}; diff --git a/src/charset/codec_impl.h b/src/charset/codec_impl.h new file mode 100644 index 0000000..eb5116b --- /dev/null +++ b/src/charset/codec_impl.h @@ -0,0 +1,51 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_charset_codecimpl_h_ +#define hubbub_charset_codecimpl_h_ + +#include <stdbool.h> +#include <inttypes.h> + +#include "codec.h" + +/** + * Core charset codec definition; implementations extend this + */ +struct hubbub_charsetcodec { + uint16_t mibenum; /**< MIB enum for charset */ + + hubbub_charsetcodec_filter filter; /**< filter function */ + void *filter_pw; /**< filter private word */ + + hubbub_charsetcodec_errormode errormode; /**< error mode */ + + hubbub_alloc alloc; /**< allocation function */ + void *alloc_pw; /**< private word */ + + struct { + void (*destroy)(hubbub_charsetcodec *codec); + hubbub_error (*encode)(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + hubbub_error (*decode)(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); + hubbub_error (*reset)(hubbub_charsetcodec *codec); + } handler; /**< Vtable for handler code */ +}; + +/** + * Codec factory component definition + */ +typedef struct hubbub_charsethandler { + bool (*handles_charset)(const char *charset); + hubbub_charsetcodec *(*create)(const char *charset, + hubbub_alloc alloc, void *pw); +} hubbub_charsethandler; + +#endif diff --git a/src/charset/codec_utf8.c b/src/charset/codec_utf8.c new file mode 100644 index 0000000..86d667f --- /dev/null +++ b/src/charset/codec_utf8.c @@ -0,0 +1,620 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <stdlib.h> +#include <string.h> + +/* These two are for htonl / ntohl */ +#include <arpa/inet.h> +#include <netinet/in.h> + +#include "charset/aliases.h" +#include "utils/utf8.h" +#include "utils/utils.h" + +#include "codec_impl.h" + +/** + * UTF-8 charset codec + */ +typedef struct hubbub_utf8_codec { + hubbub_charsetcodec base; /**< Base class */ + +#define INVAL_BUFSIZE (32) + uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up + * incomplete input + * sequences */ + size_t inval_len; /*< Byte length of inval_buf **/ + +#define READ_BUFSIZE (8) + uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial + * output sequences (decode) + * (host-endian) */ + size_t read_len; /**< Character length of read_buf */ + +#define WRITE_BUFSIZE (8) + uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial + * output sequences (encode) + * (host-endian) */ + size_t write_len; /**< Character length of write_buf */ + +} hubbub_utf8_codec; + +static bool hubbub_utf8_codec_handles_charset(const char *charset); +static hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset, + hubbub_alloc alloc, void *pw); +static void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec); +static hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec); +static hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen); +static hubbub_error hubbub_utf8_codec_filter_decoded_char( + hubbub_utf8_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen); + +/** + * Determine whether this codec handles a specific charset + * + * \param charset Charset to test + * \return true if handleable, false otherwise + */ +bool hubbub_utf8_codec_handles_charset(const char *charset) +{ + return hubbub_mibenum_from_name(charset, strlen(charset)) == + hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); +} + +/** + * Create a utf8 codec + * + * \param charset The charset to read from / write to + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to codec, or NULL on failure + */ +hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset, + hubbub_alloc alloc, void *pw) +{ + hubbub_utf8_codec *codec; + + UNUSED(charset); + + codec = alloc(NULL, sizeof(hubbub_utf8_codec), pw); + if (codec == NULL) + return NULL; + + codec->inval_buf[0] = '\0'; + codec->inval_len = 0; + + codec->read_buf[0] = 0; + codec->read_len = 0; + + codec->write_buf[0] = 0; + codec->write_len = 0; + + /* Finally, populate vtable */ + codec->base.handler.destroy = hubbub_utf8_codec_destroy; + codec->base.handler.encode = hubbub_utf8_codec_encode; + codec->base.handler.decode = hubbub_utf8_codec_decode; + codec->base.handler.reset = hubbub_utf8_codec_reset; + + return (hubbub_charsetcodec *) codec; +} + +/** + * Destroy a utf8 codec + * + * \param codec The codec to destroy + */ +void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec) +{ + UNUSED(codec); +} + +/** + * Encode a chunk of UCS4 data into utf8 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * <any_other_error> as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read. Any remaining output for the character will be buffered by the + * codec for writing on the next call. This buffered data is post-filtering, + * so will not be refiltered on the next call. + * + * In the case of the filter function failing, ::source will point _at_ the + * last input character read; nothing will be written or buffered for the + * failed character. It is up to the client to fix the cause of the failure + * and retry the encoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately before the location pointed to by + * ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec; + uint32_t ucs4; + uint32_t *towrite; + size_t towritelen; + hubbub_error error; + + /* Process any outstanding characters from the previous call */ + if (c->write_len > 0) { + uint32_t *pwrite = c->write_buf; + uint8_t buf[6]; + size_t len; + + while (c->write_len > 0) { + error = hubbub_utf8_from_ucs4(pwrite[0], buf, &len); + if (error != HUBBUB_OK) + abort(); + + if (*destlen < len) { + /* Insufficient output buffer space */ + for (len = 0; len < c->write_len; len++) + c->write_buf[len] = pwrite[len]; + + return HUBBUB_NOMEM; + } + + memcpy(*dest, buf, len); + + *dest += len; + *destlen -= len; + + pwrite++; + c->write_len--; + } + } + + /* Now process the characters for this call */ + while (*sourcelen > 0) { + ucs4 = ntohl(*((uint32_t *) (void *) *source)); + towrite = &ucs4; + towritelen = 1; + + /* Run character we're about to output through the + * registered filter, so it can replace it. */ + if (c->base.filter != NULL) { + error = c->base.filter(ucs4, + &towrite, &towritelen, + c->base.filter_pw); + if (error != HUBBUB_OK) + return error; + } + + /* Output current characters */ + while (towritelen > 0) { + uint8_t buf[6]; + size_t len; + + error = hubbub_utf8_from_ucs4(towrite[0], buf, &len); + if (error != HUBBUB_OK) + abort(); + + if (*destlen < len) { + /* Insufficient output space */ + if (towritelen >= WRITE_BUFSIZE) + abort(); + + c->write_len = towritelen; + + /* Copy pending chars to save area, for + * processing next call. */ + for (len = 0; len < towritelen; len++) + c->write_buf[len] = towrite[len]; + + /* Claim character we've just buffered, + * so it's not reprocessed */ + *source += 4; + *sourcelen -= 4; + + return HUBBUB_NOMEM; + } + + memcpy(*dest, buf, len); + + *dest += len; + *destlen -= len; + + towrite++; + towritelen--; + } + + *source += 4; + *sourcelen -= 4; + } + + return HUBBUB_OK; +} + +/** + * Decode a chunk of utf8 data into UCS4 + * + * \param codec The codec to use + * \param source Pointer to pointer to source data + * \param sourcelen Pointer to length (in bytes) of source data + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to length (in bytes) of output buffer + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * <any_other_error> as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * Note that, if failure occurs whilst attempting to write any output + * buffered by the last call, then ::source and ::sourcelen will remain + * unchanged (as nothing more has been read). + * + * There is no way to determine the output character which caused a + * failure (as it may be one in a filter-injected replacement sequence). + * It is, however, possible to determine which source character caused it + * (this being the character immediately at or before the location pointed + * to by ::source on exit). + * + * [I.e. the process of filtering results in a potential one-to-many mapping + * between source characters and output characters, and identification of + * individual output characters is impossible.] + * + * If STRICT error handling is configured and an illegal sequence is split + * over two calls, then _INVALID will be returned from the second call, + * but ::source will point mid-way through the invalid sequence (i.e. it + * will be unmodified over the second call). In addition, the internal + * incomplete-sequence buffer will be emptied, such that subsequent calls + * will progress, rather than re-evaluating the same invalid sequence. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + * + * Call this with a source length of 0 to flush the output buffer. + */ +hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec; + hubbub_error error; + + if (c->read_len > 0) { + /* Output left over from last decode */ + uint32_t *pread = c->read_buf; + + while (c->read_len > 0 && *destlen >= c->read_len * 4) { + *((uint32_t *) (void *) *dest) = htonl(pread[0]); + + *dest += 4; + *destlen -= 4; + + pread++; + c->read_len--; + } + + if (*destlen < c->read_len * 4) { + /* Ran out of output buffer */ + size_t i; + + /* Shuffle remaining output down */ + for (i = 0; i < c->read_len; i++) + c->read_buf[i] = pread[i]; + + return HUBBUB_NOMEM; + } + } + + if (c->inval_len > 0) { + /* The last decode ended in an incomplete sequence. + * Fill up inval_buf with data from the start of the + * new chunk and process it. */ + uint8_t *in = c->inval_buf; + size_t ol = c->inval_len; + size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen); + size_t orig_l = l; + + memcpy(c->inval_buf + ol, *source, l); + + l += c->inval_len; + + error = hubbub_utf8_codec_read_char(c, + (const uint8_t **) &in, &l, dest, destlen); + if (error != HUBBUB_OK && error != HUBBUB_NOMEM) { + return error; + } + + /* And now, fix up source pointers */ + *source += max((signed) (orig_l - l), 0); + *sourcelen -= max((signed) (orig_l - l), 0); + + /* Failed to resolve an incomplete character and + * ran out of buffer space. No recovery strategy + * possible, so explode everywhere. */ + if ((orig_l + ol) - l == 0) + abort(); + + /* Report memory exhaustion case from above */ + if (error != HUBBUB_OK) + return error; + } + + /* Finally, the "normal" case; process all outstanding characters */ + while (*sourcelen > 0) { + error = hubbub_utf8_codec_read_char(c, + source, sourcelen, dest, destlen); + if (error != HUBBUB_OK) { + return error; + } + } + + return HUBBUB_OK; +} + +/** + * Clear a utf8 codec's encoding state + * + * \param codec The codec to reset + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec) +{ + hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec; + + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + c->read_buf[0] = 0; + c->read_len = 0; + + c->write_buf[0] = 0; + c->write_len = 0; + + return HUBBUB_OK; +} + + +/** + * Read a character from the UTF-8 to UCS4 (big endian) + * + * \param c The codec + * \param source Pointer to pointer to source buffer (updated on exit) + * \param sourcelen Pointer to length of source buffer (updated on exit) + * \param dest Pointer to pointer to output buffer (updated on exit) + * \param destlen Pointer to length of output buffer (updated on exit) + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * HUBBUB_INVALID if a character cannot be represented and the + * codec's error handling mode is set to STRICT, + * <any_other_error> as a result of the failure of the + * client-provided filter function. + * + * On exit, ::source will point immediately _after_ the last input character + * read, if the result is _OK or _NOMEM. Any remaining output for the + * character will be buffered by the codec for writing on the next call. + * This buffered data is post-filtering, so will not be refiltered on the + * next call. + * + * In the case of the result being _INVALID or the filter function failing, + * ::source will point _at_ the last input character read; nothing will be + * written or buffered for the failed character. It is up to the client to + * fix the cause of the failure and retry the decoding process. + * + * ::sourcelen will be reduced appropriately on exit. + * + * ::dest will point immediately _after_ the last character written. + * + * ::destlen will be reduced appropriately on exit. + */ +hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c, + const uint8_t **source, size_t *sourcelen, + uint8_t **dest, size_t *destlen) +{ + uint32_t ucs4; + size_t sucs4; + hubbub_error error; + + /* Convert a single character */ + error = hubbub_utf8_to_ucs4(*source, *sourcelen, &ucs4, &sucs4); + if (error == HUBBUB_OK) { + /* Read a character */ + error = hubbub_utf8_codec_filter_decoded_char(c, + ucs4, dest, destlen); + if (error == HUBBUB_OK || error == HUBBUB_NOMEM) { + /* filter function succeeded; update source pointers */ + *source += sucs4; + *sourcelen -= sucs4; + } + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + return error; + } else if (error == HUBBUB_NEEDDATA) { + /* Incomplete input sequence */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + return HUBBUB_OK; + } else if (error == HUBBUB_INVALID) { + /* Illegal input sequence */ + uint32_t nextchar; + + /* Clear inval buffer */ + c->inval_buf[0] = '\0'; + c->inval_len = 0; + + /* Strict errormode; simply flag invalid character */ + if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) { + return HUBBUB_INVALID; + } + + /* Find next valid UTF-8 sequence. + * We're processing client-provided data, so let's + * be paranoid about its validity. */ + error = hubbub_utf8_next_paranoid(*source, *sourcelen, + 0, &nextchar); + if (error != HUBBUB_OK) { + if (error == HUBBUB_NEEDDATA) { + /* Need more data to be sure */ + if (*sourcelen > INVAL_BUFSIZE) + abort(); + + memmove(c->inval_buf, (char *) *source, + *sourcelen); + c->inval_buf[*sourcelen] = '\0'; + c->inval_len = *sourcelen; + + *source += *sourcelen; + *sourcelen = 0; + + nextchar = 0; + } else { + return error; + } + } + + /* output U+FFFD and continue processing. */ + error = hubbub_utf8_codec_filter_decoded_char(c, + 0xFFFD, dest, destlen); + if (error == HUBBUB_OK || error == HUBBUB_NOMEM) { + /* filter function succeeded; update source pointers */ + *source += nextchar; + *sourcelen -= nextchar; + } + + return error; + } + + return HUBBUB_OK; +} + +/** + * Feed a UCS4 character through the registered filter and output the result + * + * \param c Codec to use + * \param ucs4 UCS4 character (host endian) + * \param dest Pointer to pointer to output buffer + * \param destlen Pointer to output buffer length + * \return HUBBUB_OK on success, + * HUBBUB_NOMEM if output buffer is too small, + * <any_other_error> as a result of the failure of the + * client-provided filter function. + */ +hubbub_error hubbub_utf8_codec_filter_decoded_char(hubbub_utf8_codec *c, + uint32_t ucs4, uint8_t **dest, size_t *destlen) +{ + if (c->base.filter != NULL) { + uint32_t *rep; + size_t replen; + hubbub_error error; + + error = c->base.filter(ucs4, &rep, &replen, + c->base.filter_pw); + if (error != HUBBUB_OK) { + return error; + } + + while (replen > 0 && *destlen >= replen * 4) { + *((uint32_t *) (void *) *dest) = htonl(*rep); + + *dest += 4; + *destlen -= 4; + + rep++; + replen--; + } + + if (*destlen < replen * 4) { + /* Run out of output buffer */ + size_t i; + + /* Buffer remaining output */ + c->read_len = replen; + + for (i = 0; i < replen; i++) { + c->read_buf[i] = rep[i]; + } + + return HUBBUB_NOMEM; + } + + } else { + if (*destlen < 4) { + /* Run out of output buffer */ + c->read_len = 1; + c->read_buf[0] = ucs4; + + return HUBBUB_NOMEM; + } + + *((uint32_t *) (void *) *dest) = htonl(ucs4); + *dest += 4; + *destlen -= 4; + } + + return HUBBUB_OK; +} + + +const hubbub_charsethandler hubbub_utf8_codec_handler = { + hubbub_utf8_codec_handles_charset, + hubbub_utf8_codec_create +}; diff --git a/src/charset/detect.c b/src/charset/detect.c new file mode 100644 index 0000000..8ff3b87 --- /dev/null +++ b/src/charset/detect.c @@ -0,0 +1,673 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <stdbool.h> +#include <string.h> + +#include "charset/aliases.h" +#include "utils/utils.h" + +#include "detect.h" + +static uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len); +static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len); +static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, + const uint8_t *end); +static uint16_t hubbub_charset_parse_content(const uint8_t *value, + uint32_t valuelen); +static bool hubbub_charset_get_attribute(const uint8_t **data, + const uint8_t *end, + const uint8_t **name, uint32_t *namelen, + const uint8_t **value, uint32_t *valuelen); + +/** + * Extract a charset from a chunk of data + * + * \param data Pointer to pointer to buffer containing data + * \param len Pointer to buffer length + * \param mibenum Pointer to location to store MIB enum representing charset + * \param source Pointer to location to receive charset source + * \return HUBBUB_OK on success, appropriate error otherwise + * + * The data pointer and length will be modified by this function if + * a byte order mark is encountered at the start of the buffer. The updated + * data pointer will point to the first byte in the buffer after the BOM. + * The length will be modified appropriately. + * + * The larger a chunk of data fed to this routine, the better, as it allows + * charset autodetection access to a larger dataset for analysis. + */ +hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len, + uint16_t *mibenum, hubbub_charset_source *source) +{ + uint16_t charset = 0; + + if (data == NULL || *data == NULL || len == NULL || + mibenum == NULL || source == NULL) + return HUBBUB_BADPARM; + + /* We need at least 4 bytes of data */ + if (*len < 4) + goto default_encoding; + + /* First, look for a BOM */ + charset = hubbub_charset_read_bom(data, len); + if (charset != 0) { + *mibenum = charset; + *source = HUBBUB_CHARSET_DOCUMENT; + + return HUBBUB_OK; + } + + /* No BOM was found, so we must look for a meta charset within + * the document itself. */ + charset = hubbub_charset_scan_meta(*data, *len); + if (charset != 0) { + /* ISO-8859-1 becomes Windows-1252 */ + if (charset == hubbub_mibenum_from_name("ISO-8859-1", + SLEN("ISO-8859-1"))) { + charset = hubbub_mibenum_from_name("Windows-1252", + SLEN("Windows-1252")); + /* Fallback to 8859-1 if that failed */ + if (charset == 0) + charset = hubbub_mibenum_from_name( + "ISO-8859-1", SLEN("ISO-8859-1")); + } + + /* If we've encountered a meta charset for a non-ASCII- + * compatible encoding, don't trust it. + * + * Firstly, it should have been sent with a BOM (and thus + * detected above). + * + * Secondly, we've just used an ASCII-only parser to + * extract the encoding from the document. Therefore, + * the document plainly isn't what the meta charset + * claims it is. + * + * What we do in this case is to ignore the meta charset's + * claims and leave the charset determination to the + * autodetection routines (or the fallback case if they + * fail). + */ + if (charset != hubbub_mibenum_from_name("UTF-16", + SLEN("UTF-16")) && + charset != hubbub_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")) && + charset != hubbub_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")) && + charset != hubbub_mibenum_from_name("UTF-32", + SLEN("UTF-32")) && + charset != hubbub_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")) && + charset != hubbub_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE"))) { + + *mibenum = charset; + *source = HUBBUB_CHARSET_DOCUMENT; + + return HUBBUB_OK; + } + } + + /* No charset was specified within the document, attempt to + * autodetect the encoding from the data that we have available. */ + + /** \todo Charset autodetection */ + + /* We failed to autodetect a charset, so use the default fallback */ +default_encoding: + + charset = hubbub_mibenum_from_name("Windows-1252", + SLEN("Windows-1252")); + if (charset == 0) + charset = hubbub_mibenum_from_name("ISO-8859-1", + SLEN("ISO-8859-1")); + + *mibenum = charset; + *source = HUBBUB_CHARSET_DEFAULT; + + return HUBBUB_OK; +} + + +/** + * Inspect the beginning of a buffer of data for the presence of a + * UTF Byte Order Mark. + * + * \param data Pointer to pointer to buffer containing data + * \param len Pointer to buffer length + * \return MIB enum representing encoding described by BOM, or 0 if not found + * + * If a BOM is found, the data pointer will be modified to point to the first + * byte in the buffer after the BOM. The length will also be modified + * appropriately. + */ +uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len) +{ + if (data == NULL || *data == NULL || len == NULL) + return 0; + + /* We require at least 4 bytes of data */ + if (*len < 4) + return 0; + +#define UTF32BOM_LEN (4) +#define UTF16BOM_LEN (2) +#define UTF8BOM_LEN (3) + + if ((*data)[0] == 0x00 && (*data)[1] == 0x00 && + (*data)[2] == 0xFE && (*data)[3] == 0xFF) { + *data += UTF32BOM_LEN; + *len -= UTF32BOM_LEN; + + return hubbub_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE && + (*data)[2] == 0x00 && (*data)[3] == 0x00) { + *data += UTF32BOM_LEN; + *len -= UTF32BOM_LEN; + + return hubbub_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } else if ((*data)[0] == 0xFE && (*data)[1] == 0xFF) { + *data += UTF16BOM_LEN; + *len -= UTF16BOM_LEN; + + return hubbub_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE) { + *data += UTF16BOM_LEN; + *len -= UTF16BOM_LEN; + + return hubbub_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + } else if ((*data)[0] == 0xEF && (*data)[1] == 0xBB && + (*data)[2] == 0xBF) { + *data += UTF8BOM_LEN; + *len -= UTF8BOM_LEN; + + return hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); + } + +#undef UTF32BOM_LEN +#undef UTF16BOM_LEN +#undef UTF8BOM_LEN + + return 0; +} + +#define PEEK(a) \ + (pos < end - SLEN(a) && \ + strncasecmp((const char *) pos, a, SLEN(a)) == 0) + +#define ADVANCE(a) \ + while (pos < end - SLEN(a)) { \ + if (PEEK(a)) \ + break; \ + pos++; \ + } \ + \ + if (pos == end - SLEN(a)) \ + return 0; + +#define ISSPACE(a) \ + (a == 0x09 || a == 0x0a || a == 0x0b || \ + a == 0x0c || a == 0x0d || a == 0x20) + +/** + * Search for a meta charset within a buffer of data + * + * \param data Pointer to buffer containing data + * \param len Length of buffer in data + * \return MIB enum representing encoding, or 0 if none found + */ +uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len) +{ + const uint8_t *pos = data; + const uint8_t *end; + uint16_t mibenum; + + if (data == NULL) + return 0; + + end = pos + min(512, len); + + /* 1. */ + while (pos < end) { + /* a */ + if (PEEK("<!--")) { + pos += SLEN("<!--"); + ADVANCE("-->"); + /* b */ + } else if (PEEK("<meta")) { + if (pos + SLEN("<meta") >= end - 1) + return 0; + + if (ISSPACE(*(pos + SLEN("<meta")))) { + /* 1 */ + pos += SLEN("<meta"); + + mibenum = hubbub_charset_parse_attributes( + &pos, end); + if (mibenum != 0) + return mibenum; + + if (pos >= end) + return 0; + } + /* c */ + } else if ((PEEK("</") && (pos < end - 3 && + (0x41 <= (*(pos + 2) & ~ 0x20) && + (*(pos + 2) & ~ 0x20) <= 0x5A))) || + (pos < end - 2 && *pos == '<' && + (0x41 <= (*(pos + 1) & ~ 0x20) && + (*(pos + 1) & ~ 0x20) <= 0x5A))) { + + /* skip '<' */ + pos++; + + /* 1. */ + while (pos < end) { + if (ISSPACE(*pos) || + *pos == '>' || *pos == '<') + break; + pos++; + } + + if (pos >= end) + return 0; + + /* 3 */ + if (*pos != '<') { + const uint8_t *n; + const uint8_t *v; + uint32_t nl, vl; + + while (hubbub_charset_get_attribute(&pos, end, + &n, &nl, &v, &vl)) + ; /* do nothing */ + /* 2 */ + } else + continue; + /* d */ + } else if (PEEK("<!") || PEEK("</") || PEEK("<?")) { + pos++; + ADVANCE(">"); + } + + /* e - do nothing */ + + /* 2 */ + pos++; + } + + return 0; +} + +/** + * Parse attributes on a meta tag + * + * \param pos Pointer to pointer to current location (updated on exit) + * \param end Pointer to end of data stream + * \return MIB enum of detected encoding, or 0 if none found + */ +uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, + const uint8_t *end) +{ + const uint8_t *name; + const uint8_t *value; + uint32_t namelen, valuelen; + uint16_t mibenum; + + if (pos == NULL || *pos == NULL || end == NULL) + return 0; + + /* 2 */ + while (hubbub_charset_get_attribute(pos, end, + &name, &namelen, &value, &valuelen)) { + /* 3 */ + /* a */ + if (namelen == SLEN("charset") && valuelen > 0 && + strncasecmp((const char *) name, "charset", + SLEN("charset")) == 0) { + /* strip value */ + while (ISSPACE(*value)) { + value++; + valuelen--; + } + + while (valuelen > 0 && ISSPACE(value[valuelen - 1])) + valuelen--; + + mibenum = hubbub_mibenum_from_name( + (const char *) value, valuelen); + if (mibenum != 0) + return mibenum; + /* b */ + } else if (namelen == SLEN("content") && valuelen > 0 && + strncasecmp((const char *) name, "content", + SLEN("content")) == 0) { + mibenum = hubbub_charset_parse_content(value, + valuelen); + if (mibenum != 0) + return mibenum; + } + + /* c - do nothing */ + + /* 1 */ + while (*pos < end) { + if (ISSPACE(**pos)) + break; + (*pos)++; + } + + if (*pos >= end) { + return 0; + } + } + + return 0; +} + +/** + * Parse a content= attribute's value + * + * \param value Attribute's value + * \param valuelen Length of value + * \return MIB enum of detected encoding, or 0 if none found + */ +uint16_t hubbub_charset_parse_content(const uint8_t *value, + uint32_t valuelen) +{ + const uint8_t *end; + const uint8_t *tentative = NULL; + uint32_t tentative_len = 0; + + if (value == NULL) + return 0; + + end = value + valuelen; + + /* 1 */ + while (value < end) { + if (*value == ';') { + value++; + break; + } + + value++; + } + + if (value >= end) + return 0; + + /* 2 */ + while (value < end && ISSPACE(*value)) { + value++; + } + + if (value >= end) + return 0; + + /* 3 */ + if (value < end - SLEN("charset") && + strncasecmp((const char *) value, + "charset", SLEN("charset")) != 0) + return 0; + + value += SLEN("charset"); + + /* 4 */ + while (value < end && ISSPACE(*value)) { + value++; + } + + if (value >= end) + return 0; + + /* 5 */ + if (*value != '=') + return 0; + /* skip '=' */ + value++; + + /* 6 */ + while (value < end && ISSPACE(*value)) { + value++; + } + + if (value >= end) + return 0; + + /* 7 */ + tentative = value; + + /* a */ + if (*value == '"') { + while (++value < end && *value != '"') { + tentative_len++; + } + + if (value < end) + tentative++; + else + tentative = NULL; + /* b */ + } else if (*value == '\'') { + while (++value < end && *value != '\'') { + tentative_len++; + } + + if (value < end) + tentative++; + else + tentative = NULL; + /* c */ + } else { + while (value < end && !ISSPACE(*value)) { + value++; + tentative_len++; + } + } + + /* 8 */ + if (tentative != NULL) { + return hubbub_mibenum_from_name((const char *) tentative, + tentative_len); + } + + /* 9 */ + return 0; +} + +/** + * Extract an attribute from the data stream + * + * \param data Pointer to pointer to current location (updated on exit) + * \param end Pointer to end of data stream + * \param name Pointer to location to receive attribute name + * \param namelen Pointer to location to receive attribute name length + * \param value Pointer to location to receive attribute value + * \param valuelen Pointer to location to receive attribute value langth + * \return true if attribute extracted, false otherwise. + * + * Note: The caller should heed the returned lengths; these are the only + * indicator that useful content resides in name or value. + */ +bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, + const uint8_t **name, uint32_t *namelen, + const uint8_t **value, uint32_t *valuelen) +{ + const uint8_t *pos; + + if (data == NULL || *data == NULL || end == NULL || name == NULL || + namelen == NULL || value == NULL || valuelen == NULL) + return false; + + pos = *data; + + /* 1. Skip leading spaces or '/' characters */ + while (pos < end && (ISSPACE(*pos) || *pos == '/')) { + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* 2. Invalid element open character */ + if (*pos == '<') { + pos--; + *data = pos; + return false; + } + + /* 3. End of element */ + if (*pos == '>') { + *data = pos; + return false; + } + + /* 4. Initialise name & value to empty string */ + *name = pos; + *namelen = 0; + *value = (const uint8_t *) ""; + *valuelen = 0; + + /* 5. Extract name */ + while (pos < end) { + /* a */ + if (*pos == '=') { + break; + } + + /* b */ + if (ISSPACE(*pos)) { + break; + } + + /* c */ + if (*pos == '/' || *pos == '<' || *pos == '>') { + return true; + } + + /* d is handled by strncasecmp in _parse_attributes */ + + /* e */ + (*namelen)++; + + /* 6 */ + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + if (ISSPACE(*pos)) { + /* 7. Skip trailing spaces */ + while (pos < end && ISSPACE(*pos)) { + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* 8. Must be '=' */ + if (*pos != '=') { + pos--; + *data = pos; + return true; + } + } + + /* 9. Skip '=' */ + pos++; + + /* 10. Skip any spaces after '=' */ + while (pos < end && ISSPACE(*pos)) { + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* 11. Extract value, if quoted */ + /* a */ + if (*pos == '\'' || *pos == '"') { + /* 1 */ + const uint8_t *quote = pos; + + /* 2 */ + while (++pos < end) { + /* 3 */ + if (*pos == *quote) { + *value = (quote + 1); + *data = ++pos; + return true; + } + + /* 4 is handled by strncasecmp */ + + /* 5 */ + (*valuelen)++; + + /* 6 */ + } + + if (pos >= end) { + *data = pos; + return false; + } + } + + /* b */ + if (*pos == '<' || *pos == '>') { + *data = pos; + return true; + } + + /* c is handled by strncasecmp */ + + /* d */ + *value = pos; + + while (pos < end) { + /* 12. Extract unquoted value */ + /* a */ + if (ISSPACE(*pos) || *pos == '<' || *pos == '>') { + *data = pos; + return true; + } + + /* b is handled by strncasecmp */ + + /* c */ + (*valuelen)++; + + /* 13. Advance */ + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* should never be reached */ + abort(); + + return false; +} diff --git a/src/charset/detect.h b/src/charset/detect.h new file mode 100644 index 0000000..854a8d6 --- /dev/null +++ b/src/charset/detect.h @@ -0,0 +1,22 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_charset_detect_h_ +#define hubbub_charset_detect_h_ + +#include <inttypes.h> + +#include <hubbub/errors.h> +#include <hubbub/functypes.h> +#include <hubbub/types.h> + +/* Extract a charset from a chunk of data */ +hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len, + uint16_t *mibenum, hubbub_charset_source *source); + +#endif + diff --git a/src/hubbub.c b/src/hubbub.c new file mode 100644 index 0000000..32e0a1f --- /dev/null +++ b/src/hubbub.c @@ -0,0 +1,63 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <hubbub/hubbub.h> + +#include "charset/aliases.h" +#include "tokeniser/entities.h" + +/** + * Initialise the Hubbub library for use. + * + * This _must_ be called before using any hubbub functions + * + * \param aliases_file Pointer to name of file containing encoding alias data + * \param alloc Pointer to (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, applicable error otherwise. + */ +hubbub_error hubbub_initialise(const char *aliases_file, + hubbub_alloc alloc, void *pw) +{ + hubbub_error error; + + if (aliases_file == NULL || alloc == NULL) + return HUBBUB_BADPARM; + + error = hubbub_aliases_create(aliases_file, alloc, pw); + if (error != HUBBUB_OK) + return error; + + error = hubbub_entities_create(alloc, pw); + if (error != HUBBUB_OK) { + hubbub_aliases_destroy(alloc, pw); + return error; + } + + return HUBBUB_OK; +} + +/** + * Clean up after Hubbub + * + * \param alloc Pointer to (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, applicable error otherwise. + */ +hubbub_error hubbub_finalise(hubbub_alloc alloc, void *pw) +{ + if (alloc == NULL) + return HUBBUB_BADPARM; + + hubbub_entities_destroy(alloc, pw); + + hubbub_aliases_destroy(alloc, pw); + + return HUBBUB_OK; +} + + diff --git a/src/input/Makefile b/src/input/Makefile new file mode 100644 index 0000000..8b06c63 --- /dev/null +++ b/src/input/Makefile @@ -0,0 +1,53 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Objects +OBJS = filter inputstream utf8_stream + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS))) + +debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS))) + +clean: + -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS})) + +distclean: + +setup: + +export: + +test: + +# Pattern rules +../Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +../Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/input/filter.c b/src/input/filter.c new file mode 100644 index 0000000..5ac5391 --- /dev/null +++ b/src/input/filter.c @@ -0,0 +1,380 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <errno.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +#include "charset/aliases.h" +#include "charset/codec.h" +#include "utils/utils.h" + +#include "input/filter.h" + + +/** Input filter */ +struct hubbub_filter { + hubbub_charsetcodec *read_codec; /**< Read codec */ + hubbub_charsetcodec *write_codec; /**< Write codec */ + + uint32_t filter_output[2]; /**< Filter output buffer */ + uint32_t last_filter_char; /**< Last filtered character */ + + uint32_t pivot_buf[64]; /**< Conversion pivot buffer */ + + bool leftover; /**< Data remains from last call */ + uint8_t *pivot_left; /**< Remaining pivot to write */ + size_t pivot_len; /**< Length of pivot remaining */ + + struct { + uint16_t encoding; /**< Input encoding */ + } settings; /**< Filter settings */ + + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client private data */ +}; + +static hubbub_error hubbub_filter_set_defaults(hubbub_filter *input); +static hubbub_error hubbub_filter_set_encoding(hubbub_filter *input, + const char *enc); +static hubbub_error read_character_filter(uint32_t c, + uint32_t **output, size_t *outputlen, void *pw); + +/** + * Create an input filter + * + * \param int_enc Desired encoding of document + * \param alloc Function used to (de)allocate data + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to filter instance, or NULL on failure + */ +hubbub_filter *hubbub_filter_create(const char *int_enc, + hubbub_alloc alloc, void *pw) +{ + hubbub_filter *filter; + + if (alloc == NULL) + return NULL; + + filter = alloc(NULL, sizeof(*filter), pw); + if (!filter) + return NULL; + + filter->last_filter_char = 0; + + filter->leftover = false; + filter->pivot_left = NULL; + filter->pivot_len = 0; + + filter->alloc = alloc; + filter->pw = pw; + + if (hubbub_filter_set_defaults(filter) != HUBBUB_OK) { + filter->alloc(filter, 0, pw); + return NULL; + } + + filter->write_codec = hubbub_charsetcodec_create(int_enc, alloc, pw); + if (filter->write_codec == NULL) { + if (filter->read_codec != NULL) + hubbub_charsetcodec_destroy(filter->read_codec); + filter->alloc(filter, 0, pw); + return NULL; + } + + return filter; +} + +/** + * Destroy an input filter + * + * \param input Pointer to filter instance + */ +void hubbub_filter_destroy(hubbub_filter *input) +{ + if (input == NULL) + return; + + if (input->read_codec != NULL) + hubbub_charsetcodec_destroy(input->read_codec); + + if (input->write_codec != NULL) + hubbub_charsetcodec_destroy(input->write_codec); + + input->alloc(input, 0, input->pw); + + return; +} + +/** + * Configure an input filter + * + * \param input Pointer to filter instance + * \param type Input option type to configure + * \param params Option-specific parameters + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_setopt(hubbub_filter *input, + hubbub_filter_opttype type, + hubbub_filter_optparams *params) +{ + hubbub_error error = HUBBUB_OK; + + if (input == NULL || params == NULL) + return HUBBUB_BADPARM; + + switch (type) { + case HUBBUB_FILTER_SET_ENCODING: + error = hubbub_filter_set_encoding(input, + params->encoding.name); + break; + } + + return error; +} + +/** + * Process a chunk of data + * + * \param input Pointer to filter instance + * \param data Pointer to pointer to input buffer + * \param len Pointer to length of input buffer + * \param output Pointer to pointer to output buffer + * \param outlen Pointer to length of output buffer + * \return HUBBUB_OK on success, appropriate error otherwise + * + * Call this with an input buffer length of 0 to flush any buffers. + */ +hubbub_error hubbub_filter_process_chunk(hubbub_filter *input, + const uint8_t **data, size_t *len, + uint8_t **output, size_t *outlen) +{ + hubbub_error read_error, write_error; + + if (input == NULL || data == NULL || *data == NULL || len == NULL || + output == NULL || *output == NULL || outlen == NULL) + return HUBBUB_BADPARM; + + if (input->leftover) { + /* Some data left to be written from last call */ + + /* Attempt to flush the remaining data. */ + write_error = hubbub_charsetcodec_encode(input->write_codec, + (const uint8_t **) &input->pivot_left, + &input->pivot_len, + output, outlen); + + if (write_error != HUBBUB_OK) { + return write_error; + } + + /* And clear leftover */ + input->pivot_left = NULL; + input->pivot_len = 0; + input->leftover = false; + } + + while (*len > 0) { + size_t pivot_len = sizeof(input->pivot_buf); + uint8_t *pivot = (uint8_t *) input->pivot_buf; + + read_error = hubbub_charsetcodec_decode(input->read_codec, + data, len, + (uint8_t **) &pivot, &pivot_len); + + pivot = (uint8_t *) input->pivot_buf; + pivot_len = sizeof(input->pivot_buf) - pivot_len; + + if (pivot_len > 0) { + write_error = hubbub_charsetcodec_encode( + input->write_codec, + (const uint8_t **) &pivot, + &pivot_len, + output, outlen); + + if (write_error != HUBBUB_OK) { + input->leftover = true; + input->pivot_left = pivot; + input->pivot_len = pivot_len; + + return write_error; + } + } + + if (read_error != HUBBUB_OK && read_error != HUBBUB_NOMEM) + return read_error; + } + + return HUBBUB_OK; +} + +/** + * Reset an input filter's state + * + * \param input The input filter to reset + * \param HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_reset(hubbub_filter *input) +{ + hubbub_error error; + + if (input == NULL) + return HUBBUB_BADPARM; + + /* Clear pivot buffer leftovers */ + input->pivot_left = NULL; + input->pivot_len = 0; + input->leftover = false; + + /* Reset read codec */ + error = hubbub_charsetcodec_reset(input->read_codec); + if (error != HUBBUB_OK) + return error; + + /* Reset write codec */ + error = hubbub_charsetcodec_reset(input->write_codec); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Set an input filter's default settings + * + * \param input Input filter to configure + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_set_defaults(hubbub_filter *input) +{ + hubbub_error error; + + if (input == NULL) + return HUBBUB_BADPARM; + + input->read_codec = NULL; + input->write_codec = NULL; + input->settings.encoding = 0; + error = hubbub_filter_set_encoding(input, "ISO-8859-1"); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Set an input filter's encoding + * + * \param input Input filter to configure + * \param enc Encoding name + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_filter_set_encoding(hubbub_filter *input, + const char *enc) +{ + const char *old_enc; + uint16_t mibenum; + hubbub_error error; + hubbub_charsetcodec_optparams params; + + if (input == NULL || enc == NULL) + return HUBBUB_BADPARM; + + mibenum = hubbub_mibenum_from_name(enc, strlen(enc)); + if (mibenum == 0) + return HUBBUB_INVALID; + + /* Exit early if we're already using this encoding */ + if (input->settings.encoding == mibenum) + return HUBBUB_OK; + + old_enc = hubbub_mibenum_to_name(input->settings.encoding); + if (old_enc == NULL) + old_enc = "ISO-8859-1"; + + if (input->read_codec != NULL) + hubbub_charsetcodec_destroy(input->read_codec); + + input->read_codec = hubbub_charsetcodec_create(enc, input->alloc, + input->pw); + if (input->read_codec == NULL) + return HUBBUB_NOMEM; + + /* Register filter function */ + params.filter_func.filter = read_character_filter; + params.filter_func.pw = (void *) input; + error = hubbub_charsetcodec_setopt(input->read_codec, + HUBBUB_CHARSETCODEC_FILTER_FUNC, + (hubbub_charsetcodec_optparams *) ¶ms); + if (error != HUBBUB_OK) + return error; + + input->settings.encoding = mibenum; + + return HUBBUB_OK; +} + +/** + * Character filter function for read characters + * + * \param c The read character (UCS4 - host byte order) + * \param output Pointer to pointer to output buffer (filled on exit) + * \param outputlen Pointer to output buffer length (filled on exit) + * \param pw Pointer to client-specific private data. + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error read_character_filter(uint32_t c, uint32_t **output, + size_t *outputlen, void *pw) +{ + hubbub_filter *input = (hubbub_filter *) pw; + size_t len; + + if (output == NULL || outputlen == NULL || pw == NULL) + return HUBBUB_BADPARM; + + /* Line ending normalisation: + * CRLF -> LF (trap CR and let LF through unmodified) + * CR -> LF (trap CR and convert to LF if not CRLF) + * LF -> LF (leave LF alone) + */ + +#define NUL (0x00000000) +#define CR (0x0000000D) +#define LF (0x0000000A) +#define REP (0x0000FFFD) + + if (c == NUL) { + /* Replace NUL (U+0000) characters in input with U+FFFD */ + input->filter_output[0] = REP; + len = 1; + } else if (c == CR) { + /* Trap CR characters */ + len = 0; + } else if (input->last_filter_char == CR && c != LF) { + /* Last char was CR and this isn't LF => CR -> LF */ + input->filter_output[0] = LF; + input->filter_output[1] = c; + len = 2; + } else { + /* Let character through unchanged */ + input->filter_output[0] = c; + len = 1; + } + +#undef NUL +#undef CR +#undef LF +#undef REP + + input->last_filter_char = c; + + *output = input->filter_output; + *outputlen = len; + + return HUBBUB_OK; +} diff --git a/src/input/filter.h b/src/input/filter.h new file mode 100644 index 0000000..6650e09 --- /dev/null +++ b/src/input/filter.h @@ -0,0 +1,57 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_input_filter_h_ +#define hubbub_input_filter_h_ + +#include <inttypes.h> + +#include <hubbub/errors.h> +#include <hubbub/functypes.h> + +typedef struct hubbub_filter hubbub_filter; + +/** + * Input filter option types + */ +typedef enum hubbub_filter_opttype { + HUBBUB_FILTER_SET_ENCODING = 0, +} hubbub_filter_opttype; + +/** + * Input filter option parameters + */ +typedef union hubbub_filter_optparams { + /** Parameters for encoding setting */ + struct { + /** Encoding name */ + const char *name; + } encoding; +} hubbub_filter_optparams; + + +/* Create an input filter */ +hubbub_filter *hubbub_filter_create(const char *int_enc, + hubbub_alloc alloc, void *pw); +/* Destroy an input filter */ +void hubbub_filter_destroy(hubbub_filter *input); + +/* Configure an input filter */ +hubbub_error hubbub_filter_setopt(hubbub_filter *input, + hubbub_filter_opttype type, + hubbub_filter_optparams *params); + +/* Process a chunk of data */ +hubbub_error hubbub_filter_process_chunk(hubbub_filter *input, + const uint8_t **data, size_t *len, + uint8_t **output, size_t *outlen); + +/* Reset an input filter's state */ +hubbub_error hubbub_filter_reset(hubbub_filter *input); + +#endif + diff --git a/src/input/inputstream.c b/src/input/inputstream.c new file mode 100644 index 0000000..f82d279 --- /dev/null +++ b/src/input/inputstream.c @@ -0,0 +1,479 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <stdlib.h> + +#include "charset/aliases.h" +#include "input/streamimpl.h" + +/** + * Buffer moving claimant context + */ +struct hubbub_inputstream_bm_handler { + hubbub_inputstream_buffermoved handler; /**< Handler function */ + void *pw; /**< Client private data */ + + struct hubbub_inputstream_bm_handler *next; + struct hubbub_inputstream_bm_handler *prev; +}; + +extern hubbub_streamhandler utf8stream; + +static hubbub_streamhandler *handler_table[] = { + &utf8stream, + NULL +}; + +/** + * Create an input stream + * + * \param enc Document charset, or NULL to autodetect + * \param int_enc Desired encoding of document + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to stream instance, or NULL on failure + */ +hubbub_inputstream *hubbub_inputstream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw) +{ + hubbub_inputstream *stream; + hubbub_streamhandler **handler; + + if (int_enc == NULL || alloc == NULL) + return NULL; + + /* Search for handler class */ + for (handler = handler_table; *handler != NULL; handler++) { + if ((*handler)->uses_encoding(int_enc)) + break; + } + + /* None found */ + if ((*handler) == NULL) + return NULL; + + stream = (*handler)->create(enc, int_enc, alloc, pw); + if (stream == NULL) + return NULL; + + stream->handlers = NULL; + + stream->alloc = alloc; + stream->pw = pw; + + return stream; +} + +/** + * Destroy an input stream + * + * \param stream Input stream to destroy + */ +void hubbub_inputstream_destroy(hubbub_inputstream *stream) +{ + hubbub_inputstream_bm_handler *h, *i; + + if (stream == NULL) + return; + + for (h = stream->handlers; h; h = i) { + i = h->next; + + stream->alloc(h, 0, stream->pw); + } + + stream->destroy(stream); +} + +/** + * Append data to an input stream + * + * \param stream Input stream to append data to + * \param data Data to append (in document charset), or NULL to flag EOF + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + if (stream == NULL) + return HUBBUB_BADPARM; + + /* Calling this if we've disowned the buffer is foolish */ + if (stream->buffer == NULL) + return HUBBUB_INVALID; + + return stream->append(stream, data, len); +} + +/** + * Insert data into stream at current location + * + * \param stream Input stream to insert into + * \param data Data to insert (UTF-8 encoded) + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + if (stream == NULL || data == NULL) + return HUBBUB_BADPARM; + + /* Calling this if we've disowned the buffer is foolish */ + if (stream->buffer == NULL) + return HUBBUB_INVALID; + + return stream->insert(stream, data, len); +} + +/** + * Look at the next character in the stream + * + * \param stream Stream to look in + * \return UCS4 (host-endian) character code, or EOF or OOD. + */ +uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_INPUTSTREAM_OOD; + + return stream->peek(stream);; +} + +/** + * Retrieve the byte index and length of the current character in the stream + * + * \param stream Stream to look in + * \param len Pointer to location to receive byte length of character + * \return Byte index of current character from start of stream, + * or (uint32_t) -1 on error + */ +uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream, + size_t *len) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || len == NULL || stream->buffer == NULL) + return (uint32_t) -1; + + return stream->cur_pos(stream, len); +} + +/** + * Convert the current character to lower case + * + * \param stream Stream to look in + */ +void hubbub_inputstream_lowercase(hubbub_inputstream *stream) +{ + if (stream == NULL || stream->buffer == NULL) + return; + + stream->lowercase(stream); +} + +/** + * Convert the current character to upper case + * + * \param stream Stream to look in + */ +void hubbub_inputstream_uppercase(hubbub_inputstream *stream) +{ + if (stream == NULL || stream->buffer == NULL) + return; + + stream->uppercase(stream); +} + +/** + * Advance the stream's current position + * + * \param stream The stream whose position to advance + */ +void hubbub_inputstream_advance(hubbub_inputstream *stream) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || stream->buffer == NULL) + return; + + if (stream->cursor == stream->buffer_len) + return; + + stream->advance(stream); +} + +/** + * Push a character back onto the stream + * + * \param stream Stream to push back to + * \param character UCS4 (host-endian) codepoint to push back + * \return HUBBUB_OK on success, appropriate error otherwise + * + * Note that this doesn't actually modify the data in the stream. + * It works by ensuring that the character located just before the + * current stream location is the same as ::character. If it is, + * then the stream pointer is moved back. If it is not, then an + * error is returned and the stream pointer remains unmodified. + */ +hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream, + uint32_t character) +{ + /* It is illegal to call this after the buffer has been disowned */ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_BADPARM; + + if (stream->cursor == 0) + return HUBBUB_INVALID; + + return stream->push_back(stream, character); +} + +/** + * Rewind the input stream by a number of bytes + * + * \param stream Stream to rewind + * \param n Number of bytes to go back + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n) +{ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_BADPARM; + + if (stream->cursor < n) + return HUBBUB_INVALID; + + stream->cursor -= n; + + return HUBBUB_OK; +} + +/** + * Claim ownership of an input stream's buffer + * + * \param stream Input stream whose buffer to claim + * \param buffer Pointer to location to receive buffer pointer + * \param len Pointer to location to receive byte length of buffer + * \return HUBBUB_OK on success, appropriate error otherwise. + * + * Once the buffer has been claimed by a client, the input stream disclaims + * all ownership rights (and invalidates any internal references it may have + * to the buffer). Therefore, the only input stream call which may be made + * after calling this function is to destroy the input stream. Therefore, + * unless the stream pointer is located at EOF, this call will return an + * error. + */ +hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream, + uint8_t **buffer, size_t *len) +{ + if (stream == NULL || buffer == NULL || len == NULL) + return HUBBUB_BADPARM; + + if (stream->had_eof == false || + stream->cursor != stream->buffer_len) + return HUBBUB_INVALID; + + *buffer = stream->buffer; + *len = stream->buffer_len; + + stream->buffer = NULL; + + return HUBBUB_OK; +} + +/** + * Register interest in buffer moved events + * + * \param stream Input stream to register interest with + * \param handler Pointer to handler function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_register_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw) +{ + hubbub_inputstream_bm_handler *h; + + if (stream == NULL || handler == NULL) + return HUBBUB_BADPARM; + + h = stream->alloc(NULL, sizeof(hubbub_inputstream_bm_handler), + stream->pw); + if (h == NULL) + return HUBBUB_NOMEM; + + h->handler = handler; + h->pw = pw; + + h->prev = NULL; + h->next = stream->handlers; + + if (stream->handlers) + stream->handlers->prev = h; + stream->handlers = h; + + /* And notify claimant of current buffer location */ + handler(stream->buffer, stream->buffer_len, pw); + + return HUBBUB_OK; +} + +/** + * Deregister interest in buffer moved events + * + * \param stream Input stream to deregister from + * \param handler Pointer to handler function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_deregister_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw) +{ + hubbub_inputstream_bm_handler *h; + + if (stream == NULL || handler == NULL) + return HUBBUB_BADPARM; + + for (h = stream->handlers; h; h = h->next) { + if (h->handler == handler && h->pw == pw) + break; + } + + if (h == NULL) + return HUBBUB_INVALID; + + if (h->next) + h->next->prev = h->prev; + if (h->prev) + h->prev->next = h->next; + else + stream->handlers = h->next; + + stream->alloc(h, 0, stream->pw); + + return HUBBUB_OK; +} + +/** + * Case insensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + if (stream == NULL || stream->buffer == NULL) + return 1; /* arbitrary */ + + return stream->cmp_range_ci(stream, r1, r2, len); +} + +/** + * Case sensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + if (stream == NULL || stream->buffer == NULL) + return 1; /* arbitrary */ + + return stream->cmp_range_cs(stream, r1, r2, len); +} + +/** + * Case sensitively compare a range of input stream against an ASCII string + * + * \param stream Input stream to look in + * \param off Offset of range start + * \param len Byte length of range + * \param data Comparison string + * \param dlen Byte length of comparison string + * \return 0 if match, non-zero otherwise + */ +int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen) +{ + if (stream == NULL || stream->buffer == NULL) + return 1; /* arbitrary */ + + return stream->cmp_range_ascii(stream, off, len, data, dlen); +} + +/** + * Replace a range of bytes in the input stream with a single character + * + * \param stream Input stream containing data + * \param start Offset of start of range to replace + * \param len Length (in bytes) of range to replace + * \param ucs4 UCS4 (host endian) encoded replacement character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4) +{ + if (stream == NULL || stream->buffer == NULL) + return HUBBUB_BADPARM; + + if (start >= stream->buffer_len) + return HUBBUB_INVALID; + + if (start < stream->cursor) + return HUBBUB_INVALID; + + return stream->replace_range(stream, start, len, ucs4); +} + +/** + * Read the document charset + * + * \param stream Input stream to query + * \param source Pointer to location to receive charset source + * \return Pointer to charset name (constant; do not free), or NULL if unknown + */ +const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream, + hubbub_charset_source *source) +{ + if (stream == NULL || source == NULL) + return NULL; + + *source = stream->encsrc; + + if (stream->encsrc == HUBBUB_CHARSET_UNKNOWN) + return NULL; + + return hubbub_mibenum_to_name(stream->mibenum); +} + +/** + * Inform interested parties that the buffer has moved + * + * \param stream Input stream + */ +void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream) +{ + hubbub_inputstream_bm_handler *h; + + if (stream == NULL) + return; + + for (h = stream->handlers; h; h = h->next) + h->handler(stream->buffer, stream->buffer_len, h->pw); +} + diff --git a/src/input/inputstream.h b/src/input/inputstream.h new file mode 100644 index 0000000..5325d14 --- /dev/null +++ b/src/input/inputstream.h @@ -0,0 +1,98 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_input_inputstream_h_ +#define hubbub_input_inputstream_h_ + +#include <inttypes.h> + +#include <hubbub/errors.h> +#include <hubbub/functypes.h> +#include <hubbub/types.h> + +typedef struct hubbub_inputstream hubbub_inputstream; + +/* EOF pseudo-character */ +#define HUBBUB_INPUTSTREAM_EOF (0xFFFFFFFFU) +/* Out-of-data indicator */ +#define HUBBUB_INPUTSTREAM_OOD (0xFFFFFFFEU) + +/* Type of input stream buffer moved handler function */ +typedef void (*hubbub_inputstream_buffermoved)(const uint8_t *buffer, + size_t len, void *pw); + +/* Create an input stream */ +hubbub_inputstream *hubbub_inputstream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw); +/* Destroy an input stream */ +void hubbub_inputstream_destroy(hubbub_inputstream *stream); + +/* Append data to an input stream */ +hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len); +/* Insert data into stream at current location */ +hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len); + +/* Look at the next character in the stream */ +uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream); + +/* Retrieve the byte index and length of the current character in the stream */ +uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream, size_t *len); + +/* Convert the current character to lowercase */ +void hubbub_inputstream_lowercase(hubbub_inputstream *stream); + +/* Convert the current character to uppercase */ +void hubbub_inputstream_uppercase(hubbub_inputstream *stream); + +/* Advance the stream's current position */ +void hubbub_inputstream_advance(hubbub_inputstream *stream); + +/* Push a character back onto the stream */ +hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream, + uint32_t character); + +/* Rewind the input stream by a number of bytes */ +hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n); + +/* Claim ownership of an input stream's buffer */ +hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream, + uint8_t **buffer, size_t *len); + +/* Register interest in buffer moved events */ +hubbub_error hubbub_inputstream_register_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw); + +/* Deregister interest in buffer moved events */ +hubbub_error hubbub_inputstream_deregister_movehandler( + hubbub_inputstream *stream, + hubbub_inputstream_buffermoved handler, void *pw); + +/* Case insensitively compare a pair of ranges in the input stream */ +int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len); + +/* Case sensitively compare a pair of ranges in the input stream */ +int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len); + +/* Case sensitively compare a range of input stream against an ASCII string */ +int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen); + +/* Replace a range of bytes in the input stream with a single character */ +hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4); + +/* Read the document charset */ +const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream, + hubbub_charset_source *source); + +#endif + diff --git a/src/input/streamimpl.h b/src/input/streamimpl.h new file mode 100644 index 0000000..f44f6da --- /dev/null +++ b/src/input/streamimpl.h @@ -0,0 +1,77 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_input_streamimpl_h_ +#define hubbub_input_streamimpl_h_ + +#include <stdbool.h> + +#include <hubbub/types.h> + +#include "input/filter.h" +#include "input/inputstream.h" + +typedef struct hubbub_inputstream_bm_handler hubbub_inputstream_bm_handler; + +/** + * Input stream definition: implementations extend this + */ +struct hubbub_inputstream { + uint8_t *buffer; /**< Document buffer */ + size_t buffer_len; /**< Amount of data in buffer */ + size_t buffer_alloc; /**< Allocated size of buffer */ + + uint32_t cursor; /**< Byte offset of current position */ + + bool had_eof; /**< Whether EOF has been reached */ + + uint16_t mibenum; /**< MIB enum for charset, or 0 */ + hubbub_charset_source encsrc; /**< Charset source */ + + hubbub_filter *input; /**< Charset conversion filter */ + + hubbub_inputstream_bm_handler *handlers; /**< List of buffer + * moved handlers */ + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client private data */ + + void (*destroy)(hubbub_inputstream *stream); + hubbub_error (*append)(hubbub_inputstream *stream, + const uint8_t *data, size_t len); + hubbub_error (*insert)(hubbub_inputstream *stream, + const uint8_t *data, size_t len); + uint32_t (*peek)(hubbub_inputstream *stream); + uint32_t (*cur_pos)(hubbub_inputstream *stream, size_t *len); + void (*lowercase)(hubbub_inputstream *stream); + void (*uppercase)(hubbub_inputstream *stream); + void (*advance)(hubbub_inputstream *stream); + hubbub_error (*push_back)(hubbub_inputstream *stream, + uint32_t character); + int (*cmp_range_ci)(hubbub_inputstream *stream, uint32_t r1, + uint32_t r2, size_t len); + int (*cmp_range_cs)(hubbub_inputstream *stream, uint32_t r1, + uint32_t r2, size_t len); + int (*cmp_range_ascii)(hubbub_inputstream *stream, + uint32_t off, size_t len, + const char *data, size_t dlen); + hubbub_error (*replace_range)(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4); +}; + +/** + * Input stream factory component definition + */ +typedef struct hubbub_streamhandler { + bool (*uses_encoding)(const char *int_enc); + hubbub_inputstream *(*create)(const char *enc, const char *int_enc, + hubbub_alloc alloc, void *pw); +} hubbub_streamhandler; + +/* Notification of stream buffer moving */ +void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream); + +#endif diff --git a/src/input/utf8_stream.c b/src/input/utf8_stream.c new file mode 100644 index 0000000..5d08993 --- /dev/null +++ b/src/input/utf8_stream.c @@ -0,0 +1,567 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <stdbool.h> +#include <string.h> + +#include "charset/aliases.h" +#include "charset/detect.h" +#include "input/streamimpl.h" +#include "utils/utf8.h" +#include "utils/utils.h" + +#define BUFFER_CHUNK (4096) + +static bool hubbub_utf8stream_uses_encoding(const char *int_enc); +static hubbub_inputstream *hubbub_utf8stream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw); +static void hubbub_utf8stream_destroy(hubbub_inputstream *stream); +static hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len); +static hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len); +static uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream); +static uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream, + size_t *len); +static void hubbub_utf8stream_lowercase(hubbub_inputstream *stream); +static void hubbub_utf8stream_uppercase(hubbub_inputstream *stream); +static void hubbub_utf8stream_advance(hubbub_inputstream *stream); +static hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream, + uint32_t character); +static int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len); +static int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len); +static int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen); +static hubbub_error hubbub_utf8stream_replace_range( + hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4); + +/** + * Determine whether a stream implementation uses an internal encoding + * + * \param int_enc The desired encoding + * \return true if handled, false otherwise + */ +bool hubbub_utf8stream_uses_encoding(const char *int_enc) +{ + return (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) == + hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"))); +} + +/** + * Create an input stream + * + * \param enc Document charset, or NULL if unknown + * \param int_enc Desired encoding of document + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to stream instance, or NULL on failure + */ +hubbub_inputstream *hubbub_utf8stream_create(const char *enc, + const char *int_enc, hubbub_alloc alloc, void *pw) +{ + hubbub_inputstream *stream; + + if (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) != + hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"))) + return NULL; + + stream = alloc(NULL, sizeof(hubbub_inputstream), pw); + if (stream == NULL) + return NULL; + + stream->buffer = alloc(NULL, BUFFER_CHUNK, pw); + if (stream->buffer == NULL) { + alloc(stream, 0, pw); + return NULL; + } + + stream->buffer_len = 0; + stream->buffer_alloc = BUFFER_CHUNK; + + stream->cursor = 0; + + stream->had_eof = false; + + stream->input = hubbub_filter_create(int_enc, alloc, pw); + if (stream->input == NULL) { + alloc(stream->buffer, 0, pw); + alloc(stream, 0, pw); + return NULL; + } + + if (enc != NULL) { + hubbub_error error; + hubbub_filter_optparams params; + + stream->mibenum = hubbub_mibenum_from_name(enc, strlen(enc)); + + if (stream->mibenum != 0) { + params.encoding.name = enc; + + error = hubbub_filter_setopt(stream->input, + HUBBUB_FILTER_SET_ENCODING, ¶ms); + if (error != HUBBUB_OK && error != HUBBUB_INVALID) { + hubbub_filter_destroy(stream->input); + alloc(stream->buffer, 0, pw); + alloc(stream, 0, pw); + return NULL; + } + + stream->encsrc = HUBBUB_CHARSET_DICTATED; + } + } else { + stream->mibenum = 0; + stream->encsrc = HUBBUB_CHARSET_UNKNOWN; + } + + stream->destroy = hubbub_utf8stream_destroy; + stream->append = hubbub_utf8stream_append; + stream->insert = hubbub_utf8stream_insert; + stream->peek = hubbub_utf8stream_peek; + stream->cur_pos = hubbub_utf8stream_cur_pos; + stream->lowercase = hubbub_utf8stream_lowercase; + stream->uppercase = hubbub_utf8stream_uppercase; + stream->advance = hubbub_utf8stream_advance; + stream->push_back = hubbub_utf8stream_push_back; + stream->cmp_range_ci = hubbub_utf8stream_compare_range_ci; + stream->cmp_range_cs = hubbub_utf8stream_compare_range_cs; + stream->cmp_range_ascii = hubbub_utf8stream_compare_range_ascii; + stream->replace_range = hubbub_utf8stream_replace_range; + + return stream; +} + +/** + * Destroy an input stream + * + * \param stream Input stream to destroy + */ +void hubbub_utf8stream_destroy(hubbub_inputstream *stream) +{ + if (stream->input != NULL) { + hubbub_filter_destroy(stream->input); + } + + if (stream->buffer != NULL) { + stream->alloc(stream->buffer, 0, stream->pw); + } + + stream->alloc(stream, 0, stream->pw); +} + +/** + * Append data to an input stream + * + * \param stream Input stream to append data to + * \param data Data to append (in document charset), or NULL to flag EOF + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + hubbub_error error; + uint8_t *base; + size_t space; + + if (data == NULL) { + /* EOF indicated */ + size_t dummy_len = 0; + uint8_t *dummy_data = (uint8_t *) &dummy_len; + + base = stream->buffer + stream->buffer_len; + space = stream->buffer_alloc - stream->buffer_len; + + /* Forcibly flush through any remaining buffered data */ + while ((error = hubbub_filter_process_chunk(stream->input, + (const uint8_t **) &dummy_data, &dummy_len, + &base, &space)) == HUBBUB_NOMEM) { + bool moved = false; + uint8_t *temp = stream->alloc(stream->buffer, + stream->buffer_alloc + BUFFER_CHUNK, + stream->pw); + + if (temp == NULL) { + return HUBBUB_NOMEM; + } + + moved = (temp != stream->buffer); + + stream->buffer = temp; + stream->buffer_len += stream->buffer_alloc - + stream->buffer_len - space; + stream->buffer_alloc += BUFFER_CHUNK; + + base = stream->buffer + stream->buffer_len; + space = stream->buffer_alloc - stream->buffer_len; + + if (moved) + hubbub_inputstream_buffer_moved(stream); + } + + /* And fix up buffer length */ + stream->buffer_len += stream->buffer_alloc - + stream->buffer_len - space; + + stream->had_eof = true; + } else { + /* Normal data chunk */ + + if (stream->mibenum == 0) { + /* Haven't found charset yet; detect it */ + error = hubbub_charset_extract(&data, &len, + &stream->mibenum, &stream->encsrc); + if (error) { + return error; + } + + /* We should always have a charset by now */ + if (stream->mibenum == 0) + abort(); + } + + base = stream->buffer + stream->buffer_len; + space = stream->buffer_alloc - stream->buffer_len; + + /* Convert chunk to UTF-8 */ + while ((error = hubbub_filter_process_chunk(stream->input, + &data, &len, + &base, &space)) == HUBBUB_NOMEM) { + bool moved = false; + uint8_t *temp = stream->alloc(stream->buffer, + stream->buffer_alloc + BUFFER_CHUNK, + stream->pw); + + if (temp == NULL) { + return HUBBUB_NOMEM; + } + + moved = (temp != stream->buffer); + + stream->buffer = temp; + stream->buffer_len += stream->buffer_alloc - + stream->buffer_len - space; + stream->buffer_alloc += BUFFER_CHUNK; + + base = stream->buffer + stream->buffer_len; + space = stream->buffer_alloc - stream->buffer_len - + space; + + if (moved) + hubbub_inputstream_buffer_moved(stream); + } + + /* And fix up buffer length */ + stream->buffer_len += stream->buffer_alloc - + stream->buffer_len - space; + } + + return HUBBUB_OK; +} + +/** + * Insert data into stream at current location + * + * \param stream Input stream to insert into + * \param data Data to insert (UTF-8 encoded) + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream, + const uint8_t *data, size_t len) +{ + size_t space; + uint8_t *curpos; + + space = stream->buffer_alloc - stream->buffer_len; + + /* Need to grow buffer, if there's insufficient space */ + if (space <= len) { + bool moved = false; + uint8_t *temp = stream->alloc(stream->buffer, + stream->buffer_alloc + + ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) + + BUFFER_CHUNK, + stream->pw); + + if (temp == NULL) + return HUBBUB_NOMEM; + + moved = (temp != stream->buffer); + + stream->buffer = temp; + stream->buffer_alloc += + ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK); + + if (moved) + hubbub_inputstream_buffer_moved(stream); + } + + /* Find the insertion point + * (just before the next character to be read) */ + curpos = stream->buffer + stream->cursor; + + /* Move data above this point up */ + memmove(curpos + len, curpos, stream->buffer_len - stream->cursor); + + /* Copy new data into gap created by memmove */ + memcpy(curpos, data, len); + + /* Fix up buffer length */ + stream->buffer_len += len; + + return HUBBUB_OK; +} + +/** + * Look at the next character in the stream + * + * \param stream Stream to look in + * \return UCS4 (host-endian) character code, or EOF or OOD. + */ +uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream) +{ + hubbub_error error; + size_t len; + uint32_t ret; + + if (stream->cursor == stream->buffer_len) { + return stream->had_eof ? HUBBUB_INPUTSTREAM_EOF + : HUBBUB_INPUTSTREAM_OOD; + } + + error = hubbub_utf8_to_ucs4(stream->buffer + stream->cursor, + stream->buffer_len - stream->cursor, + &ret, &len); + if (error != HUBBUB_OK && error != HUBBUB_NEEDDATA) + return HUBBUB_INPUTSTREAM_OOD; + + if (error == HUBBUB_NEEDDATA) { + if (stream->had_eof) + return HUBBUB_INPUTSTREAM_EOF; + else + return HUBBUB_INPUTSTREAM_OOD; + } + + return ret; +} + +/** + * Retrieve the byte index and length of the current character in the stream + * + * \param stream Stream to look in + * \param len Pointer to location to receive byte length of character + * \return Byte index of current character from start of stream, + * or (uint32_t) -1 on error + */ +uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream, + size_t *len) +{ + hubbub_utf8_char_byte_length(stream->buffer + stream->cursor, len); + + return stream->cursor; +} + +/** + * Convert the current character to lower case + * + * \param stream Stream to look in + */ +void hubbub_utf8stream_lowercase(hubbub_inputstream *stream) +{ + if ('A' <= stream->buffer[stream->cursor] && + stream->buffer[stream->cursor] <= 'Z') + stream->buffer[stream->cursor] += 0x0020; +} + +/** + * Convert the current character to upper case + * + * \param stream Stream to look in + */ +void hubbub_utf8stream_uppercase(hubbub_inputstream *stream) +{ + if ('a' <= stream->buffer[stream->cursor] && + stream->buffer[stream->cursor] <= 'z') + stream->buffer[stream->cursor] -= 0x0020; +} + +/** + * Advance the stream's current position + * + * \param stream The stream whose position to advance + */ +void hubbub_utf8stream_advance(hubbub_inputstream *stream) +{ + hubbub_error error; + uint32_t next; + + error = hubbub_utf8_next(stream->buffer, stream->buffer_len, + stream->cursor, &next); + + if (error == HUBBUB_OK) + stream->cursor = next; +} + +/** + * Push a character back onto the stream + * + * \param stream Stream to push back to + * \param character UCS4 (host-endian) codepoint to push back + * \return HUBBUB_OK on success, appropriate error otherwise + * + * Note that this doesn't actually modify the data in the stream. + * It works by ensuring that the character located just before the + * current stream location is the same as ::character. If it is, + * then the stream pointer is moved back. If it is not, then an + * error is returned and the stream pointer remains unmodified. + */ +hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream, + uint32_t character) +{ + hubbub_error error; + uint32_t prev; + uint8_t buf[6]; + size_t len; + + error = hubbub_utf8_prev(stream->buffer, stream->cursor, &prev); + if (error != HUBBUB_OK) + return error; + + error = hubbub_utf8_from_ucs4(character, buf, &len); + if (error != HUBBUB_OK) + return error; + + if ((stream->cursor - prev) != len || + memcmp(stream->buffer + prev, buf, len) != 0) + return HUBBUB_INVALID; + + stream->cursor = prev; + + return HUBBUB_OK; +} + +/** + * Case insensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + return strncasecmp((const char *) (stream->buffer + r1), + (const char *) (stream->buffer + r2), len); +} + +/** + * Case sensitively compare a pair of ranges in the input stream + * + * \param stream Input stream to look in + * \param r1 Offset of start of first range + * \param r2 Offset of start of second range + * \param len Byte length of ranges + * \return 0 if ranges match, non-zero otherwise + */ +int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream, + uint32_t r1, uint32_t r2, size_t len) +{ + return strncmp((const char *) (stream->buffer + r1), + (const char *) (stream->buffer + r2), len); +} + +/** + * Case sensitively compare a range of input stream against an ASCII string + * + * \param stream Input stream to look in + * \param off Offset of range start + * \param len Byte length of range + * \param data Comparison string + * \param dlen Byte length of comparison string + * \return 0 if match, non-zero otherwise + */ +int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream, + uint32_t off, size_t len, const char *data, size_t dlen) +{ + /* Lengths don't match, so strings don't */ + if (len != dlen) + return 1; /* arbitrary */ + + return strncmp((const char *) (stream->buffer + off), + data, len); +} + +/** + * Replace a range of bytes in the input stream with a single character + * + * \param stream Input stream containing data + * \param start Offset of start of range to replace + * \param len Length (in bytes) of range to replace + * \param ucs4 UCS4 (host endian) encoded replacement character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_utf8stream_replace_range(hubbub_inputstream *stream, + uint32_t start, size_t len, uint32_t ucs4) +{ + uint8_t buf[6]; + size_t replen; + int32_t diff; + hubbub_error error; + + /* Get UTF8 version of replacement character */ + error = hubbub_utf8_from_ucs4(ucs4, buf, &replen); + if (error) + return error; + + diff = replen - len; + + if (stream->buffer_len + diff >= stream->buffer_alloc) { + /* Need more buffer space */ + bool moved = false; + uint8_t *temp = stream->alloc(stream->buffer, + stream->buffer_alloc + + ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) + + BUFFER_CHUNK, + stream->pw); + + if (temp == NULL) + return HUBBUB_NOMEM; + + moved = (temp != stream->buffer); + + stream->buffer = temp; + stream->buffer_alloc += + ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK); + + if (moved) + hubbub_inputstream_buffer_moved(stream); + } + + /* Move subsequent input to correct location */ + memmove(stream->buffer + start + len + diff, + stream->buffer + start + len, + stream->buffer_len - (start + len)); + + /* And fill the gap with the replacement character */ + memcpy(stream->buffer + start, buf, replen); + + /* Finally, update length */ + stream->buffer_len += diff; + + return HUBBUB_OK; +} + +hubbub_streamhandler utf8stream = { + hubbub_utf8stream_uses_encoding, + hubbub_utf8stream_create +}; diff --git a/src/parser.c b/src/parser.c new file mode 100644 index 0000000..e7a4fe8 --- /dev/null +++ b/src/parser.c @@ -0,0 +1,237 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <hubbub/parser.h> + +#include "input/inputstream.h" +#include "tokeniser/tokeniser.h" + +/** + * Hubbub parser object + */ +struct hubbub_parser { + hubbub_inputstream *stream; /**< Input stream instance */ + hubbub_tokeniser *tok; /**< Tokeniser instance */ + + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Client data */ +}; + +/** + * Create a hubbub parser + * + * \param enc Source document encoding, or NULL to autodetect + * \param int_enc Desired encoding of document + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to parser instance, or NULL on error + */ +hubbub_parser *hubbub_parser_create(const char *enc, const char *int_enc, + hubbub_alloc alloc, void *pw) +{ + hubbub_parser *parser; + + if (alloc == NULL) + return NULL; + + parser = alloc(NULL, sizeof(hubbub_parser), pw); + if (parser == NULL) + return NULL; + + parser->stream = hubbub_inputstream_create(enc, int_enc, alloc, pw); + if (parser->stream == NULL) { + alloc(parser, 0, pw); + return NULL; + } + + parser->tok = hubbub_tokeniser_create(parser->stream, alloc, pw); + if (parser->tok == NULL) { + hubbub_inputstream_destroy(parser->stream); + alloc(parser, 0, pw); + return NULL; + } + + parser->alloc = alloc; + parser->pw = pw; + + return parser; +} + +/** + * Destroy a hubbub parser + * + * \param parser Parser instance to destroy + */ +void hubbub_parser_destroy(hubbub_parser *parser) +{ + if (parser == NULL) + return; + + hubbub_tokeniser_destroy(parser->tok); + + hubbub_inputstream_destroy(parser->stream); + + parser->alloc(parser, 0, parser->pw); +} + +/** + * Configure a hubbub parser + * + * \param parser Parser instance to configure + * \param type Option to set + * \param params Option-specific parameters + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_parser_setopt(hubbub_parser *parser, + hubbub_parser_opttype type, + hubbub_parser_optparams *params) +{ + hubbub_tokeniser_opttype toktype; + + if (parser == NULL || params == NULL) + return HUBBUB_BADPARM; + + switch (type) { + case HUBBUB_PARSER_TOKEN_HANDLER: + toktype = HUBBUB_TOKENISER_TOKEN_HANDLER; + break; + case HUBBUB_PARSER_BUFFER_HANDLER: + toktype = HUBBUB_TOKENISER_BUFFER_HANDLER; + break; + case HUBBUB_PARSER_ERROR_HANDLER: + toktype = HUBBUB_TOKENISER_BUFFER_HANDLER; + break; + case HUBBUB_PARSER_CONTENT_MODEL: + toktype = HUBBUB_TOKENISER_CONTENT_MODEL; + break; + } + + return hubbub_tokeniser_setopt(parser->tok, toktype, + (hubbub_tokeniser_optparams *) params); +} + +/** + * Pass a chunk of data to a hubbub parser for parsing + * + * \param parser Parser instance to use + * \param data Data to parse (encoded in the input charset) + * \param len Length, in bytes, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser, + uint8_t *data, size_t len) +{ + hubbub_error error; + + if (parser == NULL || data == NULL) + return HUBBUB_BADPARM; + + error = hubbub_inputstream_append(parser->stream, data, len); + if (error != HUBBUB_OK) + return error; + + error = hubbub_tokeniser_run(parser->tok); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Pass a chunk of extraneous data to a hubbub parser for parsing + * + * \param parser Parser instance to use + * \param data Data to parse (encoded in internal charset) + * \param len Length, in byte, of data + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_parser_parse_extraneous_chunk(hubbub_parser *parser, + uint8_t *data, size_t len) +{ + hubbub_error error; + + /** \todo In some cases, we don't actually want script-inserted + * data to be parsed until later. We'll need some way of flagging + * this through the public API, and the inputstream API will need + * some way of marking the insertion point so that, when the + * tokeniser is run, only the inserted chunk is parsed. */ + + if (parser == NULL || data == NULL) + return HUBBUB_BADPARM; + + error = hubbub_inputstream_insert(parser->stream, data, len); + if (error != HUBBUB_OK) + return error; + + error = hubbub_tokeniser_run(parser->tok); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Inform the parser that the last chunk of data has been parsed + * + * \param parser Parser to inform + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_parser_completed(hubbub_parser *parser) +{ + hubbub_error error; + + if (parser == NULL) + return HUBBUB_BADPARM; + + error = hubbub_inputstream_append(parser->stream, NULL, 0); + if (error != HUBBUB_OK) + return error; + + error = hubbub_tokeniser_run(parser->tok); + if (error != HUBBUB_OK) + return error; + + return HUBBUB_OK; +} + +/** + * Read the document charset + * + * \param parser Parser instance to query + * \param source Pointer to location to receive charset source + * \return Pointer to charset name (constant; do not free), or NULL if unknown + */ +const char *hubbub_parser_read_charset(hubbub_parser *parser, + hubbub_charset_source *source) +{ + if (parser == NULL || source == NULL) + return NULL; + + return hubbub_inputstream_read_charset(parser->stream, source); +} + +/** + * Claim ownership of the document buffer + * + * \param parser Parser whose buffer to claim + * \param buffer Pointer to location to receive buffer pointer + * \param len Pointer to location to receive byte length of buffer + * \return HUBBUB_OK on success, appropriate error otherwise. + * + * Once the buffer has been claimed by a client, the parser disclaims + * all ownership rights (and invalidates any internal references it may have + * to the buffer). Therefore, the only parser call which may be made + * after calling this function is to destroy the parser. + */ +hubbub_error hubbub_parser_claim_buffer(hubbub_parser *parser, + uint8_t **buffer, size_t *len) +{ + if (parser == NULL || buffer == NULL || len == NULL) + return HUBBUB_BADPARM; + + return hubbub_inputstream_claim_buffer(parser->stream, buffer, len); +} diff --git a/src/tokeniser/Makefile b/src/tokeniser/Makefile new file mode 100644 index 0000000..539625f --- /dev/null +++ b/src/tokeniser/Makefile @@ -0,0 +1,53 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Objects +OBJS = entities tokeniser + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS))) + +debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS))) + +clean: + -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS})) + +distclean: + +setup: + +export: + +test: + +# Pattern rules +../Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +../Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/tokeniser/entities.c b/src/tokeniser/entities.c new file mode 100644 index 0000000..8a9acf5 --- /dev/null +++ b/src/tokeniser/entities.c @@ -0,0 +1,363 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include "utils/dict.h" +#include "utils/utils.h" +#include "tokeniser/entities.h" + +typedef struct hubbub_entity hubbub_entity; + +static const struct hubbub_entity { + const char *name; + uint32_t ucs4; +} entities[] = { + { "AElig", 0x00C6 }, + { "Aacute", 0x00C1 }, + { "Acirc", 0x00C2 }, + { "Agrave", 0x00C0 }, + { "Alpha", 0x0391 }, + { "Aring", 0x00C5 }, + { "Atilde", 0x00C3 }, + { "Auml", 0x00C4 }, + { "Beta", 0x0392 }, + { "Ccedil", 0x00C7 }, + { "Chi", 0x03A7 }, + { "Dagger", 0x2021 }, + { "Delta", 0x0394 }, + { "ETH", 0x00D0 }, + { "Eacute", 0x00C9 }, + { "Ecirc", 0x00CA }, + { "Egrave", 0x00C8 }, + { "Epsilon", 0x0395 }, + { "Eta", 0x0397 }, + { "Euml", 0x00CB }, + { "Gamma", 0x0393 }, + { "Iacute", 0x00CD }, + { "Icirc", 0x00CE }, + { "Igrave", 0x00CC }, + { "Iota", 0x0399 }, + { "Iuml", 0x00CF }, + { "Kappa", 0x039A }, + { "Lambda", 0x039B }, + { "Mu", 0x039C }, + { "Ntilde", 0x00D1 }, + { "Nu", 0x039D }, + { "OElig", 0x0152 }, + { "Oacute", 0x00D3 }, + { "Ocirc", 0x00D4 }, + { "Ograve", 0x00D2 }, + { "Omega", 0x03A9 }, + { "Omicron", 0x039F }, + { "Oslash", 0x00D8 }, + { "Otilde", 0x00D5 }, + { "Ouml", 0x00D6 }, + { "Phi", 0x03A6 }, + { "Pi", 0x03A0 }, + { "Prime", 0x2033 }, + { "Psi", 0x03A8 }, + { "Rho", 0x03A1 }, + { "Scaron", 0x0160 }, + { "Sigma", 0x03A3 }, + { "THORN", 0x00DE }, + { "Tau", 0x03A4 }, + { "Theta", 0x0398 }, + { "Uacute", 0x00DA }, + { "Ucirc", 0x00DB }, + { "Ugrave", 0x00D9 }, + { "Upsilon", 0x03A5 }, + { "Uuml", 0x00DC }, + { "Xi", 0x039E }, + { "Yacute", 0x00DD }, + { "Yuml", 0x0178 }, + { "Zeta", 0x0396 }, + { "aacute", 0x00E1 }, + { "acirc", 0x00E2 }, + { "acute", 0x00B4 }, + { "aelig", 0x00E6 }, + { "agrave", 0x00E0 }, + { "alefsym", 0x2135 }, + { "alpha", 0x03B1 }, + { "amp", 0x0026 }, + { "AMP", 0x0026 }, + { "and", 0x2227 }, + { "ang", 0x2220 }, + { "apos", 0x0027 }, + { "aring", 0x00E5 }, + { "asymp", 0x2248 }, + { "atilde", 0x00E3 }, + { "auml", 0x00E4 }, + { "bdquo", 0x201E }, + { "beta", 0x03B2 }, + { "brvbar", 0x00A6 }, + { "bull", 0x2022 }, + { "cap", 0x2229 }, + { "ccedil", 0x00E7 }, + { "cedil", 0x00B8 }, + { "cent", 0x00A2 }, + { "chi", 0x03C7 }, + { "circ", 0x02C6 }, + { "clubs", 0x2663 }, + { "cong", 0x2245 }, + { "copy", 0x00A9 }, + { "COPY", 0x00A9 }, + { "crarr", 0x21B5 }, + { "cup", 0x222A }, + { "curren", 0x00A4 }, + { "dArr", 0x21D3 }, + { "dagger", 0x2020 }, + { "darr", 0x2193 }, + { "deg", 0x00B0 }, + { "delta", 0x03B4 }, + { "diams", 0x2666 }, + { "divide", 0x00F7 }, + { "eacute", 0x00E9 }, + { "ecirc", 0x00EA }, + { "egrave", 0x00E8 }, + { "empty", 0x2205 }, + { "emsp", 0x2003 }, + { "ensp", 0x2002 }, + { "epsilon", 0x03B5 }, + { "equiv", 0x2261 }, + { "eta", 0x03B7 }, + { "eth", 0x00F0 }, + { "euml", 0x00EB }, + { "euro", 0x20AC }, + { "exist", 0x2203 }, + { "fnof", 0x0192 }, + { "forall", 0x2200 }, + { "frac12", 0x00BD }, + { "frac14", 0x00BC }, + { "frac34", 0x00BE }, + { "frasl", 0x2044 }, + { "gamma", 0x03B3 }, + { "ge", 0x2265 }, + { "gt", 0x003E }, + { "GT", 0x003E }, + { "hArr", 0x21D4 }, + { "harr", 0x2194 }, + { "hearts", 0x2665 }, + { "hellip", 0x2026 }, + { "iacute", 0x00ED }, + { "icirc", 0x00EE }, + { "iexcl", 0x00A1 }, + { "igrave", 0x00EC }, + { "image", 0x2111 }, + { "infin", 0x221E }, + { "int", 0x222B }, + { "iota", 0x03B9 }, + { "iquest", 0x00BF }, + { "isin", 0x2208 }, + { "iuml", 0x00EF }, + { "kappa", 0x03BA }, + { "lArr", 0x21D0 }, + { "lambda", 0x03BB }, + { "lang", 0x2329 }, + { "laquo", 0x00AB }, + { "larr", 0x2190 }, + { "lceil", 0x2308 }, + { "ldquo", 0x201C }, + { "le", 0x2264 }, + { "lfloor", 0x230A }, + { "lowast", 0x2217 }, + { "loz", 0x25CA }, + { "lrm", 0x200E }, + { "lsaquo", 0x2039 }, + { "lsquo", 0x2018 }, + { "lt", 0x003C }, + { "LT", 0x003C }, + { "macr", 0x00AF }, + { "mdash", 0x2014 }, + { "micro", 0x00B5 }, + { "middot", 0x00B7 }, + { "minus", 0x2212 }, + { "mu", 0x03BC }, + { "nabla", 0x2207 }, + { "nbsp", 0x00A0 }, + { "ndash", 0x2013 }, + { "ne", 0x2260 }, + { "ni", 0x220B }, + { "not", 0x00AC }, + { "notin", 0x2209 }, + { "nsub", 0x2284 }, + { "ntilde", 0x00F1 }, + { "nu", 0x03BD }, + { "oacute", 0x00F3 }, + { "ocirc", 0x00F4 }, + { "oelig", 0x0153 }, + { "ograve", 0x00F2 }, + { "oline", 0x203E }, + { "omega", 0x03C9 }, + { "omicron", 0x03BF }, + { "oplus", 0x2295 }, + { "or", 0x2228 }, + { "ordf", 0x00AA }, + { "ordm", 0x00BA }, + { "oslash", 0x00F8 }, + { "otilde", 0x00F5 }, + { "otimes", 0x2297 }, + { "ouml", 0x00F6 }, + { "para", 0x00B6 }, + { "part", 0x2202 }, + { "permil", 0x2030 }, + { "perp", 0x22A5 }, + { "phi", 0x03C6 }, + { "pi", 0x03C0 }, + { "piv", 0x03D6 }, + { "plusmn", 0x00B1 }, + { "pound", 0x00A3 }, + { "prime", 0x2032 }, + { "prod", 0x220F }, + { "prop", 0x221D }, + { "psi", 0x03C8 }, + { "quot", 0x0022 }, + { "QUOT", 0x0022 }, + { "rArr", 0x21D2 }, + { "radic", 0x221A }, + { "rang", 0x232A }, + { "raquo", 0x00BB }, + { "rarr", 0x2192 }, + { "rceil", 0x2309 }, + { "rdquo", 0x201D }, + { "real", 0x211C }, + { "reg", 0x00AE }, + { "REG", 0x00AE }, + { "rfloor", 0x230B }, + { "rho", 0x03C1 }, + { "rlm", 0x200F }, + { "rsaquo", 0x203A }, + { "rsquo", 0x2019 }, + { "sbquo", 0x201A }, + { "scaron", 0x0161 }, + { "sdot", 0x22C5 }, + { "sect", 0x00A7 }, + { "shy", 0x00AD }, + { "sigma", 0x03C3 }, + { "sigmaf", 0x03C2 }, + { "sim", 0x223C }, + { "spades", 0x2660 }, + { "sub", 0x2282 }, + { "sube", 0x2286 }, + { "sum", 0x2211 }, + { "sup", 0x2283 }, + { "sup1", 0x00B9 }, + { "sup2", 0x00B2 }, + { "sup3", 0x00B3 }, + { "supe", 0x2287 }, + { "szlig", 0x00DF }, + { "tau", 0x03C4 }, + { "there4", 0x2234 }, + { "theta", 0x03B8 }, + { "thetasym", 0x03D1 }, + { "thinsp", 0x2009 }, + { "thorn", 0x00FE }, + { "tilde", 0x02DC }, + { "times", 0x00D7 }, + { "trade", 0x2122 }, + { "uArr", 0x21D1 }, + { "uacute", 0x00FA }, + { "uarr", 0x2191 }, + { "ucirc", 0x00FB }, + { "ugrave", 0x00F9 }, + { "uml", 0x00A8 }, + { "upsih", 0x03D2 }, + { "upsilon", 0x03C5 }, + { "uuml", 0x00FC }, + { "weierp", 0x2118 }, + { "xi", 0x03BE }, + { "yacute", 0x00FD }, + { "yen", 0x00A5 }, + { "yuml", 0x00FF }, + { "zeta", 0x03B6 }, + { "zwj", 0x200D }, + { "zwnj", 0x200C }, +}; + +static hubbub_dict *dict; + +/** + * Create the entities dictionary + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_entities_create(hubbub_alloc alloc, void *pw) +{ + hubbub_error error; + size_t i; + + if (alloc == NULL) + return HUBBUB_BADPARM; + + dict = hubbub_dict_create(alloc, pw); + if (dict == NULL) + return HUBBUB_NOMEM; + + for (i = 0; i < sizeof(entities) / sizeof(entities[0]); i++) { + error = hubbub_dict_insert(dict, entities[i].name, + &entities[i]); + if (error != HUBBUB_OK) { + hubbub_dict_destroy(dict); + return error; + } + } + + return HUBBUB_OK; +} + +/** + * Destroy the entities dictionary + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + */ +void hubbub_entities_destroy(hubbub_alloc alloc, void *pw) +{ + UNUSED(alloc); + UNUSED(pw); + + hubbub_dict_destroy(dict); +} + +/** + * Step-wise search for an entity in the dictionary + * + * \param c Character to look for + * \param result Pointer to location for result + * \param context Pointer to location for search context + * \return HUBBUB_OK if key found, + * HUBBUB_NEEDDATA if more steps are required + * HUBBUB_INVALID if nothing matches + * + * The value pointed to by ::context should be NULL for the first call. + * Thereafter, pass in the same value as returned by the previous call. + * The context is opaque to the caller and should not be inspected. + * + * The location pointed to by ::result will be set to U+FFFD unless a match + * is found. + */ +hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result, + void **context) +{ + const hubbub_entity *e; + hubbub_error error; + + if (result == NULL || context == NULL) + return HUBBUB_BADPARM; + + error = hubbub_dict_search_step(dict, c, + (const void **) (const void *) &e, + context); + if (error != HUBBUB_OK) { + *result = 0xFFFD; + return error; + } + + *result = e->ucs4; + + return HUBBUB_OK; +} diff --git a/src/tokeniser/entities.h b/src/tokeniser/entities.h new file mode 100644 index 0000000..efd1987 --- /dev/null +++ b/src/tokeniser/entities.h @@ -0,0 +1,25 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_tokeniser_entities_h_ +#define hubbub_tokeniser_entities_h_ + +#include <inttypes.h> + +#include <hubbub/errors.h> +#include <hubbub/functypes.h> + +/* Create the entities dictionary */ +hubbub_error hubbub_entities_create(hubbub_alloc alloc, void *pw); +/* Destroy the entities dictionary */ +void hubbub_entities_destroy(hubbub_alloc alloc, void *pw); + +/* Step-wise search for an entity in the dictionary */ +hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result, + void **context); + +#endif diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c new file mode 100644 index 0000000..f8b6bb3 --- /dev/null +++ b/src/tokeniser/tokeniser.c @@ -0,0 +1,2282 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <stdbool.h> +#include <string.h> + +#include "utils/utils.h" + +#include "tokeniser/entities.h" +#include "tokeniser/tokeniser.h" + +/** + * Table of mappings between Windows-1252 codepoints 128-159 and UCS4 + */ +static const uint32_t cp1252Table[32] = { + 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD, + 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178 +}; + +/** + * Tokeniser states + */ +typedef enum hubbub_tokeniser_state { + HUBBUB_TOKENISER_STATE_DATA, + HUBBUB_TOKENISER_STATE_ENTITY_DATA, + HUBBUB_TOKENISER_STATE_TAG_OPEN, + HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN, + HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH, + HUBBUB_TOKENISER_STATE_TAG_NAME, + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME, + HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME, + HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME, + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE, + HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ, + HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ, + HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ, + HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE, + HUBBUB_TOKENISER_STATE_BOGUS_COMMENT, + HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN, + HUBBUB_TOKENISER_STATE_COMMENT_START, + HUBBUB_TOKENISER_STATE_COMMENT, + HUBBUB_TOKENISER_STATE_COMMENT_DASH, + HUBBUB_TOKENISER_STATE_COMMENT_END, + HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE, + HUBBUB_TOKENISER_STATE_DOCTYPE, + HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME, + HUBBUB_TOKENISER_STATE_DOCTYPE_NAME, + HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME, + HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE, + HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY, + HUBBUB_TOKENISER_STATE_NAMED_ENTITY +} hubbub_tokeniser_state; + +/** + * Context for tokeniser + */ +typedef struct hubbub_tokeniser_context { + hubbub_token_type current_tag_type; /**< Type of current_tag */ + hubbub_tag current_tag; /**< Current tag */ + + hubbub_string current_comment; /**< Current comment */ + + hubbub_doctype current_doctype; /**< Current doctype */ + + hubbub_string current_chars; /**< Pending characters */ + + hubbub_tokeniser_state prev_state; /**< Previous state */ + + struct { + hubbub_string tag; /**< Pending close tag */ + } close_tag_match; + + struct { + uint32_t count; /**< Index into "DOCTYPE" */ + } match_doctype; + + struct { + hubbub_string str; /**< Pending string */ + uint8_t base; /**< Base for numeric + * entities */ + uint32_t codepoint; /**< UCS4 codepoint */ + bool had_data; /**< Whether we read + * anything after &#(x)? */ + hubbub_tokeniser_state return_state; /**< State we were + * called from */ + bool complete; /**< Flag that entity + * matching completed */ + bool done_setup; /**< Flag that match setup + * has completed */ + void *context; /**< Context for named + * entity search */ + size_t prev_len; /**< Previous byte length + * of str */ + } match_entity; + + struct { + uint32_t line; /**< Current line of input */ + uint32_t col; /**< Current character in + * line */ + } position; +} hubbub_tokeniser_context; + +/** + * Tokeniser data structure + */ +struct hubbub_tokeniser { + hubbub_tokeniser_state state; /**< Current tokeniser state */ + hubbub_content_model content_model; /**< Current content + * model flag */ + + hubbub_inputstream *input; /**< Input stream */ + + const uint8_t *input_buffer; /**< Start of input stream's buffer */ + size_t input_buffer_len; /**< Length of input buffer */ + + hubbub_tokeniser_context context; /**< Tokeniser context */ + + hubbub_token_handler token_handler; + void *token_pw; + + hubbub_buffer_handler buffer_handler; + void *buffer_pw; + + hubbub_error_handler error_handler; + void *error_pw; + + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *alloc_pw; /**< Client private data */ +}; + +static bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_close_tag_open( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_close_tag_match( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_before_attribute_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_attribute_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_after_attribute_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_before_attribute_value( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_attribute_value_dq( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_attribute_value_sq( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_attribute_value_uq( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_entity_in_attribute_value( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_bogus_comment( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_markup_declaration_open( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_comment_start( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_comment_dash( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_match_doctype( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_before_doctype_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_doctype_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_after_doctype_name( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_bogus_doctype( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_numbered_entity( + hubbub_tokeniser *tokeniser); +static bool hubbub_tokeniser_handle_named_entity( + hubbub_tokeniser *tokeniser); +static void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer, + size_t len, void *pw); +static void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, + hubbub_token *token); + +/** + * Create a hubbub tokeniser + * + * \param input Input stream instance + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to tokeniser instance, or NULL on failure + */ +hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input, + hubbub_alloc alloc, void *pw) +{ + hubbub_tokeniser *tok; + + if (input == NULL || alloc == NULL) + return NULL; + + tok = alloc(NULL, sizeof(hubbub_tokeniser), pw); + if (tok == NULL) + return NULL; + + tok->state = HUBBUB_TOKENISER_STATE_DATA; + tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA; + + tok->input = input; + tok->input_buffer = NULL; + tok->input_buffer_len = 0; + + tok->token_handler = NULL; + tok->token_pw = NULL; + + tok->buffer_handler = NULL; + tok->buffer_pw = NULL; + + tok->error_handler = NULL; + tok->error_pw = NULL; + + tok->alloc = alloc; + tok->alloc_pw = pw; + + if (hubbub_inputstream_register_movehandler(input, + hubbub_tokeniser_buffer_moved_handler, tok) != + HUBBUB_OK) { + alloc(tok, 0, pw); + return NULL; + } + + memset(&tok->context, 0, sizeof(hubbub_tokeniser_context)); + + return tok; +} + +/** + * Destroy a hubbub tokeniser + * + * \param tokeniser The tokeniser instance to destroy + */ +void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser) +{ + if (tokeniser == NULL) + return; + + hubbub_inputstream_deregister_movehandler(tokeniser->input, + hubbub_tokeniser_buffer_moved_handler, tokeniser); + + if (tokeniser->context.current_tag.attributes != NULL) { + tokeniser->alloc(tokeniser->context.current_tag.attributes, + 0, tokeniser->alloc_pw); + } + + tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw); +} + +/** + * Configure a hubbub tokeniser + * + * \param tokeniser The tokeniser instance to configure + * \param type The option type to set + * \param params Option-specific parameters + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, + hubbub_tokeniser_opttype type, + hubbub_tokeniser_optparams *params) +{ + if (tokeniser == NULL || params == NULL) + return HUBBUB_BADPARM; + + switch (type) { + case HUBBUB_TOKENISER_TOKEN_HANDLER: + tokeniser->token_handler = params->token_handler.handler; + tokeniser->token_pw = params->token_handler.pw; + break; + case HUBBUB_TOKENISER_BUFFER_HANDLER: + tokeniser->buffer_handler = params->buffer_handler.handler; + tokeniser->buffer_pw = params->buffer_handler.pw; + tokeniser->buffer_handler(tokeniser->input_buffer, + tokeniser->input_buffer_len, + tokeniser->buffer_pw); + break; + case HUBBUB_TOKENISER_ERROR_HANDLER: + tokeniser->error_handler = params->error_handler.handler; + tokeniser->error_pw = params->error_handler.pw; + break; + case HUBBUB_TOKENISER_CONTENT_MODEL: + tokeniser->content_model = params->content_model.model; + break; + } + + return HUBBUB_OK; +} + +/** + * Process remaining data in the input stream + * + * \param tokeniser The tokeniser instance to invoke + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser) +{ + bool cont = true; + + if (tokeniser == NULL) + return HUBBUB_BADPARM; + + while (cont) { + switch (tokeniser->state) { + case HUBBUB_TOKENISER_STATE_DATA: + cont = hubbub_tokeniser_handle_data(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ENTITY_DATA: + cont = hubbub_tokeniser_handle_entity_data( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_TAG_OPEN: + cont = hubbub_tokeniser_handle_tag_open(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN: + cont = hubbub_tokeniser_handle_close_tag_open( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH: + cont = hubbub_tokeniser_handle_close_tag_match( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_TAG_NAME: + cont = hubbub_tokeniser_handle_tag_name(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME: + cont = hubbub_tokeniser_handle_before_attribute_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME: + cont = hubbub_tokeniser_handle_attribute_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME: + cont = hubbub_tokeniser_handle_after_attribute_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE: + cont = hubbub_tokeniser_handle_before_attribute_value( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ: + cont = hubbub_tokeniser_handle_attribute_value_dq( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ: + cont = hubbub_tokeniser_handle_attribute_value_sq( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ: + cont = hubbub_tokeniser_handle_attribute_value_uq( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE: + cont = hubbub_tokeniser_handle_entity_in_attribute_value( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BOGUS_COMMENT: + cont = hubbub_tokeniser_handle_bogus_comment( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN: + cont = hubbub_tokeniser_handle_markup_declaration_open( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_COMMENT_START: + cont = hubbub_tokeniser_handle_comment_start( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_COMMENT: + cont = hubbub_tokeniser_handle_comment(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_COMMENT_DASH: + cont = hubbub_tokeniser_handle_comment_dash( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_COMMENT_END: + cont = hubbub_tokeniser_handle_comment_end( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE: + cont = hubbub_tokeniser_handle_match_doctype( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_DOCTYPE: + cont = hubbub_tokeniser_handle_doctype(tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME: + cont = hubbub_tokeniser_handle_before_doctype_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_DOCTYPE_NAME: + cont = hubbub_tokeniser_handle_doctype_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME: + cont = hubbub_tokeniser_handle_after_doctype_name( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE: + cont = hubbub_tokeniser_handle_bogus_doctype( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY: + cont = hubbub_tokeniser_handle_numbered_entity( + tokeniser); + break; + case HUBBUB_TOKENISER_STATE_NAMED_ENTITY: + cont = hubbub_tokeniser_handle_named_entity( + tokeniser); + break; + } + } + + return HUBBUB_OK; +} + +bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser) +{ + hubbub_token token; + uint32_t c; + + /* Clear current characters */ + tokeniser->context.current_chars.data_off = 0; + tokeniser->context.current_chars.len = 0; + + while ((c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + if (c == '&' && (tokeniser->content_model == + HUBBUB_CONTENT_MODEL_PCDATA || + tokeniser->content_model == + HUBBUB_CONTENT_MODEL_RCDATA)) { + tokeniser->state = + HUBBUB_TOKENISER_STATE_ENTITY_DATA; + /* Don't eat the '&'; it'll be handled by + * entity consumption */ + break; + } else if (c == '<' && tokeniser->content_model != + HUBBUB_CONTENT_MODEL_PLAINTEXT) { + if (tokeniser->context.current_chars.len > 0) { + /* Emit any pending characters */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, + &token); + } + + /* Buffer '<' */ + tokeniser->context.current_chars.data_off = + hubbub_inputstream_cur_pos(tokeniser->input, + &tokeniser->context.current_chars.len); + + tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_OPEN; + hubbub_inputstream_advance(tokeniser->input); + break; + } else { + uint32_t pos; + size_t len; + + /* Accumulate characters into buffer */ + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + if (tokeniser->context.current_chars.len == 0) { + tokeniser->context.current_chars.data_off = + pos; + } + tokeniser->context.current_chars.len++; + + hubbub_inputstream_advance(tokeniser->input); + } + } + + if (tokeniser->state != HUBBUB_TOKENISER_STATE_TAG_OPEN && + tokeniser->context.current_chars.len > 0) { + /* Emit any pending characters */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->context.current_chars.data_off = 0; + tokeniser->context.current_chars.len = 0; + } + + if (c == HUBBUB_INPUTSTREAM_EOF) { + token.type = HUBBUB_TOKEN_EOF; + + hubbub_tokeniser_emit_token(tokeniser, &token); + } + + return (c != HUBBUB_INPUTSTREAM_EOF && c != HUBBUB_INPUTSTREAM_OOD); +} + +bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser) +{ + if (tokeniser->context.match_entity.complete == false) { + return hubbub_tokeniser_consume_entity(tokeniser); + } else { + hubbub_token token; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD || + c == HUBBUB_INPUTSTREAM_EOF) { + /* Should never happen */ + abort(); + } + + /* Emit character */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character.data_off = + hubbub_inputstream_cur_pos(tokeniser->input, + &token.data.character.len); + + hubbub_tokeniser_emit_token(tokeniser, &token); + + /* Reset for next time */ + tokeniser->context.match_entity.complete = false; + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t pos; + size_t len; + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || + tokeniser->content_model == + HUBBUB_CONTENT_MODEL_CDATA) { + if (c == '/') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + tokeniser->context.current_chars.len += len; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN; + + hubbub_inputstream_advance(tokeniser->input); + } else { + hubbub_token token; + + /* Emit '<' */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = + HUBBUB_TOKENISER_STATE_DATA; + } + } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) { + if (c == '!') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_chars.len += len; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '/') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_chars.len += len; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + hubbub_inputstream_lowercase(tokeniser->input); + + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_START_TAG; + + ctag->name.data_off = + hubbub_inputstream_cur_pos(tokeniser->input, + &ctag->name.len); + ctag->n_attributes = 0; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_TAG_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if ('a' <= c && c <= 'z') { + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_START_TAG; + + ctag->name.data_off = + hubbub_inputstream_cur_pos(tokeniser->input, + &ctag->name.len); + ctag->n_attributes = 0; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_TAG_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + tokeniser->context.current_chars.len += len; + + /* Emit "<>" */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = + HUBBUB_TOKENISER_STATE_DATA; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '?') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + tokeniser->context.current_chars.len += len; + + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len = len; + tokeniser->state = + HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + hubbub_inputstream_advance(tokeniser->input); + } else { + hubbub_token token; + + /* Emit '<' */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = + HUBBUB_TOKENISER_STATE_DATA; + } + } + + return true; +} + +bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser) +{ + if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA || + tokeniser->content_model == + HUBBUB_CONTENT_MODEL_CDATA) { + tokeniser->context.close_tag_match.tag.len = 0; + tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH; + } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) { + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + uint32_t pos; + size_t len; + + if ('A' <= c && c <= 'Z') { + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_END_TAG; + ctag->name.data_off = pos; + ctag->name.len = len; + ctag->n_attributes = 0; + + tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if ('a' <= c && c <= 'z') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_tag_type = + HUBBUB_TOKEN_END_TAG; + ctag->name.data_off = pos; + ctag->name.len = len; + ctag->n_attributes = 0; + + tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit "</" */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else if (c != HUBBUB_INPUTSTREAM_OOD) { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len = len; + + tokeniser->state = + HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + hubbub_inputstream_advance(tokeniser->input); + } else { + /* Out of data */ + return false; + } + } + + return true; +} + +bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser) +{ + hubbub_tokeniser_context *ctx = &tokeniser->context; + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = 0; + + while (ctx->close_tag_match.tag.len < ctag->name.len && + (c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + /* Match last open tag */ + uint32_t off; + size_t len; + + off = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctx->close_tag_match.tag.len == 0) { + ctx->close_tag_match.tag.data_off = off; + ctx->close_tag_match.tag.len = len; + } else { + ctx->close_tag_match.tag.len += len; + } + + hubbub_inputstream_advance(tokeniser->input); + + if (ctx->close_tag_match.tag.len > ctag->name.len || + (ctx->close_tag_match.tag.len == ctag->name.len && + hubbub_inputstream_compare_range_ci( + tokeniser->input, + ctag->name.data_off, + ctx->close_tag_match.tag.data_off, + ctag->name.len) != 0)) { + hubbub_token token; + + /* Rewind input stream to start of tag name */ + if (hubbub_inputstream_rewind(tokeniser->input, + ctx->close_tag_match.tag.len) != + HUBBUB_OK) + abort(); + + /* Emit "</" */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = + tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + return true; + } else if (ctx->close_tag_match.tag.len == ctag->name.len && + hubbub_inputstream_compare_range_ci( + tokeniser->input, + ctag->name.data_off, + ctx->close_tag_match.tag.data_off, + ctag->name.len) == 0) { + /* Matched => stop searching */ + break; + } + } + + if (c == HUBBUB_INPUTSTREAM_OOD) { + /* Need more data */ + return false; + } + + if (c == HUBBUB_INPUTSTREAM_EOF) { + /* Ran out of data - parse error */ + hubbub_token token; + + /* Rewind input stream to start of tag name */ + if (hubbub_inputstream_rewind(tokeniser->input, + ctx->close_tag_match.tag.len) != HUBBUB_OK) + abort(); + + /* Emit "</" */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + return true; + } + + /* Match following char */ + c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) { + /* Need more data */ + return false; + } + + /* Rewind input stream to start of tag name */ + if (hubbub_inputstream_rewind(tokeniser->input, + ctx->close_tag_match.tag.len) != HUBBUB_OK) + abort(); + + /* Check that following char was valid */ + if (c != '\t' && c != '\n' && c != '\v' && c != '\f' && + c != ' ' && c != '>' && c != '/' && c != '<' && + c != HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit "</" */ + token.type = HUBBUB_TOKEN_CHARACTER; + token.data.character = tokeniser->context.current_chars; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + return true; + } + + /* Switch the content model back to PCDATA */ + tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA; + + /* Finally, transition back to close tag open state */ + tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN; + + return true; +} + +bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + uint32_t pos; + size_t len; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else if (c == '/') { + /** \todo permitted slash */ + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_before_attribute_name( + hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + uint32_t pos; + size_t len; + hubbub_attribute *attr; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + attr = tokeniser->alloc(ctag->attributes, + (ctag->n_attributes + 1) * + sizeof(hubbub_attribute), + tokeniser->alloc_pw); + if (attr == NULL) { + /** \todo handle memory exhaustion */ + } + + ctag->attributes = attr; + + attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.len = len; + attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.len = 0; + + ctag->n_attributes++; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '/') { + /** \todo permitted slash */ + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + hubbub_attribute *attr; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + attr = tokeniser->alloc(ctag->attributes, + (ctag->n_attributes + 1) * + sizeof(hubbub_attribute), + tokeniser->alloc_pw); + if (attr == NULL) { + /** \todo handle memory exhaustion */ + } + + ctag->attributes = attr; + + attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.len = len; + attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.len = 0; + + ctag->n_attributes++; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '=') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + uint32_t pos; + size_t len; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->attributes[ctag->n_attributes - 1].name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '/') { + /** \todo permitted slash */ + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->attributes[ctag->n_attributes - 1].name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_after_attribute_name( + hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '=') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('A' <= c && c <= 'Z') { + uint32_t pos; + size_t len; + hubbub_attribute *attr; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + attr = tokeniser->alloc(ctag->attributes, + (ctag->n_attributes + 1) * + sizeof(hubbub_attribute), + tokeniser->alloc_pw); + if (attr == NULL) { + /** \todo handle memory exhaustion */ + } + + ctag->attributes = attr; + + attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.len = len; + attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.len = 0; + + ctag->n_attributes++; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '/') { + /** \todo permitted slash */ + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + hubbub_attribute *attr; + + hubbub_inputstream_lowercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + attr = tokeniser->alloc(ctag->attributes, + (ctag->n_attributes + 1) * + sizeof(hubbub_attribute), + tokeniser->alloc_pw); + if (attr == NULL) { + /** \todo handle memory exhaustion */ + } + + ctag->attributes = attr; + + attr[ctag->n_attributes].name.data_off = pos; + attr[ctag->n_attributes].name.len = len; + attr[ctag->n_attributes].value.data_off = 0; + attr[ctag->n_attributes].value.len = 0; + + ctag->n_attributes++; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_before_attribute_value( + hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '"') { + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '&') { + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ; + } else if (c == '\'') { + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + ctag->attributes[ctag->n_attributes - 1].value.data_off = pos; + ctag->attributes[ctag->n_attributes - 1].value.len = len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '"') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '&') { + tokeniser->context.prev_state = tokeniser->state; + tokeniser->state = + HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE; + /* Don't eat the '&'; entity consumption handles this */ + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { + ctag->attributes[ctag->n_attributes - 1].value.data_off = + pos; + } + + ctag->attributes[ctag->n_attributes - 1].value.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\'') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '&') { + tokeniser->context.prev_state = tokeniser->state; + tokeniser->state = + HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE; + /* Don't eat the '&'; entity consumption handles this */ + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { + ctag->attributes[ctag->n_attributes - 1].value.data_off = + pos; + } + + ctag->attributes[ctag->n_attributes - 1].value.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + tokeniser->state = + HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '&') { + tokeniser->context.prev_state = tokeniser->state; + tokeniser->state = + HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE; + /* Don't eat the '&'; entity consumption handles this */ + } else if (c == '>') { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit current tag */ + token.type = tokeniser->context.current_tag_type; + token.data.tag = tokeniser->context.current_tag; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { + ctag->attributes[ctag->n_attributes - 1].value.data_off = + pos; + } + + ctag->attributes[ctag->n_attributes - 1].value.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_entity_in_attribute_value( + hubbub_tokeniser *tokeniser) +{ + hubbub_tag *ctag = &tokeniser->context.current_tag; + uint32_t pos; + size_t len; + + if (tokeniser->context.match_entity.complete == false) { + return hubbub_tokeniser_consume_entity(tokeniser); + } else { + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD || + c == HUBBUB_INPUTSTREAM_EOF) { + /* Should never happen */ + abort(); + } + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) { + ctag->attributes[ctag->n_attributes - 1].value.data_off = + pos; + } + + ctag->attributes[ctag->n_attributes - 1].value.len += len; + + /* Reset for next time */ + tokeniser->context.match_entity.complete = false; + + /* And back to the previous state */ + tokeniser->state = tokeniser->context.prev_state; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser) +{ + hubbub_token token; + uint32_t c; + + while ((c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + uint32_t pos; + size_t len; + + if (c == '>') { + hubbub_inputstream_advance(tokeniser->input); + break; + } + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + + return true; +} + +bool hubbub_tokeniser_handle_markup_declaration_open( + hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '-') { + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_START; + hubbub_inputstream_advance(tokeniser->input); + } else if ((c & ~0x20) == 'D') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count = 1; + tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE; + hubbub_inputstream_advance(tokeniser->input); + } else { + tokeniser->context.current_comment.data_off = 0; + tokeniser->context.current_comment.len = 0; + + tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + } + + return true; +} + +bool hubbub_tokeniser_handle_comment_start(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + tokeniser->context.current_comment.data_off = 0; + tokeniser->context.current_comment.len = 0; + + + if (c == '-') { + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; + hubbub_inputstream_advance(tokeniser->input); + } else { + hubbub_inputstream_push_back(tokeniser->input, '-'); + tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + } + + return true; +} + +bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '-') { + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_DASH; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_comment_dash(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '-') { + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) { + tokeniser->context.current_comment.data_off = pos; + } else { + /* Need to do this to get length of '-' */ + len += pos - + tokeniser->context.current_comment.data_off; + } + + tokeniser->context.current_comment.len = len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '>') { + hubbub_token token; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '-') { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) { + tokeniser->context.current_comment.data_off = pos; + tokeniser->context.current_comment.len = len; + } else { + /* Need to do this to get length of '-' */ + len = pos - + tokeniser->context.current_comment.data_off; + } + + tokeniser->context.current_comment.len = len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit comment */ + token.type = HUBBUB_TOKEN_COMMENT; + token.data.comment = tokeniser->context.current_comment; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + if (tokeniser->context.current_comment.len == 0) { + tokeniser->context.current_comment.data_off = pos; + } else { + /* Need to do this to get length of '--' */ + len += pos - + tokeniser->context.current_comment.data_off; + } + + tokeniser->context.current_comment.len = len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (tokeniser->context.match_doctype.count == 1 && + (c & ~0x20) == 'O') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 2 && + (c & ~0x20) == 'C') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 3 && + (c & ~0x20) == 'T') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 4 && + (c & ~0x20) == 'Y') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 5 && + (c & ~0x20) == 'P') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->context.match_doctype.count++; + hubbub_inputstream_advance(tokeniser->input); + } else if (tokeniser->context.match_doctype.count == 6 && + (c & ~0x20) == 'E') { + hubbub_inputstream_uppercase(tokeniser->input); + tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE; + hubbub_inputstream_advance(tokeniser->input); + } else { + switch (tokeniser->context.match_doctype.count) { + case 6: hubbub_inputstream_push_back(tokeniser->input, 'P'); + case 5: hubbub_inputstream_push_back(tokeniser->input, 'Y'); + case 4: hubbub_inputstream_push_back(tokeniser->input, 'T'); + case 3: hubbub_inputstream_push_back(tokeniser->input, 'C'); + case 2: hubbub_inputstream_push_back(tokeniser->input, 'O'); + case 1: hubbub_inputstream_push_back(tokeniser->input, 'D'); + } + + tokeniser->context.current_comment.data_off = 0; + tokeniser->context.current_comment.len = 0; + + tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT; + } + + return true; +} + +bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } + + tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME; + + return true; +} + +bool hubbub_tokeniser_handle_before_doctype_name( + hubbub_tokeniser *tokeniser) +{ + hubbub_doctype *cdoc = &tokeniser->context.current_doctype; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if ('a' <= c && c <= 'z') { + uint32_t pos; + size_t len; + + hubbub_inputstream_uppercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + cdoc->name.data_off = pos; + cdoc->name.len = len; + cdoc->correct = false; + + tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + cdoc->name.data_off = pos; + cdoc->name.len = len; + cdoc->correct = false; + + tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser) +{ + hubbub_doctype *cdoc = &tokeniser->context.current_doctype; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + token.data.doctype.correct = + (hubbub_inputstream_compare_range_ascii( + tokeniser->input, + token.data.doctype.name.data_off, + token.data.doctype.name.len, + "HTML", SLEN("HTML")) == 0); + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if ('a' <= c && c <= 'z') { + uint32_t pos; + size_t len; + + hubbub_inputstream_uppercase(tokeniser->input); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + cdoc->name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + uint32_t pos; + size_t len; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + cdoc->name.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser) +{ + hubbub_doctype *cdoc = &tokeniser->context.current_doctype; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') { + hubbub_inputstream_advance(tokeniser->input); + } else if (c == '>') { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + token.data.doctype.correct = + (hubbub_inputstream_compare_range_ascii( + tokeniser->input, + token.data.doctype.name.data_off, + token.data.doctype.name.len, + "HTML", SLEN("HTML")) == 0); + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + cdoc->correct = false; + + tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE; + + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser) +{ + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '>') { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + hubbub_inputstream_advance(tokeniser->input); + } else if (c == HUBBUB_INPUTSTREAM_EOF) { + hubbub_token token; + + /* Emit doctype */ + token.type = HUBBUB_TOKEN_DOCTYPE; + token.data.doctype = tokeniser->context.current_doctype; + + hubbub_tokeniser_emit_token(tokeniser, &token); + + tokeniser->state = HUBBUB_TOKENISER_STATE_DATA; + } else { + hubbub_inputstream_advance(tokeniser->input); + } + + return true; +} + +bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser) +{ + uint32_t c; + uint32_t pos; + size_t len; + + if (tokeniser->context.match_entity.done_setup == false) { + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + tokeniser->context.match_entity.str.data_off = pos; + tokeniser->context.match_entity.str.len = len; + tokeniser->context.match_entity.base = 0; + tokeniser->context.match_entity.codepoint = 0; + tokeniser->context.match_entity.had_data = false; + tokeniser->context.match_entity.return_state = + tokeniser->state; + tokeniser->context.match_entity.complete = false; + tokeniser->context.match_entity.done_setup = true; + tokeniser->context.match_entity.context = NULL; + tokeniser->context.match_entity.prev_len = len; + + hubbub_inputstream_advance(tokeniser->input); + } + + c = hubbub_inputstream_peek(tokeniser->input); + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (c == '#') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + + tokeniser->context.match_entity.str.len += len; + + tokeniser->state = HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY; + hubbub_inputstream_advance(tokeniser->input); + } else { + tokeniser->state = HUBBUB_TOKENISER_STATE_NAMED_ENTITY; + } + + return true; +} + +bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser) +{ + hubbub_tokeniser_context *ctx = &tokeniser->context; + uint32_t c = hubbub_inputstream_peek(tokeniser->input); + uint32_t pos; + size_t len; + hubbub_error error; + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + if (ctx->match_entity.base == 0) { + if ((c & ~0x20) == 'X') { + ctx->match_entity.base = 16; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } else { + ctx->match_entity.base = 10; + } + } + + while ((c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + if (ctx->match_entity.base == 10 && + ('0' <= c && c <= '9')) { + ctx->match_entity.had_data = true; + + ctx->match_entity.codepoint = + ctx->match_entity.codepoint * 10 + (c - '0'); + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + } else if (ctx->match_entity.base == 16 && + (('0' <= c && c <= '9') || + ('A' <= (c & ~0x20) && + (c & ~0x20) <= 'F'))) { + ctx->match_entity.had_data = true; + + ctx->match_entity.codepoint *= 16; + + if ('0' <= c && c <= '9') { + ctx->match_entity.codepoint += (c - '0'); + } else { + ctx->match_entity.codepoint += + ((c & ~0x20) - 'A' + 10); + } + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + } else { + break; + } + + hubbub_inputstream_advance(tokeniser->input); + } + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + /* Eat trailing semicolon, if any */ + if (c == ';') { + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + ctx->match_entity.str.len += len; + + hubbub_inputstream_advance(tokeniser->input); + } + + /* Rewind the inputstream to start of matched sequence */ + hubbub_inputstream_rewind(tokeniser->input, + ctx->match_entity.str.len); + + if (ctx->match_entity.had_data) { + /* Had data, so calculate final codepoint */ + if (0x80 <= ctx->match_entity.codepoint && + ctx->match_entity.codepoint <= 0x9F) { + ctx->match_entity.codepoint = + cp1252Table[ctx->match_entity.codepoint - + 0x80]; + } else if (ctx->match_entity.codepoint == 0 || + ctx->match_entity.codepoint > 0x10FFFF) { + ctx->match_entity.codepoint = 0xFFFD; + } + + /* And replace the matched range with it */ + error = hubbub_inputstream_replace_range(tokeniser->input, + ctx->match_entity.str.data_off, + ctx->match_entity.str.len, + ctx->match_entity.codepoint); + if (error != HUBBUB_OK) { + /** \todo handle memory exhaustion */ + } + } + + /* Reset for next time */ + ctx->match_entity.done_setup = false; + + /* Flag completion */ + ctx->match_entity.complete = true; + + /* And back to the state we were entered in */ + tokeniser->state = ctx->match_entity.return_state; + + return true; +} + +bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser) +{ + hubbub_tokeniser_context *ctx = &tokeniser->context; + uint32_t c; + uint32_t pos; + size_t len; + hubbub_error error; + + while ((c = hubbub_inputstream_peek(tokeniser->input)) != + HUBBUB_INPUTSTREAM_EOF && + c != HUBBUB_INPUTSTREAM_OOD) { + uint32_t cp; + + if (c > 0x7F) { + /* Entity names are ASCII only */ + break; + } + + error = hubbub_entities_search_step((uint8_t) c, + &cp, + &ctx->match_entity.context); + if (error == HUBBUB_OK) { + /* Had a match - store it for later */ + ctx->match_entity.codepoint = cp; + + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + + /* And cache length, for replacement */ + ctx->match_entity.prev_len = + ctx->match_entity.str.len; + } else if (error == HUBBUB_INVALID) { + /* No further matches - use last found */ + break; + } else { + pos = hubbub_inputstream_cur_pos(tokeniser->input, + &len); + ctx->match_entity.str.len += len; + } + + hubbub_inputstream_advance(tokeniser->input); + } + + if (c == HUBBUB_INPUTSTREAM_OOD) + return false; + + /* Eat trailing semicolon, if any */ + if (ctx->match_entity.codepoint != 0 && c == ';' && + ctx->match_entity.prev_len == + ctx->match_entity.str.len) { + pos = hubbub_inputstream_cur_pos(tokeniser->input, &len); + ctx->match_entity.prev_len += len; + } + + /* Rewind the inputstream to start of processed sequence */ + hubbub_inputstream_rewind(tokeniser->input, + ctx->match_entity.str.len); + + /* Now, replace range, if we found a named entity */ + if (ctx->match_entity.codepoint != 0) { + error = hubbub_inputstream_replace_range(tokeniser->input, + ctx->match_entity.str.data_off, + ctx->match_entity.prev_len, + ctx->match_entity.codepoint); + if (error != HUBBUB_OK) { + /** \todo handle memory exhaustion */ + } + } + + /* Reset for next time */ + ctx->match_entity.done_setup = false; + + /* Flag completion */ + ctx->match_entity.complete = true; + + /* And back to the state from whence we came */ + tokeniser->state = ctx->match_entity.return_state; + + return true; +} + +/** + * Handle input stream buffer moving + * + * \param buffer Pointer to buffer + * \param len Length of data in buffer (bytes) + * \param pw Pointer to our context + */ +void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer, + size_t len, void *pw) +{ + hubbub_tokeniser *tok = (hubbub_tokeniser *) pw; + + tok->input_buffer = buffer; + tok->input_buffer_len = len; + + if (tok->buffer_handler != NULL) + tok->buffer_handler(buffer, len, tok->buffer_pw); +} + +/** + * Emit a token, performing sanity checks if necessary + * + * \param tokeniser Tokeniser instance + * \param token Token to emit + */ +void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser, + hubbub_token *token) +{ + if (tokeniser == NULL || token == NULL) + return; + + /* Nothing to do if there's no registered handler */ + if (tokeniser->token_handler == NULL) + return; + + if (token->type == HUBBUB_TOKEN_START_TAG || + token->type == HUBBUB_TOKEN_END_TAG) { + uint32_t i, j; + uint32_t n_attributes = token->data.tag.n_attributes; + hubbub_attribute *attrs = + token->data.tag.attributes; + + /* Discard duplicate attributes */ + for (i = 0; i < n_attributes; i++) { + for (j = 0; j < n_attributes; j++) { + uint32_t move; + + if (j == i || + attrs[i].name.len != + attrs[j].name.len || + hubbub_inputstream_compare_range_cs( + tokeniser->input, + attrs[i].name.data_off, + attrs[j].name.data_off, + attrs[i].name.len) != 0) { + /* Attributes don't match */ + continue; + } + + /* Calculate amount to move */ + move = (n_attributes - 1 - + ((i < j) ? j : i)) * + sizeof(hubbub_attribute); + + if (move > 0) { + memmove((i < j) ? &attrs[j] + : &attrs[i], + (i < j) ? &attrs[j+1] + : &attrs[i+1], + move); + } + + /* And reduce the number of attributes */ + n_attributes--; + } + } + + token->data.tag.n_attributes = n_attributes; + } + + /* Finally, emit token */ + tokeniser->token_handler(token, tokeniser->token_pw); +} diff --git a/src/tokeniser/tokeniser.h b/src/tokeniser/tokeniser.h new file mode 100644 index 0000000..20bbe20 --- /dev/null +++ b/src/tokeniser/tokeniser.h @@ -0,0 +1,71 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_tokeniser_tokeniser_h_ +#define hubbub_tokeniser_tokeniser_h_ + +#include <stdbool.h> +#include <inttypes.h> + +#include <hubbub/errors.h> +#include <hubbub/functypes.h> +#include <hubbub/types.h> + +#include "input/inputstream.h" + +typedef struct hubbub_tokeniser hubbub_tokeniser; + +/** + * Hubbub tokeniser option types + */ +typedef enum hubbub_tokeniser_opttype { + HUBBUB_TOKENISER_TOKEN_HANDLER, + HUBBUB_TOKENISER_BUFFER_HANDLER, + HUBBUB_TOKENISER_ERROR_HANDLER, + HUBBUB_TOKENISER_CONTENT_MODEL, +} hubbub_tokeniser_opttype; + +/** + * Hubbub tokeniser option parameters + */ +typedef union hubbub_tokeniser_optparams { + struct { + hubbub_token_handler handler; + void *pw; + } token_handler; + + struct { + hubbub_buffer_handler handler; + void *pw; + } buffer_handler; + + struct { + hubbub_error_handler handler; + void *pw; + } error_handler; + + struct { + hubbub_content_model model; + } content_model; +} hubbub_tokeniser_optparams; + +/* Create a hubbub tokeniser */ +hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input, + hubbub_alloc alloc, void *pw); +/* Destroy a hubbub tokeniser */ +void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser); + +/* Configure a hubbub tokeniser */ +hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser, + hubbub_tokeniser_opttype type, + hubbub_tokeniser_optparams *params); + +/* Process remaining data in the input stream */ +hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser); + +#endif + diff --git a/src/utils/Makefile b/src/utils/Makefile new file mode 100644 index 0000000..59b5512 --- /dev/null +++ b/src/utils/Makefile @@ -0,0 +1,53 @@ +# Makefile for libhubbub +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Manipulate include paths +CFLAGS += -I$(CURDIR) + +# Objects +OBJS = dict errors utf8 + +.PHONY: clean debug distclean export release setup test + +# Targets +release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS))) + +debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS))) + +clean: + -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS})) + -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS})) + +distclean: + +setup: + +export: + +test: + +# Pattern rules +../Release/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $< + +../Debug/%.o: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@ $< diff --git a/src/utils/dict.c b/src/utils/dict.c new file mode 100644 index 0000000..f50ffab --- /dev/null +++ b/src/utils/dict.c @@ -0,0 +1,219 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <stdbool.h> + +#include "utils/dict.h" + +/** Node in a dictionary tree */ +typedef struct hubbub_dict_node { + uint8_t split; /**< Data to split on */ + struct hubbub_dict_node *lt; /**< Subtree for data less than + * split */ + struct hubbub_dict_node *eq; /**< Subtree for data equal to split + * If split == '\0', this stores the + * pointer to the actual data, not a + * subtree */ + struct hubbub_dict_node *gt; /**< Subtree for data greater than + * split */ +} hubbub_dict_node; + +/** Dictionary object */ +struct hubbub_dict { + hubbub_dict_node *dict; /**< Root of tree */ + + hubbub_alloc alloc; /**< Memory (de)allocation function */ + void *pw; /**< Pointer to client data */ +}; + +static void hubbub_dict_destroy_internal(hubbub_dict *dict, + hubbub_dict_node *root); +static hubbub_dict_node *hubbub_dict_insert_internal(hubbub_dict *dict, + hubbub_dict_node *parent, const char *key, + const void *value); + + +/** + * Create a dictionary + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data (may be NULL) + * \return Pointer to dictionary instance, or NULL on error + */ +hubbub_dict *hubbub_dict_create(hubbub_alloc alloc, void *pw) +{ + hubbub_dict *dict; + + if (alloc == NULL) + return NULL; + + dict = alloc(NULL, sizeof(hubbub_dict), pw); + if (dict == NULL) + return NULL; + + dict->dict = NULL; + + dict->alloc = alloc; + dict->pw = pw; + + return dict; +} + +/** + * Destroy a dictionary + * + * \param dict Dictionary to destroy + */ +void hubbub_dict_destroy(hubbub_dict *dict) +{ + if (dict == NULL) + return; + + hubbub_dict_destroy_internal(dict, dict->dict); + + dict->alloc(dict, 0, dict->pw); +} + +/** + * Helper routine for dictionary destruction + * + * \param dict Dictionary being destroyed + * \param root Root node of dictionary (sub)tree to destroy + */ +void hubbub_dict_destroy_internal(hubbub_dict *dict, hubbub_dict_node *root) +{ + if (root == NULL) + return; + + hubbub_dict_destroy_internal(dict, root->lt); + if (root->split != '\0') + hubbub_dict_destroy_internal(dict, root->eq); + hubbub_dict_destroy_internal(dict, root->gt); + + dict->alloc(root, 0, dict->pw); +} + +/** + * Insert a key-value pair into a dictionary + * + * \param dict Dictionary to insert into + * \param key Key string + * \param value Value to associate with key (may be NULL) + * \return HUBBUB_OK on success, appropriate error otherwise + */ +hubbub_error hubbub_dict_insert(hubbub_dict *dict, const char *key, + const void *value) +{ + if (dict == NULL || key == NULL) + return HUBBUB_BADPARM; + + dict->dict = hubbub_dict_insert_internal(dict, dict->dict, + key, value); + + return HUBBUB_OK; +} + +/** + * Helper routine for insertion into dictionary + * + * \param dict Dictionary being inserted into + * \param parent Parent node of subtree to insert into + * \param key Key string + * \param value Value to associate with key + * \return Pointer to root of tree created + */ +hubbub_dict_node *hubbub_dict_insert_internal(hubbub_dict *dict, + hubbub_dict_node *parent, const char *key, const void *value) +{ + if (parent == NULL) { + parent = dict->alloc(NULL, + sizeof(hubbub_dict_node), dict->pw); + if (parent == NULL) + return NULL; + parent->split = (uint8_t) key[0]; + parent->lt = parent->eq = parent->gt = NULL; + } + + if ((uint8_t) key[0] < parent->split) { + parent->lt = hubbub_dict_insert_internal(dict, + parent->lt, key, value); + } else if ((uint8_t) key[0] == parent->split) { + if (key[0] == '\0') { + parent->eq = (hubbub_dict_node *) value; + } else { + parent->eq = hubbub_dict_insert_internal(dict, + parent->eq, ++key, value); + } + } else { + parent->gt = hubbub_dict_insert_internal(dict, + parent->gt, key, value); + } + + return parent; +} + +/** + * Step-wise search for a key in a dictionary + * + * \param dict Dictionary to search + * \param c Character to look for + * \param result Pointer to location for result + * \param context Pointer to location for search context + * \return HUBBUB_OK if key found, + * HUBBUB_NEEDDATA if more steps are required + * HUBBUB_INVALID if nothing matches + * + * The value pointed to by ::context must be NULL for the first call. + * Thereafter, pass in the same value as returned by the previous call. + * The context is opaque to the caller and should not be inspected. + * + * The location pointed to by ::result will be set to NULL unless a match + * is found. + */ +hubbub_error hubbub_dict_search_step(hubbub_dict *dict, uint8_t c, + const void **result, void **context) +{ + bool match = false; + hubbub_dict_node *p; + + if (dict == NULL || result == NULL || context == NULL) + return HUBBUB_BADPARM; + + *result = NULL; + + if (*context == NULL) { + p = dict->dict; + } else { + p = (hubbub_dict_node *) *context; + } + + while (p != NULL) { + if (c < p->split) { + p = p->lt; + } else if (c == p->split) { + if (p->split == '\0') { + match = true; + p = NULL; + } else if (p->eq != NULL && p->eq->split == '\0') { + match = true; + *result = (const void *) p->eq->eq; + p = p->eq; + } else { + p = p->eq; + } + + break; + } else { + p = p->gt; + } + } + + *context = (void *) p; + + return (match) ? HUBBUB_OK : + (p == NULL) ? HUBBUB_INVALID : HUBBUB_NEEDDATA; +} diff --git a/src/utils/dict.h b/src/utils/dict.h new file mode 100644 index 0000000..2cde01d --- /dev/null +++ b/src/utils/dict.h @@ -0,0 +1,31 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_utils_dict_h_ +#define hubbub_utils_dict_h_ + +#include <inttypes.h> + +#include <hubbub/errors.h> +#include <hubbub/hubbub.h> + +typedef struct hubbub_dict hubbub_dict; + +/* Create a dictionary */ +hubbub_dict *hubbub_dict_create(hubbub_alloc alloc, void *pw); +/* Destroy a dictionary */ +void hubbub_dict_destroy(hubbub_dict *dict); + +/* Insert a key-value pair into a dictionary */ +hubbub_error hubbub_dict_insert(hubbub_dict *dict, const char *key, + const void *value); + +/* Step-wise search for a key in a dictionary */ +hubbub_error hubbub_dict_search_step(hubbub_dict *dict, uint8_t c, + const void **result, void **context); + +#endif diff --git a/src/utils/errors.c b/src/utils/errors.c new file mode 100644 index 0000000..e57ba6a --- /dev/null +++ b/src/utils/errors.c @@ -0,0 +1,70 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#include <string.h> + +#include <hubbub/errors.h> + +/** + * Convert a hubbub error code to a string + * + * \param error The error code to convert + * \return Pointer to string representation of error, or NULL if unknown. + */ +const char *hubbub_error_to_string(hubbub_error error) +{ + const char *result = NULL; + + switch (error) { + case HUBBUB_OK: + result = "No error"; + break; + case HUBBUB_NOMEM: + result = "Insufficient memory"; + break; + case HUBBUB_BADPARM: + result = "Bad parameter"; + break; + case HUBBUB_INVALID: + result = "Invalid input"; + break; + case HUBBUB_FILENOTFOUND: + result = "File not found"; + break; + case HUBBUB_NEEDDATA: + result = "Insufficient data"; + break; + } + + return result; +} + +/** + * Convert a string representation of an error name to a hubbub error code + * + * \param str String containing error name + * \param len Length of string (bytes) + * \return Hubbub error code, or HUBBUB_OK if unknown + */ +hubbub_error hubbub_error_from_string(const char *str, size_t len) +{ + if (strncmp(str, "HUBBUB_OK", len) == 0) { + return HUBBUB_OK; + } else if (strncmp(str, "HUBBUB_NOMEM", len) == 0) { + return HUBBUB_NOMEM; + } else if (strncmp(str, "HUBBUB_BADPARM", len) == 0) { + return HUBBUB_BADPARM; + } else if (strncmp(str, "HUBBUB_INVALID", len) == 0) { + return HUBBUB_INVALID; + } else if (strncmp(str, "HUBBUB_FILENOTFOUND", len) == 0) { + return HUBBUB_FILENOTFOUND; + } else if (strncmp(str, "HUBBUB_NEEDDATA", len) == 0) { + return HUBBUB_NEEDDATA; + } + + return HUBBUB_OK; +} diff --git a/src/utils/utf8.c b/src/utils/utf8.c new file mode 100644 index 0000000..062d629 --- /dev/null +++ b/src/utils/utf8.c @@ -0,0 +1,368 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +/** \file + * UTF-8 manipulation functions (implementation). + */ + +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> + +#include "utils/utf8.h" + +/** Number of continuation bytes for a given start byte */ +static const uint8_t numContinuations[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, +}; + +/** + * Convert a UTF-8 multibyte sequence into a single UCS4 character + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param s The sequence to process + * \param len Length of sequence + * \param ucs4 Pointer to location to receive UCS4 character (host endian) + * \param clen Pointer to location to receive byte length of UTF-8 sequence + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen) +{ + if (s == NULL || ucs4 == NULL || clen == NULL) + return HUBBUB_BADPARM; + + if (len == 0) + return HUBBUB_NEEDDATA; + + if (*s < 0x80) { + *ucs4 = *s; + *clen = 1; + } else if ((*s & 0xE0) == 0xC0) { + if (len < 2) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F); + *clen = 2; + } + } else if ((*s & 0xF0) == 0xE0) { + if (len < 3) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 12) | + ((*(s+1) & 0x3F) << 6) | + (*(s+2) & 0x3F); + *clen = 3; + } + } else if ((*s & 0xF8) == 0xF0) { + if (len < 4) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80 || + (*(s+3) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 18) | + ((*(s+1) & 0x3F) << 12) | + ((*(s+2) & 0x3F) << 6) | + (*(s+3) & 0x3F); + *clen = 4; + } + } else if ((*s & 0xFC) == 0xF8) { + if (len < 5) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80 || + (*(s+3) & 0xC0) != 0x80 || + (*(s+4) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 24) | + ((*(s+1) & 0x3F) << 18) | + ((*(s+2) & 0x3F) << 12) | + ((*(s+3) & 0x3F) << 6) | + (*(s+4) & 0x3F); + *clen = 5; + } + } else if ((*s & 0xFE) == 0xFC) { + if (len < 6) + return HUBBUB_NEEDDATA; + else if ((*(s+1) & 0xC0) != 0x80 || + (*(s+2) & 0xC0) != 0x80 || + (*(s+3) & 0xC0) != 0x80 || + (*(s+4) & 0xC0) != 0x80 || + (*(s+5) & 0xC0) != 0x80) + return HUBBUB_INVALID; + else { + *ucs4 = ((*s & 0x0F) << 28) | + ((*(s+1) & 0x3F) << 24) | + ((*(s+2) & 0x3F) << 18) | + ((*(s+3) & 0x3F) << 12) | + ((*(s+4) & 0x3F) << 6) | + (*(s+5) & 0x3F); + *clen = 6; + } + } else { + return HUBBUB_INVALID; + } + + return HUBBUB_OK; +} + +/** + * Convert a single UCS4 character into a UTF-8 multibyte sequence + * + * Encoding of UCS values outside the UTF-16 plane has been removed from + * RFC3629. This function conforms to RFC2279, however. + * + * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian) + * \param s Pointer to 6 byte long output buffer + * \param len Pointer to location to receive length of multibyte sequence + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len) +{ + uint32_t l = 0; + + if (s == NULL || len == NULL) + return HUBBUB_BADPARM; + else if (ucs4 < 0x80) { + *s = (uint8_t) ucs4; + l = 1; + } else if (ucs4 < 0x800) { + *s = 0xC0 | ((ucs4 >> 6) & 0x1F); + *(s+1) = 0x80 | (ucs4 & 0x3F); + l = 2; + } else if (ucs4 < 0x10000) { + *s = 0xE0 | ((ucs4 >> 12) & 0xF); + *(s+1) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+2) = 0x80 | (ucs4 & 0x3F); + l = 3; + } else if (ucs4 < 0x200000) { + *s = 0xF0 | ((ucs4 >> 18) & 0x7); + *(s+1) = 0x80 | ((ucs4 >> 12) & 0x3F); + *(s+2) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+3) = 0x80 | (ucs4 & 0x3F); + l = 4; + } else if (ucs4 < 0x4000000) { + *s = 0xF8 | ((ucs4 >> 24) & 0x3); + *(s+1) = 0x80 | ((ucs4 >> 18) & 0x3F); + *(s+2) = 0x80 | ((ucs4 >> 12) & 0x3F); + *(s+3) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+4) = 0x80 | (ucs4 & 0x3F); + l = 5; + } else if (ucs4 <= 0x7FFFFFFF) { + *s = 0xFC | ((ucs4 >> 30) & 0x1); + *(s+1) = 0x80 | ((ucs4 >> 24) & 0x3F); + *(s+2) = 0x80 | ((ucs4 >> 18) & 0x3F); + *(s+3) = 0x80 | ((ucs4 >> 12) & 0x3F); + *(s+4) = 0x80 | ((ucs4 >> 6) & 0x3F); + *(s+5) = 0x80 | (ucs4 & 0x3F); + l = 6; + } else { + return HUBBUB_INVALID; + } + + *len = l; + + return HUBBUB_OK; +} + +/** + * Calculate the length (in characters) of a bounded UTF-8 string + * + * \param s The string + * \param max Maximum length + * \param len Pointer to location to receive length of string + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max, + size_t *len) +{ + const uint8_t *end = s + max; + int l = 0; + + if (s == NULL || len == NULL) + return HUBBUB_BADPARM; + + while (s < end) { + if ((*s & 0x80) == 0x00) + s += 1; + else if ((*s & 0xE0) == 0xC0) + s += 2; + else if ((*s & 0xF0) == 0xE0) + s += 3; + else if ((*s & 0xF8) == 0xF0) + s += 4; + else if ((*s & 0xFC) == 0xF8) + s += 5; + else if ((*s & 0xFE) == 0xFC) + s += 6; + else + return HUBBUB_INVALID; + l++; + } + + *len = l; + + return HUBBUB_OK; +} + +/** + * Calculate the length (in bytes) of a UTF-8 character + * + * \param s Pointer to start of character + * \param len Pointer to location to receive length + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s, + size_t *len) +{ + if (s == NULL || len == NULL) + return HUBBUB_BADPARM; + + *len = numContinuations[s[0]] + 1 /* Start byte */; + + return HUBBUB_OK; +} + +/** + * Find previous legal UTF-8 char in string + * + * \param s The string + * \param off Offset in the string to start at + * \param prevoff Pointer to location to receive offset of first byte of + * previous legal character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff) +{ + if (s == NULL || prevoff == NULL) + return HUBBUB_BADPARM; + + while (off != 0 && (s[--off] & 0xC0) == 0x80) + /* do nothing */; + + *prevoff = off; + + return HUBBUB_OK; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed valid) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + if (s == NULL || off >= len || nextoff == NULL) + return HUBBUB_BADPARM; + + /* Skip current start byte (if present - may be mid-sequence) */ + if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) + off++; + + while (off < len && (s[off] & 0xC0) == 0x80) + off++; + + *nextoff = off; + + return HUBBUB_OK; +} + +/** + * Find next legal UTF-8 char in string + * + * \param s The string (assumed to be of dubious validity) + * \param len Maximum offset in string + * \param off Offset in the string to start at + * \param nextoff Pointer to location to receive offset of first byte of + * next legal character + * \return HUBBUB_OK on success, appropriate error otherwise + */ +inline hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff) +{ + bool valid; + + if (s == NULL || off >= len || nextoff == NULL) + return HUBBUB_BADPARM; + + /* Skip current start byte (if present - may be mid-sequence) */ + if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0) + off++; + + while (1) { + /* Find next possible start byte */ + while (off < len && (s[off] & 0xC0) == 0x80) + off++; + + /* Ran off end of data */ + if (off == len || off + numContinuations[s[off]] >= len) + return HUBBUB_NEEDDATA; + + /* Found if start byte is ascii, + * or next n bytes are valid continuations */ + valid = true; + + switch (numContinuations[s[off]]) { + case 5: + valid &= ((s[off + 5] & 0xC0) == 0x80); + case 4: + valid &= ((s[off + 4] & 0xC0) == 0x80); + case 3: + valid &= ((s[off + 3] & 0xC0) == 0x80); + case 2: + valid &= ((s[off + 2] & 0xC0) == 0x80); + case 1: + valid &= ((s[off + 1] & 0xC0) == 0x80); + case 0: + valid &= (s[off + 0] < 0x80); + } + + if (valid) + break; + + /* Otherwise, skip this (invalid) start byte and try again */ + off++; + } + + *nextoff = off; + + return HUBBUB_OK; +} + diff --git a/src/utils/utf8.h b/src/utils/utf8.h new file mode 100644 index 0000000..8836338 --- /dev/null +++ b/src/utils/utf8.h @@ -0,0 +1,38 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +/** \file + * UTF-8 manipulation functions (interface). + */ + +#ifndef hubbub_utils_utf8_h_ +#define hubbub_utils_utf8_h + +#include <inttypes.h> + +#include <hubbub/errors.h> + +inline hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len, + uint32_t *ucs4, size_t *clen); +inline hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s, + size_t *len); + +inline hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max, + size_t *len); +inline hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s, + size_t *len); + +inline hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off, + uint32_t *prevoff); +inline hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +inline hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len, + uint32_t off, uint32_t *nextoff); + +#endif + diff --git a/src/utils/utils.h b/src/utils/utils.h new file mode 100644 index 0000000..a1e0230 --- /dev/null +++ b/src/utils/utils.h @@ -0,0 +1,28 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org> + */ + +#ifndef hubbub_utils_h_ +#define hubbub_utils_h_ + +#ifndef max +#define max(a,b) ((a)>(b)?(a):(b)) +#endif + +#ifndef min +#define min(a,b) ((a)<(b)?(a):(b)) +#endif + +#ifndef SLEN +/* Calculate length of a string constant */ +#define SLEN(s) (sizeof((s)) - 1) /* -1 for '\0' */ +#endif + +#ifndef UNUSED +#define UNUSED(x) ((x)=(x)) +#endif + +#endif diff --git a/test/INDEX b/test/INDEX new file mode 100644 index 0000000..100dd21 --- /dev/null +++ b/test/INDEX @@ -0,0 +1,15 @@ +# Index for libhubbub testcases +# +# Test Description DataDir + +aliases Encoding alias handling +cscodec Charset codec implementation cscodec +csdetect Charset detection csdetect +dict Generic string dictionary +entities Named entity dictionary +filter Input stream filtering +hubbub Library initialisation/finalisation +inputstream Buffered input stream html +parser Public parser API html +tokeniser HTML tokeniser html +tokeniser2 HTML tokeniser (again) tokeniser2
\ No newline at end of file diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000..ef50365 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,63 @@ +# Makefile for Hubbub testcases +# +# Toolchain is exported by top-level makefile +# +# Top-level makefile also exports the following variables: +# +# COMPONENT Name of component +# EXPORT Absolute path of export directory +# TOP Absolute path of source tree root +# +# The top-level makefile requires the following targets to exist: +# +# clean Clean source tree +# debug Create a debug binary +# distclean Fully clean source tree, back to pristine condition +# export Export distributable components to ${EXPORT} +# release Create a release binary +# setup Perform any setup required prior to compilation +# test Execute any test cases + +# Extend toolchain settings +# We require the presence of libjson -- http://oss.metaparadigm.com/json-c/ +CFLAGS += -I${TOP}/src/ -I$(CURDIR) \ + `${PKGCONFIG} ${PKGCONFIGFLAGS} --cflags json` +LDFLAGS += `${PKGCONFIG} ${PKGCONFIGFLAGS} --libs json` + +# Release output +RELEASE = + +# Debug output +DEBUG = + +# Objects +OBJS = aliases cscodec csdetect dict entities filter hubbub \ + inputstream parser tokeniser tokeniser2 +OBJS += regression/cscodec-segv regression/filter-segv + +.PHONY: clean debug export release setup test + +# Targets +release: + +debug: + +clean: + -@${RM} ${RMFLAGS} $(addsuffix ${EXEEXT}, $(OBJS)) + +distclean: + -@${RM} ${RMFLAGS} log + +setup: + +export: + +test: $(OBJS) + @${PERL} testrunner.pl ${EXEEXT} + +# Pattern rules +%: %.c + @${ECHO} ${ECHOFLAGS} "==> $<" + @${CC} -c -g ${CFLAGS} -o $@.o $< + @${LD} -g -o $@ $@.o ${LDFLAGS} -lhubbub-debug + @${RM} ${RMFLAGS} $@.o diff --git a/test/README b/test/README new file mode 100644 index 0000000..e4a895b --- /dev/null +++ b/test/README @@ -0,0 +1,84 @@ +Hubbub testcases +================ + +Testcases for hubbub are self-contained binaries which test various parts +of the hubbub library. These may make use of external data files to drive +the testing. + +Testcase command lines +---------------------- + +Testcase command lines are in a unified format, thus: + + <aliases_file> [ <data_file> ] + +The aliases file parameter will always be specified (as it is required for +the library to work at all). + +The data file parameter is optional and may be provided on a test-by-test +basis. + +Testcase output +--------------- + +Testcases may output anything at all to stdout. The final line of the +output must begin with either PASS or FAIL (case sensitive), indicating +the success status of the test. + +Test Index +---------- + +In the test sources directory, is a file, named INDEX, which provides an +index of all available test binaries. Any new test applications should be +added to this index as they are created. + +The test index file format is as follows: + + file = *line + + line = ( entry / comment / blank ) LF + + entry = testname 1*HTAB description [ 1*HTAB datadir ] + comment = "#" *non-newline + blank = 0<OCTET> + + testname = 1*non-reserved + description = 1*non-reserved + datadir = 1*non-reserved + + non-newline = VCHAR / WSP + non-reserved = VCHAR / SP + +Each entry contains a mandatory binary name and description followed by +an optional data directory specifier. The data directory specifier is +used to state the name of the directory containing data files for the +test name. This directory will be searched for within the "data" +directory in the source tree. + +If a data directory is specified, the test binary will be invoked for +each data file listed within the data directory INDEX, passing the +filename as the second parameter (<data_file>, above). + +Data Index +---------- + +Each test data directory contains a file, named INDEX, which provides an +index of all available test data files. + +The data index file format is as follows: + + file = *line + + line = ( entry / comment / blank ) LF + + entry = dataname 1*HTAB description + comment = "#" *non-newline + blank = 0<OCTET> + + dataname = 1*non-reserved + description = 1*non-reserved + + non-newline = VCHAR / WSP + non-reserved = VCHAR / SP + +Each entry contains a mandatory data file name and description. diff --git a/test/aliases.c b/test/aliases.c new file mode 100644 index 0000000..1cbf2a4 --- /dev/null +++ b/test/aliases.c @@ -0,0 +1,61 @@ +#include <stdio.h> +#include <string.h> + +#include "charset/aliases.h" + +#include "testutils.h" + +extern void hubbub_aliases_dump(void); + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main (int argc, char **argv) +{ + hubbub_aliases_canon *c; + + if (argc != 2) { + printf("Usage: %s <filename>\n", argv[0]); + return 1; + } + + hubbub_aliases_create(argv[1], myrealloc, NULL); + + hubbub_aliases_dump(); + + c = hubbub_alias_canonicalise("moose", 5); + if (c) { + printf("FAIL - found invalid encoding 'moose'\n"); + return 1; + } + + c = hubbub_alias_canonicalise("csinvariant", 11); + if (c) { + printf("%s %d\n", c->name, c->mib_enum); + } else { + printf("FAIL - failed finding encoding 'csinvariant'\n"); + return 1; + } + + c = hubbub_alias_canonicalise("nats-sefi-add", 13); + if (c) { + printf("%s %d\n", c->name, c->mib_enum); + } else { + printf("FAIL - failed finding encoding 'nats-sefi-add'\n"); + return 1; + } + + printf("%d\n", hubbub_mibenum_from_name(c->name, strlen(c->name))); + + printf("%s\n", hubbub_mibenum_to_name(c->mib_enum)); + + hubbub_aliases_destroy(myrealloc, NULL); + + printf("PASS\n"); + + return 0; +} diff --git a/test/cscodec.c b/test/cscodec.c new file mode 100644 index 0000000..525b275 --- /dev/null +++ b/test/cscodec.c @@ -0,0 +1,247 @@ +#include <stdio.h> +#include <string.h> + +#include <hubbub/hubbub.h> + +#include "charset/codec.h" +#include "utils/utils.h" + +#include "testutils.h" + +typedef struct line_ctx { + hubbub_charsetcodec *codec; + + size_t buflen; + size_t bufused; + uint8_t *buf; + size_t explen; + size_t expused; + uint8_t *exp; + + bool indata; + bool inexp; + + hubbub_error exp_ret; + + enum { ENCODE, DECODE } dir; +} line_ctx; + +static bool handle_line(const char *data, size_t datalen, void *pw); +static void run_test(line_ctx *ctx); +static hubbub_error filter(uint32_t c, uint32_t **output, + size_t *outputlen, void *pw); + + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + line_ctx ctx; + + if (argc != 3) { + printf("Usage: %s <aliases_file> <filename>\n", argv[0]); + return 1; + } + + assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK); + + assert(hubbub_charsetcodec_create("NATS-SEFI-ADD", + myrealloc, NULL) == NULL); + + ctx.codec = hubbub_charsetcodec_create("UTF-8", myrealloc, NULL); + assert(ctx.codec != NULL); + + ctx.buflen = parse_filesize(argv[2]); + if (ctx.buflen == 0) + return 1; + + ctx.buf = malloc(2 * ctx.buflen); + if (ctx.buf == NULL) { + printf("Failed allocating %u bytes\n", + (unsigned int) ctx.buflen); + return 1; + } + + ctx.exp = ctx.buf + ctx.buflen; + ctx.explen = ctx.buflen; + + ctx.buf[0] = '\0'; + ctx.exp[0] = '\0'; + ctx.bufused = 0; + ctx.expused = 0; + ctx.indata = false; + ctx.inexp = false; + ctx.exp_ret = HUBBUB_OK; + + assert(parse_testfile(argv[2], handle_line, &ctx) == true); + + /* and run final test */ + if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n') + ctx.bufused -= 1; + + if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n') + ctx.expused -= 1; + + run_test(&ctx); + + free(ctx.buf); + + hubbub_charsetcodec_destroy(ctx.codec); + + assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK); + + printf("PASS\n"); + + return 0; +} + +bool handle_line(const char *data, size_t datalen, void *pw) +{ + line_ctx *ctx = (line_ctx *) pw; + + if (data[0] == '#') { + if (ctx->inexp) { + /* This marks end of testcase, so run it */ + + if (ctx->buf[ctx->bufused - 1] == '\n') + ctx->bufused -= 1; + + if (ctx->exp[ctx->expused - 1] == '\n') + ctx->expused -= 1; + + run_test(ctx); + + ctx->buf[0] = '\0'; + ctx->exp[0] = '\0'; + ctx->bufused = 0; + ctx->expused = 0; + ctx->exp_ret = HUBBUB_OK; + } + + if (strncasecmp(data+1, "data", 4) == 0) { + hubbub_charsetcodec_optparams params; + const char *ptr = data + 6; + + ctx->indata = true; + ctx->inexp = false; + + if (strncasecmp(ptr, "decode", 6) == 0) + ctx->dir = DECODE; + else + ctx->dir = ENCODE; + + ptr += 7; + + if (strncasecmp(ptr, "LOOSE", 5) == 0) { + params.error_mode.mode = + HUBBUB_CHARSETCODEC_ERROR_LOOSE; + ptr += 6; + } else if (strncasecmp(ptr, "STRICT", 6) == 0) { + params.error_mode.mode = + HUBBUB_CHARSETCODEC_ERROR_STRICT; + ptr += 7; + } else { + params.error_mode.mode = + HUBBUB_CHARSETCODEC_ERROR_TRANSLIT; + ptr += 9; + } + + assert(hubbub_charsetcodec_setopt(ctx->codec, + HUBBUB_CHARSETCODEC_ERROR_MODE, + (hubbub_charsetcodec_optparams *) ¶ms) + == HUBBUB_OK); + + if (strncasecmp(ptr, "filter", 6) == 0) { + params.filter_func.filter = filter; + params.filter_func.pw = ctx; + + assert(hubbub_charsetcodec_setopt(ctx->codec, + HUBBUB_CHARSETCODEC_FILTER_FUNC, + (hubbub_charsetcodec_optparams *) + ¶ms) == HUBBUB_OK); + } + } else if (strncasecmp(data+1, "expected", 8) == 0) { + ctx->indata = false; + ctx->inexp = true; + + ctx->exp_ret = hubbub_error_from_string(data + 10, + datalen - 10 - 1 /* \n */); + } else if (strncasecmp(data+1, "reset", 5) == 0) { + ctx->indata = false; + ctx->inexp = false; + + hubbub_charsetcodec_reset(ctx->codec); + } + } else { + if (ctx->indata) { + memcpy(ctx->buf + ctx->bufused, data, datalen); + ctx->bufused += datalen; + } + if (ctx->inexp) { + memcpy(ctx->exp + ctx->expused, data, datalen); + ctx->expused += datalen; + } + } + + return true; +} + +void run_test(line_ctx *ctx) +{ + static int testnum; + size_t destlen = ctx->bufused * 4; + uint8_t dest[destlen]; + uint8_t *pdest = dest; + const uint8_t *psrc = ctx->buf; + size_t srclen = ctx->bufused; + size_t i; + + if (ctx->dir == DECODE) { + assert(hubbub_charsetcodec_decode(ctx->codec, + &psrc, &srclen, + &pdest, &destlen) == ctx->exp_ret); + } else { + assert(hubbub_charsetcodec_encode(ctx->codec, + &psrc, &srclen, + &pdest, &destlen) == ctx->exp_ret); + } + + printf("%d: Read '", ++testnum); + for (i = 0; i < ctx->expused; i++) { + printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf], + "0123456789abcdef"[dest[i] & 0xf]); + } + printf("' Expected '"); + for (i = 0; i < ctx->expused; i++) { + printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf], + "0123456789abcdef"[ctx->exp[i] & 0xf]); + } + printf("'\n"); + + assert(memcmp(dest, ctx->exp, ctx->expused) == 0); +} + +hubbub_error filter(uint32_t c, uint32_t **output, + size_t *outputlen, void *pw) +{ + static uint32_t outbuf; + + UNUSED(pw); + + if (c == HUBBUB_CHARSETCODEC_NULL) { + outbuf = 0; + return HUBBUB_OK; + } + + outbuf = c; + + *output = &outbuf; + *outputlen = 1; + + return HUBBUB_OK; +} diff --git a/test/csdetect.c b/test/csdetect.c new file mode 100644 index 0000000..3b39972 --- /dev/null +++ b/test/csdetect.c @@ -0,0 +1,132 @@ +#include <inttypes.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <hubbub/hubbub.h> + +#include "charset/aliases.h" +#include "charset/detect.h" +#include "utils/utils.h" + +#include "testutils.h" + +typedef struct line_ctx { + size_t buflen; + size_t bufused; + uint8_t *buf; + char enc[64]; + bool indata; + bool inenc; +} line_ctx; + +static bool handle_line(const char *data, size_t datalen, void *pw); +static void run_test(const uint8_t *data, size_t len, char *expected); + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + line_ctx ctx; + + if (argc != 3) { + printf("Usage: %s <aliases_file> <filename>\n", argv[0]); + return 1; + } + + assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK); + + ctx.buflen = parse_filesize(argv[2]); + if (ctx.buflen == 0) + return 1; + + ctx.buf = malloc(ctx.buflen); + if (ctx.buf == NULL) { + printf("Failed allocating %u bytes\n", + (unsigned int) ctx.buflen); + return 1; + } + + ctx.buf[0] = '\0'; + ctx.enc[0] = '\0'; + ctx.bufused = 0; + ctx.indata = false; + ctx.inenc = false; + + assert(parse_testfile(argv[2], handle_line, &ctx) == true); + + /* and run final test */ + if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n') + ctx.bufused -= 1; + + run_test(ctx.buf, ctx.bufused, ctx.enc); + + free(ctx.buf); + + assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK); + + printf("PASS\n"); + + return 0; +} + +bool handle_line(const char *data, size_t datalen, void *pw) +{ + line_ctx *ctx = (line_ctx *) pw; + + if (data[0] == '#') { + if (ctx->inenc) { + /* This marks end of testcase, so run it */ + + if (ctx->buf[ctx->bufused - 1] == '\n') + ctx->bufused -= 1; + + run_test(ctx->buf, ctx->bufused, ctx->enc); + + ctx->buf[0] = '\0'; + ctx->enc[0] = '\0'; + ctx->bufused = 0; + } + + ctx->indata = (strncasecmp(data+1, "data", 4) == 0); + ctx->inenc = (strncasecmp(data+1, "encoding", 8) == 0); + } else { + if (ctx->indata) { + memcpy(ctx->buf + ctx->bufused, data, datalen); + ctx->bufused += datalen; + } + if (ctx->inenc) { + strcpy(ctx->enc, data); + if (ctx->enc[strlen(ctx->enc) - 1] == '\n') + ctx->enc[strlen(ctx->enc) - 1] = '\0'; + } + } + + return true; +} + +void run_test(const uint8_t *data, size_t len, char *expected) +{ + uint16_t mibenum; + hubbub_charset_source source; + static int testnum; + + assert(hubbub_charset_extract(&data, &len, + &mibenum, &source) == HUBBUB_OK); + + assert(mibenum != 0); + + printf("%d: Detected charset %s (%d) Source %d Expected %s (%d)\n", + ++testnum, hubbub_mibenum_to_name(mibenum), + mibenum, source, expected, + hubbub_mibenum_from_name(expected, strlen(expected))); + + assert(mibenum == + hubbub_mibenum_from_name(expected, strlen(expected))); +} diff --git a/test/data/Aliases b/test/data/Aliases new file mode 100644 index 0000000..db61ff1 --- /dev/null +++ b/test/data/Aliases @@ -0,0 +1,302 @@ +# > Unicode:Files.Aliases +# Mapping of character set encoding names to their canonical form +# +# Lines starting with a '#' are comments, blank lines are ignored. +# +# Based on http://www.iana.org/assignments/character-sets and +# http://www.iana.org/assignments/ianacharset-mib +# +# Canonical Form MIBenum Aliases... +# +US-ASCII 3 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ASCII ISO646-US ANSI_X3.4-1968 us IBM367 cp367 csASCII +ISO-10646-UTF-1 27 csISO10646UTF1 +ISO_646.basic:1983 28 ref csISO646basic1983 +INVARIANT 29 csINVARIANT +ISO_646.irv:1983 30 iso-ir-2 irv csISO2IntlRefVersion +BS_4730 20 iso-ir-4 ISO646-GB gb uk csISO4UnitedKingdom +NATS-SEFI 31 iso-ir-8-1 csNATSSEFI +NATS-SEFI-ADD 32 iso-ir-8-2 csNATSSEFIADD +NATS-DANO 33 iso-ir-9-1 csNATSDANO +NATS-DANO-ADD 34 iso-ir-9-2 csNATSDANOADD +SEN_850200_B 35 iso-ir-10 FI ISO646-FI ISO646-SE se csISO10Swedish +SEN_850200_C 21 iso-ir-11 ISO646-SE2 se2 csISO11SwedishForNames +KS_C_5601-1987 36 iso-ir-149 KS_C_5601-1989 KSC_5601 korean csKSC56011987 +ISO-2022-KR 37 csISO2022KR +EUC-KR 38 csEUCKR EUCKR +ISO-2022-JP 39 csISO2022JP +ISO-2022-JP-2 40 csISO2022JP2 +ISO-2022-CN 104 +ISO-2022-CN-EXT 105 +JIS_C6220-1969-jp 41 JIS_C6220-1969 iso-ir-13 katakana x0201-7 csISO13JISC6220jp +JIS_C6220-1969-ro 42 iso-ir-14 jp ISO646-JP csISO14JISC6220ro +IT 22 iso-ir-15 ISO646-IT csISO15Italian +PT 43 iso-ir-16 ISO646-PT csISO16Portuguese +ES 23 iso-ir-17 ISO646-ES csISO17Spanish +greek7-old 44 iso-ir-18 csISO18Greek7Old +latin-greek 45 iso-ir-19 csISO19LatinGreek +DIN_66003 24 iso-ir-21 de ISO646-DE csISO21German +NF_Z_62-010_(1973) 46 iso-ir-25 ISO646-FR1 csISO25French +Latin-greek-1 47 iso-ir-27 csISO27LatinGreek1 +ISO_5427 48 iso-ir-37 csISO5427Cyrillic +JIS_C6226-1978 49 iso-ir-42 csISO42JISC62261978 +BS_viewdata 50 iso-ir-47 csISO47BSViewdata +INIS 51 iso-ir-49 csISO49INIS +INIS-8 52 iso-ir-50 csISO50INIS8 +INIS-cyrillic 53 iso-ir-51 csISO51INISCyrillic +ISO_5427:1981 54 iso-ir-54 ISO5427Cyrillic1981 +ISO_5428:1980 55 iso-ir-55 csISO5428Greek +GB_1988-80 56 iso-ir-57 cn ISO646-CN csISO57GB1988 +GB_2312-80 57 iso-ir-58 chinese csISO58GB231280 +NS_4551-1 25 iso-ir-60 ISO646-NO no csISO60DanishNorwegian csISO60Norwegian1 +NS_4551-2 58 ISO646-NO2 iso-ir-61 no2 csISO61Norwegian2 +NF_Z_62-010 26 iso-ir-69 ISO646-FR fr csISO69French +videotex-suppl 59 iso-ir-70 csISO70VideotexSupp1 +PT2 60 iso-ir-84 ISO646-PT2 csISO84Portuguese2 +ES2 61 iso-ir-85 ISO646-ES2 csISO85Spanish2 +MSZ_7795.3 62 iso-ir-86 ISO646-HU hu csISO86Hungarian +JIS_C6226-1983 63 iso-ir-87 x0208 JIS_X0208-1983 csISO87JISX0208 +greek7 64 iso-ir-88 csISO88Greek7 +ASMO_449 65 ISO_9036 arabic7 iso-ir-89 csISO89ASMO449 +iso-ir-90 66 csISO90 +JIS_C6229-1984-a 67 iso-ir-91 jp-ocr-a csISO91JISC62291984a +JIS_C6229-1984-b 68 iso-ir-92 ISO646-JP-OCR-B jp-ocr-b csISO92JISC62991984b +JIS_C6229-1984-b-add 69 iso-ir-93 jp-ocr-b-add csISO93JIS62291984badd +JIS_C6229-1984-hand 70 iso-ir-94 jp-ocr-hand csISO94JIS62291984hand +JIS_C6229-1984-hand-add 71 iso-ir-95 jp-ocr-hand-add csISO95JIS62291984handadd +JIS_C6229-1984-kana 72 iso-ir-96 csISO96JISC62291984kana +ISO_2033-1983 73 iso-ir-98 e13b csISO2033 +ANSI_X3.110-1983 74 iso-ir-99 CSA_T500-1983 NAPLPS csISO99NAPLPS +ISO-8859-1 4 iso-ir-100 ISO_8859-1 ISO_8859-1:1987 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 ISO8859-1 +ISO-8859-2 5 iso-ir-101 ISO_8859-2 ISO_8859-2:1987 latin2 l2 csISOLatin2 8859_2 ISO8859-2 +T.61-7bit 75 iso-ir-102 csISO102T617bit +T.61-8bit 76 T.61 iso-ir-103 csISO103T618bit +ISO-8859-3 6 iso-ir-109 ISO_8859-3 ISO_8859-3:1988 latin3 l3 csISOLatin3 8859_3 ISO8859-3 +ISO-8859-4 7 iso-ir-110 ISO_8859-4 ISO_8859-4:1988 latin4 l4 csISOLatin4 8859_4 ISO8859-4 +ECMA-cyrillic 77 iso-ir-111 KOI8-E csISO111ECMACyrillic +CSA_Z243.4-1985-1 78 iso-ir-121 ISO646-CA csa7-1 ca csISO121Canadian1 +CSA_Z243.4-1985-2 79 iso-ir-122 ISO646-CA2 csa7-2 csISO122Canadian2 +CSA_Z243.4-1985-gr 80 iso-ir-123 csISO123CSAZ24341985gr +ISO-8859-6 9 iso-ir-127 ISO_8859-6 ISO_8859-6:1987 ECMA-114 ASMO-708 arabic csISOLatinArabic +ISO-8859-6-E 81 csISO88596E ISO_8859-6-E +ISO-8859-6-I 82 csISO88596I ISO_8859-6-I +ISO-8859-7 10 iso-ir-126 ISO_8859-7 ISO_8859-7:1987 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 ISO8859-7 +T.101-G2 83 iso-ir-128 csISO128T101G2 +ISO-8859-8 11 iso-ir-138 ISO_8859-8 ISO_8859-8:1988 hebrew csISOLatinHebrew 8859_8 ISO8859-8 +ISO-8859-8-E 84 csISO88598E ISO_8859-8-E +ISO-8859-8-I 85 csISO88598I ISO_8859-8-I +CSN_369103 86 iso-ir-139 csISO139CSN369103 +JUS_I.B1.002 87 iso-ir-141 ISO646-YU js yu csISO141JUSIB1002 +ISO_6937-2-add 14 iso-ir-142 csISOTextComm +IEC_P27-1 88 iso-ir-143 csISO143IECP271 +ISO-8859-5 8 iso-ir-144 ISO_8859-5 ISO_8859-5:1988 cyrillic csISOLatinCyrillic 8859_5 ISO8859-5 +JUS_I.B1.003-serb 89 iso-ir-146 serbian csISO146Serbian +JUS_I.B1.003-mac 90 macedonian iso-ir-147 csISO147Macedonian +ISO-8859-9 12 iso-ir-148 ISO_8859-9 ISO_8859-9:1989 latin5 l5 csISOLatin5 8859_9 ISO8859-9 +greek-ccitt 91 iso-ir-150 csISO150 csISO150GreekCCITT +NC_NC00-10:81 92 cuba iso-ir-151 ISO646-CU csISO151Cuba +ISO_6937-2-25 93 iso-ir-152 csISO6937Add +GOST_19768-74 94 ST_SEV_358-88 iso-ir-153 csISO153GOST1976874 +ISO_8859-supp 95 iso-ir-154 latin1-2-5 csISO8859Supp +ISO_10367-box 96 iso-ir-155 csISO10367Box +ISO-8859-10 13 iso-ir-157 l6 ISO_8859-10:1992 csISOLatin6 latin6 8859_10 ISO8859-10 +latin-lap 97 lap iso-ir-158 csISO158Lap +JIS_X0212-1990 98 x0212 iso-ir-159 csISO159JISX02121990 +DS_2089 99 DS2089 ISO646-DK dk csISO646Danish +us-dk 100 csUSDK +dk-us 101 csDKUS +JIS_X0201 15 X0201 csHalfWidthKatakana +KSC5636 102 ISO646-KR csKSC5636 +ISO-10646-UCS-2 1000 csUnicode UCS-2 UCS2 +ISO-10646-UCS-4 1001 csUCS4 UCS-4 UCS4 +DEC-MCS 2008 dec csDECMCS +hp-roman8 2004 roman8 r8 csHPRoman8 +macintosh 2027 mac csMacintosh MACROMAN MAC-ROMAN X-MAC-ROMAN +IBM037 2028 cp037 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037 +IBM038 2029 EBCDIC-INT cp038 csIBM038 +IBM273 2030 CP273 csIBM273 +IBM274 2031 EBCDIC-BE CP274 csIBM274 +IBM275 2032 EBCDIC-BR cp275 csIBM275 +IBM277 2033 EBCDIC-CP-DK EBCDIC-CP-NO csIBM277 +IBM278 2034 CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278 +IBM280 2035 CP280 ebcdic-cp-it csIBM280 +IBM281 2036 EBCDIC-JP-E cp281 csIBM281 +IBM284 2037 CP284 ebcdic-cp-es csIBM284 +IBM285 2038 CP285 ebcdic-cp-gb csIBM285 +IBM290 2039 cp290 EBCDIC-JP-kana csIBM290 +IBM297 2040 cp297 ebcdic-cp-fr csIBM297 +IBM420 2041 cp420 ebcdic-cp-ar1 csIBM420 +IBM423 2042 cp423 ebcdic-cp-gr csIBM423 +IBM424 2043 cp424 ebcdic-cp-he csIBM424 +IBM437 2011 cp437 437 csPC8CodePage437 +IBM500 2044 CP500 ebcdic-cp-be ebcdic-cp-ch csIBM500 +IBM775 2087 cp775 csPC775Baltic +IBM850 2009 cp850 850 csPC850Multilingual +IBM851 2045 cp851 851 csIBM851 +IBM852 2010 cp852 852 csPCp852 +IBM855 2046 cp855 855 csIBM855 +IBM857 2047 cp857 857 csIBM857 +IBM860 2048 cp860 860 csIBM860 +IBM861 2049 cp861 861 cp-is csIBM861 +IBM862 2013 cp862 862 csPC862LatinHebrew +IBM863 2050 cp863 863 csIBM863 +IBM864 2051 cp864 csIBM864 +IBM865 2052 cp865 865 csIBM865 +IBM866 2086 cp866 866 csIBM866 +IBM868 2053 CP868 cp-ar csIBM868 +IBM869 2054 cp869 869 cp-gr csIBM869 +IBM870 2055 CP870 ebcdic-cp-roece ebcdic-cp-yu csIBM870 +IBM871 2056 CP871 ebcdic-cp-is csIBM871 +IBM880 2057 cp880 EBCDIC-Cyrillic csIBM880 +IBM891 2058 cp891 csIBM891 +IBM903 2059 cp903 csIBM903 +IBM904 2060 cp904 904 csIBBM904 +IBM905 2061 CP905 ebcdic-cp-tr csIBM905 +IBM918 2062 CP918 ebcdic-cp-ar2 csIBM918 +IBM1026 2063 CP1026 csIBM1026 +EBCDIC-AT-DE 2064 csIBMEBCDICATDE +EBCDIC-AT-DE-A 2065 csEBCDICATDEA +EBCDIC-CA-FR 2066 csEBCDICCAFR +EBCDIC-DK-NO 2067 csEBCDICDKNO +EBCDIC-DK-NO-A 2068 csEBCDICDKNOA +EBCDIC-FI-SE 2069 csEBCDICFISE +EBCDIC-FI-SE-A 2070 csEBCDICFISEA +EBCDIC-FR 2071 csEBCDICFR +EBCDIC-IT 2072 csEBCDICIT +EBCDIC-PT 2073 csEBCDICPT +EBCDIC-ES 2074 csEBCDICES +EBCDIC-ES-A 2075 csEBCDICESA +EBCDIC-ES-S 2076 csEBCDICESS +EBCDIC-UK 2077 csEBCDICUK +EBCDIC-US 2078 csEBCDICUS +UNKNOWN-8BIT 2079 csUnknown8BiT +MNEMONIC 2080 csMnemonic +MNEM 2081 csMnem +VISCII 2082 csVISCII +VIQR 2083 csVIQR +KOI8-R 2084 csKOI8R +KOI8-U 2088 +IBM00858 2089 CCSID00858 CP00858 PC-Multilingual-850+euro +IBM00924 2090 CCSID00924 CP00924 ebcdic-Latin9--euro +IBM01140 2091 CCSID01140 CP01140 ebcdic-us-37+euro +IBM01141 2092 CCSID01141 CP01141 ebcdic-de-273+euro +IBM01142 2093 CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro +IBM01143 2094 CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro +IBM01144 2095 CCSID01144 CP01144 ebcdic-it-280+euro +IBM01145 2096 CCSID01145 CP01145 ebcdic-es-284+euro +IBM01146 2097 CCSID01146 CP01146 ebcdic-gb-285+euro +IBM01147 2098 CCSID01147 CP01147 ebcdic-fr-297+euro +IBM01148 2099 CCSID01148 CP01148 ebcdic-international-500+euro +IBM01149 2100 CCSID01149 CP01149 ebcdic-is-871+euro +Big5-HKSCS 2101 +IBM1047 2102 IBM-1047 +PTCP154 2103 csPTCP154 PT154 CP154 Cyrillic-Asian +Amiga-1251 2104 Ami1251 Amiga1251 Ami-1251 +KOI7-switched 2105 +UNICODE-1-1 1010 csUnicode11 +SCSU 1011 +UTF-7 1012 +UTF-16BE 1013 +UTF-16LE 1014 +UTF-16 1015 +CESU-8 1016 csCESU-8 +UTF-32 1017 +UTF-32BE 1018 +UTF-32LE 1019 +BOCU-1 1020 csBOCU-1 +UNICODE-1-1-UTF-7 103 csUnicode11UTF7 +UTF-8 106 UNICODE-1-1-UTF-8 UNICODE-2-0-UTF-8 utf8 +ISO-8859-13 109 8859_13 ISO8859-13 +ISO-8859-14 110 iso-ir-199 ISO_8859-14:1998 ISO_8859-14 latin8 iso-celtic l8 8859_14 ISO8859-14 +ISO-8859-15 111 ISO_8859-15 Latin-9 8859_15 ISO8859-15 +ISO-8859-16 112 iso-ir-226 ISO_8859-16:2001 ISO_8859-16 latin10 l10 +GBK 113 CP936 MS936 windows-936 +GB18030 114 +OSD_EBCDIC_DF04_15 115 +OSD_EBCDIC_DF03_IRV 116 +OSD_EBCDIC_DF04_1 117 +JIS_Encoding 16 csJISEncoding +Shift_JIS 17 MS_Kanji csShiftJIS X-SJIS Shift-JIS +EUC-JP 18 csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese EUCJP +Extended_UNIX_Code_Fixed_Width_for_Japanese 19 csEUCFixWidJapanese +ISO-10646-UCS-Basic 1002 csUnicodeASCII +ISO-10646-Unicode-Latin1 1003 csUnicodeLatin1 ISO-10646 +ISO-Unicode-IBM-1261 1005 csUnicodeIBM1261 +ISO-Unicode-IBM-1268 1006 csUnicodeIBM1268 +ISO-Unicode-IBM-1276 1007 csUnicodeIBM1276 +ISO-Unicode-IBM-1264 1008 csUnicodeIBM1264 +ISO-Unicode-IBM-1265 1009 csUnicodeIBM1265 +ISO-8859-1-Windows-3.0-Latin-1 2000 csWindows30Latin1 +ISO-8859-1-Windows-3.1-Latin-1 2001 csWindows31Latin1 +ISO-8859-2-Windows-Latin-2 2002 csWindows31Latin2 +ISO-8859-9-Windows-Latin-5 2003 csWindows31Latin5 +Adobe-Standard-Encoding 2005 csAdobeStandardEncoding +Ventura-US 2006 csVenturaUS +Ventura-International 2007 csVenturaInternational +PC8-Danish-Norwegian 2012 csPC8DanishNorwegian +PC8-Turkish 2014 csPC8Turkish +IBM-Symbols 2015 csIBMSymbols +IBM-Thai 2016 csIBMThai +HP-Legal 2017 csHPLegal +HP-Pi-font 2018 csHPPiFont +HP-Math8 2019 csHPMath8 +Adobe-Symbol-Encoding 2020 csHPPSMath +HP-DeskTop 2021 csHPDesktop +Ventura-Math 2022 csVenturaMath +Microsoft-Publishing 2023 csMicrosoftPublishing +Windows-31J 2024 csWindows31J +GB2312 2025 csGB2312 EUC-CN EUCCN CN-GB +Big5 2026 csBig5 BIG-FIVE BIG-5 CN-BIG5 BIG_FIVE +windows-1250 2250 CP1250 MS-EE +windows-1251 2251 CP1251 MS-CYRL +windows-1252 2252 CP1252 MS-ANSI +windows-1253 2253 CP1253 MS-GREEK +windows-1254 2254 CP1254 MS-TURK +windows-1255 2255 +windows-1256 2256 CP1256 MS-ARAB +windows-1257 2257 CP1257 WINBALTRIM +windows-1258 2258 +TIS-620 2259 +HZ-GB-2312 2085 + +# Additional encodings not defined by IANA + +# Arbitrary allocations +#CP737 3001 +#CP853 3002 +#CP856 3003 +CP874 3004 WINDOWS-874 +#CP922 3005 +#CP1046 3006 +#CP1124 3007 +#CP1125 3008 WINDOWS-1125 +#CP1129 3009 +#CP1133 3010 IBM-CP1133 +#CP1161 3011 IBM-1161 IBM1161 CSIBM1161 +#CP1162 3012 IBM-1162 IBM1162 CSIBM1162 +#CP1163 3013 IBM-1163 IBM1163 CSIBM1163 +#GEORGIAN-ACADEMY 3014 +#GEORGIAN-PS 3015 +#KOI8-RU 3016 +#KOI8-T 3017 +#MACARABIC 3018 X-MAC-ARABIC MAC-ARABIC +#MACCROATIAN 3019 X-MAC-CROATIAN MAC-CROATIAN +#MACGREEK 3020 X-MAC-GREEK MAC-GREEK +#MACHEBREW 3021 X-MAC-HEBREW MAC-HEBREW +#MACICELAND 3022 X-MAC-ICELAND MAC-ICELAND +#MACROMANIA 3023 X-MAC-ROMANIA MAC-ROMANIA +#MACTHAI 3024 X-MAC-THAI MAC-THAI +#MACTURKISH 3025 X-MAC-TURKISH MAC-TURKISH +#MULELAO-1 3026 + +# From Unicode Lib +ISO-IR-182 4000 +ISO-IR-197 4002 +ISO-2022-JP-1 4008 +MACCYRILLIC 4009 X-MAC-CYRILLIC MAC-CYRILLIC +MACUKRAINE 4010 X-MAC-UKRAINIAN MAC-UKRAINIAN +MACCENTRALEUROPE 4011 X-MAC-CENTRALEURROMAN MAC-CENTRALEURROMAN +JOHAB 4012 +ISO-8859-11 4014 iso-ir-166 ISO_8859-11 ISO8859-11 8859_11 +X-CURRENT 4999 X-SYSTEM +X-ACORN-LATIN1 5001 +X-ACORN-FUZZY 5002 diff --git a/test/data/cscodec/INDEX b/test/data/cscodec/INDEX new file mode 100644 index 0000000..326cff5 --- /dev/null +++ b/test/data/cscodec/INDEX @@ -0,0 +1,5 @@ +# Index file for charset codec tests +# +# Test Description + +simple.dat Simple tests, designed to validate testdriver
\ No newline at end of file diff --git a/test/data/cscodec/simple.dat b/test/data/cscodec/simple.dat Binary files differnew file mode 100644 index 0000000..6a3cad1 --- /dev/null +++ b/test/data/cscodec/simple.dat diff --git a/test/data/csdetect/INDEX b/test/data/csdetect/INDEX new file mode 100644 index 0000000..e292063 --- /dev/null +++ b/test/data/csdetect/INDEX @@ -0,0 +1,9 @@ +# Index file for charset detection tests +# +# Test Description + +bom.dat UTF Byte Order Mark detection tests +non-ascii-meta.dat Tests for meta charsets claiming to be non-ASCII +test-yahoo-jp.dat Yahoo! Japan, from html5lib testcases +tests1.dat Assorted tests, including edge cases, from html5lib +tests2.dat Further tests from html5lib diff --git a/test/data/csdetect/bom.dat b/test/data/csdetect/bom.dat Binary files differnew file mode 100644 index 0000000..9a2f719 --- /dev/null +++ b/test/data/csdetect/bom.dat diff --git a/test/data/csdetect/non-ascii-meta.dat b/test/data/csdetect/non-ascii-meta.dat new file mode 100644 index 0000000..ea2a707 --- /dev/null +++ b/test/data/csdetect/non-ascii-meta.dat @@ -0,0 +1,129 @@ +#data +<html> +<head> +<meta charset="utf-16"> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset="utf-16le"> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset="utf-16be"> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset='utf-16'> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset='utf-16le'> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset='utf-16be'> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset=utf-16> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset=utf-16le> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset=utf-16be> +#encoding +windows-1252 + + + +#data +<html> +<head> +<meta charset="utf-32"> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset="utf-32le"> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset="utf-32be"> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset='utf-32'> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset='utf-32le'> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset='utf-32be'> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset=utf-32> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset=utf-32le> +#encoding +windows-1252 + +#data +<html> +<head> +<meta charset=utf-32be> +#encoding +windows-1252 + + diff --git a/test/data/csdetect/test-yahoo-jp.dat b/test/data/csdetect/test-yahoo-jp.dat new file mode 100644 index 0000000..daf6125 --- /dev/null +++ b/test/data/csdetect/test-yahoo-jp.dat @@ -0,0 +1,10 @@ +#data +<html> +<head> +<meta http-equiv="Content-Type" content="text/html; charset=euc-jp"> +<!--京--> +<title>Yahoo! JAPAN</title> +<meta name="description" content="日本最大級ã®ãƒãƒ¼ã‚¿ãƒ«ã‚µã‚¤ãƒˆã€‚検索ã€ã‚ªãƒ¼ã‚¯ã‚·ãƒ§ãƒ³ã€ãƒ‹ãƒ¥ãƒ¼ã‚¹ã€ãƒ¡ãƒ¼ãƒ«ã€ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã€ã‚·ãƒ§ãƒƒãƒ”ングã€ãªã©80以上ã®ã‚µãƒ¼ãƒ“スを展開。ã‚ãªãŸã®ç”Ÿæ´»ã‚’より豊ã‹ã«ã™ã‚‹ã€Œãƒ©ã‚¤ãƒ•・エンジンã€ã‚’目指ã—ã¦ã„ãã¾ã™ã€‚"> +<style type="text/css" media="all"> +#encoding +euc-jp
\ No newline at end of file diff --git a/test/data/csdetect/tests1.dat b/test/data/csdetect/tests1.dat new file mode 100644 index 0000000..8a62676 --- /dev/null +++ b/test/data/csdetect/tests1.dat @@ -0,0 +1,392 @@ +#data +<!DOCTYPE HTML> +<!-- (control test - for the other tests to work, this should pass - you may have to set your defaults appropriately) --> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<meta charset="ISO-8859-1"> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<meta charset="ISO-8859-9"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta charset='ISO-8859-9'> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta charset=ISO-8859-9> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta +charset=ISO-8859-9> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<metacharset=ISO-8859-9> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-9"> +<!-- XXX this is a tough one, not sure how to do this one, unless we explictly do content= processing --> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta content="text/html; charset=ISO-8859-9" http-equiv="Content-Type"> +<!-- XXX this is a tough one, not sure how to do this one, unless we explictly do content= processing --> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta http-equiv="Content-Type" content=text/html; charset=ISO-8859-9> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta http-equiv="Content-Type content="text/html; charset=ISO-8859-9"> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<meta http-equiv="Content-Type " content="text/html; charset=ISO-8859-9"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta content="text/html; charset=ISO-8859-9" http-equiv="Content-Type "> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta http-equiv="Content-Type>" content="text/html; charset=ISO-8859-9"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta content="text/html; charset=ISO-8859-9" http-equiv="Content-Type>"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta http-equiv="Content-Style-Type" content="text/html; charset=ISO-8859-9"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta content="text/html; charset=ISO-8859-9" http-equiv="Content-Style-Type"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta name="Content-Style-Type" content="text/html; charset=ISO-8859-9"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta content="text/html; charset=ISO-8859-9" name="Content-Style-Type"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta content="text/html; charset=ISO-8859-9"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta content=" text/html; charset = ISO-8859-9 "> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta content=" +text/html; charset=ISO-8859-9 +"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta charset=" +ISO-8859-9 +"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta charset= +ISO-8859-9 +> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta charset="ISO-8859-9> +<p>"</p> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<meta charset=ISO-8859-9"> +<p>"</p> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<meta " charset=ISO-8859-9> +<p>"</p> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta test" charset=ISO-8859-9> +<p>"</p> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta test=" charset=ISO-8859-9> +<p>"</p> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<meta test="' charset=ISO-8859-9> +<p>"'</p> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<meta test='" charset=ISO-8859-9> +<p>'"</p> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<meta test="" charset=ISO-8859-9> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta test=x" charset=ISO-8859-9> +<p>"</p> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<head></head><p title="x> +<meta test=x" charset=ISO-8859-9> +<p>"</p> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<head></head><p title="x> +<meta test=x charset=ISO-8859-9> +<p>"</p> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<head></head><p title="x> +<meta charset=ISO-8859-9> +<p>"</p> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<head></head><p title="x>"> +<meta charset=ISO-8859-9> +<p>"</p> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta charset="ISO-8859-1"> +<meta charset="ISO-8859-9"> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<meta charset="ISO-8859-9"> +<meta charset="ISO-8859-1"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<!--<meta charset="ISO-8859-1">--> +<meta charset="ISO-8859-9"> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<!--<meta charset="ISO-8859-9">--> +<meta charset="ISO-8859-1"> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<!-- Starts with UTF-8 BOM --> +#encoding +UTF-8 + +#data +<!DOCTYPE HTML> +<meta charset="ISO-8859-1"> +<!-- Starts with UTF-8 BOM --> +#encoding +UTF-8 + +#data +<!-- 511 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx--> +<meta charset="ISO-8859-9"> +#encoding +ISO-8859-9 + +#data +<!-- 512 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx--> +<meta charset="ISO-8859-9"> +#encoding +ISO-8859-9 + +#data +<!-- 1024 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx--> +<meta charset="ISO-8859-9"> +#encoding +Windows-1252 + +#data +<!-- 1025 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz--> +<meta charset="ISO-8859-9"> +#encoding +Windows-1252 + +#data +<!-- 2048 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx--> +<meta charset="ISO-8859-9"> +#encoding +Windows-1252 + +#data +<!-- 2049 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz--> +<meta charset="ISO-8859-9"> +#encoding +Windows-1252 + +#data <!-- 4096 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx--> +<meta charset="ISO-8859-9"> +#encoding +Windows-1252 + +#data <!-- 4097 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz--> +<meta charset="ISO-8859-9"> +#encoding +Windows-1252 + +#data +<!-- 8192 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx--> +<meta charset="ISO-8859-9"> +#encoding +Windows-1252 + +#data +<!-- 8193 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz--> +<meta charset="ISO-8859-9"> +#encoding +Windows-1252 + +#data +<!-- multi-script test --> +<script>alert('step 1 of 3 ("þ")')</script> +<!-- ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ --> +<script>alert('step 2 of 3 ("þ")')</script> +<meta charset="ISO-8859-9"> +<script>alert('step 3 of 3 ("þ")')</script> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<script>document.write('<meta charset="ISO-8859-' + '9">')</script> +#encoding +Windows-1252 + +#data +<!DOCTYPE HTML> +<script>document.write('<meta charset="ISO-8859-9">')</script> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<script type="text/plain"><meta charset="ISO-8859-9"></script> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<style type="text/plain"><meta charset="ISO-8859-9"></style> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<p><meta charset="ISO-8859-9"></p> +#encoding +ISO-8859-9 + +#data +<!DOCTYPE HTML> +<meta charset="bogus"> +<meta charset="ISO-8859-9"> +#encoding +ISO-8859-9 diff --git a/test/data/csdetect/tests2.dat b/test/data/csdetect/tests2.dat new file mode 100644 index 0000000..dd43f85 --- /dev/null +++ b/test/data/csdetect/tests2.dat @@ -0,0 +1,82 @@ +#data +<meta +#encoding +windows-1252 + +#data +< +#encoding +windows-1252 + +#data +<! +#encoding +windows-1252 + +#data +<meta charset = " +#encoding +windows-1252 + +#data +<meta charset=EUC-jp +#encoding +windows-1252 + +#data +<meta <meta charset='EUC-jp'> +#encoding +EUC-jp + +#data +<meta charset = 'EUC-jp'> +#encoding +EUC-jp + + +#data +<!-- --> +<meta http-equiv="Content-Type" content="text/html; charset=utf-8"> +#encoding +utf-8 + +#data +<!-- --> +<meta http-equiv="Content-Type" content="text/html; charset=utf +#encoding +windows-1252 + +#data +<meta http-equiv="Content-Type<meta charset="utf-8"> +#encoding +windows-1252 + +#data +<meta http-equiv="Content-Type" content="text/html; charset='utf-8'"> +#encoding +utf-8 + +#data +<meta http-equiv="Content-Type" content="text/html; charset='utf-8"> +#encoding +windows-1252 + +#data +<meta +#encoding +windows-1252 + +#data +<meta charset = +#encoding +windows-1252 + +#data +<meta charset= utf-8 +#encoding +windows-1252 + +#data +<meta content = "text/html; +#encoding +windows-1252 diff --git a/test/data/html/INDEX b/test/data/html/INDEX new file mode 100644 index 0000000..03d6e04 --- /dev/null +++ b/test/data/html/INDEX @@ -0,0 +1,6 @@ +# Index file for generic HTML content +# +# Test Description + +section-tree-construction.html HTML5 tree construction algorithm +web-apps.html HTML5 specification diff --git a/test/data/html/section-tree-construction.html b/test/data/html/section-tree-construction.html new file mode 100644 index 0000000..45ce9ab --- /dev/null +++ b/test/data/html/section-tree-construction.html @@ -0,0 +1,2783 @@ +<!DOCTYPE HTML> + + +<html lang="en-GB-hixie"> + <head> + <title>HTML 5</title> + <link href="/style/specification" type="text/css" rel="stylesheet"> + <link href="/images/icon" rel="icon"> + + <style type="text/css"> + h4 + .element { margin-top: -2.5em; padding-top: 2em; } + h4 + p + .element { margin-top: -5em; padding-top: 4em; } + .element { background: #EEFFEE; color: black; margin: 0 0 1em -1em; padding: 0 1em 0.25em 0.75em; border-left: solid #99FF99 0.25em; -padding: 0; /* that last decl is for IE6. Try removing it, it's hilarious! */ } + .proposal { border: blue solid; padding: 1em; } + table.matrix, table.matrix td { border: none; text-align: right; } + table.matrix { margin-left: 2em; } + </style> + + <link href="section-tokenisation.html#nav-bar" rel="prev" title="8.2.3. Tokenisation"><link href="index.html#contents" rel="index" title="Table of contents"><link href="section-namespaces.html#nav-bar" rel="next" title="8.3. Namespaces"></head><body class="draft"><div class="head"> + <p><a href="http://www.whatwg.org/" class="logo" rel="home"><img src="/images/logo" alt="WHATWG"></a></p> + + <h1 id="html-5">HTML 5</h1> + + <h2 id="working" class="no-num no-toc">Working Draft — 12 June 2007</h2></div><nav id="nav-bar"><a href="section-tokenisation.html#nav-bar">< 8.2.3. Tokenisation</a> – <a href="index.html#contents">Table of contents</a> – <a href="section-namespaces.html#nav-bar">8.3. Namespaces ></a></nav><h4 id="tree-construction"><span class="secno">8.2.4. </span><dfn id="tree-construction0">Tree construction</dfn></h4> + + <p>The input to the tree construction stage is a sequence of tokens from + the <a href="section-tokenisation.html#tokenisation0">tokenisation</a> stage. The tree construction + stage is associated with a DOM <code>Document</code> object when a parser + is created. The "output" of this stage consists of dynamically modifying + or extending that document's DOM tree. + + </p><p>Tree construction passes through several phases. Initially, UAs must act + according to the steps described as being those of <a href="#the-initial0">the initial phase</a>. + + </p><p>This specification does not define when an interactive user agent has to + render the <code>Document</code> available to the user, or when it has to + begin accepting user input. + + </p><p>When the steps below require the UA to <dfn id="append">append a + character</dfn> to a node, the UA must collect it and all subsequent + consecutive characters that would be appended to that node, and insert one + <code>Text</code> node whose data is the concatenation of all those + characters. + + </p><p id="mutation-during-parsing">DOM mutation events must not fire for changes + caused by the UA parsing the document. (Conceptually, the parser is not + mutating the DOM, it is constructing it.) This includes the parsing of any + content inserted using <code title="dom-document-write-HTML"><a href="section-dynamic.html#document.write0">document.write()</a></code> and <code title="dom-document-writeln"><a href="section-dynamic.html#document.writeln">document.writeln()</a></code> calls.<!-- + XXX xref --> + <a href="#refsDOM3EVENTS">[DOM3EVENTS]</a></p> + <!-- XXX + what abotu innerHTML? --> + + <p class="note">Not all of the tag names mentioned below are conformant tag + names in this specification; many are included to handle legacy content. + They still form part of the algorithm that implementations are required to + implement to claim conformance. + + </p><p class="note">The algorithm described below places no limit on the depth of + the DOM tree generated, or on the length of tag names, attribute names, + attribute values, text nodes, etc. While implementators are encouraged to + avoid arbitrary limits, it is recognised that <a href="section-conformance.html#hardwareLimitations">practical concerns</a> will likely force user + agents to impose nesting depths. + + </p><h5 id="the-initial"><span class="secno">8.2.4.1. </span><dfn id="the-initial0">The initial phase</dfn></h5> + + <p>Initially, the tree construction stage must handle each token emitted + from the <a href="section-tokenisation.html#tokenisation0">tokenisation</a> stage as follows: + + </p><dl class="switch"> + <dt>A DOCTYPE token that is marked as being in error + + </dt><dt>A comment token + + </dt><dt>A start tag token + + </dt><dt>An end tag token + + </dt><dt>A character token that is not one of one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM + FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dt>An end-of-file token + + </dt><dd> + <p>This specification does not define how to handle this case. In + particular, user agents may ignore the entirety of this specification + altogether for such documents, and instead invoke special parse modes + with a greater emphasis on backwards compatibility.</p> + + <div class="note"> + <p>Browsers in particular have generally used DOCTYPE-based sniffing to + invoke an "alternative conformance mode" known as <em>quirks mode</em> + on certain documents. In this mode, emphasis is put on legacy + compatibility rather than on standards compliance. This specification + takes no position on this behaviour; documents without DOCTYPEs or with + DOCTYPEs that do not conform to the syntax allowed by this + specification are considered to be out of scope of this specification.</p> + </div> + + <div class="big-issue"> + <p>As far as parsing goes, the quirks I know of are:</p> + + <ul> + <li>Comment parsing is different. + + </li><li>The following is considered one script block (!): + <pre><script><!-- document.write('</script>'); --></script></pre> + + </li><li><code title=""></br></code> and <code title=""></p></code> do + magical things. + + </li><li><code><a href="section-prose.html#p">p</a></code> can contain <code><a href="section-tabular.html#table">table</a></code> + + </li><li>Safari and IE have special parsing rules for <% ... %> (even + in standards mode, though clearly this should be quirks-only). + </li></ul> + + <p>Maybe we should just adopt all those and be done with it. One parsing + mode to rule them all. Or legitimise/codify the quirks mode parsing in + some way.</p> + + <p>Would be interesting to do a search to see how many pages hit each of + the above.</p> + <!-- biased by page rank? --></div> + + </dd><dt>A DOCTYPE token marked as being correct + + </dt><dd> + <p>Append a <code>DocumentType</code> node to the <code>Document</code> + node, with the <code title="">name</code> attribute set to the name + given in the DOCTYPE token (which will be "HTML"), and the other + attributes specific to <code>DocumentType</code> objects set to null, + empty lists, or the empty string as appropriate.</p> + + <p>Then, switch to <a href="#the-root1">the root element phase</a> of the + tree construction stage.</p> + <!-- XXX should set doctype on the Document object, too, unless + spec is defined to already point to it if you append --> + + + </dd><dt>A character token that <em>is</em> one of one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM + FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dd> + <p><a href="#append" title="append a character">Append that character</a> + to the <code>Document</code> node.</p> + </dd></dl> + + <h5 id="the-root0"><span class="secno">8.2.4.2. </span><dfn id="the-root1">The + root element phase</dfn></h5> + + <p>After <a href="#the-initial0">the initial phase</a>, as each token is + emitted from the <a href="section-tokenisation.html#tokenisation0">tokenisation</a> stage, it must + be processed as described in this section. + + </p><dl class="switch"> + <dt>A DOCTYPE token + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + + </dd><dt>A comment token + + </dt><dd> + <p>Append a <code>Comment</code> node to the <code>Document</code> object + with the <code title="">data</code> attribute set to the data given in + the comment token.</p> + + </dd><dt>A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dd> + <p><a href="#append" title="append a character">Append that character</a> + to the <code>Document</code> node.</p> + + </dd><dt>A character token that is <em>not</em> one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM + FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dt>A start tag token + + </dt><dt>An end tag token + + </dt><dt>An end-of-file token + + </dt><dd> + <p>Create an <code><a href="section-elements.html#htmlelement">HTMLElement</a></code> node + with the tag name <code><a href="section-the-root.html#html">html</a></code>, in the <a href="section-namespaces.html#html-namespace0">HTML namespace</a>. Append it to the + <code>Document</code> object. Switch to <a href="#the-main0">the main + phase</a> and reprocess the current token.</p> + + <p class="big-issue">Should probably make end tags be ignored, so that + "</head><!-- --><html>" puts the comment befor the root node + (or should we?)</p> + </dd></dl> + + <p>The root element can end up being removed from the <code>Document</code> + object, e.g. by scripts; nothing in particular happens in such cases, + content continues being appended to the nodes as described in the next + section. + + </p><h5 id="the-main"><span class="secno">8.2.4.3. </span><dfn id="the-main0">The + main phase</dfn></h5> + + <p>After <a href="#the-root1">the root element phase</a>, each token + emitted from the <a href="section-tokenisation.html#tokenisation0">tokenisation</a> stage must be + processed as described in <em>this</em> section. This is by far the most + involved part of parsing an HTML document. + + </p><p>The tree construction stage in this phase has several pieces of state: a + <a href="#stack">stack of open elements</a>, a <a href="#list-of4">list of + active formatting elements</a>, a <a href="#head-element"><code title="">head</code> element pointer</a>, a <a href="#form-element"><code title="">form</code> element pointer</a>, and an <a href="#insertion0">insertion mode</a>. + + </p><p class="big-issue">We could just fold insertion modes and phases into one + concept (and duplicate the two rules common to all insertion modes into + all of them). + + </p><h6 id="the-stack"><span class="secno">8.2.4.3.1. </span>The stack of open + elements</h6> + + <p>Initially the <dfn id="stack">stack of open elements</dfn> contains just + the <code><a href="section-the-root.html#html">html</a></code> root element node created in the + <a href="#the-root1" title="the root element phase">last phase</a> before + switching to <em>this</em> phase (or, in the <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>, the <code><a href="section-the-root.html#html">html</a></code> element created to represent the element + whose <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> attribute is being set). That's + the topmost node of the stack. It never gets popped off the stack. (This + stack grows downwards.) + + </p><p>The <dfn id="current4">current node</dfn> is the bottommost node in this + stack. + + </p><p>Elements in the stack fall into the following categories: + + </p><dl> + <dt><dfn id="special">Special</dfn> + + </dt><dd> + <p>The following HTML elements have varying levels of special parsing + rules: <code><a href="section-sections.html#address">address</a></code>, <code><a href="section-the-canvas.html#area">area</a></code>, <code><a href="section-document.html#base">base</a></code>, + <code>basefont</code>, <code>bgsound</code>, <code><a href="section-sections.html#blockquote">blockquote</a></code>, <code><a href="section-sections.html#body0">body</a></code>, <code><a href="section-prose.html#br">br</a></code>, + <code>center</code>, <code><a href="section-tabular.html#col">col</a></code>, <code><a href="section-tabular.html#colgroup">colgroup</a></code>, <code><a href="section-lists0.html#dd">dd</a></code>, + <code>dir</code>, <code><a href="section-miscellaneous.html#div">div</a></code>, <code><a href="section-lists0.html#dl">dl</a></code>, <code><a href="section-lists0.html#dt">dt</a></code>, <code><a href="section-embedded.html#embed">embed</a></code>, <code>fieldset</code>, + <code>form</code>, <code>frame</code>, <code>frameset</code>, <code><a href="section-sections.html#h1">h1</a></code>, <code><a href="section-sections.html#h2">h2</a></code>, <code><a href="section-sections.html#h3">h3</a></code>, <code><a href="section-sections.html#h4">h4</a></code>, <code><a href="section-sections.html#h5">h5</a></code>, <code><a href="section-sections.html#h6">h6</a></code>, <code><a href="section-document.html#head">head</a></code>, <code><a href="section-prose.html#hr">hr</a></code>, + <code><a href="section-embedded.html#iframe">iframe</a></code>, + <code>image</code><!-- XXX ? this isn't an element that can end up + on the stack-->, + <code><a href="section-embedded.html#img">img</a></code>, <code>input</code>, + <code>isindex</code>, <code><a href="section-lists0.html#li">li</a></code>, <code><a href="section-document.html#link">link</a></code>, <code>listing</code>, <code><a href="section-the-command.html#menu">menu</a></code>, <code><a href="section-document.html#meta0">meta</a></code>, + <code>noembed</code>, <code>noframes</code>, <code><a href="section-scripting0.html#noscript">noscript</a></code>, <code><a href="section-lists0.html#ol">ol</a></code>, + <code>optgroup</code>, <code>option</code>, <code><a href="section-prose.html#p">p</a></code>, <code><a href="section-embedded.html#param">param</a></code>, + <code>plaintext</code>, <code><a href="section-preformatted.html#pre">pre</a></code>, <code><a href="section-scripting0.html#script0">script</a></code>, <code>select</code>, + <code>spacer</code>, <code><a href="section-document.html#style">style</a></code>, <code><a href="section-tabular.html#tbody">tbody</a></code>, <code>textarea</code>, <code><a href="section-tabular.html#tfoot0">tfoot</a></code>, <code><a href="section-tabular.html#thead0">thead</a></code>, <code><a href="section-document.html#title1">title</a></code>, <code><a href="section-tabular.html#tr">tr</a></code>, + <code><a href="section-lists0.html#ul">ul</a></code>, and <code>wbr</code>. + + </p></dd><dt><dfn id="scoping">Scoping</dfn> + + </dt><dd> + <p>The following HTML elements introduce new <a href="#have-an" title="has an element in scope">scopes</a> for various parts of the + parsing: <code>button</code>, <code><a href="section-tabular.html#caption0">caption</a></code>, <code><a href="section-the-root.html#html">html</a></code>, <code>marquee</code>, <code><a href="section-embedded.html#object">object</a></code>, <code><a href="section-tabular.html#table">table</a></code>, <code><a href="section-tabular.html#td">td</a></code> and + <code><a href="section-tabular.html#th">th</a></code>. + + </p></dd><dt><dfn id="formatting">Formatting</dfn> + + </dt><dd> + <p>The following HTML elements are those that end up in the <a href="#list-of4">list of active formatting elements</a>: <code><a href="section-phrase.html#a">a</a></code>, <code><a href="section-phrase.html#b">b</a></code>, + <code>big</code>, <code><a href="section-phrase.html#em">em</a></code>, <code><a href="section-presentational.html#font">font</a></code>, <code><a href="section-phrase.html#i">i</a></code>, + <code>nobr</code>, <code>s</code>, <code><a href="section-phrase.html#small">small</a></code>, <code>strike</code>, <code><a href="section-phrase.html#strong">strong</a></code>, <code>tt</code>, and <code>u</code>. + + </p></dd><dt><dfn id="phrasing">Phrasing</dfn> + + </dt><dd> + <p>All other elements found while parsing an HTML document. + </p></dd></dl> + + <p class="big-issue">Still need to add these new elements to the lists: + <code><a href="section-scripting0.html#event-source">event-source</a></code>, <code><a href="section-sections.html#section">section</a></code>, <code><a href="section-sections.html#nav">nav</a></code>, + <code><a href="section-sections.html#article">article</a></code>, <code><a href="section-sections.html#aside">aside</a></code>, <code><a href="section-sections.html#header">header</a></code>, + <code><a href="section-sections.html#footer">footer</a></code>, <code><a href="section-interactive.html#datagrid0">datagrid</a></code>, <code><a href="section-the-command.html#command0">command</a></code> + + </p><p>The <a href="#stack">stack of open elements</a> is said to <dfn id="have-an" title="has an element in scope">have an element in scope</dfn> + or <dfn id="have-an0" title="has an element in table scope">have an element + in <em>table scope</em></dfn> when the following algorithm terminates in a + match state: + + </p><ol> + <li> + <p>Initialise <var title="">node</var> to be the <a href="#current4">current node</a> (the bottommost node of the stack). + + </p></li><li> + <p>If <var title="">node</var> is the target node, terminate in a match + state. + + </p></li><li> + <p>Otherwise, if <var title="">node</var> is a <code><a href="section-tabular.html#table">table</a></code> element, terminate in a failure state. + + </p></li><li> + <p>Otherwise, if the algorithm is the "has an element in scope" variant + (rather than the "has an element in table scope" variant), and <var title="">node</var> is one of the following, terminate in a failure + state:</p> + + <ul class="brief"> + <li><code><a href="section-tabular.html#caption0">caption</a></code> + + </li><li><code><a href="section-tabular.html#td">td</a></code> + + </li><li><code><a href="section-tabular.html#th">th</a></code> + + </li><li><code>button</code> + + </li><li><code>marquee</code> + + </li><li><code><a href="section-embedded.html#object">object</a></code> + </li></ul> + + </li><li> + <p>Otherwise, if <var title="">node</var> is an <code><a href="section-the-root.html#html">html</a></code> element, terminate in a failure state. + (This can only happen if the <var title="">node</var> is the topmost + node of the <a href="#stack">stack of open elements</a>, and prevents + the next step from being invoked if there are no more elements in the + stack.) + + </p></li><li> + <p>Otherwise, set <var title="">node</var> to the previous entry in the + <a href="#stack">stack of open elements</a> and return to step 2. (This + will never fail, since the loop will always terminate in the previous + step if the top of the stack is reached.) + </p></li></ol> + + <p>Nothing happens if at any time any of the elements in the <a href="#stack">stack of open elements</a> are moved to a new location in, + or removed from, the <code>Document</code> tree. In particular, the stack + is not changed in this situation. This can cause, amongst other strange + effects, content to be appended to nodes that are no longer in the DOM. + + </p><p class="note">In some cases (namely, when <a href="#adoptionAgency">closing + misnested formatting elements</a>), the stack is manipulated in a + random-access fashion. + + </p><h6 id="the-list"><span class="secno">8.2.4.3.2. </span>The list of active + formatting elements</h6> + + <p>Initially the <dfn id="list-of4">list of active formatting elements</dfn> + is empty. It is used to handle mis-nested <a href="#formatting" title="formatting">formatting element tags</a>. + + </p><p>The list contains elements in the <a href="#formatting">formatting</a> + category, and scope markers. The scope markers are inserted when entering + buttons, <code><a href="section-embedded.html#object">object</a></code> elements, marquees, + table cells, and table captions, and are used to prevent formatting from + "leaking" into tables, buttons, <code><a href="section-embedded.html#object">object</a></code> + elements, and marquees. + + </p><p>When the steps below require the UA to <dfn id="reconstruct">reconstruct + the active formatting elements</dfn>, the UA must perform the following + steps: + + </p><ol> + <li>If there are no entries in the <a href="#list-of4">list of active + formatting elements</a>, then there is nothing to reconstruct; stop this + algorithm. + + </li><li>If the last (most recently added) entry in the <a href="#list-of4">list of active formatting elements</a> is a marker, or + if it is an element that is in the <a href="#stack">stack of open + elements</a>, then there is nothing to reconstruct; stop this algorithm. + + </li><li>Let <var title="">entry</var> be the last (most recently added) + element in the <a href="#list-of4">list of active formatting + elements</a>. + + </li><li>If there are no entries before <var title="">entry</var> in the <a href="#list-of4">list of active formatting elements</a>, then jump to + step 8. + + </li><li>Let <var title="">entry</var> be the entry one earlier than <var title="">entry</var> in the <a href="#list-of4">list of active formatting + elements</a>. + + </li><li>If <var title="">entry</var> is neither a marker nor an element that + is also in the <a href="#stack">stack of open elements</a>, go to step 4. + + </li><li>Let <var title="">entry</var> be the element one later than <var title="">entry</var> in the <a href="#list-of4">list of active formatting + elements</a>. + + </li><li>Perform a shallow clone of the element <var title="">entry</var> to + obtain <var title="">clone</var>. <a href="#refsDOM3CORE">[DOM3CORE]</a> + + </li><li>Append <var title="">clone</var> to the <a href="#current4">current + node</a> and push it onto the <a href="#stack">stack of open elements</a> + so that it is the new <a href="#current4">current node</a>. + + </li><li>Replace the entry for <var title="">entry</var> in the list with an + entry for <var title="">clone</var>. + + </li><li>If the entry for <var title="">clone</var> in the <a href="#list-of4">list of active formatting elements</a> is not the last + entry in the list, return to step 7. + </li></ol> + + <p>This has the effect of reopening all the formatting elements that were + opened in the current body, cell, or caption (whichever is youngest) that + haven't been explicitly closed. + + </p><p class="note">The way this specification is written, the <a href="#list-of4">list of active formatting elements</a> always consists of + elements in chronological order with the least recently added element + first and the most recently added element last (except for while steps 8 + to 11 of the above algorithm are being executed, of course). + + </p><p>When the steps below require the UA to <dfn id="clear0">clear the list of + active formatting elements up to the last marker</dfn>, the UA must + perform the following steps: + + </p><ol> + <li>Let <var title="">entry</var> be the last (most recently added) entry + in the <a href="#list-of4">list of active formatting elements</a>. + + </li><li>Remove <var title="">entry</var> from the <a href="#list-of4">list of + active formatting elements</a>. + + </li><li>If <var title="">entry</var> was a marker, then stop the algorithm at + this point. The list has been cleared up to the last marker. + + </li><li>Go to step 1. + </li></ol> + + <h6 id="creating"><span class="secno">8.2.4.3.3. </span>Creating and inserting + HTML elements</h6> + + <p>When the steps below require the UA to <dfn id="create" title="create an + element for the token">create an element for a token</dfn>, the UA must + create a node implementing the interface appropriate for the element type + corresponding to the tag name of the token (as given in the section of + this specification that defines that element, e.g. for an <code><a href="section-phrase.html#a">a</a></code> element it would be the <code><a href="section-phrase.html#htmlanchorelement">HTMLAnchorElement</a></code> interface), with + the tag name being the name of that element, with the node being in the <a href="section-namespaces.html#html-namespace0">HTML namespace</a>, and with the attributes on the + node being those given in the given token. + + </p><p>When the steps below require the UA to <dfn id="insert">insert an HTML + element</dfn> for a token, the UA must first <a href="#create">create an + element for the token</a>, and then append this node to the <a href="#current4">current node</a>, and push it onto the <a href="#stack">stack of open elements</a> so that it is the new <a href="#current4">current node</a>. + + </p><p>The steps below may also require that the UA insert an HTML element in a + particular place, in which case the UA must <a href="#create">create an + element for the token</a> and then insert or append the new node in the + location specified. (This happens in particular during the parsing of + tables with invalid content.) + + </p><p>The interface appropriate for an element that is not defined in this + specification is <code><a href="section-elements.html#htmlelement">HTMLElement</a></code>. + + </p><h6 id="closing"><span class="secno">8.2.4.3.4. </span>Closing elements that + have implied end tags</h6> + + <p>When the steps below require the UA to <dfn id="generate">generate implied + end tags</dfn>, then, if the <a href="#current4">current node</a> is a + <code><a href="section-lists0.html#dd">dd</a></code> element, a <code><a href="section-lists0.html#dt">dt</a></code> element, an <code><a href="section-lists0.html#li">li</a></code> + element, a <code><a href="section-prose.html#p">p</a></code> element, a <code><a href="section-tabular.html#td">td</a></code> element, a <code><a href="section-tabular.html#th">th</a></code> + element, or a <code><a href="section-tabular.html#tr">tr</a></code> element, the UA must act + as if an end tag with the respective tag name had been seen and then <a href="#generate">generate implied end tags</a> again. + + </p><p>The step that requires the UA to generate implied end tags but lists an + element to exclude from the process, then the UA must perform the above + steps as if that element was not in the above list. + + </p><h6 id="the-element"><span class="secno">8.2.4.3.5. </span>The element pointers</h6> + + <p>Initially the <dfn id="head-element"><code title="">head</code> element + pointer</dfn> and the <dfn id="form-element"><code title="">form</code> + element pointer</dfn> are both null. + + </p><p>Once a <code><a href="section-document.html#head">head</a></code> element has been parsed + (whether implicitly or explicitly) the <a href="#head-element"><code title="">head</code> element pointer</a> gets set to point to this node. + + </p><p>The <a href="#form-element"><code title="">form</code> element + pointer</a> points to the last <code>form</code> element that was opened + and whose end tag has not yet been seen. It is used to make form controls + associate with forms in the face of dramatically bad markup, for + historical reasons. + + </p><h6 id="the-insertion"><span class="secno">8.2.4.3.6. </span>The insertion mode</h6> + + <p>Initially the <dfn id="insertion0">insertion mode</dfn> is "<a href="#before2" title="insertion mode: before head">before head</a>". It + can change to "<a href="#in-head" title="insertion mode: in head">in + head</a>", "<a href="#after1" title="insertion mode: after head">after + head</a>", "<a href="#in-body" title="insertion mode: in body">in + body</a>", "<a href="#in-table" title="insertion mode: in table">in + table</a>", "<a href="#in-caption" title="insertion mode: in caption">in + caption</a>", "<a href="#in-column" title="insertion mode: in column + group">in column group</a>", "<a href="#in-table0" title="insertion mode: + in table body">in table body</a>", "<a href="#in-row" title="insertion + mode: in row">in row</a>", "<a href="#in-cell" title="insertion mode: in + cell">in cell</a>", "<a href="#in-select" title="insertion mode: in + select">in select</a>", "<a href="#after2" title="insertion mode: after + body">after body</a>", "<a href="#in-frameset" title="insertion mode: in + frameset">in frameset</a>", and "<a href="#after3" title="insertion mode: + after frameset">after frameset</a>" during the course of the parsing, as + described below. It affects how certain tokens are processed. + + </p><p>If the tree construction stage is switched from <a href="#the-main0">the + main phase</a> to <a href="#the-trailing0">the trailing end phase</a> and + back again, the various pieces of state are not reset; the UA must act as + if the state was maintained. + + </p><p>When the steps below require the UA to <dfn id="reset">reset the insertion + mode appropriately</dfn>, it means the UA must follow these steps: + + </p><ol> + <li>Let <var title="">last</var> be false. + + </li><li>Let <var title="">node</var> be the last node in the <a href="#stack">stack of open elements</a>. + + </li><li>If <var title="">node</var> is the first node in the stack of open + elements, then set <var title="">last</var> to true. If the element whose + <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> + attribute is being set is neither a <code><a href="section-tabular.html#td">td</a></code> + element nor a <code><a href="section-tabular.html#th">th</a></code> element, then set <var title="">node</var> to the element whose <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> + attribute is being set. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> + case</a>) + + </li><li>If <var title="">node</var> is a <code>select</code> element, then + switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-select" title="insertion mode: in select">in select</a>" and + abort these steps. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> + case</a>) + + </li><li>If <var title="">node</var> is a <code><a href="section-tabular.html#td">td</a></code> or + <code><a href="section-tabular.html#th">th</a></code> element, then switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-cell" title="insertion mode: in cell">in cell</a>" and abort these steps. + + </li><li>If <var title="">node</var> is a <code><a href="section-tabular.html#tr">tr</a></code> + element, then switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-row" title="insertion mode: in row">in row</a>" and abort these + steps. + + </li><li>If <var title="">node</var> is a <code><a href="section-tabular.html#tbody">tbody</a></code>, <code><a href="section-tabular.html#thead0">thead</a></code>, + or <code><a href="section-tabular.html#tfoot0">tfoot</a></code> element, then switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-table0" title="insertion mode: in table body">in table body</a>" and abort these + steps. + + </li><li>If <var title="">node</var> is a <code><a href="section-tabular.html#caption0">caption</a></code> element, then switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-caption" title="insertion mode: in caption">in caption</a>" and abort these steps. + + </li><li>If <var title="">node</var> is a <code><a href="section-tabular.html#colgroup">colgroup</a></code> element, then switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-column" title="insertion mode: in column group">in column group</a>" and abort + these steps. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>) + + </li><li>If <var title="">node</var> is a <code><a href="section-tabular.html#table">table</a></code> element, then switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-table" title="insertion mode: in table">in table</a>" and abort these steps. + + </li><li>If <var title="">node</var> is a <code><a href="section-document.html#head">head</a></code> + element, then switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-body" title="insertion mode: in body">in body</a>" ("<a href="#in-body" title="insertion mode: in body">in body</a>"! <em> not + "<a href="#in-head" title="insertion mode: in head">in head</a>"</em>!) + and abort these steps. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> + case</a>) + + </li><li>If <var title="">node</var> is a <code><a href="section-sections.html#body0">body</a></code> element, then switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-body" title="insertion mode: in body">in body</a>" and abort these steps. + + </li><li>If <var title="">node</var> is a <code>frameset</code> element, then + switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-frameset" title="insertion mode: in frameset">in frameset</a>" + and abort these steps. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> + case</a>) + + </li><li>If <var title="">node</var> is an <code><a href="section-the-root.html#html">html</a></code> element, then: if the <a href="#head-element"><code title="">head</code> element pointer</a> is + null, switch the <a href="#insertion0">insertion mode</a> to "<a href="#before2" title="insertion mode: before head">before head</a>", + otherwise, switch the <a href="#insertion0">insertion mode</a> to "<a href="#after1" title="insertion mode: after head">after head</a>". In + either case, abort these steps. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</li> + <!-- XXX can the head element pointer ever be + non-null when we're going through these steps? --> + + <li>If <var title="">last</var> is true, then set the <a href="#insertion0">insertion mode</a> to "<a href="#in-body" title="insertion mode: in body">in body</a>" and abort these steps. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>) + + </li><li>Let <var title="">node</var> now be the node before <var title="">node</var> in the <a href="#stack">stack of open elements</a>. + + </li><li>Return to step 3. + </li></ol> + <!--When you don't have to handle innerHTML, you can use this +simplified explanation instead: + + <ol> + + <li><p>If the <span>stack of open elements</span> <span title="has + an element in table scope">has a <code>td</code> or <code>th</code> + element in table scope</span>, then switch the <span>insertion + mode</span> to "<span title="insertion mode: in cell">in + cell</span>".</p></li> + + <li><p>Otherwise, if the <span>stack of open elements</span> <span + title="has an element in table scope">has a <code>tr</code> element + in table scope</span>, then switch the <span>insertion mode</span> + to "<span title="insertion mode: in row">in row</span>".</p></li> + + <li><p>Otherwise, if the <span>stack of open elements</span> <span + title="has an element in table scope">has a <code>tbody</code>, + <code>tfoot</code>, or <code>thead</code> element in table + scope</span>, then switch the <span>insertion mode</span> to "<span + title="insertion mode: in table body">in table + body</span>".</p></li> + + <li><p>Otherwise, if the <span>stack of open elements</span> <span + title="has an element in table scope">has a <code>caption</code> + element in table scope</span>, then switch the <span>insertion + mode</span> to "<span title="insertion mode: in caption">in + caption</span>".</p></li> + + ( you can't reach this point with a colgroup element on the + stack ) + + <li><p>Otherwise, if the <span>stack of open elements</span> <span + title="has an element in table scope">has a <code>table</code> + element in table scope</span>, then switch the <span>insertion + mode</span> to "<span title="insertion mode: in table">in + table</span>".</p></li> + + <li><p>Otherwise, switch the <span>insertion mode</span> to "<span + title="insertion mode: in body">in body</span>".</p></li> + + </ol> +--> + + <h6 id="how-to0"><span class="secno">8.2.4.3.7. </span>How to handle tokens in + the main phase</h6> + + <p>Tokens in the main phase must be handled as follows: + + </p><dl class="switch"> + <dt>A DOCTYPE token + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + + </dd><dt>A start tag token with the tag name "html" + + </dt><dd> + <p>If this start tag token was not the first start tag token, then it is + a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>For each attribute on the token, check to see if the attribute is + already present on the top element of the <a href="#stack">stack of open + elements</a>. If it is not, add the attribute and its corresponding + value to that element.</p> + + </dd><dt>An end-of-file token + + </dt><dd> + <p><a href="#generate">Generate implied end tags.</a></p> + + <p>If there are more than two nodes on the <a href="#stack">stack of open + elements</a>, or if there are two nodes but the second node is not a + <code><a href="section-sections.html#body0">body</a></code> node, this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>Otherwise, if the parser was originally created in order to handle the + setting of an element's <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> attribute, and there's more than + one element in the <a href="#stack">stack of open elements</a>, and the + second node on the <a href="#stack">stack of open elements</a> is not a + <code><a href="section-sections.html#body0">body</a></code> node, then this is a <a href="section-parsing.html#parse">parse error</a>. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p><a href="#stops">Stop parsing.</a></p> + + <p class="big-issue">This fails because it doesn't imply HEAD and BODY + tags. We should probably expand out the insertion modes and merge them + with phases and then put the three things here into each insertion mode + instead of trying to factor them out so carefully.</p> + + </dd><dt>Anything else + + </dt><dd> + <p>Depends on the <a href="#insertion0">insertion mode</a>:</p> + + <dl class="switch"> + <dt>If the <a href="#insertion0">insertion mode</a> is "<dfn id="before2" title="insertion mode: before head">before head</dfn>" + + </dt><dd> + <p>Handle the token as follows:</p> + + <dl class="switch"> + <dt>A character token that is one of one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C + FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dd> + <p><a href="#append" title="append a character">Append the + character</a> to the <a href="#current4">current node</a>.</p> + + </dd><dt>A comment token + + </dt><dd> + <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment + token.</p> + + </dd><dt>A start tag token with the tag name "head" + + </dt><dd> + <p><a href="#create">Create an element for the token</a>.</p> + + <p>Set the <a href="#head-element"><code title="">head</code> element + pointer</a> to this new element node.</p> + + <p>Append the new element to the <a href="#current4">current node</a> + and push it onto the <a href="#stack">stack of open elements</a>.</p> + + <p>Change the <a href="#insertion0">insertion mode</a> to "<a href="#in-head" title="insertion mode: in head">in head</a>".</p> + + </dd><dt>A start tag token whose tag name is one of: "base", "link", + "meta", "script", "style", "title" + + </dt><dd> + <p>Act as if a start tag token with the tag name "head" and no + attributes had been seen, then reprocess the current token.</p> + + <p class="note">This will result in a <code><a href="section-document.html#head">head</a></code> element being generated, and with the + current token being reprocessed in the "<a href="#in-head" title="insertion mode: in head">in head</a>" <a href="#insertion0">insertion mode</a>.</p> + + </dd><dt>An end tag with the tag name "html" + + </dt><dd> + <p>Act as if a start tag token with the tag name "head" and no + attributes had been seen, then reprocess the current token.</p> + + </dd><dt>Any other end tag + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + + </dd><dt>A character token that is <em>not</em> one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C + FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dt>Any other start tag token + + </dt><dd> + <p>Act as if a start tag token with the tag name "head" and no + attributes had been seen, then reprocess the current token.</p> + + <p class="note">This will result in an empty <code><a href="section-document.html#head">head</a></code> element being generated, with the + current token being reprocessed in the "<a href="#after1" title="insertion mode: after head">after head</a>" <a href="#insertion0">insertion mode</a>.</p> + </dd></dl> + + </dd><dt id="parsing-main-inhead">If the <a href="#insertion0">insertion + mode</a> is "<dfn id="in-head" title="insertion mode: in head">in + head</dfn>" + + </dt><dd> + <p>Handle the token as follows.</p> + + <p class="note">The rules for handling "title", "style", and "script" + start tags are similar, but not identical.</p> + + <p class="note">It is possible for the <a href="#tree-construction0">tree + construction</a> stage's <a href="#the-main0" title="the main + phase">main phase</a> to be in the "<a href="#in-head" title="insertion mode: in head">in head</a>" <a href="#insertion0">insertion mode</a> without the <a href="#current4">current node</a> being a <code><a href="section-document.html#head">head</a></code> element, e.g. if a <code><a href="section-document.html#head">head</a></code> end tag is immediately followed by a + <code><a href="section-document.html#meta0">meta</a></code> start tag.</p> + + <dl class="switch"> + <dt>A character token that is one of one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C + FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dd> + <p><a href="#append" title="append a character">Append the + character</a> to the <a href="#current4">current node</a>.</p> + + </dd><dt>A comment token + + </dt><dd> + <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment + token.</p> + + </dd><dt>A start tag with the tag name "title" + + </dt><dd> + <p><a href="#create">Create an element for the token</a>.</p> + + <p>Append the new element to the node pointed to by the <a href="#head-element"><code title="">head</code> element pointer</a>, + or, if that is null (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> + case</a>), to the <a href="#current4">current node</a>.</p> + + <p>Switch the tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> + to the RCDATA state.</p> + + <p>Then, collect all the character tokens that the tokeniser returns + until it returns a token that is not a character token.</p> + + <p>If this process resulted in a collection of character tokens, + append a single <code>Text</code> node to the <code><a href="section-document.html#title1">title</a></code> element node whose contents is the + concatenation of all those tokens' characters.</p> + + <p>The tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> will + have switched back to the PCDATA state.</p> + + <p>If the next token is an end tag token with the tag name "title", + ignore it. Otherwise, this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + </dd><dt>A start tag with the tag name "style" + + </dt><dd> + <p><a href="#create">Create an element for the token</a>.</p> + + <p>Append the new element to the <a href="#current4">current + node</a>, unless the <a href="#insertion0">insertion mode</a> is "<a href="#in-head" title="insertion mode: in head">in head</a>" and the + <a href="#head-element"><code title="">head</code> element + pointer</a> is not null, in which case append it to the node pointed + to by the <a href="#head-element"><code title="">head</code> element + pointer</a>. <!-- + <head></head><style><body> should put the style block in the + head, and does so by switching back to in head, but the head + isn't the current node at that point (comments should go + between the head and the body) -->.</p> + + <p>Switch the tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> + to the CDATA state.</p> + + <p>Then, collect all the character tokens that the tokeniser returns + until it returns a token that is not a character token, or until it + stops tokenising.</p> + + <p>If this process resulted in a collection of character tokens, + append a single <code>Text</code> node to the <code><a href="section-document.html#style">style</a></code> element node whose contents is the + concatenation of all those tokens' characters.</p> + + <p>The tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> will + have switched back to the PCDATA state.</p> + + <p>If the next token is an end tag token with the tag name "style", + ignore it. Otherwise, this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + </dd><dt id="scriptTag">A start tag with the tag name "script" + + </dt><dd> + <p><a href="#create">Create an element for the token</a>.</p> + + <p>Mark the element as being <a href="section-scripting0.html#parser-inserted">"parser-inserted"</a>. This ensures that, if + the script is external, any <code title="dom-document-write-HTML"><a href="section-dynamic.html#document.write0">document.write()</a></code> calls in the + script will execute in-line, instead of blowing the document away, + as would happen in most other cases.</p> + + <p>Switch the tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> + to the CDATA state.</p> + + <p>Then, collect all the character tokens that the tokeniser returns + until it returns a token that is not a character token, or until it + stops tokenising.</p> + + <p>If this process resulted in a collection of character tokens, + append a single <code>Text</code> node to the <code><a href="section-scripting0.html#script0">script</a></code> element node whose contents is the + concatenation of all those tokens' characters.</p> + + <p>The tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> will + have switched back to the PCDATA state.</p> + + <p>If the next token is not an end tag token with the tag name + "script", then this is a <a href="section-parsing.html#parse">parse error</a>; mark the + <code><a href="section-scripting0.html#script0">script</a></code> element as <a href="section-scripting0.html#already">"already executed"</a>. Otherwise, the token is the + <code><a href="section-scripting0.html#script0">script</a></code> element's end tag, so + ignore it.</p> + + <p>If the parser was originally created in order to handle the + setting of a node's <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> attribute, then mark the + <code><a href="section-scripting0.html#script0">script</a></code> element as <a href="section-scripting0.html#already">"already executed"</a>, and skip the rest of the + processing described for this token (including the part below where + "<a href="section-scripting0.html#the-script" title="the script that will execute as soon + as the parser resumes">scripts that will execute as soon as the + parser resumes</a>" are executed). (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p class="note">Marking the <code><a href="section-scripting0.html#script0">script</a></code> + element as "already executed" prevents it from executing when it is + inserted into the document a few paragraphs below. Scripts missing + their end tags and scripts that were inserted using <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> + aren't executed.</p> + + <p>Let the <var title="">old insertion point</var> have the same + value as the current <a href="section-parsing.html#insertion">insertion point</a>. Let + the <a href="section-parsing.html#insertion">insertion point</a> be just before the <a href="section-parsing.html#next-input">next input character</a>.</p> + + <p>Append the new element to the <a href="#current4">current + node</a>, unless the <a href="#insertion0">insertion mode</a> is "<a href="#in-head" title="insertion mode: in head">in head</a>" and the + <a href="#head-element"><code title="">head</code> element + pointer</a> is not null, in which case append it to the node pointed + to by the <a href="#head-element"><code title="">head</code> element + pointer</a>. <!-- + <head></head><script><body> should put the script in the head, + and does so by switching back to in head, but the head isn't + the current node at that point (comments should go between the + head and the body) --> + <a href="section-scripting0.html#running0" title="running a script">Special processing + occurs when a <code>script</code> element is inserted into a + document</a> that might cause some script to execute, which might + cause <a href="section-dynamic.html#document.write0" title="dom-document-write-HTML">new + characters to be inserted into the tokeniser</a>.</p> + + <p>Let the <a href="section-parsing.html#insertion">insertion point</a> have the value of + the <var title="">old insertion point</var>. (In other words, + restore the <a href="section-parsing.html#insertion">insertion point</a> to the value it + had before the previous paragraph. This value might be the + "undefined" value.)</p> + + <p id="scriptTagParserResumes">At this stage, if there is <a href="section-scripting0.html#the-script" title="the script that will execute as soon as + the parser resumes">a script that will execute as soon as the parser + resumes</a>, then:</p> + + <dl class="switch"> + <dt>If the tree construction stage is <a href="section-parsing.html#nestedParsing">being + called reentrantly</a>, say from a call to <code title="dom-document-write-HTML"><a href="section-dynamic.html#document.write0">document.write()</a></code>: + + </dt><dd> + <p>Abort the processing of any nested invokations of the tokeniser, + yielding control back to the caller. (Tokenisation will resume + when the caller returns to the "outer" tree construction stage.) + + </p></dd><dt>Otherwise: + + </dt><dd> + <p>Follow these steps:</p> + + <ol> + <li> + <p>Let <var title="">the script</var> be <a href="section-scripting0.html#the-script">the script that will execute as soon as the + parser resumes</a>. There is no longer <a href="section-scripting0.html#the-script" title="the script that will execute as soon as the parser + resumes">a script that will execute as soon as the parser + resumes</a>. + + </p></li><li> + <p><a href="section-terminology.html#pause">Pause</a> until the script has + <span>completed loading</span><!-- XXX xref -->. + + </p></li><li> + <p>Let the <a href="section-parsing.html#insertion">insertion point</a> be just + before the <a href="section-parsing.html#next-input">next input character</a>. + + </p></li><li> + <p><a href="section-scripting0.html#executing0" title="executing a script block">Execute + the script</a>. + + </p></li><li> + <p>Let the <a href="section-parsing.html#insertion">insertion point</a> be undefined + again. + + </p></li><li> + <p>If there is once again <a href="section-scripting0.html#the-script" title="the script + that will execute as soon as the parser resumes">a script that + will execute as soon as the parser resumes</a>, then repeat + these steps from step 1. + </p></li></ol> + </dd></dl> + + </dd><dt>A start tag with the tag name "base", "link", or "meta" + + </dt><dd> + <p><a href="#create">Create an element for the token</a>.</p> + + <p>Append the new element to the node pointed to by the <a href="#head-element"><code title="">head</code> element pointer</a>, + or, if that is null (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> + case</a>), to the <a href="#current4">current node</a>.</p> + + </dd><dt>An end tag with the tag name "head" + + </dt><dd> + <p>If the <a href="#current4">current node</a> is a <code><a href="section-document.html#head">head</a></code> element, pop the <a href="#current4">current node</a> off the <a href="#stack">stack of + open elements</a>. Otherwise, this is a <a href="section-parsing.html#parse">parse + error</a>.</p> + <!-- might happen if you see two </head>s + and something in between the two sends you from "after head" + back to "in head" --> + + <p>Change the <a href="#insertion0">insertion mode</a> to "<a href="#after1" title="insertion mode: after head">after head</a>".</p> + + </dd><dt>An end tag with the tag name "html" + + </dt><dd> + <p>Act as described in the "anything else" entry below.</p> + + </dd><dt>A start tag with the tag name "head" + + </dt><dt>Any other end tag + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + + </dd><dt>Anything else + + </dt><dd> + <p>If the <a href="#current4">current node</a> is a <code><a href="section-document.html#head">head</a></code> element, act as if an end tag token + with the tag name "head" had been seen.</p> + + <p>Otherwise, change the <a href="#insertion0">insertion mode</a> to + "<a href="#after1" title="insertion mode: after head">after + head</a>".</p> + + <p>Then, reprocess the current token.</p> + + <p class="big-issue">In certain UAs, <a href="https://bugzilla.mozilla.org/attachment.cgi?id=180157&action=view">some + elements</a> don't trigger the "in body" mode straight away, but + instead get put into the head. Do we want to copy that?</p> + </dd></dl> + + </dd><dt>If the <a href="#insertion0">insertion mode</a> is "<dfn id="after1" title="insertion mode: after head">after head</dfn>" + + </dt><dd> + <p>Handle the token as follows:</p> + + <dl class="switch"> + <dt>A character token that is one of one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C + FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dd> + <p><a href="#append" title="append a character">Append the + character</a> to the <a href="#current4">current node</a>.</p> + + </dd><dt>A comment token + + </dt><dd> + <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment + token.</p> + + </dd><dt>A start tag token with the tag name "body" + + </dt><dd> + <p><a href="#insert" title="insert an HTML element">Insert a + <code>body</code> element</a> for the token.</p> + + <p>Change the <a href="#insertion0">insertion mode</a> to "<a href="#in-body" title="insertion mode: in body">in body</a>".</p> + + </dd><dt>A start tag token with the tag name "frameset" + + </dt><dd> + <p><a href="#insert" title="insert an HTML element">Insert a + <code>frameset</code> element</a> for the token.</p> + + <p>Change the <a href="#insertion0">insertion mode</a> to "<a href="#in-frameset" title="insertion mode: in frameset">in + frameset</a>".</p> + + </dd><dt>A start tag token whose tag name is one of: "base", "link", + "meta", "script", "style", "title" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Switch the <a href="#insertion0">insertion mode</a> back to "<a href="#in-head" title="insertion mode: in head">in head</a>" and reprocess the + token.</p> + + </dd><dt>Anything else + + </dt><dd> + <p>Act as if a start tag token with the tag name "body" and no + attributes had been seen, and then reprocess the current token.</p> + </dd></dl> + + </dd><dt id="parsing-main-inbody">If the <a href="#insertion0">insertion + mode</a> is "<dfn id="in-body" title="insertion mode: in body">in + body</dfn>" + + </dt><dd> + <p>Handle the token as follows:</p> + + <dl class="switch"> + <dt>A character token + + </dt><dd> + <p><a href="#reconstruct">Reconstruct the active formatting + elements</a>, if any.</p> + + <p><a href="#append" title="append a character">Append the token's + character</a> to the <a href="#current4">current node</a>.</p> + + </dd><dt>A comment token + + </dt><dd> + <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment + token.</p> + + </dd><dt>A start tag token whose tag name is one of: "script", "style" + + </dt><dd> + <p>Process the token as if the <a href="#insertion0">insertion + mode</a> had been "<a href="#in-head" title="insertion mode: in + head">in head</a>".</p> + + </dd><dt>A start tag token whose tag name is one of: "base", "link", + "meta", "title" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Process the token as if the <a href="#insertion0">insertion mode</a> had been "<a href="#in-head" title="insertion mode: in head">in head</a>".</p> + + </dd><dt>A start tag token with the tag name "body" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>.</p> + + <p>If the second element on the <a href="#stack">stack of open + elements</a> is not a <code><a href="section-sections.html#body0">body</a></code> + element, or, if the <a href="#stack">stack of open elements</a> has + only one node on it, then ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p>Otherwise, for each attribute on the token, check to see if the + attribute is already present on the <code><a href="section-sections.html#body0">body</a></code> element (the second element) on the <a href="#stack">stack of open elements</a>. If it is not, add the + attribute and its corresponding value to that element.</p> + + </dd><dt>An end tag with the tag name "body" + + </dt><dd> + <p>If the second element in the <a href="#stack">stack of open + elements</a> is not a <code><a href="section-sections.html#body0">body</a></code> + element, this is a <a href="section-parsing.html#parse">parse error</a>. Ignore the + token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p>Otherwise:</p> + + <p class="big-issue">this needs to handle closing of implied elements, + but without closing them</p> + + <p>If the <a href="#current4">current node</a> is not the <code><a href="section-sections.html#body0">body</a></code> element, then this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>Change the <a href="#insertion0">insertion mode</a> to "<a href="#after2" title="insertion mode: after body">after body</a>".</p> + + </dd><dt>An end tag with the tag name "html" + + </dt><dd> + <p>Act as if an end tag with tag name "body" had been seen, then, if + that token wasn't ignored, reprocess the current token.</p> + + <p class="note">The fake end tag token here can only be ignored in the + <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p> + + </dd><dt>A start tag whose tag name is one of: "address", "blockquote", + "center", "dir", "div", "dl", "fieldset", "listing", "menu", "ol", + "p", "ul" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code> + element in scope</a>, then act as if an end tag with the tag name + <code><a href="section-prose.html#p">p</a></code> had been seen.</p> + + <p><a href="#insert" title="insert an html element">Insert an HTML + element</a> for the token.</p> + + </dd><dt>A start tag whose tag name is "pre" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code> + element in scope</a>, then act as if an end tag with the tag name + <code><a href="section-prose.html#p">p</a></code> had been seen.</p> + + <p><a href="#insert" title="insert an html element">Insert an HTML + element</a> for the token.</p> + + <p>If the next token is a U+000A LINE FEED (LF) character token, then + ignore that token and move on to the next one. (Newlines at the + start of <code><a href="section-preformatted.html#pre">pre</a></code> blocks are ignored as + an authoring convenience.)</p> + + </dd><dt>A start tag whose tag name is "form" + + </dt><dd> + <p>If the <a href="#form-element"><code title="form">form</code> + element pointer</a> is not null, ignore the token with a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>Otherwise:</p> + + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code> + element in scope</a>, then act as if an end tag with the tag name + <code><a href="section-prose.html#p">p</a></code> had been seen.</p> + + <p><a href="#insert" title="insert an html Element">Insert an HTML + element</a> for the token, and set the <code title="form">form</code> + element pointer to point to the element created.</p> + + </dd><dt>A start tag whose tag name is "li" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code> + element in scope</a>, then act as if an end tag with the tag name + <code><a href="section-prose.html#p">p</a></code> had been seen.</p> + + <p>Run the following algorithm:</p> + + <ol> + <li> + <p>Initialise <var title="">node</var> to be the <a href="#current4">current node</a> (the bottommost node of the + stack). + + </p></li><li> + <p>If <var title="">node</var> is an <code><a href="section-lists0.html#li">li</a></code> element, then pop all the nodes from the + <a href="#current4">current node</a> up to <var title="">node</var>, including <var title="">node</var>, then stop + this algorithm. If more than one node is popped, then this is a <a href="section-parsing.html#parse">parse error</a>. + + </p></li><li> + <p>If <var title="">node</var> is not in the <a href="#formatting">formatting</a> category, and is not in the <a href="#phrasing">phrasing</a> category, and is not an <code><a href="section-sections.html#address">address</a></code> or <code><a href="section-miscellaneous.html#div">div</a></code> element, then stop this algorithm. + </p></li> + <!-- an element <foo> is in this + list if the following markup: + + <!DOCTYPE html><body><ol><li><foo><li> + + ...results in the second <li> not being (in any way) a + descendant of the first <li>, or if <foo> is a formatting + element that gets reopened later. --> + + <li> + <p>Otherwise, set <var title="">node</var> to the previous entry in + the <a href="#stack">stack of open elements</a> and return to step + 2. + </p></li></ol> + + <p>Finally, <a href="#insert" title="insert an html element">insert + an <code>li</code> element</a>.</p> + + </dd><dt>A start tag whose tag name is "dd" or "dt" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code> + element in scope</a>, then act as if an end tag with the tag name + <code><a href="section-prose.html#p">p</a></code> had been seen.</p> + + <p>Run the following algorithm:</p> + + <ol> + <li> + <p>Initialise <var title="">node</var> to be the <a href="#current4">current node</a> (the bottommost node of the + stack). + + </p></li><li> + <p>If <var title="">node</var> is a <code><a href="section-lists0.html#dd">dd</a></code> or <code><a href="section-lists0.html#dt">dt</a></code> + element, then pop all the nodes from the <a href="#current4">current node</a> up to <var title="">node</var>, + including <var title="">node</var>, then stop this algorithm. If + more than one node is popped, then this is a <a href="section-parsing.html#parse">parse error</a>. + + </p></li><li> + <p>If <var title="">node</var> is not in the <a href="#formatting">formatting</a> category, and is not in the <a href="#phrasing">phrasing</a> category, and is not an <code><a href="section-sections.html#address">address</a></code> or <code><a href="section-miscellaneous.html#div">div</a></code> element, then stop this algorithm. + </p></li> + <!-- an element <foo> is in this + list if the following markup: + + <!DOCTYPE html><body><ol><dt><foo><dt> + + ...results in the second <li> not being (in any way) a + descendant of the first <li>, or if <foo> is a formatting + element that gets reopened later. --> + + <li> + <p>Otherwise, set <var title="">node</var> to the previous entry in + the <a href="#stack">stack of open elements</a> and return to step + 2. + </p></li></ol> + + <p>Finally, <a href="#insert" title="insert an html element">insert + an HTML element</a> with the same tag name as the token's.</p> + + </dd><dt>A start tag token whose tag name is "plaintext" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code> + element in scope</a>, then act as if an end tag with the tag name + <code><a href="section-prose.html#p">p</a></code> had been seen.</p> + + <p><a href="#insert" title="insert an html element">Insert an HTML + element</a> for the token.</p> + + <p>Switch the <a href="section-tokenisation.html#content2">content model flag</a> to the + PLAINTEXT state.</p> + + <p class="note">Once a start tag with the tag name "plaintext" has been + seen, that will be the last token ever seen other than character + tokens (and the end-of-file token), because there is no way to + switch the <a href="section-tokenisation.html#content2">content model flag</a> out of the + PLAINTEXT state.</p> + + </dd><dt>An end tag whose tag name is one of: "address", "blockquote", + "center", "dir", "div", "dl", "fieldset", "listing", "menu", "ol", + "pre", "ul" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an">has an element in scope</a> with the same tag name + as that of the token, then <a href="#generate">generate implied end + tags</a>.</p> + + <p>Now, if the <a href="#current4">current node</a> is not an element + with the same tag name as that of the token, then this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an">has an element in scope</a> with the same tag name + as that of the token, then pop elements from this stack until an + element with that tag name has been popped from the stack.</p> + <!-- XXX quirk (except for in certain cases?): + <p>Otherwise, act as if a start tag with the tag name given in + the token had been seen, then reprocess the current token.</p> + --> + + + </dd><dt>An end tag whose tag name is "form" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an">has an element in scope</a> with the same tag name + as that of the token, then <a href="#generate">generate implied end + tags</a>.</p> + + <p>Now, if the <a href="#current4">current node</a> is not an element + with the same tag name as that of the token, then this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>Otherwise, if the <a href="#current4">current node</a> is an + element with the same tag name as that of the token pop that element + from the stack.</p> + + <p>In any case, set the <a href="#form-element"><code title="">form</code> element pointer</a> to null.</p> + + </dd><dt>An end tag whose tag name is "p" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code> + element in scope</a>, then <a href="#generate">generate implied end + tags</a>, except for <code><a href="section-prose.html#p">p</a></code> elements.</p> + + <p>If the <a href="#current4">current node</a> is not a <code><a href="section-prose.html#p">p</a></code> element, then this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code> + element in scope</a>, then pop elements from this stack until the + stack no longer <a href="#have-an" title="has an element in + scope">has a <code>p</code> element in scope</a>.</p> + <!-- XXX quirk: + <p>Otherwise, act as if a start tag with the tag name + <code>p</code> had been seen, then reprocess the current + token.</p> + --> + + + </dd><dt>An end tag whose tag name is "dd", "dt", or "li" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an">has an element in scope</a> whose tag name matches + the tag name of the token, then <a href="#generate">generate implied + end tags</a>, except for elements with the same tag name as the + token.</p> + + <p>If the <a href="#current4">current node</a> is not an element with + the same tag name as the token, then this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an">has an element in scope</a> whose tag name matches + the tag name of the token, then pop elements from this stack until + an element with that tag name has been popped from the stack.</p> + + </dd><dt>A start tag whose tag name is one of: "h1", "h2", "h3", "h4", + "h5", "h6" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code> + element in scope</a>, then act as if an end tag with the tag name + <code><a href="section-prose.html#p">p</a></code> had been seen.</p> + + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has in scope</a> an + element whose tag name is one of "h1", "h2", "h3", "h4", "h5", or + "h6", then this is a <a href="section-parsing.html#parse">parse error</a>; pop elements + from the stack until an element with one of those tag names has been + popped from the stack.</p> + + <p><a href="#insert" title="insert an html element">Insert an HTML + element</a> for the token.</p> + + </dd><dt>An end tag whose tag name is one of: "h1", "h2", "h3", "h4", "h5", + "h6" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has in scope</a> an + element whose tag name is one of "h1", "h2", "h3", "h4", "h5", or + "h6", then <a href="#generate">generate implied end tags</a>.</p> + + <p>Now, if the <a href="#current4">current node</a> is not an element + with the same tag name as that of the token, then this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has in scope</a> an + element whose tag name is one of "h1", "h2", "h3", "h4", "h5", or + "h6", then pop elements from the stack until an element with one of + those tag names has been popped from the stack.</p> + <!-- XXX quirk: + <p>Otherwise, act as if a start tag with the tag name given in + the token had been seen, then reprocess the current token.</p> + --> + </dd> + <!-- ADOPTION AGENCY ELEMENTS + Mozilla-only: bdo blink del ins sub sup q + Safari-only: code dfn kbd nobr samp var wbr + Both: a b big em font i s small strike strong tt u --> + + <dt>A start tag whose tag name is "a" + + </dt><dd> + <p>If the <a href="#list-of4">list of active formatting elements</a> + contains an element whose tag name is "a" between the end of the + list and the last marker on the list (or the start of the list if + there is no marker on the list), then this is a <a href="section-parsing.html#parse">parse error</a>; act as if an end tag with the tag + name "a" had been seen, then remove that element from the <a href="#list-of4">list of active formatting elements</a> and the <a href="#stack">stack of open elements</a> if the end tag didn't + already remove it (it might not have if the element is not <a href="#have-an0" title="has an element in table scope">in table + scope</a>).</p> + + <p class="example">In the non-conforming stream + <code><a href="a">a<table><a href="b">b</table>x</code>, + the first <code><a href="section-phrase.html#a">a</a></code> element would be closed + upon seeing the second one, and the "x" character would be inside a + link to "b", not to "a". This is despite the fact that the outer + <code><a href="section-phrase.html#a">a</a></code> element is not in table scope + (meaning that a regular <code></a></code> end tag at the start of + the table wouldn't close the outer <code><a href="section-phrase.html#a">a</a></code> + element).</p> + + <p><a href="#reconstruct">Reconstruct the active formatting + elements</a>, if any.</p> + + <p><a href="#insert" title="insert an html element">Insert an HTML + element</a> for the token. Add that element to the <a href="#list-of4">list of active formatting elements</a>.</p> + + </dd><dt>A start tag whose tag name is one of: "b", "big", "em", "font", + "i", "nobr", "s", "small", "strike", "strong", "tt", "u" + + </dt><dd> + <p><a href="#reconstruct">Reconstruct the active formatting + elements</a>, if any.</p> + + <p><a href="#insert" title="insert an html element">Insert an HTML + element</a> for the token. Add that element to the <a href="#list-of4">list of active formatting elements</a>.</p> + + </dd><dt id="adoptionAgency">An end tag whose tag name is one of: "a", "b", + "big", "em", "font", "i", "nobr", "s", "small", "strike", "strong", + "tt", "u" + + </dt><dd> + <p>Follow these steps:</p> + + <ol> + <li> + <p>Let the <var title="">formatting element</var> be the last + element in the <a href="#list-of4">list of active formatting + elements</a> that:</p> + + <ul> + <li>is between the end of the list and the last scope marker in + the list, if any, or the start of the list otherwise, and + + </li><li>has the same tag name as the token. + </li></ul> + + <p>If there is no such node, or, if that node is also in the <a href="#stack">stack of open elements</a> but the element is not <a href="#have-an" title="has an element in scope">in scope</a>, then + this is a <a href="section-parsing.html#parse">parse error</a>. Abort these steps. The + token is ignored.</p> + + <p>Otherwise, if there is such a node, but that node is not in the + <a href="#stack">stack of open elements</a>, then this is a <a href="section-parsing.html#parse">parse error</a>; remove the element from the list, + and abort these steps.</p> + + <p>Otherwise, there is a <var title="">formatting element</var> and + that element is in <a href="#stack" title="stack of open + elements">the stack</a> and is <a href="#have-an" title="has an + element in scope">in scope</a>. If the element is not the <a href="#current4">current node</a>, this is a <a href="section-parsing.html#parse">parse error</a>. In any case, proceed with the + algorithm as written in the following steps.</p> + + </li><li> + <p>Let the <var title="">furthest block</var> be the topmost node + in the <a href="#stack">stack of open elements</a> that is lower + in the stack than the <var title="">formatting element</var>, and + is not an element in the <a href="#phrasing">phrasing</a> or <a href="#formatting">formatting</a> categories. There might not be + one. + + </p></li><li> + <p>If there is no <var title="">furthest block</var>, then the UA + must skip the subsequent steps and instead just pop all the nodes + from the bottom of the <a href="#stack">stack of open + elements</a>, from the <a href="#current4">current node</a> up to + the <var title="">formatting element</var>, and remove the <var title="">formatting element</var> from the <a href="#list-of4">list of active formatting elements</a>. + + </p></li><li> + <p>Let the <var title="">common ancestor</var> be the element + immediately above the <var title="">formatting element</var> in + the <a href="#stack">stack of open elements</a>. + + </p></li><li> + <p>If the <var title="">furthest block</var> has a parent node, + then remove the <var title="">furthest block</var> from its parent + node. + + </p></li><li> + <p>Let a bookmark note the position of the <var title="">formatting + element</var> in the <a href="#list-of4">list of active formatting + elements</a> relative to the elements on either side of it in the + list. + + </p></li><li> + <p>Let <var title="">node</var> and <var title="">last node</var> + be the <var title="">furthest block</var>. Follow these steps:</p> + + <ol> + <li>Let <var title="">node</var> be the element immediately prior + to <var title="">node</var> in the <a href="#stack">stack of open + elements</a>. + + </li><li>If <var title="">node</var> is not in the <a href="#list-of4">list of active formatting elements</a>, then + remove <var title="">node</var> from the <a href="#stack">stack + of open elements</a> and then go back to step 1. + + </li><li>Otherwise, if <var title="">node</var> is the <var title="">formatting element</var>, then go to the next step in + the overall algorithm. + + </li><li>Otherwise, if <var title="">last node</var> is the <var title="">furthest block</var>, then move the aforementioned + bookmark to be immediately after the <var title="">node</var> in + the <a href="#list-of4">list of active formatting elements</a>. + + </li><li>If <var title="">node</var> has any children, perform a + shallow clone of <var title="">node</var>, replace the entry for + <var title="">node</var> in the <a href="#list-of4">list of + active formatting elements</a> with an entry for the clone, + replace the entry for <var title="">node</var> in the <a href="#stack">stack of open elements</a> with an entry for the + clone, and let <var title="">node</var> be the clone. + + </li><li>Insert <var title="">last node</var> into <var title="">node</var>, first removing it from its previous parent + node if any. + + </li><li>Let <var title="">last node</var> be <var title="">node</var>. + + </li><li>Return to step 1 of this inner set of steps. + </li></ol> + + </li><li> + <p>Insert whatever <var title="">last node</var> ended up being in + the previous step into the <var title="">common ancestor</var> + node, first removing it from its previous parent node if any. + + </p></li><li> + <p>Perform a shallow clone of the <var title="">formatting + element</var>. + + </p></li><li> + <p>Take all of the child nodes of the <var title="">furthest + block</var> and append them to the clone created in the last step. + + </p></li><li> + <p>Append that clone to the <var title="">furthest block</var>. + + </p></li><li> + <p>Remove the <var title="">formatting element</var> from the <a href="#list-of4">list of active formatting elements</a>, and + insert the clone into the <a href="#list-of4">list of active + formatting elements</a> at the position of the aforementioned + bookmark. + + </p></li><li> + <p>Remove the <var title="">formatting element</var> from the <a href="#stack">stack of open elements</a>, and insert the clone + into the <a href="#stack">stack of open elements</a> immediately + after (i.e. in a more deeply nested position than) the position of + the <var title="">furthest block</var> in that stack. + + </p></li><li> + <p>Jump back to step 1 in this series of steps. + </p></li></ol> + + <p class="note">The way these steps are defined, only elements in the + <a href="#formatting">formatting</a> category ever get cloned by + this algorithm.</p> + <!--XXX + <div class="example"> + <p class="big-issue">Need an example.</p> + </div> +--> + + <p class="note">Because of the way this algorithm causes elements to + change parents, it has been dubbed the "adoption agency algorithm" + (in contrast with other possibly algorithms for dealing with + misnested content, which included the "incest algorithm", the + "secret affair algorithm", and the "Heisenberg algorithm").</p> + + </dd><dt>A start tag token whose tag name is "button" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a + <code>button</code> element in scope</a>, then this is a <a href="section-parsing.html#parse">parse error</a>; act as if an end tag with the tag + name "button" had been seen, then reprocess the token.</p> + + <p>Otherwise:</p> + + <p><a href="#reconstruct">Reconstruct the active formatting + elements</a>, if any.</p> + + <p><a href="#insert">Insert an HTML element</a> for the token.</p> + + <p>Insert a marker at the end of the <a href="#list-of4">list of + active formatting elements</a>.</p> + + </dd><dt>A start tag token whose tag name is one of: "marquee", "object" + + </dt><dd> + <p><a href="#reconstruct">Reconstruct the active formatting + elements</a>, if any.</p> + + <p><a href="#insert">Insert an HTML element</a> for the token.</p> + + <p>Insert a marker at the end of the <a href="#list-of4">list of + active formatting elements</a>.</p> + + </dd><dt>An end tag token whose tag name is one of: "button", "marquee", + "object" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has in scope</a> an + element whose tag name is the same as the tag name of the token, + then <a href="#generate">generate implied end tags</a>.</p> + + <p>Now, if the <a href="#current4">current node</a> is not an element + with the same tag name as the token, then this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>Now, if the <a href="#stack">stack of open elements</a> <a href="#have-an">has an element in scope</a> whose tag name matches + the tag name of the token, then pop elements from the stack until + that element has been popped from the stack, and <a href="#clear0">clear the list of active formatting elements up to + the last marker</a>.</p> + + </dd><dt>A start tag token whose tag name is "xmp" + + </dt><dd> + <p><a href="#reconstruct">Reconstruct the active formatting + elements</a>, if any.</p> + + <p><a href="#insert">Insert an HTML element</a> for the token.</p> + + <p>Switch the <a href="section-tokenisation.html#content2">content model flag</a> to the CDATA + state.</p> + + </dd><dt>A start tag whose tag name is "table" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code> + element in scope</a>, then act as if an end tag with the tag name + <code><a href="section-prose.html#p">p</a></code> had been seen.</p> + <!-- XXX quirks: don't do this --> + <p><a href="#insert">Insert an HTML element</a> for the token.</p> + + <p>Change the <a href="#insertion0">insertion mode</a> to "<a href="#in-table" title="insertion mode: in table">in table</a>".</p> + + </dd><dt>A start tag whose tag name is one of: "area", "basefont", + "bgsound", "br", "embed", "img", "param", "spacer", "wbr" + + </dt><dd> + <p><a href="#reconstruct">Reconstruct the active formatting + elements</a>, if any.</p> + + <p><a href="#insert" title="insert an html element">Insert an HTML + element</a> for the token. Immediately pop the <a href="#current4">current node</a> off the <a href="#stack">stack of + open elements</a>.</p> + + </dd><dt>A start tag whose tag name is "hr" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code> + element in scope</a>, then act as if an end tag with the tag name + <code><a href="section-prose.html#p">p</a></code> had been seen.</p> + <!-- XXX quirks: don't do this --> + <p><a href="#insert" title="insert an html element">Insert an HTML + element</a> for the token. Immediately pop the <a href="#current4">current node</a> off the <a href="#stack">stack of + open elements</a>.</p> + + </dd><dt>A start tag whose tag name is "image" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Change the token's tag name to + "img" and reprocess it. (Don't ask.)</p> + <!-- As of + 2005-12, studies showed that around 0.2% of pages used the + <image> element. --> + + + </dd><dt>A start tag whose tag name is "input" + + </dt><dd> + <p><a href="#reconstruct">Reconstruct the active formatting + elements</a>, if any.</p> + + <p><a href="#insert" title="insert an html element">Insert an + <code>input</code> element</a> for the token.</p> + + <p>If the <a href="#form-element"><code title="">form</code> element + pointer</a> is not null, then <span>associate</span><!--XXX + xref! --> + the <code>input</code> element with the <code>form</code> element + pointed to by the <a href="#form-element"><code title="">form</code> + element pointer</a>.</p> + + <p>Pop that <code>input</code> element off the <a href="#stack">stack + of open elements</a>.</p> + + </dd><dt id="isindex">A start tag whose tag name is "isindex" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>.</p> + + <p>If the <a href="#form-element"><code title="">form</code> element + pointer</a> is not null, then ignore the token.</p> + + <p>Otherwise:</p> + + <p>Act as if a start tag token with the tag name "form" had been + seen.</p> + + <p>Act as if a start tag token with the tag name "hr" had been seen.</p> + + <p>Act as if a start tag token with the tag name "p" had been seen.</p> + + <p>Act as if a start tag token with the tag name "label" had been + seen.</p> + + <p>Act as if a stream of character tokens had been seen (see below + for what they should say).</p> + + <p>Act as if a start tag token with the tag name "input" had been + seen, with all the attributes from the "isindex" token, except with + the "name" attribute set to the value "isindex" (ignoring any + explicit "name" attribute).</p> + + <p>Act as if a stream of character tokens had been seen (see below + for what they should say).</p> + + <p>Act as if an end tag token with the tag name "label" had been + seen.</p> + + <p>Act as if an end tag token with the tag name "p" had been seen.</p> + + <p>Act as if a start tag token with the tag name "hr" had been seen.</p> + + <p>Act as if an end tag token with the tag name "form" had been seen.</p> + + <p>The two streams of character tokens together should, together with + the <code>input</code> element, express the equivalent of "This is a + searchable index. Insert your search keywords here: (input field)" + in the user's preferred language.</p> + + <p class="big-issue"> Then need to specify that if the form submission + causes just a single form control, whose name is "isindex", to be + submitted, then we submit just the value part, not the "isindex=" + part.</p> + </dd> + <!-- XXX keygen support; don't forget form element pointer! + + <dt>A start tag whose tag name is "keygen"</dt> + <dd> + ... + </dd> +--> + + <dt>A start tag whose tag name is "textarea" + + </dt><dd> + <p><a href="#create">Create an element for the token</a>.</p> + + <p>If the <a href="#form-element"><code title="">form</code> element + pointer</a> is not null, then <span>associate</span><!--XXX + xref! --> + the <code>textarea</code> element with the <code>form</code> element + pointed to by the <a href="#form-element"><code title="">form</code> + element pointer</a>.</p> + + <p>Append the new element to the <a href="#current4">current + node</a>.</p> + + <p>Switch the tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> + to the RCDATA state.</p> + + <p>If the next token is a U+000A LINE FEED (LF) character token, then + ignore that token and move on to the next one. (Newlines at the + start of <code>textarea</code> elements are ignored as an authoring + convenience.)</p> + + <p>Then, collect all the character tokens that the tokeniser returns + until it returns a token that is not a character token, or until it + stops tokenising.</p> + + <p>If this process resulted in a collection of character tokens, + append a single <code>Text</code> node, whose contents is the + concatenation of all those tokens' characters, to the new element + node.</p> + + <p>The tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> will + have switched back to the PCDATA state.</p> + + <p>If the next token is an end tag token with the tag name + "textarea", ignore it. Otherwise, this is a <a href="section-parsing.html#parse">parse + error</a>.</p> + + </dd><dt>A start tag whose tag name is one of: "iframe", "noembed", + "noframes" + + </dt><dt>A start tag whose tag name is "noscript", if <a href="section-scripting.html#scripting2">scripting is enabled</a>: + + </dt><dd> + <p><a href="#create">Create an element for the token</a>.</p> + + <p>For "iframe" tags, the node must be an <code><a href="section-embedded.html#htmliframeelement">HTMLIFrameElement</a></code> object, for + the other tags it must be an <code><a href="section-elements.html#htmlelement">HTMLElement</a></code> object.</p> + + <p>Append the new element to the <a href="#current4">current + node</a>.</p> + + <p>Switch the tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> + to the CDATA state.</p> + + <p>Then, collect all the character tokens that the tokeniser returns + until it returns a token that is not a character token, or until it + stops tokenising.</p> + + <p>If this process resulted in a collection of character tokens, + append a single <code>Text</code> node, whose contents is the + concatenation of all those tokens' characters, to the new element + node.</p> + + <p>The tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> will + have switched back to the PCDATA state.</p> + + <p>If the next token is an end tag token with the same tag name as + the start tag token, ignore it. Otherwise, this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + </dd><dt>A start tag whose tag name is "select" + + </dt><dd> + <p><a href="#reconstruct">Reconstruct the active formatting + elements</a>, if any.</p> + + <p><a href="#insert">Insert an HTML element</a> for the token.</p> + + <p>Change the <a href="#insertion0">insertion mode</a> to "<a href="#in-select" title="insertion mode: in select">in select</a>".</p> + </dd> + <!-- XXX quirks: + <dt>An end tag whose tag name is "br"</dt> + <dd> + <p>Act as if a start tag token with the tag name "br" had been + seen. Ignore the end tag token.</p> + </dd> +--> + + <dt>A start or end tag whose tag name is one of: "caption", "col", + "colgroup", "frame", "frameset", "head", "option", "optgroup", + "tbody", "td", "tfoot", "th", "thead", "tr" + + </dt><dt>An end tag whose tag name is one of: "area", "basefont", + "bgsound", <!--XXX quirks: remove br-->"br", "embed", "hr", "iframe", + "image", "img", "input", "isindex", "noembed", "noframes", "param", + "select", "spacer", "table", "textarea", "wbr"</dt> + <!-- add keygen if we add the start tag --> + + <dt>An end tag whose tag name is "noscript", if <a href="section-scripting.html#scripting2">scripting is enabled</a>: + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + + </dd><dt>A start or end tag whose tag name is one of: "event-source", + "section", "nav", "article", "aside", "header", "footer", "datagrid", + "command" + + </dt><dd> <!-- XXXX --> + <p class="big-issue">Work in progress!</p> + + </dd><dt>A start tag token not covered by the previous entries + + </dt><dd> + <p><a href="#reconstruct">Reconstruct the active formatting + elements</a>, if any.</p> + + <p><a href="#insert">Insert an HTML element</a> for the token.</p> + + <p class="note">This element will be a <a href="#phrasing">phrasing</a> + element.</p> + <!-- +Put the following into the MathML namespace if parsed: + math, mrow, mfrac, msqrt, mroot, mstyle, merror, mpadded, + mphantom, mfenced, menclose, msub, msup, msubsup, munder, + mover, munderover, mmultiscripts, mtable, mlabeledtr, mtr, + mtd, maction +--> + + + </dd><dt>An end tag token not covered by the previous entries + + </dt><dd> + <p>Run the following algorithm:</p> + + <ol> + <li> + <p>Initialise <var title="">node</var> to be the <a href="#current4">current node</a> (the bottommost node of the + stack). + + </p></li><li> + <p>If <var title="">node</var> has the same tag name as the end tag + token, then:</p> + + <ol> + <li> + <p><a href="#generate">Generate implied end tags</a>. + + </p></li><li> + <p>If the tag name of the end tag token does not match the tag + name of the <a href="#current4">current node</a>, this is a <a href="section-parsing.html#parse">parse error</a>. + + </p></li><li> + <p>Pop all the nodes from the <a href="#current4">current + node</a> up to <var title="">node</var>, including <var title="">node</var>, then stop this algorithm. + </p></li></ol> + + </li><li> + <p>Otherwise, if <var title="">node</var> is in neither the <a href="#formatting">formatting</a> category nor the <a href="#phrasing">phrasing</a> category, then this is a <a href="section-parsing.html#parse">parse error</a>. Stop this algorithm. The end tag + token is ignored. + + </p></li><li> + <p>Set <var title="">node</var> to the previous entry in the <a href="#stack">stack of open elements</a>. + + </p></li><li> + <p>Return to step 2. + </p></li></ol> + </dd></dl> + + </dd><dt id="parsing-main-intable">If the <a href="#insertion0">insertion + mode</a> is "<dfn id="in-table" title="insertion mode: in table">in + table</dfn>" + + </dt><dd> + <dl class="switch"> + <dt>A character token that is one of one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C + FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dd> + <p><a href="#append" title="append a character">Append the + character</a> to the <a href="#current4">current node</a>.</p> + + </dd><dt>A comment token + + </dt><dd> + <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment + token.</p> + + </dd><dt>A start tag whose tag name is "caption" + + </dt><dd> + <p><a href="#clear1">Clear the stack back to a table context</a>. + (See below.)</p> + + <p>Insert a marker at the end of the <a href="#list-of4">list of + active formatting elements</a>.</p> + + <p><a href="#insert">Insert an HTML element</a> for the token, then + switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-caption" title="insertion mode: in caption">in + caption</a>".</p> + + </dd><dt>A start tag whose tag name is "colgroup" + + </dt><dd> + <p><a href="#clear1">Clear the stack back to a table context</a>. + (See below.)</p> + + <p><a href="#insert">Insert an HTML element</a> for the token, then + switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-column" title="insertion mode: in column group">in column + group</a>".</p> + + </dd><dt>A start tag whose tag name is "col" + + </dt><dd> + <p>Act as if a start tag token with the tag name "colgroup" had been + seen, then reprocess the current token.</p> + + </dd><dt>A start tag whose tag name is one of: "tbody", "tfoot", "thead" + + </dt><dd> + <p><a href="#clear1">Clear the stack back to a table context</a>. + (See below.)</p> + + <p><a href="#insert">Insert an HTML element</a> for the token, then + switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-table0" title="insertion mode: in table body">in table + body</a>".</p> + + </dd><dt>A start tag whose tag name is one of: "td", "th", "tr" + + </dt><dd> + <p>Act as if a start tag token with the tag name "tbody" had been + seen, then reprocess the current token.</p> + + </dd><dt>A start tag whose tag name is "table" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Act as if an end tag token with + the tag name "table" had been seen, then, if that token wasn't + ignored, reprocess the current token.</p> + + <p class="note">The fake end tag token here can only be ignored in the + <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p> + + </dd><dt>An end tag whose tag name is "table" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an + element in table scope</a> with the same tag name as the token, this + is a <a href="section-parsing.html#parse">parse error</a>. Ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p>Otherwise:</p> + + <p><a href="#generate">Generate implied end tags</a>.</p> + + <p>Now, if the <a href="#current4">current node</a> is not a <code><a href="section-tabular.html#table">table</a></code> element, then this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>Pop elements from this stack until a <code><a href="section-tabular.html#table">table</a></code> element has been popped from the + stack.</p> + + <p><a href="#reset">Reset the insertion mode appropriately</a>.</p> + + </dd><dt>An end tag whose tag name is one of: "body", "caption", "col", + "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + + </dd><dt>Anything else + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Process the token as if the <a href="#insertion0">insertion mode</a> was "<a href="#in-body" title="insertion mode: in body">in body</a>", with the following + exception:</p> + + <p>If the <a href="#current4">current node</a> is a <code><a href="section-tabular.html#table">table</a></code>, <code><a href="section-tabular.html#tbody">tbody</a></code>, <code><a href="section-tabular.html#tfoot0">tfoot</a></code>, <code><a href="section-tabular.html#thead0">thead</a></code>, or <code><a href="section-tabular.html#tr">tr</a></code> element, then, whenever a node would be + inserted into the <a href="#current4">current node</a>, it must + instead be inserted into the <em><a href="#foster">foster parent + element</a></em>.</p> + + <p>The <dfn id="foster">foster parent element</dfn> is the parent + element of the last <code><a href="section-tabular.html#table">table</a></code> element + in the <a href="#stack">stack of open elements</a>, if there is a + <code><a href="section-tabular.html#table">table</a></code> element and it has such a + parent element. If there is no <code><a href="section-tabular.html#table">table</a></code> element in the <a href="#stack">stack + of open elements</a> (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> + case</a>), then the <em><a href="#foster">foster parent + element</a></em> is the first element in the <a href="#stack">stack + of open elements</a> (the <code><a href="section-the-root.html#html">html</a></code> + element). Otherwise, if there is a <code><a href="section-tabular.html#table">table</a></code> element in the <a href="#stack">stack + of open elements</a>, but the last <code><a href="section-tabular.html#table">table</a></code> element in the <a href="#stack">stack + of open elements</a> has no parent, or its parent node is not an + element, then the <em><a href="#foster">foster parent + element</a></em> is the element before the last <code><a href="section-tabular.html#table">table</a></code> element in the <a href="#stack">stack + of open elements</a>.</p> + + <p>If the <em><a href="#foster">foster parent element</a></em> is the + parent element of the last <code><a href="section-tabular.html#table">table</a></code> + element in the <a href="#stack">stack of open elements</a>, then the + new node must be inserted immediately <em>before</em> the last + <code><a href="section-tabular.html#table">table</a></code> element in the <a href="#stack">stack of open elements</a> in the <a href="#foster">foster parent element</a>; otherwise, the new node + must be <em>appended</em> to the <a href="#foster">foster parent + element</a>.</p> + </dd></dl> + + <p>When the steps above require the UA to <dfn id="clear1">clear the + stack back to a table context</dfn>, it means that the UA must, while + the <a href="#current4">current node</a> is not a <code><a href="section-tabular.html#table">table</a></code> element or an <code><a href="section-the-root.html#html">html</a></code> element, pop elements from the <a href="#stack">stack of open elements</a>. If this causes any elements + to be popped from the stack, then this is a <a href="section-parsing.html#parse">parse + error</a>.</p> + + <p class="note">The <a href="#current4">current node</a> being an + <code><a href="section-the-root.html#html">html</a></code> element after this process is an + <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p> + + </dd><dt id="parsing-main-incaption">If the <a href="#insertion0">insertion + mode</a> is "<dfn id="in-caption" title="insertion mode: in caption">in + caption</dfn>" + + </dt><dd> + <dl class="switch"> + <dt>An end tag whose tag name is "caption" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an + element in table scope</a> with the same tag name as the token, this + is a <a href="section-parsing.html#parse">parse error</a>. Ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p>Otherwise:</p> + + <p><a href="#generate">Generate implied end tags</a>.</p> + + <p>Now, if the <a href="#current4">current node</a> is not a <code><a href="section-tabular.html#caption0">caption</a></code> element, then this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>Pop elements from this stack until a <code><a href="section-tabular.html#caption0">caption</a></code> element has been popped from the + stack.</p> + + <p><a href="#clear0">Clear the list of active formatting elements up + to the last marker</a>.</p> + + <p>Switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-table" title="insertion mode: in table">in table</a>".</p> + + </dd><dt>A start tag whose tag name is one of: "caption", "col", + "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr" + + </dt><dt>An end tag whose tag name is "table" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Act as if an end tag with the + tag name "caption" had been seen, then, if that token wasn't + ignored, reprocess the current token.</p> + + <p class="note">The fake end tag token here can only be ignored in the + <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p> + + </dd><dt>An end tag whose tag name is one of: "body", "col", "colgroup", + "html", "tbody", "td", "tfoot", "th", "thead", "tr" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + + </dd><dt>Anything else + + </dt><dd> + <p>Process the token as if the <a href="#insertion0">insertion + mode</a> was "<a href="#in-body" title="insertion mode: in body">in + body</a>".</p> + </dd></dl> + + </dd><dt id="parsing-main-incolgroup">If the <a href="#insertion0">insertion + mode</a> is "<dfn id="in-column" title="insertion mode: in column + group">in column group</dfn>" + + </dt><dd> + <dl class="switch"> + <dt>A character token that is one of one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C + FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dd> + <p><a href="#append" title="append a character">Append the + character</a> to the <a href="#current4">current node</a>.</p> + + </dd><dt>A comment token + + </dt><dd> + <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment + token.</p> + + </dd><dt>A start tag whose tag name is "col" + + </dt><dd> + <p><a href="#insert" title="insert an HTML element">Insert a + <code>col</code> element</a> for the token. Immediately pop the <a href="#current4">current node</a> off the <a href="#stack">stack of + open elements</a>.</p> + + </dd><dt>An end tag whose tag name is "colgroup" + + </dt><dd> + <p>If the <a href="#current4">current node</a> is the root <code><a href="section-the-root.html#html">html</a></code> element, then this is a <a href="section-parsing.html#parse">parse error</a>, ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p>Otherwise, pop the <a href="#current4">current node</a> (which + will be a <code><a href="section-tabular.html#colgroup">colgroup</a></code> element) + from the <a href="#stack">stack of open elements</a>. Switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-table" title="insertion mode: in table">in table</a>".</p> + + </dd><dt>An end tag whose tag name is "col" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + + </dd><dt>Anything else + + </dt><dd> + <p>Act as if an end tag with the tag name "colgroup" had been seen, + and then, if that token wasn't ignored, reprocess the current token.</p> + + <p class="note">The fake end tag token here can only be ignored in the + <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p> + </dd></dl> + + </dd><dt id="parsing-main-intbody">If the <a href="#insertion0">insertion + mode</a> is "<dfn id="in-table0" title="insertion mode: in table body">in + table body</dfn>" + + </dt><dd> + <dl class="switch"> + <dt>A start tag whose tag name is "tr" + + </dt><dd> + <p><a href="#clear2">Clear the stack back to a table body + context</a>. (See below.)</p> + + <p><a href="#insert" title="insert an HTML element">Insert a + <code>tr</code> element</a> for the token, then switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-row" title="insertion mode: in row">in row</a>".</p> + + </dd><dt>A start tag whose tag name is one of: "th", "td" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Act as if a start tag with the + tag name "tr" had been seen, then reprocess the current token.</p> + + </dd><dt>An end tag whose tag name is one of: "tbody", "tfoot", "thead" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an + element in table scope</a> with the same tag name as the token, this + is a <a href="section-parsing.html#parse">parse error</a>. Ignore the token.</p> + + <p>Otherwise:</p> + + <p><a href="#clear2">Clear the stack back to a table body + context</a>. (See below.)</p> + + <p>Pop the <a href="#current4">current node</a> from the <a href="#stack">stack of open elements</a>. Switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-table" title="insertion mode: in table">in table</a>".</p> + + </dd><dt>A start tag whose tag name is one of: "caption", "col", + "colgroup", "tbody", "tfoot", "thead" + + </dt><dt>An end tag whose tag name is "table" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have a + <code>tbody</code>, <code>thead</code>, or <code>tfoot</code> + element in table scope</a>, this is a <a href="section-parsing.html#parse">parse + error</a>. Ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p>Otherwise:</p> + + <p><a href="#clear2">Clear the stack back to a table body + context</a>. (See below.)</p> + + <p>Act as if an end tag with the same tag name as the <a href="#current4">current node</a> ("tbody", "tfoot", or "thead") had + been seen, then reprocess the current token.</p> + + </dd><dt>An end tag whose tag name is one of: "body", "caption", "col", + "colgroup", "html", "td", "th", "tr" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + + </dd><dt>Anything else + + </dt><dd> + <p>Process the token as if the <a href="#insertion0">insertion + mode</a> was "<a href="#in-table" title="insertion mode: in + table">in table</a>".</p> + </dd></dl> + + <p>When the steps above require the UA to <dfn id="clear2">clear the + stack back to a table body context</dfn>, it means that the UA must, + while the <a href="#current4">current node</a> is not a <code><a href="section-tabular.html#tbody">tbody</a></code>, <code><a href="section-tabular.html#tfoot0">tfoot</a></code>, <code><a href="section-tabular.html#thead0">thead</a></code>, or <code><a href="section-the-root.html#html">html</a></code> element, pop elements from the <a href="#stack">stack of open elements</a>. If this causes any elements + to be popped from the stack, then this is a <a href="section-parsing.html#parse">parse + error</a>.</p> + + <p class="note">The <a href="#current4">current node</a> being an + <code><a href="section-the-root.html#html">html</a></code> element after this process is an + <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p> + + </dd><dt id="parsing-main-intr">If the <a href="#insertion0">insertion mode</a> + is "<dfn id="in-row" title="insertion mode: in row">in row</dfn>" + + </dt><dd> + <dl class="switch"> + <dt>A start tag whose tag name is one of: "th", "td" + + </dt><dd> + <p><a href="#clear3">Clear the stack back to a table row context</a>. + (See below.)</p> + + <p><a href="#insert" title="insert an HTML element">Insert an HTML + element</a> for the token, then switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-cell" title="insertion mode: in cell">in cell</a>".</p> + + <p>Insert a marker at the end of the <a href="#list-of4">list of + active formatting elements</a>.</p> + + </dd><dt>An end tag whose tag name is "tr" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an + element in table scope</a> with the same tag name as the token, this + is a <a href="section-parsing.html#parse">parse error</a>. Ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p>Otherwise:</p> + + <p><a href="#clear3">Clear the stack back to a table row context</a>. + (See below.)</p> + + <p>Pop the <a href="#current4">current node</a> (which will be a + <code><a href="section-tabular.html#tr">tr</a></code> element) from the <a href="#stack">stack of open elements</a>. Switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-table0" title="insertion mode: in table body">in table body</a>".</p> + + </dd><dt>A start tag whose tag name is one of: "caption", "col", + "colgroup", "tbody", "tfoot", "thead", "tr" + + </dt><dt>An end tag whose tag name is "table" + + </dt><dd> + <p>Act as if an end tag with the tag name "tr" had been seen, then, + if that token wasn't ignored, reprocess the current token.</p> + + <p class="note">The fake end tag token here can only be ignored in the + <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p> + + </dd><dt>An end tag whose tag name is one of: "tbody", "tfoot", "thead" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an + element in table scope</a> with the same tag name as the token, this + is a <a href="section-parsing.html#parse">parse error</a>. Ignore the token.</p> + + <p>Otherwise, act as if an end tag with the tag name "tr" had been + seen, then reprocess the current token.</p> + + </dd><dt>An end tag whose tag name is one of: "body", "caption", "col", + "colgroup", "html", "td", "th" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + + </dd><dt>Anything else + + </dt><dd> + <p>Process the token as if the <a href="#insertion0">insertion + mode</a> was "<a href="#in-table" title="insertion mode: in + table">in table</a>".</p> + </dd></dl> + + <p>When the steps above require the UA to <dfn id="clear3">clear the + stack back to a table row context</dfn>, it means that the UA must, + while the <a href="#current4">current node</a> is not a <code><a href="section-tabular.html#tr">tr</a></code> element or an <code><a href="section-the-root.html#html">html</a></code> element, pop elements from the <a href="#stack">stack of open elements</a>. If this causes any elements + to be popped from the stack, then this is a <a href="section-parsing.html#parse">parse + error</a>.</p> + + <p class="note">The <a href="#current4">current node</a> being an + <code><a href="section-the-root.html#html">html</a></code> element after this process is an + <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p> + + </dd><dt id="parsing-main-intd">If the <a href="#insertion0">insertion mode</a> + is "<dfn id="in-cell" title="insertion mode: in cell">in cell</dfn>" + + </dt><dd> + <dl class="switch"> + <dt>An end tag whose tag name is one of: "td", "th" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an + element in table scope</a> with the same tag name as that of the + token, then this is a <a href="section-parsing.html#parse">parse error</a> and the token + must be ignored.</p> + + <p>Otherwise:</p> + + <p><a href="#generate">Generate implied end tags</a>, except for + elements with the same tag name as the token.</p> + + <p>Now, if the <a href="#current4">current node</a> is not an element + with the same tag name as the token, then this is a <a href="section-parsing.html#parse">parse error</a>.</p> + + <p>Pop elements from this stack until an element with the same tag + name as the token has been popped from the stack.</p> + + <p><a href="#clear0">Clear the list of active formatting elements up + to the last marker</a>.</p> + + <p>Switch the <a href="#insertion0">insertion mode</a> to "<a href="#in-row" title="insertion mode: in row">in row</a>". (The <a href="#current4">current node</a> will be a <code><a href="section-tabular.html#tr">tr</a></code> element at this point.)</p> + + </dd><dt>A start tag whose tag name is one of: "caption", "col", + "colgroup", "tbody", "td", "tfoot", "th", "thead", "tr" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> does + <em>not</em> <a href="#have-an0" title="has an element in table + scope">have a <code>td</code> or <code>th</code> element in table + scope</a>, then this is a <a href="section-parsing.html#parse">parse error</a>; ignore + the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p>Otherwise, <a href="#close2">close the cell</a> (see below) and + reprocess the current token.</p> + + </dd><dt>An end tag whose tag name is one of: "body", "caption", "col", + "colgroup", "html" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + + </dd><dt>An end tag whose tag name is one of: "table", "tbody", "tfoot", + "thead", "tr" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an + element in table scope</a> with the same tag name as that of the + token (which can only happen for "tbody", "tfoot" and "thead", or, + in the <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>), then + this is a <a href="section-parsing.html#parse">parse error</a> and the token must be + ignored.</p> + + <p>Otherwise, <a href="#close2">close the cell</a> (see below) and + reprocess the current token.</p> + + </dd><dt>Anything else + + </dt><dd> + <p>Process the token as if the <a href="#insertion0">insertion + mode</a> was "<a href="#in-body" title="insertion mode: in body">in + body</a>".</p> + </dd></dl> + + <p>Where the steps above say to <dfn id="close2">close the cell</dfn>, + they mean to follow the following algorithm:</p> + + <ol> + <li> + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an0" title="has an element in table scope">has a + <code>td</code> element in table scope</a>, then act as if an end + tag token with the tag name "td" had been seen. + + </p></li><li> + <p>Otherwise, the <a href="#stack">stack of open elements</a> will <a href="#have-an0" title="has an element in table scope">have a + <code>th</code> element in table scope</a>; act as if an end tag + token with the tag name "th" had been seen. + </p></li></ol> + + <p class="note">The <a href="#stack">stack of open elements</a> cannot + have both a <code><a href="section-tabular.html#td">td</a></code> and a <code><a href="section-tabular.html#th">th</a></code> element <a href="#have-an0" title="has an + element in table scope">in table scope</a> at the same time, nor can + it have neither when the <a href="#insertion0">insertion mode</a> is + "<a href="#in-cell" title="insertion mode: in cell">in cell</a>".</p> + + </dd><dt id="parsing-main-inselect">If the <a href="#insertion0">insertion + mode</a> is "<dfn id="in-select" title="insertion mode: in select">in + select</dfn>" + + </dt><dd> + <p>Handle the token as follows:</p> + + <dl class="switch"> + <dt>A character token + + </dt><dd> + <p><a href="#append" title="append a character">Append the token's + character</a> to the <a href="#current4">current node</a>.</p> + + </dd><dt>A comment token + + </dt><dd> + <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment + token.</p> + + </dd><dt>A start tag token whose tag name is "option" + + </dt><dd> + <p>If the <a href="#current4">current node</a> is an + <code>option</code> element, act as if an end tag with the tag name + "option" had been seen.</p> + + <p><a href="#insert">Insert an HTML element</a> for the token.</p> + + </dd><dt>A start tag token whose tag name is "optgroup" + + </dt><dd> + <p>If the <a href="#current4">current node</a> is an + <code>option</code> element, act as if an end tag with the tag name + "option" had been seen.</p> + + <p>If the <a href="#current4">current node</a> is an + <code>optgroup</code> element, act as if an end tag with the tag + name "optgroup" had been seen.</p> + + <p><a href="#insert">Insert an HTML element</a> for the token.</p> + + </dd><dt>An end tag token whose tag name is "optgroup" + + </dt><dd> + <p>First, if the <a href="#current4">current node</a> is an + <code>option</code> element, and the node immediately before it in + the <a href="#stack">stack of open elements</a> is an + <code>optgroup</code> element, then act as if an end tag with the + tag name "option" had been seen.</p> + + <p>If the <a href="#current4">current node</a> is an + <code>optgroup</code> element, then pop that node from the <a href="#stack">stack of open elements</a>. Otherwise, this is a <a href="section-parsing.html#parse">parse error</a>, ignore the token.</p> + + </dd><dt>An end tag token whose tag name is "option" + + </dt><dd> + <p>If the <a href="#current4">current node</a> is an + <code>option</code> element, then pop that node from the <a href="#stack">stack of open elements</a>. Otherwise, this is a <a href="section-parsing.html#parse">parse error</a>, ignore the token.</p> + + </dd><dt>An end tag whose tag name is "select" + + </dt><dd> + <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an + element in table scope</a> with the same tag name as the token, this + is a <a href="section-parsing.html#parse">parse error</a>. Ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p>Otherwise:</p> + + <p>Pop elements from the <a href="#stack">stack of open elements</a> + until a <code>select</code> element has been popped from the stack.</p> + + <p><a href="#reset">Reset the insertion mode appropriately</a>.</p> + + </dd><dt>A start tag whose tag name is "select" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Act as if the token had been an + end tag with the tag name "select" instead.</p> + + </dd><dt>An end tag whose tag name is one of: "caption", "table", "tbody", + "tfoot", "thead", "tr", "td", "th" + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>.</p> + + <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an0">has an element in table scope</a> with the same tag + name as that of the token, then act as if an end tag with the tag + name "select" had been seen, and reprocess the token. Otherwise, + ignore the token.</p> + + </dd><dt>Anything else + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + </dd></dl> + + </dd><dt id="parsing-main-afterbody">If the <a href="#insertion0">insertion + mode</a> is "<dfn id="after2" title="insertion mode: after body">after + body</dfn>" + + </dt><dd> + <p>Handle the token as follows:</p> + + <dl class="switch"> + <dt>A character token that is one of one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C + FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dd> + <p>Process the token as it would be processed if the <a href="#insertion0">insertion mode</a> was "<a href="#in-body" title="insertion mode: in body">in body</a>".</p> + + </dd><dt>A comment token + + </dt><dd> + <p>Append a <code>Comment</code> node to the first element in the <a href="#stack">stack of open elements</a> (the <code><a href="section-the-root.html#html">html</a></code> element), with the <code title="">data</code> attribute set to the data given in the comment + token.</p> + + </dd><dt>An end tag with the tag name "html" + + </dt><dd> + <p>If the parser was originally created in order to handle the + setting of <em>an element</em>'s <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> attribute, this is a <a href="section-parsing.html#parse">parse error</a>; ignore the token. (The element will + be an <code><a href="section-the-root.html#html">html</a></code> element in this case.) + (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p>Otherwise, switch to <a href="#the-trailing0">the trailing end + phase</a>.</p> + + </dd><dt>Anything else + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Set the <a href="#insertion0">insertion mode</a> to "<a href="#in-body" title="insertion mode: in body">in body</a>" and reprocess the + token.</p> + </dd></dl> + + </dd><dt id="parsing-main-inframeset">If the <a href="#insertion0">insertion + mode</a> is "<dfn id="in-frameset" title="insertion mode: in frameset">in + frameset</dfn>" + + </dt><dd> + <p>Handle the token as follows:</p> + + <dl class="switch"> + <dt>A character token that is one of one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C + FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dd> + <p><a href="#append" title="append a character">Append the + character</a> to the <a href="#current4">current node</a>.</p> + + </dd><dt>A comment token + + </dt><dd> + <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment + token.</p> + + </dd><dt>A start tag with the tag name "frameset" + + </dt><dd> + <p><a href="#insert" title="Insert an HTML element">Insert a + <code>frameset</code> element</a> for the token.</p> + + </dd><dt>An end tag with the tag name "frameset" + + </dt><dd> + <p>If the <a href="#current4">current node</a> is the root <code><a href="section-the-root.html#html">html</a></code> element, then this is a <a href="section-parsing.html#parse">parse error</a>; ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p> + + <p>Otherwise, pop the <a href="#current4">current node</a> from the + <a href="#stack">stack of open elements</a>.</p> + + <p>If the parser was <em>not</em> originally created in order to + handle the setting of an element's <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> attribute (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>), and the <a href="#current4">current node</a> is no longer a + <code>frameset</code> element, then change the <a href="#insertion0">insertion mode</a> to "<a href="#after3" title="insertion mode: after frameset">after frameset</a>".</p> + + </dd><dt>A start tag with the tag name "frame" + + </dt><dd> + <p><a href="#insert">Insert an HTML element</a> for the token. + Immediately pop the <a href="#current4">current node</a> off the <a href="#stack">stack of open elements</a>.</p> + + </dd><dt>A start tag with the tag name "noframes" + + </dt><dd> + <p>Process the token as if the <a href="#insertion0">insertion + mode</a> had been "<a href="#in-body" title="insertion mode: in + body">in body</a>".</p> + + </dd><dt>Anything else + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + </dd></dl> + + </dd><dt id="parsing-main-afterframeset">If the <a href="#insertion0">insertion + mode</a> is "<dfn id="after3" title="insertion mode: after + frameset">after frameset</dfn>" + + </dt><dd> + <p>Handle the token as follows:</p> + + <dl class="switch"> + <dt>A character token that is one of one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C + FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dd> + <p><a href="#append" title="append a character">Append the + character</a> to the <a href="#current4">current node</a>.</p> + + </dd><dt>A comment token + + </dt><dd> + <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment + token.</p> + + </dd><dt>An end tag with the tag name "html" + + </dt><dd> + <p>Switch to <a href="#the-trailing0">the trailing end phase</a>.</p> + + </dd><dt>A start tag with the tag name "noframes" + + </dt><dd> + <p>Process the token as if the <a href="#insertion0">insertion + mode</a> had been "<a href="#in-body" title="insertion mode: in + body">in body</a>".</p> + + </dd><dt>Anything else + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + </dd></dl> + </dd></dl> + </dd></dl> + + <p class="big-issue">This doesn't handle UAs that don't support frames, or + that do support frames but want to show the NOFRAMES content. Supporting + the former is easy; supporting the latter is harder. + + </p><h5 id="the-trailing"><span class="secno">8.2.4.4. </span><dfn id="the-trailing0">The trailing end phase</dfn></h5> + + <p>After <a href="#the-main0">the main phase</a>, as each token is emitted + from the <a href="section-tokenisation.html#tokenisation0">tokenisation</a> stage, it must be + processed as described in this section. + + </p><dl class="switch"> + <dt>A DOCTYPE token + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p> + + </dd><dt>A comment token + + </dt><dd> + <p>Append a <code>Comment</code> node to the <code>Document</code> object + with the <code title="">data</code> attribute set to the data given in + the comment token.</p> + + </dd><dt>A character token that is one of one of U+0009 CHARACTER TABULATION, + U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), + U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dd> + <p>Process the token as it would be processed in <a href="#the-main0">the + main phase</a>.</p> + + </dd><dt>A character token that is <em>not</em> one of U+0009 CHARACTER + TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM + FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE + + </dt><dt>A start tag token + + </dt><dt>An end tag token + + </dt><dd> + <p><a href="section-parsing.html#parse">Parse error</a>. Switch back to <a href="#the-main0">the main phase</a> and reprocess the token.</p> + + </dd><dt>An end-of-file token + + </dt><dd> + <p><a href="#stops">Stop parsing</a>.</p> + </dd></dl> + + <h4 id="the-end"><span class="secno">8.2.5. </span>The End</h4> + + <p>Once the user agent <dfn id="stops" title="stop parsing">stops + parsing</dfn> the document, the user agent must follow the steps in this + section. + + </p><p>First, <!--the user agent must <span title="fire a DOMContentLoaded + event">fire a <code + title="event-DOMContentLoaded">DOMContentLoaded</code> event</span> + at <span>the <code>body</code> element</span>.</p> + + <p>Then, -->the + rules for <a href="section-scripting0.html#when-a">when a script completes loading</a> start + applying (script execution is no longer managed by the parser). + + </p><p>If any of the scripts in the <a href="section-scripting0.html#list-of1">list of scripts that + will execute as soon as possible</a> have <span>completed + loading</span><!-- XXX xref -->, or if the <a href="section-scripting0.html#list-of0">list of + scripts that will execute asynchronously</a> is not empty and the first + script in that list has <span>completed loading</span><!-- XXX xref + -->, + then the user agent must act as if those scripts just completed loading, + following the rules given for that in the <code><a href="section-scripting0.html#script0">script</a></code> element definition. + + </p><p>Then, if the <a href="section-scripting0.html#list-of">list of scripts that will execute when + the document has finished parsing</a> is not empty, and the first item in + this list has already <span>completed loading</span><!--XXX + xref -->, + then the user agent must act as if that script just finished loading. + + </p><p>By this point, there will be no scripts that have loaded but have not + yet been executed. + + </p><p>The user agent must then <a href="section-scripting.html#firing2">fire a simple event</a> + called <code title="event-DOMContentLoaded">DOMContentLoaded</code> at the + <code>Document</code>. + + </p><p>Once everything that <dfn id="delays" title="delay the load event">delays + the load event</dfn> has completed, the user agent must <a href="section-scripting.html#firing4" title="fire a load event">fire a <code title="event-load">load</code> + event</a> at <a href="section-dom-tree.html#the-body0">the <code>body</code> element</a>.</p> + <!-- XXX make sure things "delay the load event" --> + <!--XXX need to handle +http://lxr.mozilla.org/mozilla/source/parser/htmlparser/src/CNavDTD.cpp#2354 +2354 // Don't open transient styles if it makes the stack deep, bug 58917. +--> + <!--XXX +http://lxr.mozilla.org/mozilla/source/parser/htmlparser/src/nsHTMLTokenizer.cpp#749 +--> + <!-- +see also CTextToken::ConsumeCharacterData() for CDATA parsing? + +1212 1 Here's a tricky case from bug 22596: <h5><li><h5> +1213 How do we know that the 2nd <h5> should close the <LI> rather than nest inside the <LI>? +1214 (Afterall, the <h5> is a legal child of the <LI>). +1215 +1216 The way you know is that there is no root between the two, so the <h5> binds more +1217 tightly to the 1st <h5> than to the <LI>. +1218 2. Also, bug 6148 shows this case: <SPAN><DIV><SPAN> +1219 From this case we learned not to execute this logic if the parent is a block. +1220 +1221 3. Fix for 26583 +1222 Ex. <A href=foo.html><B>foo<A href-bar.html>bar</A></B></A> <- A legal HTML +1223 In the above example clicking on "foo" or "bar" should link to +1224 foo.html or bar.html respectively. That is, the inner <A> should be informed +1225 about the presence of an open <A> above <B>..so that the inner <A> can close out +1226 the outer <A>. The following code does it for us. +1227 +1228 4. Fix for 27865 [ similer to 22596 ]. Ex: <DL><DD><LI>one<DD><LI>two + - http://lxr.mozilla.org/mozilla/source/parser/htmlparser/src/CNavDTD.cpp#1211 + +815 // Here's a problem. If theTag is legal in here, we don't move it +816 // out. So if we're moving stuff out of here, the parent of theTag +817 // gets closed at this point. But some things are legal +818 // _everywhere_ and hence would effectively close out misplaced +819 // content in tables. This is undesirable, so treat them as +820 // illegal here so they'll be shipped out with their parents and +821 // siblings. See bug 40855 for an explanation (that bug was for +822 // comments, but the same issues arise with whitespace, newlines, +823 // noscript, etc). Script is special, though. Shipping it out +824 // breaks document.write stuff. See bug 243064. + - http://lxr.mozilla.org/mozilla/source/parser/htmlparser/src/CNavDTD.cpp#825 + + +1326 /************************************************************************************** +1327 * +1328 * Now a little code to deal with bug #49687 (crash when layout stack gets too deep) +1329 * I've also opened this up to any container (not just inlines): re bug 55095 +1330 * Improved to handle bug 55980 (infinite loop caused when DEPTH is exceeded and +1331 * </P> is encountered by itself (<P>) is continuously produced. +1332 * +1333 **************************************************************************************/ + +1912 // Oh boy!! we found a "stray" tag. Nav4.x and IE introduce line break in +1913 // such cases. So, let's simulate that effect for compatibility. +1914 // Ex. <html><body>Hello</P>There</body></html> +http://lxr.mozilla.org/mozilla/source/parser/htmlparser/src/CNavDTD.cpp#1912 + +http://lxr.mozilla.org/seamonkey/search?string=nested +/parser/htmlparser/src/CNavDTD.cpp, line 791 - * 2. <CENTER><DL><DT><A><CENTER> allow nested <CENTER> +/parser/htmlparser/src/CNavDTD.cpp, line 792 - * 3. <TABLE><TR><TD><TABLE>... allow nested <TABLE> +/parser/htmlparser/src/CNavDTD.cpp, line 2562 - // Discard nested forms - bug 72639 +/parser/htmlparser/src/nsElementTable.cpp, line 1453 - * 2. <CENTER><DL><DT><A><CENTER> allow nested <CENTER> +/parser/htmlparser/src/nsElementTable.cpp, line 1454 - * 3. <TABLE><TR><TD><TABLE>... allow nested <TABLE> +/parser/htmlparser/src/nsElementTable.cpp, line 1901 - // Ex: <H1><LI><H1><LI>. Inner LI has the potential of getting nested +--> + + <script src="http://status.whatwg.org/annotate-web-apps.js" type="text/javascript"></script></body></html>
\ No newline at end of file diff --git a/test/data/html/web-apps.html b/test/data/html/web-apps.html new file mode 100644 index 0000000..d685320 --- /dev/null +++ b/test/data/html/web-apps.html @@ -0,0 +1,41271 @@ +<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN"> + +<html lang=en-GB-hixie> + <head> + <title>HTML 5</title> + <link href="/style/specification" rel=stylesheet type="text/css"> + <link href="/images/icon" rel=icon> + + <style type="text/css"> + h4 + .element { margin-top: -2.5em; padding-top: 2em; } + h4 + p + .element { margin-top: -5em; padding-top: 4em; } + .element { background: #EEFFEE; color: black; margin: 0 0 1em -1em; padding: 0 1em 0.25em 0.75em; border-left: solid #99FF99 0.25em; -padding: 0; /* that last decl is for IE6. Try removing it, it's hilarious! */ } + .proposal { border: blue solid; padding: 1em; } + table.matrix, table.matrix td { border: none; text-align: right; } + table.matrix { margin-left: 2em; } + </style> + + <body class=draft> + <div class=head> + <p><a class=logo href="http://www.whatwg.org/" rel=home><img alt=WHATWG + src="/images/logo"></a></p> + + <h1 id=html-5>HTML 5</h1> + + <h2 class="no-num no-toc" id=working>Working Draft — 14 June 2007</h2> + + <p>You can take part in this work. <a + href="http://www.whatwg.org/mailing-list">Join the working group's + discussion list.</a></p> + + <p><strong>Web designers!</strong> We have a <a + href="http://blog.whatwg.org/faq/">FAQ</a>, a <a + href="http://forums.whatwg.org/">forum</a>, and a <a + href="http://www.whatwg.org/mailing-list#help">help mailing list</a> for + you!</p> + + <dl> + <dt>One-page version: + + <dd><a + href="http://www.whatwg.org/specs/web-apps/current-work/">http://www.whatwg.org/specs/web-apps/current-work/</a> + + <dt>Multiple-page version: + + <dd><a + href="http://www.whatwg.org/specs/web-apps/current-work/multipage/">http://www.whatwg.org/specs/web-apps/current-work/multipage/</a> + + <dt>Version history: + + <dd>Twitter messages (non-editorial changes only): <a + href="http://twitter.com/WHATWG">http://twitter.com/WHATWG</a> + + <dd>Commit-Watchers mailing list: <a + href="http://lists.whatwg.org/listinfo.cgi/commit-watchers-whatwg.org">http://lists.whatwg.org/listinfo.cgi/commit-watchers-whatwg.org</a> + + <dd>Interactive Web interface: <a + href="http://html5.org/tools/web-apps-tracker">http://html5.org/tools/web-apps-tracker</a> + + <dd>Subversion interface: <a + href="http://svn.whatwg.org/">http://svn.whatwg.org/</a> + + <dt>Editor: + + <dd>Ian Hickson, Google, ian@hixie.ch + </dl> + + <p class=copyright>© Copyright 2004-2007 Apple Computer, Inc., + Mozilla Foundation, and Opera Software ASA.</p> + + <p class=copyright>You are granted a license to use, reproduce and create + derivative works of this document.</p> + </div> + + <hr> + + <h2 class="no-num no-toc" id=abstract>Abstract</h2> + + <p>This specification introduces features to HTML and the DOM that ease the + authoring of Web-based applications. Additions include the context menus, + a direct-mode graphics canvas, inline popup windows, and server-sent + events. + + <h2 class="no-num no-toc" id=status>Status of this document</h2> + + <p><strong>This is a work in progress!</strong> This document is changing + on a daily if not hourly basis in response to comments and as a general + part of its development process. Comments are very welcome, please send + them to <a href="mailto:whatwg@whatwg.org">whatwg@whatwg.org</a>. Thank + you. + + <p>Implementors should be aware that this specification is not stable. + <strong>Implementors who are not taking part in the discussions are likely + to find the specification changing out from under them in incompatible + ways.</strong> Vendors interested in implementing this specification + before it eventually reaches the call for implementations should join the + <a href="/mailing-list">WHATWG mailing list</a> and take part in the + discussions. + + <p>This specification is also being produced by the <a + href="http://www.w3.org/html/wg">W3C HTML WG</a>. The two specifications + are identical from the table of contents onwards. + + <p>This specification is intended to replace (be the new version of) what + was previously the HTML4, XHTML 1.x, and DOM2 HTML specifications. + + <h3 class="no-num no-toc" id=stability0>Stability</h3> + + <p>Different parts of this specification are at different levels of + maturity. + + <div id=stability></div> + + <p class=big-issue>Known issues are usually marked like this. There are + some spec-wide issues that have not yet been addressed: case-sensitivity + is a very poorly handled topic right now, and the firing of events needs + to be unified (right now some bubble, some don't, they all use different + text to fire events, etc). + + <h2 class="no-num no-toc" id=contents>Table of contents</h2> + <!--begin-toc--> + + <ul class=toc> + <li><a href="#introduction"><span class=secno>1. </span>Introduction</a> + <ul class=toc> + <li><a href="#scope"><span class=secno>1.1. </span>Scope</a> + <ul class=toc> + <li><a href="#relationship"><span class=secno>1.1.1. + </span>Relationship to HTML 4.01, XHTML 1.1, DOM2 HTML</a> + + <li><a href="#relationship0"><span class=secno>1.1.2. + </span>Relationship to XHTML2</a> + + <li><a href="#relationship1"><span class=secno>1.1.3. + </span>Relationship to XUL, Flash, Silverlight, and other proprietary + UI languages</a> + </ul> + + <li><a href="#structure"><span class=secno>1.2. </span>Structure of this + specification</a> + <ul class=toc> + <li><a href="#how-to"><span class=secno>1.2.1. </span>How to read this + specification</a> + </ul> + + <li><a href="#conformance"><span class=secno>1.3. </span>Conformance + requirements</a> + <ul class=toc> + <li><a href="#common"><span class=secno>1.3.1. </span>Common + conformance requirements for APIs exposed to JavaScript</a> + + <li><a href="#dependencies"><span class=secno>1.3.2. + </span>Dependencies</a> + + <li><a href="#features"><span class=secno>1.3.3. </span>Features + defined in other specifications</a> + </ul> + + <li><a href="#terminology"><span class=secno>1.4. </span>Terminology</a> + + <ul class=toc> + <li><a href="#html-vs"><span class=secno>1.4.1. </span>HTML vs + XHTML</a> + </ul> + </ul> + + <li><a href="#dom"><span class=secno>2. </span>The Document Object + Model</a> + <ul class=toc> + <li><a href="#documents"><span class=secno>2.1. </span>Documents</a> + <ul class=toc> + <li><a href="#security"><span class=secno>2.1.1. </span>Security</a> + + <li><a href="#resource"><span class=secno>2.1.2. </span>Resource + metadata management</a> + </ul> + + <li><a href="#elements"><span class=secno>2.2. </span>Elements</a> + <ul class=toc> + <li><a href="#reflecting"><span class=secno>2.2.1. </span>Reflecting + content attributes in DOM attributes</a> + </ul> + + <li><a href="#common0"><span class=secno>2.3. </span>Common DOM + interfaces</a> + <ul class=toc> + <li><a href="#collections"><span class=secno>2.3.1. + </span>Collections</a> + <ul class=toc> + <li><a href="#htmlcollection"><span class=secno>2.3.1.1. + </span>HTMLCollection</a> + + <li><a href="#htmlformcontrolscollection"><span class=secno>2.3.1.2. + </span>HTMLFormControlsCollection</a> + + <li><a href="#htmloptionscollection"><span class=secno>2.3.1.3. + </span>HTMLOptionsCollection</a> + </ul> + + <li><a href="#domtokenlist"><span class=secno>2.3.2. + </span>DOMTokenList</a> + + <li><a href="#dom-feature"><span class=secno>2.3.3. </span>DOM feature + strings</a> + </ul> + + <li><a href="#dom-tree"><span class=secno>2.4. </span>DOM tree + accessors</a> + + <li><a href="#dynamic"><span class=secno>2.5. </span>Dynamic markup + insertion</a> + <ul class=toc> + <li><a href="#controlling"><span class=secno>2.5.1. </span>Controlling + the input stream</a> + + <li><a href="#dynamic0"><span class=secno>2.5.2. </span>Dynamic markup + insertion in HTML</a> + + <li><a href="#dynamic1"><span class=secno>2.5.3. </span>Dynamic markup + insertion in XML</a> + </ul> + + <li><a href="#apis-in"><span class=secno>2.6. </span>APIs in HTML + documents</a> + </ul> + + <li><a href="#semantics"><span class=secno>3. </span>Semantics and + structure of HTML elements</a> + <ul class=toc> + <li><a href="#semantics-intro"><span class=secno>3.1. + </span>Introduction</a> + + <li><a href="#common1"><span class=secno>3.2. </span>Common + microsyntaxes</a> + <ul class=toc> + <li><a href="#common2"><span class=secno>3.2.1. </span>Common parser + idioms</a> + + <li><a href="#boolean"><span class=secno>3.2.2. </span>Boolean + attributes</a> + + <li><a href="#numbers"><span class=secno>3.2.3. </span>Numbers</a> + <ul class=toc> + <li><a href="#unsigned"><span class=secno>3.2.3.1. </span>Unsigned + integers</a> + + <li><a href="#signed"><span class=secno>3.2.3.2. </span>Signed + integers</a> + + <li><a href="#real-numbers"><span class=secno>3.2.3.3. </span>Real + numbers</a> + + <li><a href="#ratios"><span class=secno>3.2.3.4. </span>Ratios</a> + + <li><a href="#percentages-and-dimensions"><span class=secno>3.2.3.5. + </span>Percentages and dimensions</a> + + <li><a href="#lists"><span class=secno>3.2.3.6. </span>Lists of + integers</a> + </ul> + + <li><a href="#dates"><span class=secno>3.2.4. </span>Dates and + times</a> + <ul class=toc> + <li><a href="#specific"><span class=secno>3.2.4.1. </span>Specific + moments in time</a> + + <li><a href="#vaguer"><span class=secno>3.2.4.2. </span>Vaguer + moments in time</a> + </ul> + + <li><a href="#time-offsets"><span class=secno>3.2.5. </span>Time + offsets</a> + + <li><a href="#tokens"><span class=secno>3.2.6. </span>Tokens</a> + + <li><a href="#keywords"><span class=secno>3.2.7. </span>Keywords and + enumerated attributes</a> + + <li><a href="#syntax-references"><span class=secno>3.2.8. + </span>References</a> + </ul> + + <li><a href="#documents0"><span class=secno>3.3. </span>Documents and + document fragments</a> + <ul class=toc> + <li><a href="#semantics0"><span class=secno>3.3.1. + </span>Semantics</a> + + <li><a href="#structure0"><span class=secno>3.3.2. + </span>Structure</a> + + <li><a href="#kinds"><span class=secno>3.3.3. </span>Kinds of + elements</a> + <ul class=toc> + <li><a href="#block-level"><span class=secno>3.3.3.1. + </span>Block-level elements</a> + + <li><a href="#inline-level"><span class=secno>3.3.3.2. + </span>Inline-level content</a> + + <li><a href="#transparent"><span class=secno>3.3.3.3. + </span>Transparent content models</a> + + <li><a href="#determining"><span class=secno>3.3.3.4. + </span>Determining if a particular element contains block-level + elements or inline-level content</a> + + <li><a href="#interactive0"><span class=secno>3.3.3.5. + </span>Interactive elements</a> + + <li><a href="#paragraphs"><span class=secno>3.3.3.6. + </span>Paragraphs</a> + </ul> + </ul> + + <li><a href="#global"><span class=secno>3.4. </span>Global + attributes</a> + <ul class=toc> + <li><a href="#the-id"><span class=secno>3.4.1. </span>The + <code>id</code> attribute</a> + + <li><a href="#the-title"><span class=secno>3.4.2. </span>The + <code>title</code> attribute</a> + + <li><a href="#the-lang"><span class=secno>3.4.3. </span>The + <code>lang</code> (HTML only) and <code>xml:lang</code> (XML only) + attributes</a> + + <li><a href="#the-dir"><span class=secno>3.4.4. </span>The + <code>dir</code> attribute</a> + + <li><a href="#classes"><span class=secno>3.4.5. </span>The + <code>class</code> attribute</a> + + <li><a href="#the-irrelevant"><span class=secno>3.4.6. </span>The + <code>irrelevant</code> attribute</a> + </ul> + + <li><a href="#interaction"><span class=secno>3.5. </span>Interaction</a> + + <ul class=toc> + <li><a href="#activation"><span class=secno>3.5.1. + </span>Activation</a> + + <li><a href="#focus"><span class=secno>3.5.2. </span>Focus</a> + <ul class=toc> + <li><a href="#focus-management"><span class=secno>3.5.2.1. + </span>Focus management</a> + + <li><a href="#sequential"><span class=secno>3.5.2.2. + </span>Sequential focus navigation</a> + </ul> + + <li><a href="#scrolling"><span class=secno>3.5.3. </span>Scrolling + elements into view</a> + </ul> + + <li><a href="#the-root"><span class=secno>3.6. </span>The root + element</a> + <ul class=toc> + <li><a href="#the-html"><span class=secno>3.6.1. </span>The + <code>html</code> element</a> + </ul> + + <li><a href="#document"><span class=secno>3.7. </span>Document + metadata</a> + <ul class=toc> + <li><a href="#the-head"><span class=secno>3.7.1. </span>The + <code>head</code> element</a> + + <li><a href="#the-title0"><span class=secno>3.7.2. </span>The + <code>title</code> element</a> + + <li><a href="#the-base"><span class=secno>3.7.3. </span>The + <code>base</code> element</a> + + <li><a href="#the-link"><span class=secno>3.7.4. </span>The + <code>link</code> element</a> + + <li><a href="#meta"><span class=secno>3.7.5. </span>The + <code>meta</code> element</a> + <ul class=toc> + <li><a href="#standard"><span class=secno>3.7.5.1. </span>Standard + metadata names</a> + + <li><a href="#other"><span class=secno>3.7.5.2. </span>Other + metadata names</a> + + <li><a href="#pragma"><span class=secno>3.7.5.3. </span>Pragma + directives</a> + + <li><a href="#charset"><span class=secno>3.7.5.4. </span>Specifying + and establishing the document's character encoding</a> + </ul> + + <li><a href="#the-style"><span class=secno>3.7.6. </span>The + <code>style</code> element</a> + + <li><a href="#styling"><span class=secno>3.7.7. </span>Styling</a> + </ul> + + <li><a href="#sections"><span class=secno>3.8. </span>Sections</a> + <ul class=toc> + <li><a href="#the-body"><span class=secno>3.8.1. </span>The + <code>body</code> element</a> + + <li><a href="#the-section"><span class=secno>3.8.2. </span>The + <code>section</code> element</a> + + <li><a href="#the-nav"><span class=secno>3.8.3. </span>The + <code>nav</code> element</a> + + <li><a href="#the-article"><span class=secno>3.8.4. </span>The + <code>article</code> element</a> + + <li><a href="#the-blockquote"><span class=secno>3.8.5. </span>The + <code>blockquote</code> element</a> + + <li><a href="#the-aside"><span class=secno>3.8.6. </span>The + <code>aside</code> element</a> + + <li><a href="#the-h1"><span class=secno>3.8.7. </span>The + <code>h1</code>, <code>h2</code>, <code>h3</code>, <code>h4</code>, + <code>h5</code>, and <code>h6</code> elements</a> + + <li><a href="#the-header"><span class=secno>3.8.8. </span>The + <code>header</code> element</a> + + <li><a href="#the-footer"><span class=secno>3.8.9. </span>The + <code>footer</code> element</a> + + <li><a href="#the-address"><span class=secno>3.8.10. </span>The + <code>address</code> element</a> + + <li><a href="#headings"><span class=secno>3.8.11. </span>Headings and + sections</a> + <ul class=toc> + <li><a href="#outlines"><span class=secno>3.8.11.1. </span>Creating + an outline</a> + + <li><a href="#associatedSection"><span class=secno>3.8.11.2. + </span>Determining which heading and section applies to a + particular node</a> + + <li><a href="#distinguishing"><span class=secno>3.8.11.3. + </span>Distinguishing site-wide headers from page headers</a> + </ul> + </ul> + + <li><a href="#prose"><span class=secno>3.9. </span>Prose</a> + <ul class=toc> + <li><a href="#the-p"><span class=secno>3.9.1. </span>The + <code>p</code> element</a> + + <li><a href="#the-hr"><span class=secno>3.9.2. </span>The + <code>hr</code> element</a> + + <li><a href="#the-br"><span class=secno>3.9.3. </span>The + <code>br</code> element</a> + + <li><a href="#the-dialog"><span class=secno>3.9.4. </span>The + <code>dialog</code> element</a> + </ul> + + <li><a href="#preformatted"><span class=secno>3.10. </span>Preformatted + text</a> + <ul class=toc> + <li><a href="#the-pre"><span class=secno>3.10.1. </span>The + <code>pre</code> element</a> + </ul> + + <li><a href="#lists0"><span class=secno>3.11. </span>Lists</a> + <ul class=toc> + <li><a href="#the-ol"><span class=secno>3.11.1. </span>The + <code>ol</code> element</a> + + <li><a href="#the-ul"><span class=secno>3.11.2. </span>The + <code>ul</code> element</a> + + <li><a href="#the-li"><span class=secno>3.11.3. </span>The + <code>li</code> element</a> + + <li><a href="#the-dl"><span class=secno>3.11.4. </span>The + <code>dl</code> element</a> + + <li><a href="#the-dt"><span class=secno>3.11.5. </span>The + <code>dt</code> element</a> + + <li><a href="#the-dd"><span class=secno>3.11.6. </span>The + <code>dd</code> element</a> + </ul> + + <li><a href="#phrase"><span class=secno>3.12. </span>Phrase elements</a> + + <ul class=toc> + <li><a href="#the-a"><span class=secno>3.12.1. </span>The + <code>a</code> element</a> + + <li><a href="#the-q"><span class=secno>3.12.2. </span>The + <code>q</code> element</a> + + <li><a href="#the-cite"><span class=secno>3.12.3. </span>The + <code>cite</code> element</a> + + <li><a href="#the-em"><span class=secno>3.12.4. </span>The + <code>em</code> element</a> + + <li><a href="#the-strong"><span class=secno>3.12.5. </span>The + <code>strong</code> element</a> + + <li><a href="#the-small"><span class=secno>3.12.6. </span>The + <code>small</code> element</a> + + <li><a href="#the-m"><span class=secno>3.12.7. </span>The + <code>m</code> element</a> + + <li><a href="#the-dfn"><span class=secno>3.12.8. </span>The + <code>dfn</code> element</a> + + <li><a href="#the-abbr"><span class=secno>3.12.9. </span>The + <code>abbr</code> element</a> + + <li><a href="#the-time"><span class=secno>3.12.10. </span>The + <code>time</code> element</a> + + <li><a href="#the-meter"><span class=secno>3.12.11. </span>The + <code>meter</code> element</a> + + <li><a href="#the-progress"><span class=secno>3.12.12. </span>The + <code>progress</code> element</a> + + <li><a href="#the-code"><span class=secno>3.12.13. </span>The + <code>code</code> element</a> + + <li><a href="#the-var"><span class=secno>3.12.14. </span>The + <code>var</code> element</a> + + <li><a href="#the-samp"><span class=secno>3.12.15. </span>The + <code>samp</code> element</a> + + <li><a href="#the-kbd"><span class=secno>3.12.16. </span>The + <code>kbd</code> element</a> + + <li><a href="#the-sup"><span class=secno>3.12.17. </span>The + <code>sup</code> and <code>sub</code> elements</a> + + <li><a href="#the-span"><span class=secno>3.12.18. </span>The + <code>span</code> element</a> + + <li><a href="#the-i"><span class=secno>3.12.19. </span>The + <code>i</code> element</a> + + <li><a href="#the-b"><span class=secno>3.12.20. </span>The + <code>b</code> element</a> + + <li><a href="#the-bdo"><span class=secno>3.12.21. </span>The + <code>bdo</code> element</a> + </ul> + + <li><a href="#edits"><span class=secno>3.13. </span>Edits</a> + <ul class=toc> + <li><a href="#the-ins"><span class=secno>3.13.1. </span>The + <code>ins</code> element</a> + + <li><a href="#the-del"><span class=secno>3.13.2. </span>The + <code>del</code> element</a> + + <li><a href="#attributes"><span class=secno>3.13.3. </span>Attributes + common to <code>ins</code> and <code>del</code> elements</a> + </ul> + + <li><a href="#embedded"><span class=secno>3.14. </span>Embedded + content</a> + <ul class=toc> + <li><a href="#the-figure"><span class=secno>3.14.1. </span>The + <code>figure</code> element</a> + + <li><a href="#the-img"><span class=secno>3.14.2. </span>The + <code>img</code> element</a> + + <li><a href="#the-iframe"><span class=secno>3.14.3. </span>The + <code>iframe</code> element</a> + + <li><a href="#the-embed"><span class=secno>3.14.4. </span>The + <code>embed</code> element</a> + + <li><a href="#the-object"><span class=secno>3.14.5. </span>The + <code>object</code> element</a> + + <li><a href="#the-param"><span class=secno>3.14.6. </span>The + <code>param</code> element</a> + + <li><a href="#video"><span class=secno>3.14.7. </span>The + <code>video</code> element</a> + <ul class=toc> + <li><a href="#video0"><span class=secno>3.14.7.1. </span>Video and + audio codecs for <code>video</code> elements</a> + </ul> + + <li><a href="#audio"><span class=secno>3.14.8. </span>The + <code>audio</code> element</a> + <ul class=toc> + <li><a href="#audio0"><span class=secno>3.14.8.1. </span>Audio + codecs for <code>audio</code> elements</a> + </ul> + + <li><a href="#media"><span class=secno>3.14.9. </span>Media + elements</a> + <ul class=toc> + <li><a href="#error"><span class=secno>3.14.9.1. </span>Error + codes</a> + + <li><a href="#location"><span class=secno>3.14.9.2. </span>Location + of the media resource</a> + + <li><a href="#network0"><span class=secno>3.14.9.3. </span>Network + states</a> + + <li><a href="#loading"><span class=secno>3.14.9.4. </span>Loading + the media resource</a> + + <li><a href="#offsets"><span class=secno>3.14.9.5. </span>Offsets + into the media resource</a> + + <li><a href="#the-ready"><span class=secno>3.14.9.6. </span>The + ready states</a> + + <li><a href="#playing"><span class=secno>3.14.9.7. </span>Playing + the media resource</a> + + <li><a href="#seeking"><span class=secno>3.14.9.8. + </span>Seeking</a> + + <li><a href="#cue-points"><span class=secno>3.14.9.9. </span>Cue + points</a> + + <li><a href="#user-interface"><span class=secno>3.14.9.10. + </span>User interface</a> + + <li><a href="#time-range"><span class=secno>3.14.9.11. </span>Time + range</a> + + <li><a href="#mediaevents"><span class=secno>3.14.9.12. </span>Event + summary</a> + + <li><a href="#security0"><span class=secno>3.14.9.13. + </span>Security and privacy considerations</a> + </ul> + + <li><a href="#the-source"><span class=secno>3.14.10. </span>The + <code>source</code> element</a> + + <li><a href="#the-canvas"><span class=secno>3.14.11. </span>The + <code>canvas</code> element</a> + <ul class=toc> + <li><a href="#the-2d"><span class=secno>3.14.11.1. </span>The 2D + context</a> + <ul class=toc> + <li><a href="#the-canvas0"><span class=secno>3.14.11.1.1. + </span>The canvas state</a> + + <li><a href="#transformations"><span class=secno>3.14.11.1.2. + </span>Transformations</a> + + <li><a href="#compositing"><span class=secno>3.14.11.1.3. + </span>Compositing</a> + + <li><a href="#colors"><span class=secno>3.14.11.1.4. </span>Colors + and styles</a> + + <li><a href="#line-styles"><span class=secno>3.14.11.1.5. + </span>Line styles</a> + + <li><a href="#shadows"><span class=secno>3.14.11.1.6. + </span>Shadows</a> + + <li><a href="#simple"><span class=secno>3.14.11.1.7. </span>Simple + shapes (rectangles)</a> + + <li><a href="#complex"><span class=secno>3.14.11.1.8. + </span>Complex shapes (paths)</a> + + <li><a href="#images"><span class=secno>3.14.11.1.9. + </span>Images</a> + + <li><a href="#pixel"><span class=secno>3.14.11.1.10. </span>Pixel + manipulation</a> + + <li><a href="#drawing"><span class=secno>3.14.11.1.11. + </span>Drawing model</a> + </ul> + </ul> + + <li><a href="#the-map"><span class=secno>3.14.12. </span>The + <code>map</code> element</a> + + <li><a href="#the-area"><span class=secno>3.14.13. </span>The + <code>area</code> element</a> + + <li><a href="#image-maps"><span class=secno>3.14.14. </span>Image + maps</a> + </ul> + + <li><a href="#tabular"><span class=secno>3.15. </span>Tabular data</a> + <ul class=toc> + <li><a href="#the-table"><span class=secno>3.15.1. </span>The + <code>table</code> element</a> + + <li><a href="#the-caption"><span class=secno>3.15.2. </span>The + <code>caption</code> element</a> + + <li><a href="#the-colgroup"><span class=secno>3.15.3. </span>The + <code>colgroup</code> element</a> + + <li><a href="#the-col"><span class=secno>3.15.4. </span>The + <code>col</code> element</a> + + <li><a href="#the-tbody"><span class=secno>3.15.5. </span>The + <code>tbody</code> element</a> + + <li><a href="#the-thead"><span class=secno>3.15.6. </span>The + <code>thead</code> element</a> + + <li><a href="#the-tfoot"><span class=secno>3.15.7. </span>The + <code>tfoot</code> element</a> + + <li><a href="#the-tr"><span class=secno>3.15.8. </span>The + <code>tr</code> element</a> + + <li><a href="#the-td"><span class=secno>3.15.9. </span>The + <code>td</code> element</a> + + <li><a href="#the-th"><span class=secno>3.15.10. </span>The + <code>th</code> element</a> + + <li><a href="#processing"><span class=secno>3.15.11. </span>Processing + model</a> + <ul class=toc> + <li><a href="#forming"><span class=secno>3.15.11.1. </span>Forming a + table</a> + + <li><a href="#header-and-data-cell-semantics"><span + class=secno>3.15.11.2. </span>Forming relationships between data + cells and header cells</a> + </ul> + </ul> + + <li><a href="#forms"><span class=secno>3.16. </span>Forms</a> + <ul class=toc> + <li><a href="#the-form"><span class=secno>3.16.1. </span>The + <code>form</code> element</a> + + <li><a href="#the-fieldset"><span class=secno>3.16.2. </span>The + <code>fieldset</code> element</a> + + <li><a href="#the-input"><span class=secno>3.16.3. </span>The + <code>input</code> element</a> + + <li><a href="#the-button"><span class=secno>3.16.4. </span>The + <code>button</code> element</a> + + <li><a href="#the-label"><span class=secno>3.16.5. </span>The + <code>label</code> element</a> + + <li><a href="#the-select"><span class=secno>3.16.6. </span>The + <code>select</code> element</a> + + <li><a href="#the-datalist"><span class=secno>3.16.7. </span>The + <code>datalist</code> element</a> + + <li><a href="#the-optgroup"><span class=secno>3.16.8. </span>The + <code>optgroup</code> element</a> + + <li><a href="#the-option"><span class=secno>3.16.9. </span>The + <code>option</code> element</a> + + <li><a href="#the-textarea"><span class=secno>3.16.10. </span>The + <code>textarea</code> element</a> + + <li><a href="#the-output"><span class=secno>3.16.11. </span>The + <code>output</code> element</a> + + <li><a href="#processing0"><span class=secno>3.16.12. + </span>Processing model</a> + <ul class=toc> + <li><a href="#form-submission"><span class=secno>3.16.12.1. + </span>Form submission</a> + </ul> + </ul> + + <li><a href="#scripting0"><span class=secno>3.17. </span>Scripting</a> + <ul class=toc> + <li><a href="#script"><span class=secno>3.17.1. </span>The + <code>script</code> element</a> + <ul class=toc> + <li><a href="#scriptingLanguages"><span class=secno>3.17.1.1. + </span>Scripting languages</a> + </ul> + + <li><a href="#the-noscript"><span class=secno>3.17.2. </span>The + <code>noscript</code> element</a> + + <li><a href="#the-event-source"><span class=secno>3.17.3. </span>The + <code>event-source</code> element</a> + </ul> + + <li><a href="#interactive"><span class=secno>3.18. </span>Interactive + elements</a> + <ul class=toc> + <li><a href="#the-details"><span class=secno>3.18.1. </span>The + <code>details</code> element</a> + + <li><a href="#datagrid"><span class=secno>3.18.2. </span>The + <code>datagrid</code> element</a> + <ul class=toc> + <li><a href="#the-datagrid"><span class=secno>3.18.2.1. </span>The + <code>datagrid</code> data model</a> + + <li><a href="#how-rows"><span class=secno>3.18.2.2. </span>How rows + are identified</a> + + <li><a href="#the-data"><span class=secno>3.18.2.3. </span>The data + provider interface</a> + + <li><a href="#the-default"><span class=secno>3.18.2.4. </span>The + default data provider</a> + <ul class=toc> + <li><a href="#commonDefaultDataGridMethodDefinitions"><span + class=secno>3.18.2.4.1. </span>Common default data provider + method definitions for cells</a> + </ul> + + <li><a href="#populating"><span class=secno>3.18.2.5. + </span>Populating the <code>datagrid</code> element</a> + + <li><a href="#updating"><span class=secno>3.18.2.6. </span>Updating + the <code>datagrid</code></a> + + <li><a href="#requirements"><span class=secno>3.18.2.7. + </span>Requirements for interactive user agents</a> + + <li><a href="#the-selection"><span class=secno>3.18.2.8. </span>The + selection</a> + + <li><a href="#columns"><span class=secno>3.18.2.9. </span>Columns + and captions</a> + </ul> + + <li><a href="#the-command"><span class=secno>3.18.3. </span>The + <code>command</code> element</a> + + <li><a href="#menus"><span class=secno>3.18.4. </span>The + <code>menu</code> element</a> + <ul class=toc> + <li><a href="#menus-intro"><span class=secno>3.18.4.1. + </span>Introduction</a> + + <li><a href="#building"><span class=secno>3.18.4.2. </span>Building + menus</a> + + <li><a href="#context"><span class=secno>3.18.4.3. </span>Context + menus</a> + + <li><a href="#toolbars"><span class=secno>3.18.4.4. + </span>Toolbars</a> + </ul> + + <li><a href="#commands"><span class=secno>3.18.5. </span>Commands</a> + <ul class=toc> + <li><a href="#using"><span class=secno>3.18.5.1. </span>Using the + <code>a</code> element to define a command</a> + + <li><a href="#using0"><span class=secno>3.18.5.2. </span>Using the + <code>button</code> element to define a command</a> + + <li><a href="#using1"><span class=secno>3.18.5.3. </span>Using the + <code>input</code> element to define a command</a> + + <li><a href="#using2"><span class=secno>3.18.5.4. </span>Using the + <code>option</code> element to define a command</a> + + <li><a href="#using3"><span class=secno>3.18.5.5. </span>Using the + <code>command</code> element to define a command</a> + </ul> + </ul> + + <li><a href="#miscellaneous"><span class=secno>3.19. + </span>Miscellaneous elements</a> + <ul class=toc> + <li><a href="#the-legend"><span class=secno>3.19.1. </span>The + <code>legend</code> element</a> + + <li><a href="#the-div"><span class=secno>3.19.2. </span>The + <code>div</code> element</a> + </ul> + </ul> + + <li><a href="#web-browsers"><span class=secno>4. </span>Web browsers</a> + <ul class=toc> + <li><a href="#windows"><span class=secno>4.1. </span>Browsing + contexts</a> + <ul class=toc> + <li><a href="#nested"><span class=secno>4.1.1. </span>Nested browsing + contexts</a> + + <li><a href="#auxiliary"><span class=secno>4.1.2. </span>Auxiliary + browsing contexts</a> + + <li><a href="#secondary"><span class=secno>4.1.3. </span>Secondary + browsing contexts</a> + + <li><a href="#threads"><span class=secno>4.1.4. </span>Threads</a> + + <li><a href="#browsing"><span class=secno>4.1.5. </span>Browsing + context names</a> + </ul> + + <li><a href="#the-default0"><span class=secno>4.2. </span>The default + view</a> + <ul class=toc> + <li><a href="#security1"><span class=secno>4.2.1. </span>Security</a> + + <li><a href="#constructors"><span class=secno>4.2.2. + </span>Constructors</a> + + <li><a href="#apis-for"><span class=secno>4.2.3. </span>APIs for + creating and navigating browsing contexts by name</a> + + <li><a href="#accessing"><span class=secno>4.2.4. </span>Accessing + other browsing contexts</a> + </ul> + + <li><a href="#history"><span class=secno>4.3. </span>Session history and + navigation</a> + <ul class=toc> + <li><a href="#the-session"><span class=secno>4.3.1. </span>The session + history of browsing contexts</a> + + <li><a href="#the-history"><span class=secno>4.3.2. </span>The + <code>History</code> interface</a> + + <li><a href="#activating"><span class=secno>4.3.3. </span>Activating + state objects</a> + + <li><a href="#the-location"><span class=secno>4.3.4. </span>The + <code>Location</code> interface</a> + <ul class=toc> + <li><a href="#security2"><span class=secno>4.3.4.1. + </span>Security</a> + </ul> + + <li><a href="#history-notes"><span class=secno>4.3.5. + </span>Implementation notes for session history</a> + </ul> + + <li><a href="#links"><span class=secno>4.4. </span>Links</a> + <ul class=toc> + <li><a href="#hyperlink"><span class=secno>4.4.1. </span>Hyperlink + elements</a> + + <li><a href="#following"><span class=secno>4.4.2. </span>Following + hyperlinks</a> + <ul class=toc> + <li><a href="#hyperlink0"><span class=secno>4.4.2.1. + </span>Hyperlink auditing</a> + </ul> + + <li><a href="#linkTypes"><span class=secno>4.4.3. </span>Link + types</a> + <ul class=toc> + <li><a href="#link-type"><span class=secno>4.4.3.1. </span>Link type + "<code>alternate</code>"</a> + + <li><a href="#link-type0"><span class=secno>4.4.3.2. </span>Link + type "<code>archives</code>"</a> + + <li><a href="#link-type1"><span class=secno>4.4.3.3. </span>Link + type "<code>author</code>"</a> + + <li><a href="#link-type2"><span class=secno>4.4.3.4. </span>Link + type "<code>bookmark</code>"</a> + + <li><a href="#link-type3"><span class=secno>4.4.3.5. </span>Link + type "<code>contact</code>"</a> + + <li><a href="#link-type4"><span class=secno>4.4.3.6. </span>Link + type "<code>external</code>"</a> + + <li><a href="#link-type5"><span class=secno>4.4.3.7. </span>Link + type "<code>feed</code>"</a> + + <li><a href="#link-type6"><span class=secno>4.4.3.8. </span>Link + type "<code>help</code>"</a> + + <li><a href="#link-type7"><span class=secno>4.4.3.9. </span>Link + type "<code>icon</code>"</a> + + <li><a href="#link-type8"><span class=secno>4.4.3.10. </span>Link + type "<code>license</code>"</a> + + <li><a href="#link-type9"><span class=secno>4.4.3.11. </span>Link + type "<code>nofollow</code>"</a> + + <li><a href="#link-type10"><span class=secno>4.4.3.12. </span>Link + type "<code>pingback</code>"</a> + + <li><a href="#link-type11"><span class=secno>4.4.3.13. </span>Link + type "<code>prefetch</code>"</a> + + <li><a href="#link-type12"><span class=secno>4.4.3.14. </span>Link + type "<code>search</code>"</a> + + <li><a href="#link-type13"><span class=secno>4.4.3.15. </span>Link + type "<code>stylesheet</code>"</a> + + <li><a href="#link-type14"><span class=secno>4.4.3.16. </span>Link + type "<code>sidebar</code>"</a> + + <li><a href="#link-type15"><span class=secno>4.4.3.17. </span>Link + type "<code>tag</code>"</a> + + <li><a href="#hierarchical"><span class=secno>4.4.3.18. + </span>Hierarchical link types</a> + <ul class=toc> + <li><a href="#link-type16"><span class=secno>4.4.3.18.1. + </span>Link type "<code>first</code>"</a> + + <li><a href="#link-type17"><span class=secno>4.4.3.18.2. + </span>Link type "<code>index</code>"</a> + + <li><a href="#link-type18"><span class=secno>4.4.3.18.3. + </span>Link type "<code>last</code>"</a> + + <li><a href="#link-type19"><span class=secno>4.4.3.18.4. + </span>Link type "<code>next</code>"</a> + + <li><a href="#link-type20"><span class=secno>4.4.3.18.5. + </span>Link type "<code>prev</code>"</a> + + <li><a href="#link-type21"><span class=secno>4.4.3.18.6. + </span>Link type "<code>up</code>"</a> + </ul> + + <li><a href="#other0"><span class=secno>4.4.3.19. </span>Other link + types</a> + </ul> + </ul> + + <li><a href="#interfaces"><span class=secno>4.5. </span>Interfaces for + URI manipulation</a> + + <li><a href="#navigating"><span class=secno>4.6. </span>Navigating + across documents</a> + <ul class=toc> + <li><a href="#read-html"><span class=secno>4.6.1. </span>Page load + processing model for HTML files</a> + + <li><a href="#read-xml"><span class=secno>4.6.2. </span>Page load + processing model for XML files</a> + + <li><a href="#read-text"><span class=secno>4.6.3. </span>Page load + processing model for text files</a> + + <li><a href="#read-image"><span class=secno>4.6.4. </span>Page load + processing model for images</a> + + <li><a href="#read-plugin"><span class=secno>4.6.5. </span>Page load + processing model for content that uses plugins</a> + + <li><a href="#non-DOM-inline-content"><span class=secno>4.6.6. + </span>Page load processing model for inline content that doesn't + have a DOM</a> + + <li><a href="#scroll-to-fragid"><span class=secno>4.6.7. + </span>Scrolling to a fragment identifier</a> + </ul> + + <li><a href="#content-type-sniffing"><span class=secno>4.7. + </span>Determining the type of a new resource in a browsing context</a> + + <ul class=toc> + <li><a href="#content-type0"><span class=secno>4.7.1. + </span>Content-Type sniffing: text or binary</a> + + <li><a href="#content-type1"><span class=secno>4.7.2. + </span>Content-Type sniffing: unknown type</a> + + <li><a href="#content-type2"><span class=secno>4.7.3. + </span>Content-Type sniffing: image</a> + + <li><a href="#content-type3"><span class=secno>4.7.4. + </span>Content-Type sniffing: feed or HTML</a> + + <li><a href="#content-type"><span class=secno>4.7.5. + </span>Content-Type metadata</a> + </ul> + + <li><a href="#user-prompts"><span class=secno>4.8. </span>User + prompts</a> + + <li><a href="#scripting"><span class=secno>4.9. </span>Scripting</a> + <ul class=toc> + <li><a href="#running"><span class=secno>4.9.1. </span>Running + executable code</a> + + <li><a href="#origin"><span class=secno>4.9.2. </span>Origin</a> + + <li><a href="#security3"><span class=secno>4.9.3. </span>Security + exceptions</a> + + <li><a href="#javascript-protocol"><span class=secno>4.9.4. </span>The + <code title="">javascript:</code> protocol</a> + + <li><a href="#events"><span class=secno>4.9.5. </span>Events</a> + <ul class=toc> + <li><a href="#event-handler-attributes"><span class=secno>4.9.5.1. + </span>Event handler attributes</a> + + <li><a href="#event"><span class=secno>4.9.5.2. </span>Event + firing</a> + + <li><a href="#events0"><span class=secno>4.9.5.3. </span>Events and + the <code>Window</code> object</a> + + <li><a href="#runtime-script-errors"><span class=secno>4.9.5.4. + </span>Runtime script errors</a> + </ul> + </ul> + + <li><a href="#browser"><span class=secno>4.10. </span>Browser state</a> + <ul class=toc> + <li><a href="#offline"><span class=secno>4.10.1. </span>Offline Web + applications</a> + + <li><a href="#custom-handlers"><span class=secno>4.10.2. </span>Custom + protocol and content handlers</a> + <ul class=toc> + <li><a href="#security4"><span class=secno>4.10.2.1. </span>Security + and privacy</a> + + <li><a href="#sample-handler-impl"><span class=secno>4.10.2.2. + </span>Sample user interface</a> + </ul> + </ul> + + <li><a href="#storage"><span class=secno>4.11. </span>Client-side + session and persistent storage of name/value pairs</a> + <ul class=toc> + <li><a href="#introduction0"><span class=secno>4.11.1. + </span>Introduction</a> + + <li><a href="#the-storage"><span class=secno>4.11.2. </span>The + <code>Storage</code> interface</a> + + <li><a href="#the-storageitem"><span class=secno>4.11.3. </span>The + <code>StorageItem</code> interface</a> + + <li><a href="#the-sessionstorage"><span class=secno>4.11.4. </span>The + <code title=dom-sessionStorage>sessionStorage</code> attribute</a> + + <li><a href="#the-globalstorage"><span class=secno>4.11.5. </span>The + <code title=dom-globalStorage>globalStorage</code> attribute</a> + + <li><a href="#the-storage0"><span class=secno>4.11.6. </span>The <code + title=event-storage>storage</code> event</a> + + <li><a href="#miscellaneous0"><span class=secno>4.11.7. + </span>Miscellaneous implementation requirements for storage + areas</a> + <ul class=toc> + <li><a href="#disk-space"><span class=secno>4.11.7.1. </span>Disk + space</a> + + <li><a href="#threads0"><span class=secno>4.11.7.2. + </span>Threads</a> + </ul> + + <li><a href="#security5"><span class=secno>4.11.8. </span>Security and + privacy</a> + <ul class=toc> + <li><a href="#user-tracking"><span class=secno>4.11.8.1. </span>User + tracking</a> + + <li><a href="#cookie"><span class=secno>4.11.8.2. </span>Cookie + resurrection</a> + + <li><a href="#integrity"><span class=secno>4.11.8.3. + </span>Integrity of "public" storage areas</a> + + <li><a href="#cross-protocol"><span class=secno>4.11.8.4. + </span>Cross-protocol and cross-port attacks</a> + + <li><a href="#dns-spoofing"><span class=secno>4.11.8.5. </span>DNS + spoofing attacks</a> + + <li><a href="#cross-directory"><span class=secno>4.11.8.6. + </span>Cross-directory attacks</a> + + <li><a href="#public"><span class=secno>4.11.8.7. </span>Public + storage areas corresponding to hosts</a> + + <li><a href="#storage0"><span class=secno>4.11.8.8. </span>Storage + areas in the face of untrusted higher-level domains that do not + correspond to public storage areas</a> + + <li><a href="#storage1"><span class=secno>4.11.8.9. </span>Storage + areas in the face of untrusted subdomains</a> + + <li><a href="#implementation"><span class=secno>4.11.8.10. + </span>Implementation risks</a> + </ul> + </ul> + + <li><a href="#sql"><span class=secno>4.12. </span>Client-side database + storage</a> + <ul class=toc> + <li><a href="#introduction1"><span class=secno>4.12.1. + </span>Introduction</a> + + <li><a href="#executing"><span class=secno>4.12.2. </span>Executing + SQL statements</a> + + <li><a href="#database"><span class=secno>4.12.3. </span>Database + query results</a> + + <li><a href="#privacy"><span class=secno>4.12.4. </span>Privacy</a> + + <li><a href="#security6"><span class=secno>4.12.5. </span>Security</a> + + <ul class=toc> + <li><a href="#user-agents"><span class=secno>4.12.5.1. </span>User + agents</a> + + <li><a href="#sql-injection"><span class=secno>4.12.5.2. </span>SQL + injection</a> + </ul> + </ul> + </ul> + + <li><a href="#editing"><span class=secno>5. </span>Editing</a> + <ul class=toc> + <li><a href="#editing-intro"><span class=secno>5.1. + </span>Introduction</a> + + <li><a href="#contenteditable"><span class=secno>5.2. </span>The <code + title=attr-contenteditable>contenteditable</code> attribute</a> + <ul class=toc> + <li><a href="#user-editing"><span class=secno>5.2.1. </span>User + editing actions</a> + + <li><a href="#making"><span class=secno>5.2.2. </span>Making entire + documents editable</a> + </ul> + + <li><a href="#dnd"><span class=secno>5.3. </span>Drag and drop</a> + <ul class=toc> + <li><a href="#the-dragevent"><span class=secno>5.3.1. </span>The + <code>DragEvent</code> and <code>DataTransfer</code> interfaces</a> + + <li><a href="#events1"><span class=secno>5.3.2. </span>Events fired + during a drag-and-drop action</a> + + <li><a href="#drag-and-drop"><span class=secno>5.3.3. + </span>Drag-and-drop processing model</a> + <ul class=toc> + <li><a href="#when-the"><span class=secno>5.3.3.1. </span>When the + drag-and-drop operation starts or ends in another document</a> + + <li><a href="#when-the0"><span class=secno>5.3.3.2. </span>When the + drag-and-drop operation starts or ends in another application</a> + </ul> + + <li><a href="#the-draggable"><span class=secno>5.3.4. </span>The + <code>draggable</code> attribute</a> + + <li><a href="#copy-and"><span class=secno>5.3.5. </span>Copy and + paste</a> + <ul class=toc> + <li><a href="#copy-to"><span class=secno>5.3.5.1. </span>Copy to + clipboard</a> + + <li><a href="#cut-to"><span class=secno>5.3.5.2. </span>Cut to + clipboard</a> + + <li><a href="#paste"><span class=secno>5.3.5.3. </span>Paste from + clipboard</a> + + <li><a href="#paste0"><span class=secno>5.3.5.4. </span>Paste from + selection</a> + </ul> + + <li><a href="#security7"><span class=secno>5.3.6. </span>Security + risks in the drag-and-drop model</a> + </ul> + + <li><a href="#undo"><span class=secno>5.4. </span>Undo history</a> + <ul class=toc> + <li><a href="#the-undomanager"><span class=secno>5.4.1. </span>The + <code>UndoManager</code> interface</a> + + <li><a href="#undo-moving"><span class=secno>5.4.2. </span>Undo: + moving back in the undo transaction history</a> + + <li><a href="#redo-moving"><span class=secno>5.4.3. </span>Redo: + moving forward in the undo transaction history</a> + + <li><a href="#the-undomanagerevent"><span class=secno>5.4.4. + </span>The <code>UndoManagerEvent</code> interface and the <code + title=event-undo>undo</code> and <code title=event-redo>redo</code> + events</a> + + <li><a href="#implementation0"><span class=secno>5.4.5. + </span>Implementation notes</a> + </ul> + + <li><a href="#command"><span class=secno>5.5. </span>Command APIs</a> + + <li><a href="#selection"><span class=secno>5.6. </span>The text + selection APIs</a> + <ul class=toc> + <li><a href="#documentSelection"><span class=secno>5.6.1. </span>APIs + for the browsing context selection</a> + + <li><a href="#textFieldSelection"><span class=secno>5.6.2. </span>APIs + for the text field selections</a> + </ul> + </ul> + + <li><a href="#comms"><span class=secno>6. </span>Communication</a> + <ul class=toc> + <li><a href="#event0"><span class=secno>6.1. </span>Event + definitions</a> + + <li><a href="#server-sent-events"><span class=secno>6.2. + </span>Server-sent DOM events</a> + <ul class=toc> + <li><a href="#the-remoteeventtarget"><span class=secno>6.2.1. + </span>The <code>RemoteEventTarget</code> interface</a> + + <li><a href="#connecting"><span class=secno>6.2.2. </span>Connecting + to an event stream</a> + + <li><a href="#parsing0"><span class=secno>6.2.3. </span>Parsing an + event stream</a> + + <li><a href="#event-stream-interpretation"><span class=secno>6.2.4. + </span>Interpreting an event stream</a> + + <li><a href="#notes"><span class=secno>6.2.5. </span>Notes</a> + </ul> + + <li><a href="#network"><span class=secno>6.3. </span>Network + connections</a> + <ul class=toc> + <li><a href="#network-intro"><span class=secno>6.3.1. + </span>Introduction</a> + + <li><a href="#the-connection"><span class=secno>6.3.2. </span>The + <code>Connection</code> interface</a> + + <li><a href="#connection"><span class=secno>6.3.3. </span>Connection + Events</a> + + <li><a href="#tcp-connections"><span class=secno>6.3.4. </span>TCP + connections</a> + + <li><a href="#broadcast"><span class=secno>6.3.5. </span>Broadcast + connections</a> + <ul class=toc> + <li><a href="#broadcasting"><span class=secno>6.3.5.1. + </span>Broadcasting over TCP/IP</a> + + <li><a href="#bluetooth-broadcast"><span class=secno>6.3.5.2. + </span>Broadcasting over Bluetooth</a> + + <li><a href="#irda-broadcast"><span class=secno>6.3.5.3. + </span>Broadcasting over IrDA</a> + </ul> + + <li><a href="#peer-to-peer"><span class=secno>6.3.6. + </span>Peer-to-peer connections</a> + <ul class=toc> + <li><a href="#peer-to-peer0"><span class=secno>6.3.6.1. + </span>Peer-to-peer connections over TCP/IP</a> + + <li><a href="#bluetooth-peer"><span class=secno>6.3.6.2. + </span>Peer-to-peer connections over Bluetooth</a> + + <li><a href="#irda-peer"><span class=secno>6.3.6.3. + </span>Peer-to-peer connections over IrDA</a> + </ul> + + <li><a href="#the-common"><span class=secno>6.3.7. </span>The common + protocol for TCP-based connections</a> + <ul class=toc> + <li><a href="#clients"><span class=secno>6.3.7.1. </span>Clients + connecting over TCP</a> + + <li><a href="#servers"><span class=secno>6.3.7.2. </span>Servers + accepting connections over TCP</a> + + <li><a href="#sending"><span class=secno>6.3.7.3. </span>Sending and + receiving data over TCP</a> + </ul> + + <li><a href="#network-security"><span class=secno>6.3.8. + </span>Security</a> + + <li><a href="#network-other-specs"><span class=secno>6.3.9. + </span>Relationship to other standards</a> + </ul> + + <li><a href="#crossDocumentMessages"><span class=secno>6.4. + </span>Cross-document messaging</a> + <ul class=toc> + <li><a href="#processing1"><span class=secno>6.4.1. </span>Processing + model</a> + </ul> + </ul> + + <li><a href="#repetition"><span class=secno>7. </span>Repetition + templates</a> + + <li><a href="#syntax"><span class=secno>8. </span>The HTML syntax</a> + <ul class=toc> + <li><a href="#writing"><span class=secno>8.1. </span>Writing HTML + documents</a> + <ul class=toc> + <li><a href="#the-doctype"><span class=secno>8.1.1. </span>The + DOCTYPE</a> + + <li><a href="#elements0"><span class=secno>8.1.2. </span>Elements</a> + <ul class=toc> + <li><a href="#start"><span class=secno>8.1.2.1. </span>Start + tags</a> + + <li><a href="#end-tags"><span class=secno>8.1.2.2. </span>End + tags</a> + + <li><a href="#attributes0"><span class=secno>8.1.2.3. + </span>Attributes</a> + + <li><a href="#optional"><span class=secno>8.1.2.4. </span>Optional + tags</a> + + <li><a href="#restrictions"><span class=secno>8.1.2.5. + </span>Restrictions on content models</a> + </ul> + + <li><a href="#text"><span class=secno>8.1.3. </span>Text</a> + <ul class=toc> + <li><a href="#newlines"><span class=secno>8.1.3.1. + </span>Newlines</a> + </ul> + + <li><a href="#character"><span class=secno>8.1.4. </span>Character + entity references</a> + + <li><a href="#comments"><span class=secno>8.1.5. </span>Comments</a> + </ul> + + <li><a href="#parsing"><span class=secno>8.2. </span>Parsing HTML + documents</a> + <ul class=toc> + <li><a href="#overview"><span class=secno>8.2.1. </span>Overview of + the parsing model</a> + + <li><a href="#the-input0"><span class=secno>8.2.2. </span>The input + stream</a> + + <li><a href="#tokenisation"><span class=secno>8.2.3. + </span>Tokenisation</a> + <ul class=toc> + <li><a href="#tokenising"><span class=secno>8.2.3.1. + </span>Tokenising entities</a> + </ul> + + <li><a href="#tree-construction"><span class=secno>8.2.4. </span>Tree + construction</a> + <ul class=toc> + <li><a href="#the-initial"><span class=secno>8.2.4.1. </span>The + initial phase</a> + + <li><a href="#the-root0"><span class=secno>8.2.4.2. </span>The root + element phase</a> + + <li><a href="#the-main"><span class=secno>8.2.4.3. </span>The main + phase</a> + <ul class=toc> + <li><a href="#the-stack"><span class=secno>8.2.4.3.1. </span>The + stack of open elements</a> + + <li><a href="#the-list"><span class=secno>8.2.4.3.2. </span>The + list of active formatting elements</a> + + <li><a href="#creating"><span class=secno>8.2.4.3.3. + </span>Creating and inserting HTML elements</a> + + <li><a href="#closing"><span class=secno>8.2.4.3.4. </span>Closing + elements that have implied end tags</a> + + <li><a href="#the-element"><span class=secno>8.2.4.3.5. </span>The + element pointers</a> + + <li><a href="#the-insertion"><span class=secno>8.2.4.3.6. + </span>The insertion mode</a> + + <li><a href="#how-to0"><span class=secno>8.2.4.3.7. </span>How to + handle tokens in the main phase</a> + </ul> + + <li><a href="#the-trailing"><span class=secno>8.2.4.4. </span>The + trailing end phase</a> + </ul> + + <li><a href="#the-end"><span class=secno>8.2.5. </span>The End</a> + </ul> + + <li><a href="#namespaces"><span class=secno>8.3. </span>Namespaces</a> + + <li><a href="#entities"><span class=secno>8.4. </span>Entities</a> + </ul> + + <li><a href="#wysiwyg"><span class=secno>9. </span>WYSIWYG editors</a> + <ul class=toc> + <li><a href="#presentational"><span class=secno>9.1. + </span>Presentational markup</a> + <ul class=toc> + <li><a href="#wysiwyg0"><span class=secno>9.1.1. </span>WYSIWYG + signature</a> + + <li><a href="#the-font"><span class=secno>9.1.2. </span>The + <code>font</code> element</a> + </ul> + </ul> + + <li><a href="#rendering"><span class=secno>10. </span>Rendering</a> + <ul class=toc> + <li><a href="#rendering0"><span class=secno>10.1. </span>Rendering and + the DOM</a> + </ul> + + <li><a href="#no"><span class=secno>11. </span>Things that you can't do + with this specification because they are better handled using other + technologies that are further described herein</a> + <ul class=toc> + <li><a href="#localisation"><span class=secno>11.1. + </span>Localisation</a> + + <li><a href="#declarative"><span class=secno>11.2. </span>Declarative 2D + vector graphics and animation</a> + + <li><a href="#declarative0"><span class=secno>11.3. </span>Declarative + 3D scenes</a> + + <li><a href="#timers"><span class=secno>11.4. </span>Timers</a> + + <li><a href="#events2"><span class=secno>11.5. </span>Events</a> + </ul> + + <li class=no-num><a href="#references">References</a> + + <li class=no-num><a href="#acknowledgements">Acknowledgements</a> + </ul> + <!--end-toc--> + + <hr> + + <h2 id=introduction><span class=secno>1. </span>Introduction</h2> + + <p><em>This section is non-normative.</em> + + <p>The World Wide Web's markup language has always been HTML. HTML was + primarily designed as a language for semantically describing scientific + documents, although its general design and adaptations over the years has + enabled it to be used to describe a number of other types of documents. + + <p>The main area that has not been adequately addressed by HTML is a vague + subject referred to as Web Applications. This specification attempts to + rectify this, while at the same time updating the HTML specifications to + address issues raised in the past few years. + + <h3 id=scope><span class=secno>1.1. </span>Scope</h3> + + <p><em>This section is non-normative.</em> + + <p>This specification is limited to providing a semantic-level markup + language and associated semantic-level scripting APIs for authoring + accessible pages on the Web ranging from static documents to dynamic + applications. + + <p>The scope of this specification does not include addressing presentation + concerns (although default rendering rules for Web browsers are included + at the end of this specification). + + <p>The scope of this specification does not include documenting every HTML + or DOM feature supported by Web browsers. Browsers support many features + that are considered to be very bad for accessibility or that are otherwise + inappropriate. For example, the <code>blink</code> element is clearly + presentational and authors wishing to cause text to blink should instead + use CSS. + + <p>The scope of this specification is not to describe an entire operating + system. In particular, hardware configuration software, image manipulation + tools, and applications that users would be expected to use with high-end + workstations on a daily basis are out of scope. In terms of applications, + this specification is targeted specifically at applications that would be + expected to be used by users on an occasional basis, or regularly but from + disparate locations, with low CPU requirements. For instance online + purchasing systems, searching systems, games (especially multiplayer + online games), public telephone books or address books, communications + software (e-mail clients, instant messaging clients, discussion software), + document editing software, etc. + + <p>For sophisticated cross-platform applications, there already exist + several proprietary solutions (such as Mozilla's XUL and Macromedia's + Flash). These solutions are evolving faster than any standards process + could follow, and the requirements are evolving even faster. These systems + are also significantly more complicated to specify, and are orders of + magnitude more difficult to achieve interoperability with, than the + solutions described in this document. Platform-specific solutions for such + sophisticated applications (for example the MacOS X Core APIs) are even + further ahead. + + <h4 id=relationship><span class=secno>1.1.1. </span>Relationship to HTML + 4.01, XHTML 1.1, DOM2 HTML</h4> + + <p><em>This section is non-normative.</em> + + <p>This specification represents a new version of HTML4 and XHTML1, along + with a new version of the associated DOM2 HTML API. Migration from HTML4 + or XHTML1 to the format and APIs described in this specification should in + most cases be straightforward, as care has been taken to ensure that + backwards-compatibility is retained.</p> + <!-- XXX refs --> + + <p>This specification will eventually supplant Web Forms 2.0 as well. <a + href="#refsWF2">[WF2]</a> + + <h4 id=relationship0><span class=secno>1.1.2. </span>Relationship to XHTML2</h4> + + <p><em>This section is non-normative.</em> + + <p>XHTML2 <a href="#refsXHTML2">[XHTML2]</a> defines a new HTML vocabulary + with better features for hyperlinks, multimedia content, annotating + document edits, rich metadata, declarative interactive forms, and + describing the semantics of human literary works such as poems and + scientific papers. + + <p>However, it lacks elements to express the semantics of many of the + non-document types of content often seen on the Web. For instance, forum + sites, auction sites, search engines, online shops, and the like, do not + fit the document metaphor well, and are not covered by XHTML2. + + <p><em>This</em> specification aims to extend HTML so that it is also + suitable in these contexts. + + <p>XHTML2 and this specification use different namespaces and therefore can + both be implemented in the same XML processor. + + <h4 id=relationship1><span class=secno>1.1.3. </span>Relationship to XUL, + Flash, Silverlight, and other proprietary UI languages</h4> + + <p><em>This section is non-normative.</em> + + <p>This specification is independent of the various proprietary UI + languages that various vendors provide. As an open, vender-neutral + language, HTML provides for a solution to the same problems without the + risk of vendor lock-in. + + <h3 id=structure><span class=secno>1.2. </span>Structure of this + specification</h3> + + <p><em>This section is non-normative.</em> + + <p>This specification is divided into the following important sections: + + <dl> + <dt><a href="#dom">The DOM</a> + + <dd>The DOM, or Document Object Model, provides a base for the rest of the + specification. + + <dt><a href="#semantics">The Semantics</a> + + <dd>Documents are built from elements. These elements form a tree using + the DOM. Each element also has a predefined meaning, which is explained + in this section. User agent requirements for how to handle each element + are also given, along with rules for authors on how to use the element. + + <dt><a href="#windows">Browsing Contexts</a> + + <dd>HTML documents do not exist in a vacuum — this section defines + many of the features that affect environments that deal with multiple + pages, links between pages, and running scripts. + + <dt>APIs + + <dd><a href="#editing">The Editing APIs</a>: HTML documents can provide a + number of mechanisms for users to modify content, which are described in + this section. + + <dd><a href="#comms">The Communication APIs</a>: Applications written in + HTML often require mechanisms to communicate with remote servers, as well + as communicating with other applications from different domains running + on the same client. + + <dd><a href="#repetition">Repetition Templates</a>: A mechanism to support + repeating sections in forms. + + <dt><a href="#syntax">The Language Syntax</a> + + <dd>All of these features would be for naught if they couldn't be + represented in a serialised form and sent to other people, and so this + section defines the syntax of HTML, along with rules for how to parse + HTML. + </dl> + + <p>There are also a couple of appendices, defining <a href="#wysiwyg">shims + for WYSIWYG editors</a>, <a href="#rendering">rendering rules</a> for Web + browsers, and listing <a href="#no">areas that are out of scope</a> for + this specification. + + <h4 id=how-to><span class=secno>1.2.1. </span>How to read this + specification</h4> + + <p>This specification should be read like all other specifications. First, + it should be read cover-to-cover, multiple times. Then, it should be read + backwards at least once. Then it should be read by picking random sections + from the contents list and following all the cross-references. + + <h3 id=conformance><span class=secno>1.3. </span>Conformance requirements</h3> + + <p>All diagrams, examples, and notes in this specification are + non-normative, as are all sections explicitly marked non-normative. + Everything else in this specification is normative. + + <p>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", + "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in the + normative parts of this document are to be interpreted as described in + RFC2119. For readability, these words do not appear in all uppercase + letters in this specification. <a href="#refsRFC2119">[RFC2119]</a></p> + <!-- XXX but they should be marked up --> + + <p>This specification describes the conformance criteria for user agents + (relevant to implementors) and documents (relevant to authors and + authoring tool implementors). + + <p class=note>There is no implied relationship between document conformance + requirements and implementation conformance requirements. User agents are + not free to handle non-conformant documents as they please; the processing + model described in this specification applies to implementations + regardless of the conformity of the input documents.</p> + <!--XXX quite possible that + this is stated twice. check for whether this is a dupe. --> + + <p>User agents fall into several (overlapping) categories with different + conformance requirements. + + <dl> + <dt id=interactive>Web browsers and other interactive user agents + + <dd> + <p>Web browsers that support <a href="#xhtml5">XHTML</a> must process + elements and attributes from the <a href="#html-namespace0">HTML + namespace</a> found in <a href="#xml-documents">XML documents</a> as + described in this specification, so that users can interact with them, + unless the semantics of those elements have been overridden by other + specifications.</p> + + <p class=example>A conforming XHTML processor would, upon finding an + XHTML <code><a href="#script0">script</a></code> element in an XML + document, execute the script contained in that element. However, if the + element is found within an XSLT transformation sheet (assuming the UA + also supports XSLT), then the processor would instead treat the <code><a + href="#script0">script</a></code> element as an opaque element that + forms part of the transform.</p> + + <p>Web browsers that support <a href="#html5" title=HTML5>HTML</a> must + process documents labelled as <code>text/html</code> as described in + this specification, so that users can interact with them.</p> + + <dt id=non-interactive>Non-interactive presentation user agents + + <dd> + <p>User agents that process HTML and XHTML documents purely to render + non-interactive versions of them must comply to the same conformance + criteria as Web browsers, except that they are exempt from requirements + regarding user interaction.</p> + + <p class=note>Typical examples of non-interactive presentation user + agents are printers (static UAs) and overhead displays (dynamic UAs). It + is expected that most static non-interactive presentation user agents + will also opt to <a href="#non-scripted">lack scripting support</a>.</p> + + <p class=example>A non-interactive but dynamic presentation UA would + still execute scripts, allowing forms to be dynamically submitted, and + so forth. However, since the concept of "focus" is irrelevant when the + user cannot interact with the document, the UA would not need to support + any of the focus-related DOM APIs.</p> + + <dt><dfn id=non-scripted>User agents with no scripting support</dfn> + + <dd> + <p>Implementations that do not support scripting (or which have their + scripting features <a href="#scripting1" title="scripting is + disabled">disabled</a>) are exempt from supporting the events and DOM + interfaces mentioned in this specification. For the parts of this + specification that are defined in terms of an events model or in terms + of the DOM, such user agents must still act as if events and the DOM + were supported.</p> + + <p class=note>Scripting can form an integral part of an application. Web + browsers that do not support scripting, or that have scripting disabled, + might be unable to fully convey the author's intent.</p> + + <dt>Conformance checkers + + <dd id=conformance-checkers> + <p>Conformance checkers must verify that a document conforms to the + applicable conformance criteria described in this specification. + Conformance checkers are exempt from detecting errors that require + interpretation of the author's intent (for example, while a document is + non-conforming if the content of a <code><a + href="#blockquote">blockquote</a></code> element is not a quote, + conformance checkers do not have to check that <code><a + href="#blockquote">blockquote</a></code> elements only contain quoted + material).</p> + + <p>Conformance checkers must check that the input document conforms when + <a href="#scripting1">scripting is disabled</a>, and should also check + that the input document conforms when <a href="#scripting2">scripting is + enabled</a>. (This is only a "SHOULD" and not a "MUST" requirement + because it has been proven to be impossible. <a + href="#refsHALTINGPROBLEM">[HALTINGPROBLEM]</a>)</p> + <!-- XXX + [Computable] On computable numbers, with an application to the + Entscheidungsproblem. Alan M. Turing. In Proceedings of the London + Mathematical Society, series 2, volume 42, pages 230-265. London + Mathematical Society, + 1937. http://www.turingarchive.org/browse.php/B/12 (referenced: + 2007-03-03) + --> + + <p>The term "HTML5 validator" can be used to refer to a conformance + checker that itself conforms to the applicable requirements of this + specification.</p> + + <div class=note> + <p>XML DTDs cannot express all the conformance requirements of this + specification. Therefore, a validating XML processor and a DTD cannot + constitute a conformance checker. Also, since neither of the two + authoring formats defined in this specification are applications of + SGML, a validating SGML system cannot constitute a conformance checker + either.</p> + + <p>To put it another way, there are three types of conformance criteria:</p> + + <ol> + <li>Criteria that can be expressed in a DTD. + + <li>Criteria that cannot be expressed by a DTD, but can still be + checked by a machine. + + <li>Criteria that can only be checked by a human. + </ol> + + <p>A conformance checker must check for the first two. A simple + DTD-based validator only checks for the first class of errors and is + therefore not a conforming conformance checker according to this + specification.</p> + </div> + + <dt>Data mining tools + + <dd id=data-mining> + <p>Applications and tools that process HTML and XHTML documents for + reasons other than to either render the documents or check them for + conformance should act in accordance to the semantics of the documents + that they process.</p> + + <p class=example>A tool that generates <span title="sections and + headings">document outlines</span> but increases the nesting level for + each paragraph and does not increase the nesting level for each section + would not be conforming.</p> + + <dt id=editors>Authoring tools and markup generators + + <dd> + <p>Authoring tools and markup generators must generate conforming + documents. Conformance criteria that apply to authors also apply to + authoring tools, where appropriate.</p> + + <p>Authoring tools are exempt from the strict requirements of using + elements only for their specified purpose, but only to the extent that + authoring tools are not yet able to determine author intent.</p> + + <p class=example>For example, it is not conforming to use an <code><a + href="#address">address</a></code> element for arbitrary contact + information; that element can only be used for marking up contact + information for the author of the document or section. However, since an + authoring tools is likely unable to determine the difference, an + authoring tool is exempt from that requirement.</p> + + <p class=note>In terms of conformance checking, an editor is therefore + required to output documents that conform to the same extent that a + conformance checker will verify.</p> + + <p>When an authoring tool is used to edit a non-conforming document, it + may preserve the conformance errors in sections of the document that + were not edited during the editing session (i.e. an editing tool is + allowed to round-trip errorneous content). However, an authoring tool + must not claim that the output is conformant if errors have been so + preserved.</p> + + <p>Authoring tools are expected to come in two broad varieties: tools + that work from structure or semantic data, and tools that work on a + What-You-See-Is-What-You-Get media-specific editing basis (WYSIWYG).</p> + + <p>The former is the preferred mechanism for tools that author HTML, + since the structure in the source information can be used to make + informed choices regarding which HTML elements and attributes are most + appropriate.</p> + + <p>However, WYSIWYG tools are legitimate, and this specification <a + href="#wysiwyg1" title="WYSIWYG editors">makes certain concessions to + WYSIWYG editors</a>.</p> + + <p>All authoring tools, whether WYSIWYG or not, should make a best effort + attempt at enabling users to create well-structured, semantically rich, + media-independent content.</p> + </dl> + + <p>Some conformance requirements are phrased as requirements on elements, + attributes, methods or objects. Such requirements fall into two + categories; those describing content model restrictions, and those + describing implementation behaviour. The former category of requirements + are requirements on documents and authoring tools. The second category are + requirements on user agents. + + <p>Conformance requirements phrased as algorithms or specific steps may be + implemented in any manner, so long as the end result is equivalent. (In + particular, the algorithms defined in this specification are intended to + be easy to follow, and not intended to be performant.) + + <p id=hardwareLimitations>User agents may impose implementation-specific + limits on otherwise unconstrained inputs, e.g. to prevent denial of + service attacks, to guard against running out of memory, or to work around + platform-specific limitations. + + <p>For compatibility with existing content and prior specifications, this + specification describes two authoring formats: one based on XML (referred + to as <dfn id=xhtml5 title=XHTML>XHTML5</dfn>), and one using a <a + href="#parsing">custom format</a> inspired by SGML (referred to as <dfn + id=html5>HTML5</dfn>). Implementations may support only one of these two + formats, although supporting both is encouraged. + + <p id=authors-using-xhtml><a href="#xhtml5">XHTML</a> documents (<a + href="#xml-documents">XML documents</a> using elements from the <a + href="#html-namespace0">HTML namespace</a>) that use the new features + described in this specification and that are served over the wire (e.g. by + HTTP) must be sent using an XML MIME type such as + <code>application/xml</code> or <code>application/xhtml+xml</code> and + must not be served as <code>text/html</code>. <a + href="#refsRFC3023">[RFC3023]</a> + + <p>Such XML documents may contain a <code>DOCTYPE</code> if desired, but + this is not required to conform to this specification. + + <p class=note>According to the XML specification, XML processors are not + guaranteed to process the external DTD subset referenced in the DOCTYPE. + This means, for example, that using entities for characters in XHTML + documents is unsafe (except for &lt;, &gt;, &amp;, &quot; + and &apos;). For interoperability, authors are advised to avoid + optional features of XML. + + <p id=authors-using-html><a href="#html5" title=HTML5>HTML documents</a>, + if they are served over the wire (e.g. by HTTP) must be labelled with the + <code>text/html</code> MIME type.</p> + <!-- + XXX update RFC 2854 --> + + <p id=entity-references>The language in this specification assumes that the + user agent expands all entity references, and therefore does not include + entity reference nodes in the DOM. If user agents do include entity + reference nodes in the DOM, then user agents must handle them as if they + were fully expanded when implementing this specification. For example, if + a requirement talks about an element's child text nodes, then any text + nodes that are children of an entity reference that is a child of that + element would be used as well.</p> + <!-- XXX unexpandable entities? --> + + <h4 id=common><span class=secno>1.3.1. </span>Common conformance + requirements for APIs exposed to JavaScript</h4> + + <p class=big-issue>A lot of arrays/lists/<span>collection</span>s in this + spec assume zero-based indexes but use the term "<var + title="">index</var>th" liberally. We should define those to be zero-based + and be clearer about this. + + <p>Unless other specified, if a DOM attribute that is a floating point + number type (<code title="">float</code>) is assigned an Infinity or + Not-a-Number value, a <code title=big-issue>NOT_SUPPORTED_ERR</code> + exception must be raised. + + <p>Unless other specified, if a DOM attribute that is a signed numberic + type is assigned a negative value, a <code + title=big-issue>NOT_SUPPORTED_ERR</code> exception must be raised. + + <p>Unless other specified, if a method with an argument that is a floating + point number type (<code title="">float</code>) is passed an Infinity or + Not-a-Number value, a <code title=big-issue>NOT_SUPPORTED_ERR</code> + exception must be raised. + + <p>Unless other specified, if a method is passed fewer arguments than is + defined for that method in its IDL definition, a <code + title=big-issue>NOT_SUPPORTED_ERR</code> exception must be raised. + + <p>Unless other specified, if a method is passed more arguments than is + defined for that method in its IDL definition, the excess arguments must + be ignored. + + <p>Unless other specified, if a method is expecting, as one of its + arguments, as defined by its IDL definition, an object implementing a + particular interface <var title="">X</var>, and the argument passed is an + object whose [[Class]] property is neither that interface <var + title="">X</var>, nor the name of an interface <var title="">Y</var> where + this specification requires that all objects implementing interface <var + title="">Y</var> also implement interface <var title="">X</var>, nor the + name of an interface that inherits from the expected interface <var + title="">X</var>, then a <code title="">TYPE_MISMATCH_ERR</code> exception + must be raised. + + <p class=big-issue>Anything else? Passing the wrong type of object, maybe? + Implied conversions to int/float? + + <h4 id=dependencies><span class=secno>1.3.2. </span>Dependencies</h4> + + <p>This specification relies on several other underlying specifications. + + <dl> + <dt>XML + + <dd> + <p>Implementations that support XHTML5 must support some version of XML, + as well as its corresponding namespaces specification, because XHTML5 + uses an XML serialisation with namespaces. <a href="#refsXML">[XML]</a> + <a href="#refsXMLNAMES">[XMLNAMES]</a></p> + + <dt>XML Base + + <dd> + <p id=xmlBase>User agents must follow the rules given by XML Base to + resolve relative URIs in HTML and XHTML fragments. That is the mechanism + used in this specification for resolving relative URIs in DOM trees. <a + href="#refsXMLBASE">[XMLBASE]</a></p> + + <p class=note>It is possible for <code + title=attr-xml-base>xml:base</code> attributes to be present even in + HTML fragments, as such attributes can be added dynamically using + script.</p> + + <dt>DOM + + <dd> + <p>Implementations must support some version of DOM Core and DOM Events, + because this specification is defined in terms of the DOM, and some of + the features are defined as extensions to the DOM Core interfaces. <a + href="#refsDOM3CORE">[DOM3CORE]</a> <a + href="#refsDOM3CORE">[DOM3EVENTS]</a></p> + + <dt>ECMAScript + + <dd> + <p>Implementations that use ECMAScript to implement the APIs defined in + this specification must implement them in a manner consistent with the + ECMAScript Bindings for DOM Specifications specification, as this + specification uses that specification's terminology. <a + href="#refsEBFD">[EBFD]</a></p> + </dl> + + <p>This specification does not require support of any particular network + transport protocols, image formats, audio formats, video formats, style + sheet language, scripting language, or any of the DOM and WebAPI + specifications beyond those described above. However, the language + described by this specification is biased towards CSS as the styling + language, ECMAScript as the scripting language, and HTTP as the network + protocol, and several features assume that those languages and protocols + are in use. + + <h4 id=features><span class=secno>1.3.3. </span>Features defined in other + specifications</h4> + + <p>Some elements are defined in terms of their DOM <dfn + id=textcontent><code>textContent</code></dfn> attribute. This is an + attribute defined on the <code>Node</code> interface in DOM3 Core. <a + href="#refsDOM3CORE">[DOM3CORE]</a> + + <p class=big-issue>Should textContent be defined differently for dir="" and + <bdo>? Should we come up with an alternative to textContent that + handles those and other things, like alt=""?</p> + <!-- This section is currently here exclusively so that we crossref + to textContent. XXX also add event-click, event-change, + event-DOMActivate, etc, here, and just have the section be a general + "defined in other specifications" section --> + + <p>The term <dfn id=activation0>activation behavior</dfn> is used as + defined in the DOM3 Events specification. <a + href="#refsDOM3EVENTS">[DOM3EVENTS]</a> <span class=big-issue>At the time + of writing, DOM3 Events hadn't yet been updated to define that + phrase.</span> + + <p id=alternate-style-sheets>The rules for handling alternative style + sheets are defined in the CSS object model specification. <a + href="#CSSOM">[CSSOM]</a> + + <p class=big-issue>See <a + href="http://dev.w3.org/cvsweb/~checkout~/csswg/cssom/Overview.html?rev=1.35&content-type=text/html;%20charset=utf-8">http://dev.w3.org/cvsweb/~checkout~/csswg/cssom/Overview.html?rev=1.35&content-type=text/html;%20charset=utf-8</a> + + <p>Certain features are defined in terms of CSS <color> values. When + the CSS value <code title="">currentColor</code> is specified in this + context, the "computed value of the 'color' property" for the purposes of + determining the computed value of the <code title="">currentColor</code> + keyword is the computed value of the 'color' property on the element in + question. <a href="#refsCSS3COLOR">[CSS3COLOR]</a> + + <p class=example>If a canvas gradient's <code + title=dom-canvasgradient-addColorStop><a + href="#addcolorstop">addColorStop()</a></code> method is called with the + <code title="">currentColor</code> keyword as the color, then the computed + value of the 'color' property on the <code><a + href="#canvas">canvas</a></code> element is the one that is used. + + <h3 id=terminology><span class=secno>1.4. </span>Terminology</h3> + + <p>This specification refers to both HTML and XML attributes and DOM + attributes, often in the same context. When it is not clear which is being + referred to, they are referred to as <dfn id=content>content + attributes</dfn> for HTML and XML attributes, and <dfn + id=dom-attributes>DOM attributes</dfn> for those from the DOM. Similarly, + the term "properties" is used for both ECMAScript object properties and + CSS properties. When these are ambiguous they are qualified as object + properties and CSS properties respectively. + + <p id=html-namespace>To ease migration from HTML to XHTML, UAs conforming + to this specification will place elements in HTML in the + <code>http://www.w3.org/1999/xhtml</code> namespace, at least for the + purposes of the DOM and CSS. The term "<dfn id=elements1>elements in the + HTML namespace</dfn>", or "<dfn id=html-elements>HTML elements</dfn>" for + short, when used in this specification, thus refers to both HTML and XHTML + elements. + + <p>Unless otherwise stated, all elements defined or mentioned in this + specification are in the <code>http://www.w3.org/1999/xhtml</code> + namespace, and all attributes defined or mentioned in this specification + have no namespace (they are in the per-element partition). + + <p>The term <a href="#html-">HTML documents</a> is sometimes used in + contrast with <a href="#xml-documents">XML documents</a> to mean + specifically documents that were parsed using an <a href="#html-0">HTML + parser</a> (as opposed to using an XML parser or created purely through + the DOM). + + <p>Generally, when the specification states that a feature applies to HTML + or XHTML, it also includes the other. When a feature specifically only + applies to one of the two languages, it is called out by explicitly + stating that it does not apply to the other format, as in "for HTML, ... + (this does not apply to XHTML)". + + <p>This specification uses the term <em>document</em> to refer to any use + of HTML, ranging from short static documents to long essays or reports + with rich multimedia, as well as to fully-fledged interactive + applications. + + <p>For readability, the term URI is used to refer to both ASCII URIs and + Unicode IRIs, as those terms are defined by <a + href="#refsRFC3986">[RFC3986]</a> and <a href="#refsRFC3987">[RFC3987]</a> + respectively. On the rare occasions where IRIs are not allowed but ASCII + URIs are, this is called out explicitly. + + <p>T |