summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2007-06-23 22:40:25 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2007-06-23 22:40:25 +0000
commit7b30a5520cfb56e651f0eb4da85a3e07747da7dc (patch)
tree5d6281c071c089e1e7a8ae6f8044cecaf6a7db16
downloadlibhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.gz
libhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.bz2
Import hubbub -- an HTML parsing library.
Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359
-rw-r--r--COPYING19
-rw-r--r--Makefile34
-rw-r--r--Makefile-riscos38
-rw-r--r--README46
-rw-r--r--build/Makefile.common39
-rw-r--r--docs/Architecture83
-rw-r--r--docs/Todo12
-rw-r--r--include/hubbub/errors.h29
-rw-r--r--include/hubbub/functypes.h37
-rw-r--r--include/hubbub/hubbub.h23
-rw-r--r--include/hubbub/parser.h84
-rw-r--r--include/hubbub/types.h97
-rw-r--r--json/README26
-rw-r--r--json/hex-chars.jmb1.p12
-rw-r--r--json/void-prototypes.jmb1.p45
-rw-r--r--src/Makefile79
-rw-r--r--src/charset/Makefile53
-rw-r--r--src/charset/aliases.c361
-rw-r--r--src/charset/aliases.h42
-rw-r--r--src/charset/codec.c186
-rw-r--r--src/charset/codec.h153
-rw-r--r--src/charset/codec_iconv.c837
-rw-r--r--src/charset/codec_impl.h51
-rw-r--r--src/charset/codec_utf8.c620
-rw-r--r--src/charset/detect.c673
-rw-r--r--src/charset/detect.h22
-rw-r--r--src/hubbub.c63
-rw-r--r--src/input/Makefile53
-rw-r--r--src/input/filter.c380
-rw-r--r--src/input/filter.h57
-rw-r--r--src/input/inputstream.c479
-rw-r--r--src/input/inputstream.h98
-rw-r--r--src/input/streamimpl.h77
-rw-r--r--src/input/utf8_stream.c567
-rw-r--r--src/parser.c237
-rw-r--r--src/tokeniser/Makefile53
-rw-r--r--src/tokeniser/entities.c363
-rw-r--r--src/tokeniser/entities.h25
-rw-r--r--src/tokeniser/tokeniser.c2282
-rw-r--r--src/tokeniser/tokeniser.h71
-rw-r--r--src/utils/Makefile53
-rw-r--r--src/utils/dict.c219
-rw-r--r--src/utils/dict.h31
-rw-r--r--src/utils/errors.c70
-rw-r--r--src/utils/utf8.c368
-rw-r--r--src/utils/utf8.h38
-rw-r--r--src/utils/utils.h28
-rw-r--r--test/INDEX15
-rw-r--r--test/Makefile63
-rw-r--r--test/README84
-rw-r--r--test/aliases.c61
-rw-r--r--test/cscodec.c247
-rw-r--r--test/csdetect.c132
-rw-r--r--test/data/Aliases302
-rw-r--r--test/data/cscodec/INDEX5
-rw-r--r--test/data/cscodec/simple.datbin0 -> 1193 bytes
-rw-r--r--test/data/csdetect/INDEX9
-rw-r--r--test/data/csdetect/bom.datbin0 -> 639 bytes
-rw-r--r--test/data/csdetect/non-ascii-meta.dat129
-rw-r--r--test/data/csdetect/test-yahoo-jp.dat10
-rw-r--r--test/data/csdetect/tests1.dat392
-rw-r--r--test/data/csdetect/tests2.dat82
-rw-r--r--test/data/html/INDEX6
-rw-r--r--test/data/html/section-tree-construction.html2783
-rw-r--r--test/data/html/web-apps.html41271
-rw-r--r--test/data/tokeniser2/INDEX7
-rw-r--r--test/data/tokeniser2/contentModelFlags.test36
-rw-r--r--test/data/tokeniser2/test1.test136
-rw-r--r--test/data/tokeniser2/test2.test108
-rw-r--r--test/dict.c53
-rw-r--r--test/entities.c42
-rw-r--r--test/filter.c355
-rw-r--r--test/hubbub.c29
-rw-r--r--test/inputstream.c126
-rw-r--r--test/parser.c175
-rw-r--r--test/regression/cscodec-segv.c37
-rw-r--r--test/regression/filter-segv.c38
-rw-r--r--test/testrunner.pl147
-rw-r--r--test/testutils.h123
-rw-r--r--test/tokeniser.c174
-rw-r--r--test/tokeniser2.c418
81 files changed, 56908 insertions, 0 deletions
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..7646f4c
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,19 @@
+Copyright (C) 2007 J-M Bell
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..db5a35b
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,34 @@
+# Toolchain definitions for building on the destination platform
+export CC = gcc
+export AR = ar
+export LD = gcc
+
+export CP = cp
+export RM = rm
+export MKDIR = mkdir
+export MV = mv
+export ECHO = echo
+export MAKE = make
+export PERL = perl
+export PKGCONFIG = pkg-config
+
+# Toolchain flags
+WARNFLAGS = -Wall -Wextra -Wundef -Wpointer-arith -Wcast-align \
+ -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \
+ -Wmissing-declarations -Wnested-externs -Werror -pedantic
+export CFLAGS = -std=c99 -D_BSD_SOURCE -I$(TOP)/include/ $(WARNFLAGS)
+export ARFLAGS = -cru
+export LDFLAGS = -L$(TOP)/
+
+export CPFLAGS =
+export RMFLAGS =
+export MKDIRFLAGS = -p
+export MVFLAGS =
+export ECHOFLAGS =
+export MAKEFLAGS =
+export PKGCONFIGFLAGS =
+
+export EXEEXT =
+
+
+include build/Makefile.common
diff --git a/Makefile-riscos b/Makefile-riscos
new file mode 100644
index 0000000..f1d8cf0
--- /dev/null
+++ b/Makefile-riscos
@@ -0,0 +1,38 @@
+# Toolchain definitions for building for RISC OS using the GCCSDK cross-compiler
+GCCSDK_INSTALL_CROSSBIN ?= /home/riscos/cross/bin
+GCCSDK_INSTALL_ENV ?= /home/riscos/env
+
+export CC = $(GCCSDK_INSTALL_CROSSBIN)/gcc
+export AR = $(GCCSDK_INSTALL_CROSSBIN)/ar
+export LD = $(GCCSDK_INSTALL_CROSSBIN)/gcc
+
+export CP = cp
+export RM = rm
+export MKDIR = mkdir
+export MV = mv
+export ECHO = echo
+export MAKE = make
+export PERL = perl
+export PKGCONFIG = pkg-config
+
+# Toolchain flags
+WARNFLAGS = -Wall -Wextra -Wundef -Wpointer-arith -Wcast-align \
+ -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \
+ -Wmissing-declarations -Wnested-externs -Werror -pedantic
+export CFLAGS = -std=c99 -D_BSD_SOURCE -I$(TOP)/include/ $(WARNFLAGS) \
+ -mpoke-function-name
+export ARFLAGS = -cru
+export LDFLAGS = -L$(TOP)/
+
+export CPFLAGS =
+export RMFLAGS =
+export MKDIRFLAGS = -p
+export MVFLAGS =
+export ECHOFLAGS =
+export MAKEFLAGS =
+export PKGCONFIGFLAGS =
+
+export EXEEXT = ,ff8
+
+
+include build/Makefile.common
diff --git a/README b/README
new file mode 100644
index 0000000..e8b92cb
--- /dev/null
+++ b/README
@@ -0,0 +1,46 @@
+Hubbub -- an HTML parser
+========================
+
+Overview
+--------
+
+ Hubbub is a flexible HTML parser. It aims to comply with the HTML5
+ specification.
+
+Requirements
+------------
+
+ Hubbub requires the following tools:
+
+ + A C99 capable C compiler
+ + GNU make or compatible
+ + Perl (for the testcases)
+ + Pkg-config (for the testcases)
+
+ Hubbub also requires the following libraries to be installed:
+
+ + An iconv implementation (e.g. libiconv)
+ + JSON-C (for the testcases) -- see json/README for further information
+
+Compilation
+-----------
+
+ If necessary, modify the toolchain settings in the Makefile.
+ Invoke make:
+ $ make
+
+Verification
+------------
+
+ To verify that the parser is working, it is necessary to specify a
+ different makefile target than that used for normal compilation, thus:
+
+ $ make test
+
+API documentation
+-----------------
+
+ Currently, there is none. However, the code is well commented and the
+ public API may be found in the "include" directory. The testcase sources
+ may also be of use in working out how to use it.
+
diff --git a/build/Makefile.common b/build/Makefile.common
new file mode 100644
index 0000000..21c319a
--- /dev/null
+++ b/build/Makefile.common
@@ -0,0 +1,39 @@
+# Top-level Makefile fragment for Hubbub
+
+# Name of component
+export COMPONENT = libhubbub
+
+# Environment
+export EXPORT = $(CURDIR)/dist
+export TOP = $(CURDIR)
+
+.PHONY: release debug test clean setup export distclean
+
+# Rules
+release: setup
+ @$(MAKE) $(MAKEFLAGS) -C src release
+
+debug: setup
+ @$(MAKE) $(MAKEFLAGS) -C src debug
+
+test: debug
+ @$(MAKE) $(MAKEFLAGS) -C test test
+
+clean:
+ @$(MAKE) $(MAKEFLAGS) -C src clean
+ @$(MAKE) $(MAKEFLAGS) -C test clean
+
+setup:
+ @$(MAKE) $(MAKEFLAGS) -C src setup
+ @$(MAKE) $(MAKEFLAGS) -C test setup
+
+export: release
+ @$(MKDIR) $(MKDIRFLAGS) $(TOP)/dist/lib
+ @$(CP) $(CPFLAGS) -r include $(EXPORT)/
+ @$(MAKE) $(MAKEFLAGS) -C src export
+ @$(MAKE) $(MAKEFLAGS) -C test export
+
+distclean: clean
+ -@$(RM) $(RMFLAGS) -r $(TOP)/dist
+ @$(MAKE) $(MAKEFLAGS) -C src distclean
+ @$(MAKE) $(MAKEFLAGS) -C test distclean
diff --git a/docs/Architecture b/docs/Architecture
new file mode 100644
index 0000000..73966eb
--- /dev/null
+++ b/docs/Architecture
@@ -0,0 +1,83 @@
+Hubbub parser architecture
+==========================
+
+Introduction
+------------
+
+ Hubbub is a flexible HTML parser. It offers two interfaces:
+
+ * a SAX-style event interface
+ * a DOM-style tree-based interface
+
+Overview
+--------
+
+ Hubbub is comprised of four parts:
+
+ * a charset handler
+ * an input stream veneer
+ * a tokeniser
+ * a tree builder
+
+ Charset handler
+ ---------------
+
+ The charset handler converts the raw data input into a requested encoding.
+
+ Input stream veneer
+ -------------------
+
+ The input stream veneer provides an abstract stream-like interface over
+ the document buffer. This is used by the tokeniser. The document buffer
+ will be encoded in either UTf-8 or UTF-16 (this is client-selectable).
+
+ Tokeniser
+ ---------
+
+ The tokeniser divides the data held in the document buffer into chunks.
+ It sends SAX-style events for each chunk. The tokeniser is agnostic to
+ the charset the document buffer is stored in.
+
+ Tree builder
+ ------------
+
+ The tree builder constructs a DOM tree from the SAX events emitted by the
+ tokeniser. The tree builder is tied to the document buffer charset.
+
+Memory usage and ownership
+--------------------------
+
+ Memory usage within the library is well defined, as is ownership of allocated
+ memory.
+
+ Raw input data provided by the library client is owned by the client.
+
+ The document buffer is allocated on the fly by the library.
+
+ The document buffer is created and resized by the charset handler. Its
+ location is passed to the tree builder through a dedicated event. While
+ parsing is occurring, the ownership of the document buffer lies with the
+ charset handler. Upon parse completion, the tree builder may request
+ ownership of the buffer. If it does not, the buffer will be freed on parser
+ destruction.
+
+ SAX events which refer to document segments contain direct references into
+ the document buffer (i.e. no copying of data held in the document buffer
+ occurs).
+
+ The tree builder will allocate memory for use as DOM nodes. References to
+ strings in the document buffer will be direct and will operate a
+ copy-on-write strategy. All strings (excepting those which comprise part of
+ the document buffer) and nodes within the DOM are reference counted. Upon a
+ reference count reaching 0, the item is freed.
+
+ The above strategy permits data copying to be kept to a minimum, hence
+ minimising memory usage.
+
+Parse errors
+------------
+
+ Notification of parse errors is made through a dedicated event similar to
+ that used for notification of movement of the document buffer. This event
+ contains the line/column offset of the error location, along with a message
+ detailing the error.
diff --git a/docs/Todo b/docs/Todo
new file mode 100644
index 0000000..2abce2b
--- /dev/null
+++ b/docs/Todo
@@ -0,0 +1,12 @@
+TODO list
+=========
+
+ + Update tokeniser to comply with latest spec draft (currently complies
+ with 2007-06-12 draft)
+ + Implement one or more tree builders
+ + More charset convertors (or make the iconv codec significantly faster)
+ + Parse error reporting from the tokeniser
+ + Implement extraneous chunk insertion/tokenisation
+ + Statistical charset autodetection
+ + Shared library, for those platforms that support such things
+ + Optimise it
diff --git a/include/hubbub/errors.h b/include/hubbub/errors.h
new file mode 100644
index 0000000..c3b1f5d
--- /dev/null
+++ b/include/hubbub/errors.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_errors_h_
+#define hubbub_errors_h_
+
+#include <stddef.h>
+
+typedef enum hubbub_error {
+ HUBBUB_OK = 0,
+
+ HUBBUB_NOMEM = 1,
+ HUBBUB_BADPARM = 2,
+ HUBBUB_INVALID = 3,
+ HUBBUB_FILENOTFOUND = 4,
+ HUBBUB_NEEDDATA = 5,
+} hubbub_error;
+
+/* Convert a hubbub error value to a string */
+const char *hubbub_error_to_string(hubbub_error error);
+/* Convert a string to a hubbub error value */
+hubbub_error hubbub_error_from_string(const char *str, size_t len);
+
+#endif
+
diff --git a/include/hubbub/functypes.h b/include/hubbub/functypes.h
new file mode 100644
index 0000000..aa3e649
--- /dev/null
+++ b/include/hubbub/functypes.h
@@ -0,0 +1,37 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_functypes_h_
+#define hubbub_functypes_h_
+
+#include <stdlib.h>
+
+#include <hubbub/types.h>
+
+/* Type of allocation function for hubbub */
+typedef void *(*hubbub_alloc)(void *ptr, size_t size, void *pw);
+
+/**
+ * Type of token handling function
+ */
+typedef void (*hubbub_token_handler)(const hubbub_token *token, void *pw);
+
+/**
+ * Type of document buffer handling function
+ */
+typedef void (*hubbub_buffer_handler)(const uint8_t *data,
+ size_t len, void *pw);
+
+/**
+ * Type of parse error handling function
+ */
+typedef void (*hubbub_error_handler)(uint32_t line, uint32_t col,
+ const char *message, void *pw);
+
+
+#endif
+
diff --git a/include/hubbub/hubbub.h b/include/hubbub/hubbub.h
new file mode 100644
index 0000000..8a15eca
--- /dev/null
+++ b/include/hubbub/hubbub.h
@@ -0,0 +1,23 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_h_
+#define hubbub_h_
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+#include <hubbub/types.h>
+
+/* Initialise the Hubbub library for use */
+hubbub_error hubbub_initialise(const char *aliases_file,
+ hubbub_alloc alloc, void *pw);
+
+/* Clean up after Hubbub */
+hubbub_error hubbub_finalise(hubbub_alloc alloc, void *pw);
+
+#endif
+
diff --git a/include/hubbub/parser.h b/include/hubbub/parser.h
new file mode 100644
index 0000000..cdf8664
--- /dev/null
+++ b/include/hubbub/parser.h
@@ -0,0 +1,84 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_parser_h_
+#define hubbub_parser_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+#include <hubbub/types.h>
+
+typedef struct hubbub_parser hubbub_parser;
+
+/**
+ * Hubbub parser option types
+ */
+typedef enum hubbub_parser_opttype {
+ HUBBUB_PARSER_TOKEN_HANDLER,
+ HUBBUB_PARSER_BUFFER_HANDLER,
+ HUBBUB_PARSER_ERROR_HANDLER,
+ HUBBUB_PARSER_CONTENT_MODEL,
+} hubbub_parser_opttype;
+
+/**
+ * Hubbub parser option parameters
+ */
+typedef union hubbub_parser_optparams {
+ struct {
+ hubbub_token_handler handler;
+ void *pw;
+ } token_handler;
+
+ struct {
+ hubbub_buffer_handler handler;
+ void *pw;
+ } buffer_handler;
+
+ struct {
+ hubbub_error_handler handler;
+ void *pw;
+ } error_handler;
+
+ struct {
+ hubbub_content_model model;
+ } content_model;
+} hubbub_parser_optparams;
+
+/* Create a hubbub parser */
+hubbub_parser *hubbub_parser_create(const char *enc, const char *int_enc,
+ hubbub_alloc alloc, void *pw);
+/* Destroy a hubbub parser */
+void hubbub_parser_destroy(hubbub_parser *parser);
+
+/* Configure a hubbub parser */
+hubbub_error hubbub_parser_setopt(hubbub_parser *parser,
+ hubbub_parser_opttype type,
+ hubbub_parser_optparams *params);
+
+/* Pass a chunk of data to a hubbub parser for parsing */
+/* This data is encoded in the input charset */
+hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser,
+ uint8_t *data, size_t len);
+/* Pass a chunk of extraneous data to a hubbub parser for parsing */
+/* This data is UTF-8 encoded */
+hubbub_error hubbub_parser_parse_extraneous_chunk(hubbub_parser *parser,
+ uint8_t *data, size_t len);
+/* Inform the parser that the last chunk of data has been parsed */
+hubbub_error hubbub_parser_completed(hubbub_parser *parser);
+
+/* Read the document charset */
+const char *hubbub_parser_read_charset(hubbub_parser *parser,
+ hubbub_charset_source *source);
+
+/* Claim ownership of the document buffer */
+hubbub_error hubbub_parser_claim_buffer(hubbub_parser *parser,
+ uint8_t **buffer, size_t *len);
+
+#endif
+
diff --git a/include/hubbub/types.h b/include/hubbub/types.h
new file mode 100644
index 0000000..57518ae
--- /dev/null
+++ b/include/hubbub/types.h
@@ -0,0 +1,97 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_types_h_
+#define hubbub_types_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+/** Source of charset information, in order of importance
+ * A client-dictated charset will override all others.
+ * A document-specified charset will override autodetection or the default */
+typedef enum hubbub_charset_source {
+ HUBBUB_CHARSET_UNKNOWN = 0, /**< Unknown */
+ HUBBUB_CHARSET_DEFAULT = 1, /**< Default setting */
+ HUBBUB_CHARSET_DETECTED = 2, /**< Autodetected */
+ HUBBUB_CHARSET_DOCUMENT = 3, /**< Defined in document */
+ HUBBUB_CHARSET_DICTATED = 4, /**< Dictated by client */
+} hubbub_charset_source;
+
+/**
+ * Content model flag
+ */
+typedef enum hubbub_content_model {
+ HUBBUB_CONTENT_MODEL_PCDATA,
+ HUBBUB_CONTENT_MODEL_RCDATA,
+ HUBBUB_CONTENT_MODEL_CDATA,
+ HUBBUB_CONTENT_MODEL_PLAINTEXT
+} hubbub_content_model;
+
+/**
+ * Type of an emitted token
+ */
+typedef enum hubbub_token_type {
+ HUBBUB_TOKEN_DOCTYPE,
+ HUBBUB_TOKEN_START_TAG,
+ HUBBUB_TOKEN_END_TAG,
+ HUBBUB_TOKEN_COMMENT,
+ HUBBUB_TOKEN_CHARACTER,
+ HUBBUB_TOKEN_EOF
+} hubbub_token_type;
+
+/**
+ * Tokeniser string type
+ */
+typedef struct hubbub_string {
+ uint32_t data_off; /**< Byte offset of string start */
+ size_t len; /**< Byte length of string */
+} hubbub_string;
+
+/**
+ * Tag attribute data
+ */
+typedef struct hubbub_attribute {
+ hubbub_string name; /**< Attribute name */
+ hubbub_string value; /**< Attribute value */
+} hubbub_attribute;
+
+/**
+ * Data for doctype token
+ */
+typedef struct hubbub_doctype {
+ hubbub_string name; /**< Doctype name */
+ bool correct; /**< Doctype validity flag */
+} hubbub_doctype;
+
+/**
+ * Data for a tag
+ */
+typedef struct hubbub_tag {
+ hubbub_string name; /**< Tag name */
+ uint32_t n_attributes; /**< Count of attributes */
+ hubbub_attribute *attributes; /**< Array of attribute data */
+} hubbub_tag;
+
+/**
+ * Token data
+ */
+typedef struct hubbub_token {
+ hubbub_token_type type;
+
+ union {
+ hubbub_doctype doctype;
+
+ hubbub_tag tag;
+
+ hubbub_string comment;
+
+ hubbub_string character;
+ } data;
+} hubbub_token;
+
+#endif
diff --git a/json/README b/json/README
new file mode 100644
index 0000000..50dcf79
--- /dev/null
+++ b/json/README
@@ -0,0 +1,26 @@
+JSON-C patches
+==============
+
+This directory contains a couple of patches to JSON-C 0.7.
+Upstream sources may be found at http://oss.metaparadigm.com/json-c/
+
+hex-chars.jmb1.p:
+
+ Fix handling of upper case hex digits.
+ The previous behaviour resulted in the likes of \uFFFD causing a parse
+ error.
+
+void-prototypes.jmb1.p:
+
+ Fix compiler warnings about function prototypes in header files when
+ compiling client code in standards mode with pedantic warnings switched
+ on.
+
+Apply them as follows:
+
+ $ cd json-c-0.7
+ $ patch -p 1 -i ../hex-chars.jmb1.p
+ $ patch -p 1 -i ../void-prototypes.jmb1.p
+
+They have been submitted upstream, so will probably disappear in due
+course.
diff --git a/json/hex-chars.jmb1.p b/json/hex-chars.jmb1.p
new file mode 100644
index 0000000..10ea30a
--- /dev/null
+++ b/json/hex-chars.jmb1.p
@@ -0,0 +1,12 @@
+diff -urw json-c-0.7/json_object.c json-c-0.7-jmb/json_object.c
+--- json-c-0.7/json_object.c 2007-03-13 08:25:39.000000000 +0000
++++ json-c-0.7-jmb/json_object.c 2007-06-23 13:33:20.000000000 +0100
+@@ -30,7 +30,7 @@
+ /* #define REFCOUNT_DEBUG 1 */
+
+ char *json_number_chars = "0123456789.+-e";
+-char *json_hex_chars = "0123456789abcdef";
++char *json_hex_chars = "0123456789abcdefABCDEF";
+
+ #ifdef REFCOUNT_DEBUG
+ static char* json_type_name[] = {
diff --git a/json/void-prototypes.jmb1.p b/json/void-prototypes.jmb1.p
new file mode 100644
index 0000000..db71ffe
--- /dev/null
+++ b/json/void-prototypes.jmb1.p
@@ -0,0 +1,45 @@
+diff -urw json-c-0.7/debug.h json-c-0.7-jmb/debug.h
+--- json-c-0.7/debug.h 2007-03-13 08:25:39.000000000 +0000
++++ json-c-0.7-jmb/debug.h 2007-06-22 23:52:37.000000000 +0100
+@@ -13,7 +13,7 @@
+ #define _DEBUG_H_
+
+ extern void mc_set_debug(int debug);
+-extern int mc_get_debug();
++extern int mc_get_debug(void);
+
+ extern void mc_set_syslog(int syslog);
+ extern void mc_abort(const char *msg, ...);
+diff -urw json-c-0.7/json_object.h json-c-0.7-jmb/json_object.h
+--- json-c-0.7/json_object.h 2007-03-13 08:25:39.000000000 +0000
++++ json-c-0.7-jmb/json_object.h 2007-06-22 23:53:10.000000000 +0100
+@@ -98,7 +98,7 @@
+ /** Create a new empty object
+ * @returns a json_object of type json_type_object
+ */
+-extern struct json_object* json_object_new_object();
++extern struct json_object* json_object_new_object(void);
+
+ /** Get the hashtable of a json_object of type json_type_object
+ * @param obj the json_object instance
+@@ -167,7 +167,7 @@
+ /** Create a new empty json_object of type json_type_array
+ * @returns a json_object of type json_type_array
+ */
+-extern struct json_object* json_object_new_array();
++extern struct json_object* json_object_new_array(void);
+
+ /** Get the arraylist of a json_object of type json_type_array
+ * @param obj the json_object instance
+diff -urw json-c-0.7/json_tokener.h json-c-0.7-jmb/json_tokener.h
+--- json-c-0.7/json_tokener.h 2007-03-13 08:25:39.000000000 +0000
++++ json-c-0.7-jmb/json_tokener.h 2007-06-22 23:53:26.000000000 +0100
+@@ -79,7 +79,7 @@
+
+ extern const char* json_tokener_errors[];
+
+-extern struct json_tokener* json_tokener_new();
++extern struct json_tokener* json_tokener_new(void);
+ extern void json_tokener_free(struct json_tokener *tok);
+ extern void json_tokener_reset(struct json_tokener *tok);
+ extern struct json_object* json_tokener_parse(char *str);
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..b72a9e0
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,79 @@
+# Makefile for libhubbub
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT Name of component
+# EXPORT Absolute path of export directory
+# TOP Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean Clean source tree
+# debug Create a debug binary
+# distclean Fully clean source tree, back to pristine condition
+# export Export distributable components to ${EXPORT}
+# release Create a release binary
+# setup Perform any setup required prior to compilation
+# test Execute any test cases
+
+# Manipulate include paths
+CFLAGS += -I$(CURDIR)
+
+# Release output
+RELEASE = ${TOP}/${COMPONENT}.a
+
+# Debug output
+DEBUG = ${TOP}/${COMPONENT}-debug.a
+
+# Objects
+OBJS = hubbub parser
+
+.PHONY: clean debug distclean export release setup test
+
+# Targets
+release: $(addprefix Release/, $(addsuffix .o, $(OBJS)))
+ @${MAKE} -C charset release
+ @${MAKE} -C input release
+ @${MAKE} -C tokeniser release
+ @${MAKE} -C utils release
+ @${AR} ${ARFLAGS} $(RELEASE) Release/*
+
+debug: $(addprefix Debug/, $(addsuffix .o, $(OBJS)))
+ @${MAKE} -C charset debug
+ @${MAKE} -C input debug
+ @${MAKE} -C tokeniser debug
+ @${MAKE} -C utils debug
+ @${AR} ${ARFLAGS} $(DEBUG) Debug/*
+
+clean:
+ @${MAKE} -C charset clean
+ @${MAKE} -C input clean
+ @${MAKE} -C tokeniser clean
+ @${MAKE} -C utils clean
+ -@${RM} ${RMFLAGS} $(addprefix Release/, $(addsuffix .o, ${OBJS}))
+ -@${RM} ${RMFLAGS} $(addprefix Debug/, $(addsuffix .o, ${OBJS}))
+ -@${RM} ${RMFLAGS} $(RELEASE) $(DEBUG)
+
+distclean:
+ -@${RM} ${RMFLAGS} -r Release
+ -@${RM} ${RMFLAGS} -r Debug
+
+setup:
+ @${MKDIR} ${MKDIRFLAGS} Release
+ @${MKDIR} ${MKDIRFLAGS} Debug
+
+export:
+ @${CP} ${CPFLAGS} $(RELEASE) ${EXPORT}/lib/
+
+test:
+
+# Pattern rules
+Release/%.o: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $<
+
+Debug/%.o: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c -g ${CFLAGS} -o $@ $<
diff --git a/src/charset/Makefile b/src/charset/Makefile
new file mode 100644
index 0000000..62817b3
--- /dev/null
+++ b/src/charset/Makefile
@@ -0,0 +1,53 @@
+# Makefile for libhubbub
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT Name of component
+# EXPORT Absolute path of export directory
+# TOP Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean Clean source tree
+# debug Create a debug binary
+# distclean Fully clean source tree, back to pristine condition
+# export Export distributable components to ${EXPORT}
+# release Create a release binary
+# setup Perform any setup required prior to compilation
+# test Execute any test cases
+
+# Manipulate include paths
+CFLAGS += -I$(CURDIR)
+
+# Objects
+OBJS = aliases codec codec_iconv codec_utf8 detect
+
+.PHONY: clean debug distclean export release setup test
+
+# Targets
+release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS)))
+
+debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS)))
+
+clean:
+ -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS}))
+ -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS}))
+
+distclean:
+
+setup:
+
+export:
+
+test:
+
+# Pattern rules
+../Release/%.o: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $<
+
+../Debug/%.o: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c -g ${CFLAGS} -o $@ $<
diff --git a/src/charset/aliases.c b/src/charset/aliases.c
new file mode 100644
index 0000000..dcf6de2
--- /dev/null
+++ b/src/charset/aliases.c
@@ -0,0 +1,361 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+
+struct alias {
+ struct alias *next;
+ hubbub_aliases_canon *canon;
+ uint16_t name_len;
+ char name[1];
+};
+
+#define HASH_SIZE (43)
+static hubbub_aliases_canon *canon_tab[HASH_SIZE];
+static struct alias *alias_tab[HASH_SIZE];
+
+static hubbub_error hubbub_create_alias(const char *alias,
+ hubbub_aliases_canon *c, hubbub_alloc alloc, void *pw);
+static hubbub_aliases_canon *hubbub_create_canon(const char *canon,
+ uint16_t mibenum, hubbub_alloc alloc, void *pw);
+static uint32_t hubbub_hash_val(const char *alias, size_t len);
+
+/**
+ * Create alias data from Aliases file
+ *
+ * \param filename The path to the Aliases file
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ */
+hubbub_error hubbub_aliases_create(const char *filename,
+ hubbub_alloc alloc, void *pw)
+{
+ char buf[300];
+ FILE *fp;
+
+ if (filename == NULL || alloc == NULL)
+ return HUBBUB_BADPARM;
+
+ fp = fopen(filename, "r");
+ if (fp == NULL)
+ return HUBBUB_FILENOTFOUND;
+
+ while (fgets(buf, sizeof buf, fp)) {
+ char *p, *aliases = 0, *mib, *end;
+ hubbub_aliases_canon *cf;
+
+ if (buf[0] == 0 || buf[0] == '#')
+ /* skip blank lines or comments */
+ continue;
+
+ buf[strlen(buf) - 1] = 0; /* lose terminating newline */
+ end = buf + strlen(buf);
+
+ /* find end of canonical form */
+ for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ *p++ = '\0'; /* terminate canonical form */
+
+ /* skip whitespace */
+ for (; *p && isspace(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ mib = p;
+
+ /* find end of mibenum */
+ for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p < end)
+ *p++ = '\0'; /* terminate mibenum */
+
+ cf = hubbub_create_canon(buf, atoi(mib), alloc, pw);
+ if (cf == NULL)
+ continue;
+
+ /* skip whitespace */
+ for (; p < end && *p && isspace(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ aliases = p;
+
+ while (p < end) {
+ /* find end of alias */
+ for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p > end)
+ /* stop if we've gone past the end */
+ break;
+ /* terminate current alias */
+ *p++ = '\0';
+
+ if (hubbub_create_alias(aliases, cf,
+ alloc, pw) != HUBBUB_OK)
+ break;
+
+ /* in terminating, we may have advanced
+ * past the end - check this here */
+ if (p >= end)
+ break;
+
+ /* skip whitespace */
+ for (; *p && isspace(*p); p++)
+ ; /* do nothing */
+
+ if (p >= end)
+ /* gone past end => stop */
+ break;
+
+ /* update pointer to current alias */
+ aliases = p;
+ }
+ }
+
+ fclose(fp);
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Free all alias data
+ *
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data
+ */
+void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw)
+{
+ hubbub_aliases_canon *c, *d;
+ struct alias *a, *b;
+ int i;
+
+ for (i = 0; i != HASH_SIZE; i++) {
+ for (c = canon_tab[i]; c; c = d) {
+ d = c->next;
+ alloc(c, 0, pw);
+ }
+ canon_tab[i] = NULL;
+
+ for (a = alias_tab[i]; a; a = b) {
+ b = a->next;
+ alloc(a, 0, pw);
+ }
+ alias_tab[i] = NULL;
+ }
+}
+
+/**
+ * Retrieve the MIB enum value assigned to an encoding name
+ *
+ * \param alias The alias to lookup
+ * \param len The length of the alias string
+ * \return The MIB enum value, or 0 if not found
+ */
+uint16_t hubbub_mibenum_from_name(const char *alias, size_t len)
+{
+ hubbub_aliases_canon *c;
+
+ if (alias == NULL)
+ return 0;
+
+ c = hubbub_alias_canonicalise(alias, len);
+ if (c == NULL)
+ return 0;
+
+ return c->mib_enum;
+}
+
+/**
+ * Retrieve the canonical name of an encoding from the MIB enum
+ *
+ * \param mibenum The MIB enum value
+ * \return Pointer to canonical name, or NULL if not found
+ */
+const char *hubbub_mibenum_to_name(uint16_t mibenum)
+{
+ int i;
+ hubbub_aliases_canon *c;
+
+ for (i = 0; i != HASH_SIZE; i++)
+ for (c = canon_tab[i]; c; c = c->next)
+ if (c->mib_enum == mibenum)
+ return c->name;
+
+ return NULL;
+}
+
+
+/**
+ * Retrieve the canonical form of an alias name
+ *
+ * \param alias The alias name
+ * \param len The length of the alias name
+ * \return Pointer to canonical form or NULL if not found
+ */
+hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias,
+ size_t len)
+{
+ uint32_t hash;
+ hubbub_aliases_canon *c;
+ struct alias *a;
+
+ if (alias == NULL)
+ return NULL;
+
+ hash = hubbub_hash_val(alias, len);
+
+ for (c = canon_tab[hash]; c; c = c->next)
+ if (c->name_len == len &&
+ strncasecmp(c->name, alias, len) == 0)
+ break;
+ if (c)
+ return c;
+
+ for (a = alias_tab[hash]; a; a = a->next)
+ if (a->name_len == len &&
+ strncasecmp(a->name, alias, len) == 0)
+ break;
+ if (a)
+ return a->canon;
+
+ return NULL;
+}
+
+
+/**
+ * Create an alias
+ *
+ * \param alias The alias name
+ * \param c The canonical form
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_create_alias(const char *alias, hubbub_aliases_canon *c,
+ hubbub_alloc alloc, void *pw)
+{
+ struct alias *a;
+ uint32_t hash;
+
+ if (alias == NULL || c == NULL || alloc == NULL)
+ return HUBBUB_BADPARM;
+
+ a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw);
+ if (a == NULL)
+ return HUBBUB_NOMEM;
+
+ a->canon = c;
+ a->name_len = strlen(alias);
+ strcpy(a->name, alias);
+ a->name[a->name_len] = '\0';
+
+ hash = hubbub_hash_val(alias, a->name_len);
+
+ a->next = alias_tab[hash];
+ alias_tab[hash] = a;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Create a canonical form
+ *
+ * \param canon The canonical name
+ * \param mibenum The MIB enum value
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to canonical form or NULL on error
+ */
+hubbub_aliases_canon *hubbub_create_canon(const char *canon,
+ uint16_t mibenum, hubbub_alloc alloc, void *pw)
+{
+ hubbub_aliases_canon *c;
+ uint32_t hash, len;
+
+ if (canon == NULL || alloc == NULL)
+ return NULL;
+
+ len = strlen(canon);
+
+ c = alloc(NULL, sizeof(hubbub_aliases_canon) + len + 1, pw);
+ if (c == NULL)
+ return NULL;
+
+ c->mib_enum = mibenum;
+ c->name_len = len;
+ strcpy(c->name, canon);
+ c->name[len] = '\0';
+
+ hash = hubbub_hash_val(canon, len);
+
+ c->next = canon_tab[hash];
+ canon_tab[hash] = c;
+
+ return c;
+}
+
+/**
+ * Hash function
+ *
+ * \param alias String to hash
+ * \return The hashed value
+ */
+uint32_t hubbub_hash_val(const char *alias, size_t len)
+{
+ const char *s = alias;
+ uint32_t h = 5381;
+
+ if (alias == NULL)
+ return 0;
+
+ while (len--)
+ h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */
+
+ return h % HASH_SIZE;
+}
+
+
+#ifndef NDEBUG
+/**
+ * Dump all alias data to stdout
+ */
+void hubbub_aliases_dump(void)
+{
+ hubbub_aliases_canon *c;
+ struct alias *a;
+ int i;
+ size_t size = 0;
+
+ for (i = 0; i != HASH_SIZE; i++) {
+ for (c = canon_tab[i]; c; c = c->next) {
+ printf("%d %s\n", i, c->name);
+ size += offsetof(hubbub_aliases_canon, name) +
+ c->name_len;
+ }
+
+ for (a = alias_tab[i]; a; a = a->next) {
+ printf("%d %s\n", i, a->name);
+ size += offsetof(struct alias, name) + a->name_len;
+ }
+ }
+
+ size += (sizeof(canon_tab) / sizeof(canon_tab[0]));
+ size += (sizeof(alias_tab) / sizeof(alias_tab[0]));
+
+ printf("%u\n", (unsigned int) size);
+}
+#endif
diff --git a/src/charset/aliases.h b/src/charset/aliases.h
new file mode 100644
index 0000000..e0505d0
--- /dev/null
+++ b/src/charset/aliases.h
@@ -0,0 +1,42 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_charset_aliases_h_
+#define hubbub_charset_aliases_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+
+typedef struct hubbub_aliases_canon {
+ struct hubbub_aliases_canon *next;
+ uint16_t mib_enum;
+ uint16_t name_len;
+ char name[1];
+} hubbub_aliases_canon;
+
+/* Load encoding aliases from file */
+hubbub_error hubbub_aliases_create(const char *filename,
+ hubbub_alloc alloc, void *pw);
+/* Destroy encoding aliases */
+void hubbub_aliases_destroy(hubbub_alloc alloc, void *pw);
+
+/* Convert an encoding alias to a MIB enum value */
+uint16_t hubbub_mibenum_from_name(const char *alias, size_t len);
+/* Convert a MIB enum value into an encoding alias */
+const char *hubbub_mibenum_to_name(uint16_t mibenum);
+
+/* Canonicalise an alias name */
+hubbub_aliases_canon *hubbub_alias_canonicalise(const char *alias,
+ size_t len);
+
+#ifndef NDEBUG
+void hubbub_aliases_dump(void);
+#endif
+
+#endif
diff --git a/src/charset/codec.c b/src/charset/codec.c
new file mode 100644
index 0000000..12a1bdc
--- /dev/null
+++ b/src/charset/codec.c
@@ -0,0 +1,186 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include "charset/aliases.h"
+
+#include "codec_impl.h"
+
+extern hubbub_charsethandler hubbub_iconv_codec_handler;
+extern hubbub_charsethandler hubbub_utf8_codec_handler;
+
+static hubbub_charsethandler *handler_table[] = {
+ &hubbub_utf8_codec_handler,
+ &hubbub_iconv_codec_handler,
+ NULL,
+};
+
+/**
+ * Create a charset codec
+ *
+ * \param charset Target charset
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec instance, or NULL on failure
+ */
+hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset,
+ hubbub_alloc alloc, void *pw)
+{
+ hubbub_charsetcodec *codec;
+ hubbub_charsethandler **handler;
+ const hubbub_aliases_canon * canon;
+
+ if (charset == NULL || alloc == NULL)
+ return NULL;
+
+ /* Canonicalise charset name. */
+ canon = hubbub_alias_canonicalise(charset, strlen(charset));
+ if (canon == NULL)
+ return NULL;
+
+ /* Search for handler class */
+ for (handler = handler_table; *handler != NULL; handler++) {
+ if ((*handler)->handles_charset(canon->name))
+ break;
+ }
+
+ /* None found */
+ if ((*handler) == NULL)
+ return NULL;
+
+ /* Instantiate class */
+ codec = (*handler)->create(canon->name, alloc, pw);
+ if (codec == NULL)
+ return NULL;
+
+ /* and initialise it */
+ codec->mibenum = canon->mib_enum;
+
+ codec->filter = NULL;
+ codec->filter_pw = NULL;
+
+ codec->errormode = HUBBUB_CHARSETCODEC_ERROR_LOOSE;
+
+ codec->alloc = alloc;
+ codec->alloc_pw = pw;
+
+ return codec;
+}
+
+/**
+ * Destroy a charset codec
+ *
+ * \param codec The codec to destroy
+ */
+void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec)
+{
+ if (codec == NULL)
+ return;
+
+ codec->handler.destroy(codec);
+
+ codec->alloc(codec, 0, codec->alloc_pw);
+}
+
+/**
+ * Configure a charset codec
+ *
+ * \param codec The codec to configure
+ * \parem type The codec option type to configure
+ * \param params Option-specific parameters
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec,
+ hubbub_charsetcodec_opttype type,
+ hubbub_charsetcodec_optparams *params)
+{
+ if (codec == NULL || params == NULL)
+ return HUBBUB_BADPARM;
+
+ switch (type) {
+ case HUBBUB_CHARSETCODEC_FILTER_FUNC:
+ codec->filter = params->filter_func.filter;
+ codec->filter_pw = params->filter_func.pw;
+ break;
+
+ case HUBBUB_CHARSETCODEC_ERROR_MODE:
+ codec->errormode = params->error_mode.mode;
+ break;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Encode a chunk of UCS4 data into a codec's charset
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ */
+hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ if (codec == NULL || source == NULL || *source == NULL ||
+ sourcelen == NULL || dest == NULL || *dest == NULL ||
+ destlen == NULL)
+ return HUBBUB_BADPARM;
+
+ return codec->handler.encode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Decode a chunk of data in a codec's charset into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ *
+ * source, sourcelen, dest and destlen will be updated appropriately on exit
+ *
+ * Call this with a source length of 0 to flush any buffers.
+ */
+hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ if (codec == NULL || source == NULL || *source == NULL ||
+ sourcelen == NULL || dest == NULL || *dest == NULL ||
+ destlen == NULL)
+ return HUBBUB_BADPARM;
+
+ return codec->handler.decode(codec, source, sourcelen, dest, destlen);
+}
+
+/**
+ * Clear a charset codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec)
+{
+ if (codec == NULL)
+ return HUBBUB_BADPARM;
+
+ /* Reset filter */
+ if (codec->filter)
+ codec->filter(HUBBUB_CHARSETCODEC_NULL, NULL, NULL, NULL);
+
+ return codec->handler.reset(codec);
+}
+
diff --git a/src/charset/codec.h b/src/charset/codec.h
new file mode 100644
index 0000000..4cd94d8
--- /dev/null
+++ b/src/charset/codec.h
@@ -0,0 +1,153 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_charset_codec_h_
+#define hubbub_charset_codec_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+
+typedef struct hubbub_charsetcodec hubbub_charsetcodec;
+
+#define HUBBUB_CHARSETCODEC_NULL (0xffffffffU)
+
+/**
+ * Type of charset codec filter function
+ *
+ * \param c UCS4 character (in host byte order) or
+ * HUBBUB_CHARSETCODEC_NULL to reset
+ * \param output Pointer to location to store output buffer location
+ * \param outputlen Pointer to location to store output buffer length
+ * \param pw Pointer to client-specific private data
+ * \return HUBBUB_OK on success, or appropriate error otherwise.
+ *
+ * The output buffer is owned by the filter code and will not be freed by
+ * any charset codec. It should contain the replacement UCS4 character(s)
+ * for the input. The replacement characters should be in host byte order.
+ * The contents of *output and *outputlen on entry are ignored and these
+ * will be filled in by the filter code.
+ *
+ * Filters may elect to replace the input character with no output. In this
+ * case, *output should be set to NULL and *outputlen should be set to 0 and
+ * HUBBUB_OK should be returned.
+ *
+ * The output length is in terms of the number of UCS4 characters in the
+ * output buffer. i.e.:
+ *
+ * for (size_t i = 0; i < outputlen; i++) {
+ * dest[curchar++] = output[i];
+ * }
+ *
+ * would copy the contents of the filter output buffer to the codec's output
+ * buffer.
+ */
+typedef hubbub_error (*hubbub_charsetcodec_filter)(uint32_t c,
+ uint32_t **output, size_t *outputlen, void *pw);
+
+/**
+ * Charset codec error mode
+ *
+ * A codec's error mode determines its behaviour in the face of:
+ *
+ * + characters which are unrepresentable in the destination charset (if
+ * encoding data) or which cannot be converted to UCS4 (if decoding data).
+ * + invalid byte sequences (both encoding and decoding)
+ *
+ * The options provide a choice between the following approaches:
+ *
+ * + draconian, "stop processing" ("strict")
+ * + "replace the unrepresentable character with something else" ("loose")
+ * + "attempt to transliterate, or replace if unable" ("translit")
+ *
+ * The default error mode is "loose".
+ *
+ *
+ * In the "loose" case, the replacement character will depend upon:
+ *
+ * + Whether the operation was encoding or decoding
+ * + If encoding, what the destination charset is.
+ *
+ * If decoding, the replacement character will be:
+ *
+ * U+FFFD (REPLACEMENT CHARACTER)
+ *
+ * If encoding, the replacement character will be:
+ *
+ * U+003F (QUESTION MARK) if the destination charset is not UTF-(8|16|32)
+ * U+FFFD (REPLACEMENT CHARACTER) otherwise.
+ *
+ *
+ * In the "translit" case, the codec will attempt to transliterate into
+ * the destination charset, if encoding. If decoding, or if transliteration
+ * fails, this option is identical to "loose".
+ */
+typedef enum hubbub_charsetcodec_errormode {
+ /** Abort processing if unrepresentable character encountered */
+ HUBBUB_CHARSETCODEC_ERROR_STRICT = 0,
+ /** Replace unrepresentable characters with single alternate */
+ HUBBUB_CHARSETCODEC_ERROR_LOOSE = 1,
+ /** Transliterate unrepresentable characters, if possible */
+ HUBBUB_CHARSETCODEC_ERROR_TRANSLIT = 2,
+} hubbub_charsetcodec_errormode;
+
+/**
+ * Charset codec option types
+ */
+typedef enum hubbub_charsetcodec_opttype {
+ /** Register codec filter function */
+ HUBBUB_CHARSETCODEC_FILTER_FUNC = 0,
+ /** Set codec error mode */
+ HUBBUB_CHARSETCODEC_ERROR_MODE = 1,
+} hubbub_charsetcodec_opttype;
+
+/**
+ * Charset codec option parameters
+ */
+typedef union hubbub_charsetcodec_optparams {
+ /** Parameters for filter function setting */
+ struct {
+ /** Filter function */
+ hubbub_charsetcodec_filter filter;
+ /** Client-specific private data */
+ void *pw;
+ } filter_func;
+
+ /** Parameters for error mode setting */
+ struct {
+ /** The desired error handling mode */
+ hubbub_charsetcodec_errormode mode;
+ } error_mode;
+} hubbub_charsetcodec_optparams;
+
+
+/* Create a charset codec */
+hubbub_charsetcodec *hubbub_charsetcodec_create(const char *charset,
+ hubbub_alloc alloc, void *pw);
+/* Destroy a charset codec */
+void hubbub_charsetcodec_destroy(hubbub_charsetcodec *codec);
+
+/* Configure a charset codec */
+hubbub_error hubbub_charsetcodec_setopt(hubbub_charsetcodec *codec,
+ hubbub_charsetcodec_opttype type,
+ hubbub_charsetcodec_optparams *params);
+
+/* Encode a chunk of UCS4 data into a codec's charset */
+hubbub_error hubbub_charsetcodec_encode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+
+/* Decode a chunk of data in a codec's charset into UCS4 */
+hubbub_error hubbub_charsetcodec_decode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+
+/* Reset a charset codec */
+hubbub_error hubbub_charsetcodec_reset(hubbub_charsetcodec *codec);
+
+#endif
diff --git a/src/charset/codec_iconv.c b/src/charset/codec_iconv.c
new file mode 100644
index 0000000..097e82a
--- /dev/null
+++ b/src/charset/codec_iconv.c
@@ -0,0 +1,837 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/* This codec is hideously slow. Only use it as a last resort */
+
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <iconv.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include "charset/aliases.h"
+#include "utils/utils.h"
+
+#include "codec_impl.h"
+
+/**
+ * A note on endianness:
+ *
+ * UCS4 is big-endian by default. Therefore, this codec reads and writes
+ * big-endian values. This is fine, and causes no problems. However, to
+ * make life easier for client-supplied filter code, character values passed
+ * to a filter and those read back from a filter are in host-endian.
+ * Therefore, we need to convert from big-endian to host-endian when passing
+ * characters to a filter and perform the reverse translation when reading
+ * characters back.
+ */
+
+/**
+ * Iconv-based charset codec
+ */
+typedef struct hubbub_iconv_codec {
+ hubbub_charsetcodec base; /**< Base class */
+
+ iconv_t read_cd; /**< Iconv handle for reading */
+#define INVAL_BUFSIZE (32)
+ uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
+ * incomplete input
+ * sequences */
+ size_t inval_len; /**< Number of bytes in inval_buf */
+
+#define READ_BUFSIZE (8)
+ uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
+ * output sequences (decode)
+ */
+ size_t read_len; /**< Number of characters in
+ * read_buf */
+
+ iconv_t write_cd; /**< Iconv handle for writing */
+#define WRITE_BUFSIZE (8)
+ uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
+ * output sequences (encode)
+ */
+ size_t write_len; /**< Number of characters in
+ * write_buf */
+} hubbub_iconv_codec;
+
+
+static bool hubbub_iconv_codec_handles_charset(const char *charset);
+static hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset,
+ hubbub_alloc alloc, void *pw);
+static void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec);
+static hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec);
+static hubbub_error hubbub_iconv_codec_filter_decoded_char(
+ hubbub_iconv_codec *c, uint32_t ucs4, uint8_t **dest,
+ size_t *destlen);
+static bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c);
+static hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool hubbub_iconv_codec_handles_charset(const char *charset)
+{
+ iconv_t cd;
+ bool ret;
+
+ cd = iconv_open("UCS-4", charset);
+
+ ret = (cd != (iconv_t) -1);
+
+ if (ret)
+ iconv_close(cd);
+
+ return ret;
+}
+
+/**
+ * Create an iconv-based codec
+ *
+ * \param charset The charset to read from / write to
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+hubbub_charsetcodec *hubbub_iconv_codec_create(const char *charset,
+ hubbub_alloc alloc, void *pw)
+{
+ hubbub_iconv_codec *codec;
+
+ codec = alloc(NULL, sizeof(hubbub_iconv_codec), pw);
+ if (codec == NULL)
+ return NULL;
+
+ codec->read_cd = iconv_open("UCS-4", charset);
+ if (codec->read_cd == (iconv_t) -1) {
+ alloc(codec, 0, pw);
+ return NULL;
+ }
+
+ codec->write_cd = iconv_open(charset, "UCS-4");
+ if (codec->write_cd == (iconv_t) -1) {
+ iconv_close(codec->read_cd);
+ alloc(codec, 0, pw);
+ return NULL;
+ }
+
+ codec->inval_buf[0] = '\0';
+ codec->inval_len = 0;
+
+ codec->read_buf[0] = 0;
+ codec->read_len = 0;
+
+ codec->write_buf[0] = 0;
+ codec->write_len = 0;
+
+ /* Finally, populate vtable */
+ codec->base.handler.destroy = hubbub_iconv_codec_destroy;
+ codec->base.handler.encode = hubbub_iconv_codec_encode;
+ codec->base.handler.decode = hubbub_iconv_codec_decode;
+ codec->base.handler.reset = hubbub_iconv_codec_reset;
+
+ return (hubbub_charsetcodec *) codec;
+}
+
+/**
+ * Destroy an iconv-based codec
+ *
+ * \param codec The codec to destroy
+ */
+void hubbub_iconv_codec_destroy (hubbub_charsetcodec *codec)
+{
+ hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
+
+ iconv_close(c->read_cd);
+ iconv_close(c->write_cd);
+
+ return;
+}
+
+/**
+ * Encode a chunk of UCS4 data into an iconv-based codec's charset
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call. This buffered data is post-filtering,
+ * so will not be refiltered on the next call.
+ *
+ * In the case of the filter function failing, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the encoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * There is no way to determine the output character which caused a
+ * failure (as it may be one in a filter-injected replacement sequence).
+ * It is, however, possible to determine which source character caused it
+ * (this being the character immediately before the location pointed to by
+ * ::source on exit).
+ *
+ * [I.e. the process of filtering results in a potential one-to-many mapping
+ * between source characters and output characters, and identification of
+ * individual output characters is impossible.]
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+hubbub_error hubbub_iconv_codec_encode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
+ uint32_t ucs4;
+ const uint32_t *towrite;
+ size_t towritelen;
+ hubbub_error error;
+
+ /* Process any outstanding characters from the previous call */
+ if (c->write_len > 0) {
+ uint32_t *pwrite = c->write_buf;
+
+ while (c->write_len > 0) {
+ error = hubbub_iconv_codec_write_char(c, pwrite[0],
+ dest, destlen);
+ if (error != HUBBUB_OK) {
+ /* Copy outstanding chars down, skipping
+ * invalid one, if present, so as to avoid
+ * reprocessing the invalid character */
+ if (error == HUBBUB_INVALID) {
+ for (ucs4 = 1; ucs4 < c->write_len;
+ ucs4++) {
+ c->write_buf[ucs4] =
+ pwrite[ucs4];
+ }
+ }
+
+ return error;
+ }
+
+ pwrite++;
+ c->write_len--;
+ }
+ }
+
+ /* Now process the characters for this call */
+ while (*sourcelen > 0) {
+ towrite = (const uint32_t *) (const void *) *source;
+ towritelen = 1;
+ ucs4 = *towrite;
+
+ /* Run character we're about to output through the
+ * registered filter, so it can replace it, if it sees
+ * fit to do so */
+ if (c->base.filter != NULL) {
+ uint32_t *replacement;
+
+ error = c->base.filter(ntohl(ucs4),
+ &replacement, &towritelen,
+ c->base.filter_pw);
+ if (error != HUBBUB_OK) {
+ /* Don't eat character -- filter failed,
+ * so nothing gets written or buffered.
+ * It's up to the client to ensure that
+ * the filter works in the case where it
+ * reprocesses this character after the
+ * fault is fixed up. */
+
+ return error;
+ }
+
+ /* Convert filter output to big endian UCS4 */
+ for (ucs4 = 0; ucs4 < towritelen; ucs4++) {
+ replacement[ucs4] = htonl(replacement[ucs4]);
+ }
+
+ towrite = (const uint32_t *) replacement;
+ }
+
+ /* Output current character(s) */
+ while (towritelen > 0) {
+ error = hubbub_iconv_codec_write_char(c, towrite[0],
+ dest, destlen);
+
+ if (error != HUBBUB_OK) {
+ ucs4 = (error == HUBBUB_INVALID) ? 1 : 0;
+
+ if (towritelen - ucs4 >= WRITE_BUFSIZE)
+ abort();
+
+ c->write_len = towritelen - ucs4;
+
+ /* Copy pending chars to save area, for
+ * processing next call; skipping invalid
+ * character, if present, so it's not
+ * reprocessed. */
+ for (; ucs4 < towritelen; ucs4++) {
+ c->write_buf[ucs4] = towrite[ucs4];
+ }
+
+ /* Claim character we've just buffered,
+ * so it's not repreocessed */
+ *source += 4;
+ *sourcelen -= 4;
+
+ return error;
+ }
+
+ towrite++;
+ towritelen--;
+ }
+
+ *source += 4;
+ *sourcelen -= 4;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Decode a chunk of data in an iconv-based codec's charset into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ * This buffered data is post-filtering, so will not be refiltered on the
+ * next call.
+ *
+ * In the case of the result being _INVALID or the filter function failing,
+ * ::source will point _at_ the last input character read; nothing will be
+ * written or buffered for the failed character. It is up to the client to
+ * fix the cause of the failure and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * There is no way to determine the output character which caused a
+ * failure (as it may be one in a filter-injected replacement sequence).
+ * It is, however, possible to determine which source character caused it
+ * (this being the character immediately at or before the location pointed
+ * to by ::source on exit).
+ *
+ * [I.e. the process of filtering results in a potential one-to-many mapping
+ * between source characters and output characters, and identification of
+ * individual output characters is impossible.]
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+hubbub_error hubbub_iconv_codec_decode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
+ hubbub_error error;
+
+ if (c->read_len > 0) {
+ /* Output left over from last decode
+ * Attempt to finish this here */
+ uint32_t *pread = c->read_buf;
+
+ while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+ *((uint32_t *) (void *) *dest) = pread[0];
+
+ *dest += 4;
+ *destlen -= 4;
+
+ pread++;
+ c->read_len--;
+ }
+
+ if (*destlen < c->read_len * 4) {
+ /* Run out of output buffer */
+ size_t i;
+
+ /* Shuffle remaining output down */
+ for (i = 0; i < c->read_len; i++) {
+ c->read_buf[i] = pread[i];
+ }
+
+ return HUBBUB_NOMEM;
+ }
+ }
+
+ if (c->inval_len > 0) {
+ /* The last decode ended in an incomplete sequence.
+ * Fill up inval_buf with data from the start of the
+ * new chunk and process it. */
+ uint8_t *in = c->inval_buf;
+ size_t ol = c->inval_len;
+ size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+ size_t orig_l = l;
+
+ memcpy(c->inval_buf + ol, *source, l);
+
+ l += c->inval_len;
+
+ error = hubbub_iconv_codec_read_char(c,
+ (const uint8_t **) &in, &l, dest, destlen);
+ if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
+ return error;
+ }
+
+
+ /* And now, fix everything up so the normal processing
+ * does the right thing. */
+ *source += max((signed) (orig_l - l), 0);
+ *sourcelen -= max((signed) (orig_l - l), 0);
+
+ /* Failed to resolve an incomplete character and
+ * ran out of buffer space. No recovery strategy
+ * possible, so explode everywhere. */
+ if ((orig_l + ol) - l == 0)
+ abort();
+
+ /* Handle memry exhaustion case from above */
+ if (error != HUBBUB_OK)
+ return error;
+ }
+
+ while (*sourcelen > 0) {
+ error = hubbub_iconv_codec_read_char(c,
+ source, sourcelen, dest, destlen);
+ if (error != HUBBUB_OK) {
+ return error;
+ }
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Clear an iconv-based codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_iconv_codec_reset(hubbub_charsetcodec *codec)
+{
+ hubbub_iconv_codec *c = (hubbub_iconv_codec *) codec;
+
+ iconv(c->read_cd, NULL, NULL, NULL, NULL);
+ iconv(c->write_cd, NULL, NULL, NULL, NULL);
+
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ c->read_buf[0] = 0;
+ c->read_len = 0;
+
+ c->write_buf[0] = 0;
+ c->write_len = 0;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Feed a UCS4 character through the registered filter and output the result
+ *
+ * \param c Codec to use
+ * \param ucs4 UCS4 character (big endian)
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to output buffer length
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ */
+hubbub_error hubbub_iconv_codec_filter_decoded_char(hubbub_iconv_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ if (c->base.filter != NULL) {
+ uint32_t *rep;
+ size_t replen;
+ hubbub_error error;
+
+ error = c->base.filter(ntohl(ucs4), &rep, &replen,
+ c->base.filter_pw);
+ if (error != HUBBUB_OK) {
+ return error;
+ }
+
+ while (replen > 0 && *destlen >= replen * 4) {
+ *((uint32_t *) (void *) *dest) = htonl(*rep);
+
+ *dest += 4;
+ *destlen -= 4;
+
+ rep++;
+ replen--;
+ }
+
+ if (*destlen < replen * 4) {
+ /* Run out of output buffer */
+ size_t i;
+
+ /* Buffer remaining output */
+ c->read_len = replen;
+
+ for (i = 0; i < replen; i++) {
+ c->read_buf[i] = htonl(rep[i]);
+ }
+
+ return HUBBUB_NOMEM;
+ }
+
+ } else {
+ if (*destlen < 4) {
+ /* Run out of output buffer */
+
+ c->read_len = 1;
+ c->read_buf[0] = ucs4;
+
+ return HUBBUB_NOMEM;
+ }
+
+ *((uint32_t *) (void *) *dest) = ucs4;
+ *dest += 4;
+ *destlen -= 4;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Detect if a codec's charset is Unicode capable
+ *
+ * \param c Codec to consider
+ * \return true if a Unicode variant, false otherwise
+ */
+bool hubbub_iconv_codec_is_unicode(hubbub_iconv_codec *c)
+{
+ static uint16_t ucs4;
+ static uint16_t ucs2;
+ static uint16_t utf8;
+ static uint16_t utf16;
+ static uint16_t utf16be;
+ static uint16_t utf16le;
+ static uint16_t utf32;
+ static uint16_t utf32be;
+ static uint16_t utf32le;
+
+ if (ucs4 == 0) {
+ ucs4 = hubbub_mibenum_from_name("UCS-4", SLEN("UCS-4"));
+ ucs2 = hubbub_mibenum_from_name("UCS-2", SLEN("UCS-2"));
+ utf8 = hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+ utf16 = hubbub_mibenum_from_name("UTF-16", SLEN("UTF-16"));
+ utf16be = hubbub_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE"));
+ utf16le = hubbub_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE"));
+ utf32 = hubbub_mibenum_from_name("UTF-32", SLEN("UTF-32"));
+ utf32be = hubbub_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"));
+ utf32le = hubbub_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE"));
+ }
+
+ return (c->base.mibenum == ucs4 ||
+ c->base.mibenum == ucs2 ||
+ c->base.mibenum == utf8 ||
+ c->base.mibenum == utf16 ||
+ c->base.mibenum == utf16be ||
+ c->base.mibenum == utf16le ||
+ c->base.mibenum == utf32 ||
+ c->base.mibenum == utf32be ||
+ c->base.mibenum == utf32le);
+}
+
+/**
+ * Read a character from the codec's native charset to UCS4 (big endian)
+ *
+ * \param c The codec
+ * \param source Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen Pointer to length of source buffer (updated on exit)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ * This buffered data is post-filtering, so will not be refiltered on the
+ * next call.
+ *
+ * In the case of the result being _INVALID or the filter function failing,
+ * ::source will point _at_ the last input character read; nothing will be
+ * written or buffered for the failed character. It is up to the client to
+ * fix the cause of the failure and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+hubbub_error hubbub_iconv_codec_read_char(hubbub_iconv_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ size_t iconv_ret;
+ const uint8_t *origsrc = *source;
+ size_t origsrclen = *sourcelen;
+ uint32_t ucs4;
+ uint8_t *pucs4 = (uint8_t *) &ucs4;
+ size_t sucs4 = 4;
+ hubbub_error error;
+
+ /* Use iconv to convert a single character
+ * Side effect: Updates *source to point at next input
+ * character and *sourcelen to reflect reduced input length
+ */
+ iconv_ret = iconv(c->read_cd, (char **) source, sourcelen,
+ (char **) (void *) &pucs4, &sucs4);
+
+ if (iconv_ret != (size_t) -1 ||
+ (*source != origsrc && sucs4 == 0)) {
+ /* Read a character */
+ error = hubbub_iconv_codec_filter_decoded_char(c,
+ ucs4, dest, destlen);
+ if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
+ /* filter function failed; restore source pointers */
+ *source = origsrc;
+ *sourcelen = origsrclen;
+ }
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ return error;
+ } else if (errno == E2BIG) {
+ /* Should never happen */
+ abort();
+ } else if (errno == EINVAL) {
+ /* Incomplete input sequence */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (const char *) *source, *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ return HUBBUB_OK;
+ } else if (errno == EILSEQ) {
+ /* Illegal input sequence */
+ bool found = false;
+ const uint8_t *oldsrc;
+ size_t oldsrclen;
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ /* Strict errormode; simply flag invalid character */
+ if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) {
+ /* restore source pointers */
+ *source = origsrc;
+ *sourcelen = origsrclen;
+
+ return HUBBUB_INVALID;
+ }
+
+ /* Ok, this becomes problematic. The iconv API here
+ * is particularly unhelpful; *source will point at
+ * the _start_ of the illegal sequence. This means
+ * that we must find the end of the sequence */
+
+ /* Search for the start of the next valid input
+ * sequence (or the end of the input stream) */
+ while (*sourcelen > 1) {
+ pucs4 = (uint8_t *) &ucs4;
+ sucs4 = 4;
+
+ (*source)++;
+ (*sourcelen)--;
+
+ oldsrc = *source;
+ oldsrclen = *sourcelen;
+
+ iconv_ret = iconv(c->read_cd,
+ (char **) source, sourcelen,
+ (char **) (void *) &pucs4, &sucs4);
+ if (iconv_ret != (size_t) -1 || errno != EILSEQ) {
+ found = true;
+ break;
+ }
+ }
+
+ if (found) {
+ /* Found start of next valid sequence */
+ *source = oldsrc;
+ *sourcelen = oldsrclen;
+ } else {
+ /* Not found - skip last byte in buffer */
+ (*source)++;
+ (*sourcelen)--;
+
+ if (*sourcelen != 0)
+ abort();
+ }
+
+ /* output U+FFFD and continue processing. */
+ error = hubbub_iconv_codec_filter_decoded_char(c,
+ htonl(0xFFFD), dest, destlen);
+ if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
+ /* filter function failed; restore source pointers */
+ *source = origsrc;
+ *sourcelen = origsrclen;
+ }
+
+ return error;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Write a UCS4 character in a codec's native charset
+ *
+ * \param c The codec
+ * \param ucs4 The UCS4 character to write (big endian)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if character cannot be represented and the
+ * codec's error handling mode is set to STRICT.
+ */
+hubbub_error hubbub_iconv_codec_write_char(hubbub_iconv_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ size_t iconv_ret;
+ uint8_t *pucs4 = (uint8_t *) &ucs4;
+ size_t sucs4 = 4;
+ uint8_t *origdest = *dest;
+
+ iconv_ret = iconv(c->write_cd, (char **) (void *) &pucs4,
+ &sucs4, (char **) dest, destlen);
+
+ if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+ /* Output buffer is too small */
+ return HUBBUB_NOMEM;
+ } else if (iconv_ret == (size_t) -1 && errno == EILSEQ) {
+ /* Illegal multibyte sequence */
+ /* This should never happen */
+ abort();
+ } else if (iconv_ret == (size_t) -1 && errno == EINVAL) {
+ /* Incomplete input character */
+ /* This should never happen */
+ abort();
+ } else if (*dest == origdest) {
+ /* Nothing was output */
+ switch (c->base.errormode) {
+ case HUBBUB_CHARSETCODEC_ERROR_STRICT:
+ return HUBBUB_INVALID;
+
+ case HUBBUB_CHARSETCODEC_ERROR_TRANSLIT:
+ /** \todo transliteration */
+ case HUBBUB_CHARSETCODEC_ERROR_LOOSE:
+ {
+ pucs4 = (uint8_t *) &ucs4;
+ sucs4 = 4;
+
+ ucs4 = hubbub_iconv_codec_is_unicode(c)
+ ? htonl(0xFFFD) : htonl(0x3F);
+
+ iconv_ret = iconv(c->write_cd,
+ (char **) (void *) &pucs4, &sucs4,
+ (char **) dest, destlen);
+
+ if (iconv_ret == (size_t) -1 && errno == E2BIG) {
+ return HUBBUB_NOMEM;
+ } else if (iconv_ret == (size_t) -1 &&
+ errno == EILSEQ) {
+ /* Illegal multibyte sequence */
+ /* This should never happen */
+ abort();
+ } else if (iconv_ret == (size_t) -1 &&
+ errno == EINVAL) {
+ /* Incomplete input character */
+ /* This should never happen */
+ abort();
+ }
+ }
+ break;
+ }
+ }
+
+ return HUBBUB_OK;
+}
+
+const hubbub_charsethandler hubbub_iconv_codec_handler = {
+ hubbub_iconv_codec_handles_charset,
+ hubbub_iconv_codec_create
+};
diff --git a/src/charset/codec_impl.h b/src/charset/codec_impl.h
new file mode 100644
index 0000000..eb5116b
--- /dev/null
+++ b/src/charset/codec_impl.h
@@ -0,0 +1,51 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_charset_codecimpl_h_
+#define hubbub_charset_codecimpl_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include "codec.h"
+
+/**
+ * Core charset codec definition; implementations extend this
+ */
+struct hubbub_charsetcodec {
+ uint16_t mibenum; /**< MIB enum for charset */
+
+ hubbub_charsetcodec_filter filter; /**< filter function */
+ void *filter_pw; /**< filter private word */
+
+ hubbub_charsetcodec_errormode errormode; /**< error mode */
+
+ hubbub_alloc alloc; /**< allocation function */
+ void *alloc_pw; /**< private word */
+
+ struct {
+ void (*destroy)(hubbub_charsetcodec *codec);
+ hubbub_error (*encode)(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+ hubbub_error (*decode)(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+ hubbub_error (*reset)(hubbub_charsetcodec *codec);
+ } handler; /**< Vtable for handler code */
+};
+
+/**
+ * Codec factory component definition
+ */
+typedef struct hubbub_charsethandler {
+ bool (*handles_charset)(const char *charset);
+ hubbub_charsetcodec *(*create)(const char *charset,
+ hubbub_alloc alloc, void *pw);
+} hubbub_charsethandler;
+
+#endif
diff --git a/src/charset/codec_utf8.c b/src/charset/codec_utf8.c
new file mode 100644
index 0000000..86d667f
--- /dev/null
+++ b/src/charset/codec_utf8.c
@@ -0,0 +1,620 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+#include <string.h>
+
+/* These two are for htonl / ntohl */
+#include <arpa/inet.h>
+#include <netinet/in.h>
+
+#include "charset/aliases.h"
+#include "utils/utf8.h"
+#include "utils/utils.h"
+
+#include "codec_impl.h"
+
+/**
+ * UTF-8 charset codec
+ */
+typedef struct hubbub_utf8_codec {
+ hubbub_charsetcodec base; /**< Base class */
+
+#define INVAL_BUFSIZE (32)
+ uint8_t inval_buf[INVAL_BUFSIZE]; /**< Buffer for fixing up
+ * incomplete input
+ * sequences */
+ size_t inval_len; /*< Byte length of inval_buf **/
+
+#define READ_BUFSIZE (8)
+ uint32_t read_buf[READ_BUFSIZE]; /**< Buffer for partial
+ * output sequences (decode)
+ * (host-endian) */
+ size_t read_len; /**< Character length of read_buf */
+
+#define WRITE_BUFSIZE (8)
+ uint32_t write_buf[WRITE_BUFSIZE]; /**< Buffer for partial
+ * output sequences (encode)
+ * (host-endian) */
+ size_t write_len; /**< Character length of write_buf */
+
+} hubbub_utf8_codec;
+
+static bool hubbub_utf8_codec_handles_charset(const char *charset);
+static hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset,
+ hubbub_alloc alloc, void *pw);
+static void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec);
+static hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec);
+static hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen);
+static hubbub_error hubbub_utf8_codec_filter_decoded_char(
+ hubbub_utf8_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen);
+
+/**
+ * Determine whether this codec handles a specific charset
+ *
+ * \param charset Charset to test
+ * \return true if handleable, false otherwise
+ */
+bool hubbub_utf8_codec_handles_charset(const char *charset)
+{
+ return hubbub_mibenum_from_name(charset, strlen(charset)) ==
+ hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+}
+
+/**
+ * Create a utf8 codec
+ *
+ * \param charset The charset to read from / write to
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to codec, or NULL on failure
+ */
+hubbub_charsetcodec *hubbub_utf8_codec_create(const char *charset,
+ hubbub_alloc alloc, void *pw)
+{
+ hubbub_utf8_codec *codec;
+
+ UNUSED(charset);
+
+ codec = alloc(NULL, sizeof(hubbub_utf8_codec), pw);
+ if (codec == NULL)
+ return NULL;
+
+ codec->inval_buf[0] = '\0';
+ codec->inval_len = 0;
+
+ codec->read_buf[0] = 0;
+ codec->read_len = 0;
+
+ codec->write_buf[0] = 0;
+ codec->write_len = 0;
+
+ /* Finally, populate vtable */
+ codec->base.handler.destroy = hubbub_utf8_codec_destroy;
+ codec->base.handler.encode = hubbub_utf8_codec_encode;
+ codec->base.handler.decode = hubbub_utf8_codec_decode;
+ codec->base.handler.reset = hubbub_utf8_codec_reset;
+
+ return (hubbub_charsetcodec *) codec;
+}
+
+/**
+ * Destroy a utf8 codec
+ *
+ * \param codec The codec to destroy
+ */
+void hubbub_utf8_codec_destroy (hubbub_charsetcodec *codec)
+{
+ UNUSED(codec);
+}
+
+/**
+ * Encode a chunk of UCS4 data into utf8
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read. Any remaining output for the character will be buffered by the
+ * codec for writing on the next call. This buffered data is post-filtering,
+ * so will not be refiltered on the next call.
+ *
+ * In the case of the filter function failing, ::source will point _at_ the
+ * last input character read; nothing will be written or buffered for the
+ * failed character. It is up to the client to fix the cause of the failure
+ * and retry the encoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * There is no way to determine the output character which caused a
+ * failure (as it may be one in a filter-injected replacement sequence).
+ * It is, however, possible to determine which source character caused it
+ * (this being the character immediately before the location pointed to by
+ * ::source on exit).
+ *
+ * [I.e. the process of filtering results in a potential one-to-many mapping
+ * between source characters and output characters, and identification of
+ * individual output characters is impossible.]
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+hubbub_error hubbub_utf8_codec_encode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec;
+ uint32_t ucs4;
+ uint32_t *towrite;
+ size_t towritelen;
+ hubbub_error error;
+
+ /* Process any outstanding characters from the previous call */
+ if (c->write_len > 0) {
+ uint32_t *pwrite = c->write_buf;
+ uint8_t buf[6];
+ size_t len;
+
+ while (c->write_len > 0) {
+ error = hubbub_utf8_from_ucs4(pwrite[0], buf, &len);
+ if (error != HUBBUB_OK)
+ abort();
+
+ if (*destlen < len) {
+ /* Insufficient output buffer space */
+ for (len = 0; len < c->write_len; len++)
+ c->write_buf[len] = pwrite[len];
+
+ return HUBBUB_NOMEM;
+ }
+
+ memcpy(*dest, buf, len);
+
+ *dest += len;
+ *destlen -= len;
+
+ pwrite++;
+ c->write_len--;
+ }
+ }
+
+ /* Now process the characters for this call */
+ while (*sourcelen > 0) {
+ ucs4 = ntohl(*((uint32_t *) (void *) *source));
+ towrite = &ucs4;
+ towritelen = 1;
+
+ /* Run character we're about to output through the
+ * registered filter, so it can replace it. */
+ if (c->base.filter != NULL) {
+ error = c->base.filter(ucs4,
+ &towrite, &towritelen,
+ c->base.filter_pw);
+ if (error != HUBBUB_OK)
+ return error;
+ }
+
+ /* Output current characters */
+ while (towritelen > 0) {
+ uint8_t buf[6];
+ size_t len;
+
+ error = hubbub_utf8_from_ucs4(towrite[0], buf, &len);
+ if (error != HUBBUB_OK)
+ abort();
+
+ if (*destlen < len) {
+ /* Insufficient output space */
+ if (towritelen >= WRITE_BUFSIZE)
+ abort();
+
+ c->write_len = towritelen;
+
+ /* Copy pending chars to save area, for
+ * processing next call. */
+ for (len = 0; len < towritelen; len++)
+ c->write_buf[len] = towrite[len];
+
+ /* Claim character we've just buffered,
+ * so it's not reprocessed */
+ *source += 4;
+ *sourcelen -= 4;
+
+ return HUBBUB_NOMEM;
+ }
+
+ memcpy(*dest, buf, len);
+
+ *dest += len;
+ *destlen -= len;
+
+ towrite++;
+ towritelen--;
+ }
+
+ *source += 4;
+ *sourcelen -= 4;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Decode a chunk of utf8 data into UCS4
+ *
+ * \param codec The codec to use
+ * \param source Pointer to pointer to source data
+ * \param sourcelen Pointer to length (in bytes) of source data
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to length (in bytes) of output buffer
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ * This buffered data is post-filtering, so will not be refiltered on the
+ * next call.
+ *
+ * In the case of the result being _INVALID or the filter function failing,
+ * ::source will point _at_ the last input character read; nothing will be
+ * written or buffered for the failed character. It is up to the client to
+ * fix the cause of the failure and retry the decoding process.
+ *
+ * Note that, if failure occurs whilst attempting to write any output
+ * buffered by the last call, then ::source and ::sourcelen will remain
+ * unchanged (as nothing more has been read).
+ *
+ * There is no way to determine the output character which caused a
+ * failure (as it may be one in a filter-injected replacement sequence).
+ * It is, however, possible to determine which source character caused it
+ * (this being the character immediately at or before the location pointed
+ * to by ::source on exit).
+ *
+ * [I.e. the process of filtering results in a potential one-to-many mapping
+ * between source characters and output characters, and identification of
+ * individual output characters is impossible.]
+ *
+ * If STRICT error handling is configured and an illegal sequence is split
+ * over two calls, then _INVALID will be returned from the second call,
+ * but ::source will point mid-way through the invalid sequence (i.e. it
+ * will be unmodified over the second call). In addition, the internal
+ * incomplete-sequence buffer will be emptied, such that subsequent calls
+ * will progress, rather than re-evaluating the same invalid sequence.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ *
+ * Call this with a source length of 0 to flush the output buffer.
+ */
+hubbub_error hubbub_utf8_codec_decode(hubbub_charsetcodec *codec,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec;
+ hubbub_error error;
+
+ if (c->read_len > 0) {
+ /* Output left over from last decode */
+ uint32_t *pread = c->read_buf;
+
+ while (c->read_len > 0 && *destlen >= c->read_len * 4) {
+ *((uint32_t *) (void *) *dest) = htonl(pread[0]);
+
+ *dest += 4;
+ *destlen -= 4;
+
+ pread++;
+ c->read_len--;
+ }
+
+ if (*destlen < c->read_len * 4) {
+ /* Ran out of output buffer */
+ size_t i;
+
+ /* Shuffle remaining output down */
+ for (i = 0; i < c->read_len; i++)
+ c->read_buf[i] = pread[i];
+
+ return HUBBUB_NOMEM;
+ }
+ }
+
+ if (c->inval_len > 0) {
+ /* The last decode ended in an incomplete sequence.
+ * Fill up inval_buf with data from the start of the
+ * new chunk and process it. */
+ uint8_t *in = c->inval_buf;
+ size_t ol = c->inval_len;
+ size_t l = min(INVAL_BUFSIZE - ol - 1, *sourcelen);
+ size_t orig_l = l;
+
+ memcpy(c->inval_buf + ol, *source, l);
+
+ l += c->inval_len;
+
+ error = hubbub_utf8_codec_read_char(c,
+ (const uint8_t **) &in, &l, dest, destlen);
+ if (error != HUBBUB_OK && error != HUBBUB_NOMEM) {
+ return error;
+ }
+
+ /* And now, fix up source pointers */
+ *source += max((signed) (orig_l - l), 0);
+ *sourcelen -= max((signed) (orig_l - l), 0);
+
+ /* Failed to resolve an incomplete character and
+ * ran out of buffer space. No recovery strategy
+ * possible, so explode everywhere. */
+ if ((orig_l + ol) - l == 0)
+ abort();
+
+ /* Report memory exhaustion case from above */
+ if (error != HUBBUB_OK)
+ return error;
+ }
+
+ /* Finally, the "normal" case; process all outstanding characters */
+ while (*sourcelen > 0) {
+ error = hubbub_utf8_codec_read_char(c,
+ source, sourcelen, dest, destlen);
+ if (error != HUBBUB_OK) {
+ return error;
+ }
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Clear a utf8 codec's encoding state
+ *
+ * \param codec The codec to reset
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_utf8_codec_reset(hubbub_charsetcodec *codec)
+{
+ hubbub_utf8_codec *c = (hubbub_utf8_codec *) codec;
+
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ c->read_buf[0] = 0;
+ c->read_len = 0;
+
+ c->write_buf[0] = 0;
+ c->write_len = 0;
+
+ return HUBBUB_OK;
+}
+
+
+/**
+ * Read a character from the UTF-8 to UCS4 (big endian)
+ *
+ * \param c The codec
+ * \param source Pointer to pointer to source buffer (updated on exit)
+ * \param sourcelen Pointer to length of source buffer (updated on exit)
+ * \param dest Pointer to pointer to output buffer (updated on exit)
+ * \param destlen Pointer to length of output buffer (updated on exit)
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * HUBBUB_INVALID if a character cannot be represented and the
+ * codec's error handling mode is set to STRICT,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ *
+ * On exit, ::source will point immediately _after_ the last input character
+ * read, if the result is _OK or _NOMEM. Any remaining output for the
+ * character will be buffered by the codec for writing on the next call.
+ * This buffered data is post-filtering, so will not be refiltered on the
+ * next call.
+ *
+ * In the case of the result being _INVALID or the filter function failing,
+ * ::source will point _at_ the last input character read; nothing will be
+ * written or buffered for the failed character. It is up to the client to
+ * fix the cause of the failure and retry the decoding process.
+ *
+ * ::sourcelen will be reduced appropriately on exit.
+ *
+ * ::dest will point immediately _after_ the last character written.
+ *
+ * ::destlen will be reduced appropriately on exit.
+ */
+hubbub_error hubbub_utf8_codec_read_char(hubbub_utf8_codec *c,
+ const uint8_t **source, size_t *sourcelen,
+ uint8_t **dest, size_t *destlen)
+{
+ uint32_t ucs4;
+ size_t sucs4;
+ hubbub_error error;
+
+ /* Convert a single character */
+ error = hubbub_utf8_to_ucs4(*source, *sourcelen, &ucs4, &sucs4);
+ if (error == HUBBUB_OK) {
+ /* Read a character */
+ error = hubbub_utf8_codec_filter_decoded_char(c,
+ ucs4, dest, destlen);
+ if (error == HUBBUB_OK || error == HUBBUB_NOMEM) {
+ /* filter function succeeded; update source pointers */
+ *source += sucs4;
+ *sourcelen -= sucs4;
+ }
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ return error;
+ } else if (error == HUBBUB_NEEDDATA) {
+ /* Incomplete input sequence */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (char *) *source, *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ return HUBBUB_OK;
+ } else if (error == HUBBUB_INVALID) {
+ /* Illegal input sequence */
+ uint32_t nextchar;
+
+ /* Clear inval buffer */
+ c->inval_buf[0] = '\0';
+ c->inval_len = 0;
+
+ /* Strict errormode; simply flag invalid character */
+ if (c->base.errormode == HUBBUB_CHARSETCODEC_ERROR_STRICT) {
+ return HUBBUB_INVALID;
+ }
+
+ /* Find next valid UTF-8 sequence.
+ * We're processing client-provided data, so let's
+ * be paranoid about its validity. */
+ error = hubbub_utf8_next_paranoid(*source, *sourcelen,
+ 0, &nextchar);
+ if (error != HUBBUB_OK) {
+ if (error == HUBBUB_NEEDDATA) {
+ /* Need more data to be sure */
+ if (*sourcelen > INVAL_BUFSIZE)
+ abort();
+
+ memmove(c->inval_buf, (char *) *source,
+ *sourcelen);
+ c->inval_buf[*sourcelen] = '\0';
+ c->inval_len = *sourcelen;
+
+ *source += *sourcelen;
+ *sourcelen = 0;
+
+ nextchar = 0;
+ } else {
+ return error;
+ }
+ }
+
+ /* output U+FFFD and continue processing. */
+ error = hubbub_utf8_codec_filter_decoded_char(c,
+ 0xFFFD, dest, destlen);
+ if (error == HUBBUB_OK || error == HUBBUB_NOMEM) {
+ /* filter function succeeded; update source pointers */
+ *source += nextchar;
+ *sourcelen -= nextchar;
+ }
+
+ return error;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Feed a UCS4 character through the registered filter and output the result
+ *
+ * \param c Codec to use
+ * \param ucs4 UCS4 character (host endian)
+ * \param dest Pointer to pointer to output buffer
+ * \param destlen Pointer to output buffer length
+ * \return HUBBUB_OK on success,
+ * HUBBUB_NOMEM if output buffer is too small,
+ * <any_other_error> as a result of the failure of the
+ * client-provided filter function.
+ */
+hubbub_error hubbub_utf8_codec_filter_decoded_char(hubbub_utf8_codec *c,
+ uint32_t ucs4, uint8_t **dest, size_t *destlen)
+{
+ if (c->base.filter != NULL) {
+ uint32_t *rep;
+ size_t replen;
+ hubbub_error error;
+
+ error = c->base.filter(ucs4, &rep, &replen,
+ c->base.filter_pw);
+ if (error != HUBBUB_OK) {
+ return error;
+ }
+
+ while (replen > 0 && *destlen >= replen * 4) {
+ *((uint32_t *) (void *) *dest) = htonl(*rep);
+
+ *dest += 4;
+ *destlen -= 4;
+
+ rep++;
+ replen--;
+ }
+
+ if (*destlen < replen * 4) {
+ /* Run out of output buffer */
+ size_t i;
+
+ /* Buffer remaining output */
+ c->read_len = replen;
+
+ for (i = 0; i < replen; i++) {
+ c->read_buf[i] = rep[i];
+ }
+
+ return HUBBUB_NOMEM;
+ }
+
+ } else {
+ if (*destlen < 4) {
+ /* Run out of output buffer */
+ c->read_len = 1;
+ c->read_buf[0] = ucs4;
+
+ return HUBBUB_NOMEM;
+ }
+
+ *((uint32_t *) (void *) *dest) = htonl(ucs4);
+ *dest += 4;
+ *destlen -= 4;
+ }
+
+ return HUBBUB_OK;
+}
+
+
+const hubbub_charsethandler hubbub_utf8_codec_handler = {
+ hubbub_utf8_codec_handles_charset,
+ hubbub_utf8_codec_create
+};
diff --git a/src/charset/detect.c b/src/charset/detect.c
new file mode 100644
index 0000000..8ff3b87
--- /dev/null
+++ b/src/charset/detect.c
@@ -0,0 +1,673 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "utils/utils.h"
+
+#include "detect.h"
+
+static uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len);
+static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len);
+static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
+ const uint8_t *end);
+static uint16_t hubbub_charset_parse_content(const uint8_t *value,
+ uint32_t valuelen);
+static bool hubbub_charset_get_attribute(const uint8_t **data,
+ const uint8_t *end,
+ const uint8_t **name, uint32_t *namelen,
+ const uint8_t **value, uint32_t *valuelen);
+
+/**
+ * Extract a charset from a chunk of data
+ *
+ * \param data Pointer to pointer to buffer containing data
+ * \param len Pointer to buffer length
+ * \param mibenum Pointer to location to store MIB enum representing charset
+ * \param source Pointer to location to receive charset source
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ *
+ * The data pointer and length will be modified by this function if
+ * a byte order mark is encountered at the start of the buffer. The updated
+ * data pointer will point to the first byte in the buffer after the BOM.
+ * The length will be modified appropriately.
+ *
+ * The larger a chunk of data fed to this routine, the better, as it allows
+ * charset autodetection access to a larger dataset for analysis.
+ */
+hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
+ uint16_t *mibenum, hubbub_charset_source *source)
+{
+ uint16_t charset = 0;
+
+ if (data == NULL || *data == NULL || len == NULL ||
+ mibenum == NULL || source == NULL)
+ return HUBBUB_BADPARM;
+
+ /* We need at least 4 bytes of data */
+ if (*len < 4)
+ goto default_encoding;
+
+ /* First, look for a BOM */
+ charset = hubbub_charset_read_bom(data, len);
+ if (charset != 0) {
+ *mibenum = charset;
+ *source = HUBBUB_CHARSET_DOCUMENT;
+
+ return HUBBUB_OK;
+ }
+
+ /* No BOM was found, so we must look for a meta charset within
+ * the document itself. */
+ charset = hubbub_charset_scan_meta(*data, *len);
+ if (charset != 0) {
+ /* ISO-8859-1 becomes Windows-1252 */
+ if (charset == hubbub_mibenum_from_name("ISO-8859-1",
+ SLEN("ISO-8859-1"))) {
+ charset = hubbub_mibenum_from_name("Windows-1252",
+ SLEN("Windows-1252"));
+ /* Fallback to 8859-1 if that failed */
+ if (charset == 0)
+ charset = hubbub_mibenum_from_name(
+ "ISO-8859-1", SLEN("ISO-8859-1"));
+ }
+
+ /* If we've encountered a meta charset for a non-ASCII-
+ * compatible encoding, don't trust it.
+ *
+ * Firstly, it should have been sent with a BOM (and thus
+ * detected above).
+ *
+ * Secondly, we've just used an ASCII-only parser to
+ * extract the encoding from the document. Therefore,
+ * the document plainly isn't what the meta charset
+ * claims it is.
+ *
+ * What we do in this case is to ignore the meta charset's
+ * claims and leave the charset determination to the
+ * autodetection routines (or the fallback case if they
+ * fail).
+ */
+ if (charset != hubbub_mibenum_from_name("UTF-16",
+ SLEN("UTF-16")) &&
+ charset != hubbub_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE")) &&
+ charset != hubbub_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE")) &&
+ charset != hubbub_mibenum_from_name("UTF-32",
+ SLEN("UTF-32")) &&
+ charset != hubbub_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE")) &&
+ charset != hubbub_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"))) {
+
+ *mibenum = charset;
+ *source = HUBBUB_CHARSET_DOCUMENT;
+
+ return HUBBUB_OK;
+ }
+ }
+
+ /* No charset was specified within the document, attempt to
+ * autodetect the encoding from the data that we have available. */
+
+ /** \todo Charset autodetection */
+
+ /* We failed to autodetect a charset, so use the default fallback */
+default_encoding:
+
+ charset = hubbub_mibenum_from_name("Windows-1252",
+ SLEN("Windows-1252"));
+ if (charset == 0)
+ charset = hubbub_mibenum_from_name("ISO-8859-1",
+ SLEN("ISO-8859-1"));
+
+ *mibenum = charset;
+ *source = HUBBUB_CHARSET_DEFAULT;
+
+ return HUBBUB_OK;
+}
+
+
+/**
+ * Inspect the beginning of a buffer of data for the presence of a
+ * UTF Byte Order Mark.
+ *
+ * \param data Pointer to pointer to buffer containing data
+ * \param len Pointer to buffer length
+ * \return MIB enum representing encoding described by BOM, or 0 if not found
+ *
+ * If a BOM is found, the data pointer will be modified to point to the first
+ * byte in the buffer after the BOM. The length will also be modified
+ * appropriately.
+ */
+uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len)
+{
+ if (data == NULL || *data == NULL || len == NULL)
+ return 0;
+
+ /* We require at least 4 bytes of data */
+ if (*len < 4)
+ return 0;
+
+#define UTF32BOM_LEN (4)
+#define UTF16BOM_LEN (2)
+#define UTF8BOM_LEN (3)
+
+ if ((*data)[0] == 0x00 && (*data)[1] == 0x00 &&
+ (*data)[2] == 0xFE && (*data)[3] == 0xFF) {
+ *data += UTF32BOM_LEN;
+ *len -= UTF32BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"));
+ } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE &&
+ (*data)[2] == 0x00 && (*data)[3] == 0x00) {
+ *data += UTF32BOM_LEN;
+ *len -= UTF32BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE"));
+ } else if ((*data)[0] == 0xFE && (*data)[1] == 0xFF) {
+ *data += UTF16BOM_LEN;
+ *len -= UTF16BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE"));
+ } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE) {
+ *data += UTF16BOM_LEN;
+ *len -= UTF16BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE"));
+ } else if ((*data)[0] == 0xEF && (*data)[1] == 0xBB &&
+ (*data)[2] == 0xBF) {
+ *data += UTF8BOM_LEN;
+ *len -= UTF8BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+ }
+
+#undef UTF32BOM_LEN
+#undef UTF16BOM_LEN
+#undef UTF8BOM_LEN
+
+ return 0;
+}
+
+#define PEEK(a) \
+ (pos < end - SLEN(a) && \
+ strncasecmp((const char *) pos, a, SLEN(a)) == 0)
+
+#define ADVANCE(a) \
+ while (pos < end - SLEN(a)) { \
+ if (PEEK(a)) \
+ break; \
+ pos++; \
+ } \
+ \
+ if (pos == end - SLEN(a)) \
+ return 0;
+
+#define ISSPACE(a) \
+ (a == 0x09 || a == 0x0a || a == 0x0b || \
+ a == 0x0c || a == 0x0d || a == 0x20)
+
+/**
+ * Search for a meta charset within a buffer of data
+ *
+ * \param data Pointer to buffer containing data
+ * \param len Length of buffer in data
+ * \return MIB enum representing encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len)
+{
+ const uint8_t *pos = data;
+ const uint8_t *end;
+ uint16_t mibenum;
+
+ if (data == NULL)
+ return 0;
+
+ end = pos + min(512, len);
+
+ /* 1. */
+ while (pos < end) {
+ /* a */
+ if (PEEK("<!--")) {
+ pos += SLEN("<!--");
+ ADVANCE("-->");
+ /* b */
+ } else if (PEEK("<meta")) {
+ if (pos + SLEN("<meta") >= end - 1)
+ return 0;
+
+ if (ISSPACE(*(pos + SLEN("<meta")))) {
+ /* 1 */
+ pos += SLEN("<meta");
+
+ mibenum = hubbub_charset_parse_attributes(
+ &pos, end);
+ if (mibenum != 0)
+ return mibenum;
+
+ if (pos >= end)
+ return 0;
+ }
+ /* c */
+ } else if ((PEEK("</") && (pos < end - 3 &&
+ (0x41 <= (*(pos + 2) & ~ 0x20) &&
+ (*(pos + 2) & ~ 0x20) <= 0x5A))) ||
+ (pos < end - 2 && *pos == '<' &&
+ (0x41 <= (*(pos + 1) & ~ 0x20) &&
+ (*(pos + 1) & ~ 0x20) <= 0x5A))) {
+
+ /* skip '<' */
+ pos++;
+
+ /* 1. */
+ while (pos < end) {
+ if (ISSPACE(*pos) ||
+ *pos == '>' || *pos == '<')
+ break;
+ pos++;
+ }
+
+ if (pos >= end)
+ return 0;
+
+ /* 3 */
+ if (*pos != '<') {
+ const uint8_t *n;
+ const uint8_t *v;
+ uint32_t nl, vl;
+
+ while (hubbub_charset_get_attribute(&pos, end,
+ &n, &nl, &v, &vl))
+ ; /* do nothing */
+ /* 2 */
+ } else
+ continue;
+ /* d */
+ } else if (PEEK("<!") || PEEK("</") || PEEK("<?")) {
+ pos++;
+ ADVANCE(">");
+ }
+
+ /* e - do nothing */
+
+ /* 2 */
+ pos++;
+ }
+
+ return 0;
+}
+
+/**
+ * Parse attributes on a meta tag
+ *
+ * \param pos Pointer to pointer to current location (updated on exit)
+ * \param end Pointer to end of data stream
+ * \return MIB enum of detected encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
+ const uint8_t *end)
+{
+ const uint8_t *name;
+ const uint8_t *value;
+ uint32_t namelen, valuelen;
+ uint16_t mibenum;
+
+ if (pos == NULL || *pos == NULL || end == NULL)
+ return 0;
+
+ /* 2 */
+ while (hubbub_charset_get_attribute(pos, end,
+ &name, &namelen, &value, &valuelen)) {
+ /* 3 */
+ /* a */
+ if (namelen == SLEN("charset") && valuelen > 0 &&
+ strncasecmp((const char *) name, "charset",
+ SLEN("charset")) == 0) {
+ /* strip value */
+ while (ISSPACE(*value)) {
+ value++;
+ valuelen--;
+ }
+
+ while (valuelen > 0 && ISSPACE(value[valuelen - 1]))
+ valuelen--;
+
+ mibenum = hubbub_mibenum_from_name(
+ (const char *) value, valuelen);
+ if (mibenum != 0)
+ return mibenum;
+ /* b */
+ } else if (namelen == SLEN("content") && valuelen > 0 &&
+ strncasecmp((const char *) name, "content",
+ SLEN("content")) == 0) {
+ mibenum = hubbub_charset_parse_content(value,
+ valuelen);
+ if (mibenum != 0)
+ return mibenum;
+ }
+
+ /* c - do nothing */
+
+ /* 1 */
+ while (*pos < end) {
+ if (ISSPACE(**pos))
+ break;
+ (*pos)++;
+ }
+
+ if (*pos >= end) {
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Parse a content= attribute's value
+ *
+ * \param value Attribute's value
+ * \param valuelen Length of value
+ * \return MIB enum of detected encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_parse_content(const uint8_t *value,
+ uint32_t valuelen)
+{
+ const uint8_t *end;
+ const uint8_t *tentative = NULL;
+ uint32_t tentative_len = 0;
+
+ if (value == NULL)
+ return 0;
+
+ end = value + valuelen;
+
+ /* 1 */
+ while (value < end) {
+ if (*value == ';') {
+ value++;
+ break;
+ }
+
+ value++;
+ }
+
+ if (value >= end)
+ return 0;
+
+ /* 2 */
+ while (value < end && ISSPACE(*value)) {
+ value++;
+ }
+
+ if (value >= end)
+ return 0;
+
+ /* 3 */
+ if (value < end - SLEN("charset") &&
+ strncasecmp((const char *) value,
+ "charset", SLEN("charset")) != 0)
+ return 0;
+
+ value += SLEN("charset");
+
+ /* 4 */
+ while (value < end && ISSPACE(*value)) {
+ value++;
+ }
+
+ if (value >= end)
+ return 0;
+
+ /* 5 */
+ if (*value != '=')
+ return 0;
+ /* skip '=' */
+ value++;
+
+ /* 6 */
+ while (value < end && ISSPACE(*value)) {
+ value++;
+ }
+
+ if (value >= end)
+ return 0;
+
+ /* 7 */
+ tentative = value;
+
+ /* a */
+ if (*value == '"') {
+ while (++value < end && *value != '"') {
+ tentative_len++;
+ }
+
+ if (value < end)
+ tentative++;
+ else
+ tentative = NULL;
+ /* b */
+ } else if (*value == '\'') {
+ while (++value < end && *value != '\'') {
+ tentative_len++;
+ }
+
+ if (value < end)
+ tentative++;
+ else
+ tentative = NULL;
+ /* c */
+ } else {
+ while (value < end && !ISSPACE(*value)) {
+ value++;
+ tentative_len++;
+ }
+ }
+
+ /* 8 */
+ if (tentative != NULL) {
+ return hubbub_mibenum_from_name((const char *) tentative,
+ tentative_len);
+ }
+
+ /* 9 */
+ return 0;
+}
+
+/**
+ * Extract an attribute from the data stream
+ *
+ * \param data Pointer to pointer to current location (updated on exit)
+ * \param end Pointer to end of data stream
+ * \param name Pointer to location to receive attribute name
+ * \param namelen Pointer to location to receive attribute name length
+ * \param value Pointer to location to receive attribute value
+ * \param valuelen Pointer to location to receive attribute value langth
+ * \return true if attribute extracted, false otherwise.
+ *
+ * Note: The caller should heed the returned lengths; these are the only
+ * indicator that useful content resides in name or value.
+ */
+bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end,
+ const uint8_t **name, uint32_t *namelen,
+ const uint8_t **value, uint32_t *valuelen)
+{
+ const uint8_t *pos;
+
+ if (data == NULL || *data == NULL || end == NULL || name == NULL ||
+ namelen == NULL || value == NULL || valuelen == NULL)
+ return false;
+
+ pos = *data;
+
+ /* 1. Skip leading spaces or '/' characters */
+ while (pos < end && (ISSPACE(*pos) || *pos == '/')) {
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ /* 2. Invalid element open character */
+ if (*pos == '<') {
+ pos--;
+ *data = pos;
+ return false;
+ }
+
+ /* 3. End of element */
+ if (*pos == '>') {
+ *data = pos;
+ return false;
+ }
+
+ /* 4. Initialise name & value to empty string */
+ *name = pos;
+ *namelen = 0;
+ *value = (const uint8_t *) "";
+ *valuelen = 0;
+
+ /* 5. Extract name */
+ while (pos < end) {
+ /* a */
+ if (*pos == '=') {
+ break;
+ }
+
+ /* b */
+ if (ISSPACE(*pos)) {
+ break;
+ }
+
+ /* c */
+ if (*pos == '/' || *pos == '<' || *pos == '>') {
+ return true;
+ }
+
+ /* d is handled by strncasecmp in _parse_attributes */
+
+ /* e */
+ (*namelen)++;
+
+ /* 6 */
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ if (ISSPACE(*pos)) {
+ /* 7. Skip trailing spaces */
+ while (pos < end && ISSPACE(*pos)) {
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ /* 8. Must be '=' */
+ if (*pos != '=') {
+ pos--;
+ *data = pos;
+ return true;
+ }
+ }
+
+ /* 9. Skip '=' */
+ pos++;
+
+ /* 10. Skip any spaces after '=' */
+ while (pos < end && ISSPACE(*pos)) {
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ /* 11. Extract value, if quoted */
+ /* a */
+ if (*pos == '\'' || *pos == '"') {
+ /* 1 */
+ const uint8_t *quote = pos;
+
+ /* 2 */
+ while (++pos < end) {
+ /* 3 */
+ if (*pos == *quote) {
+ *value = (quote + 1);
+ *data = ++pos;
+ return true;
+ }
+
+ /* 4 is handled by strncasecmp */
+
+ /* 5 */
+ (*valuelen)++;
+
+ /* 6 */
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+ }
+
+ /* b */
+ if (*pos == '<' || *pos == '>') {
+ *data = pos;
+ return true;
+ }
+
+ /* c is handled by strncasecmp */
+
+ /* d */
+ *value = pos;
+
+ while (pos < end) {
+ /* 12. Extract unquoted value */
+ /* a */
+ if (ISSPACE(*pos) || *pos == '<' || *pos == '>') {
+ *data = pos;
+ return true;
+ }
+
+ /* b is handled by strncasecmp */
+
+ /* c */
+ (*valuelen)++;
+
+ /* 13. Advance */
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ /* should never be reached */
+ abort();
+
+ return false;
+}
diff --git a/src/charset/detect.h b/src/charset/detect.h
new file mode 100644
index 0000000..854a8d6
--- /dev/null
+++ b/src/charset/detect.h
@@ -0,0 +1,22 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_charset_detect_h_
+#define hubbub_charset_detect_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+#include <hubbub/types.h>
+
+/* Extract a charset from a chunk of data */
+hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
+ uint16_t *mibenum, hubbub_charset_source *source);
+
+#endif
+
diff --git a/src/hubbub.c b/src/hubbub.c
new file mode 100644
index 0000000..32e0a1f
--- /dev/null
+++ b/src/hubbub.c
@@ -0,0 +1,63 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <hubbub/hubbub.h>
+
+#include "charset/aliases.h"
+#include "tokeniser/entities.h"
+
+/**
+ * Initialise the Hubbub library for use.
+ *
+ * This _must_ be called before using any hubbub functions
+ *
+ * \param aliases_file Pointer to name of file containing encoding alias data
+ * \param alloc Pointer to (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, applicable error otherwise.
+ */
+hubbub_error hubbub_initialise(const char *aliases_file,
+ hubbub_alloc alloc, void *pw)
+{
+ hubbub_error error;
+
+ if (aliases_file == NULL || alloc == NULL)
+ return HUBBUB_BADPARM;
+
+ error = hubbub_aliases_create(aliases_file, alloc, pw);
+ if (error != HUBBUB_OK)
+ return error;
+
+ error = hubbub_entities_create(alloc, pw);
+ if (error != HUBBUB_OK) {
+ hubbub_aliases_destroy(alloc, pw);
+ return error;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Clean up after Hubbub
+ *
+ * \param alloc Pointer to (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, applicable error otherwise.
+ */
+hubbub_error hubbub_finalise(hubbub_alloc alloc, void *pw)
+{
+ if (alloc == NULL)
+ return HUBBUB_BADPARM;
+
+ hubbub_entities_destroy(alloc, pw);
+
+ hubbub_aliases_destroy(alloc, pw);
+
+ return HUBBUB_OK;
+}
+
+
diff --git a/src/input/Makefile b/src/input/Makefile
new file mode 100644
index 0000000..8b06c63
--- /dev/null
+++ b/src/input/Makefile
@@ -0,0 +1,53 @@
+# Makefile for libhubbub
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT Name of component
+# EXPORT Absolute path of export directory
+# TOP Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean Clean source tree
+# debug Create a debug binary
+# distclean Fully clean source tree, back to pristine condition
+# export Export distributable components to ${EXPORT}
+# release Create a release binary
+# setup Perform any setup required prior to compilation
+# test Execute any test cases
+
+# Manipulate include paths
+CFLAGS += -I$(CURDIR)
+
+# Objects
+OBJS = filter inputstream utf8_stream
+
+.PHONY: clean debug distclean export release setup test
+
+# Targets
+release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS)))
+
+debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS)))
+
+clean:
+ -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS}))
+ -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS}))
+
+distclean:
+
+setup:
+
+export:
+
+test:
+
+# Pattern rules
+../Release/%.o: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $<
+
+../Debug/%.o: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c -g ${CFLAGS} -o $@ $<
diff --git a/src/input/filter.c b/src/input/filter.c
new file mode 100644
index 0000000..5ac5391
--- /dev/null
+++ b/src/input/filter.c
@@ -0,0 +1,380 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "charset/codec.h"
+#include "utils/utils.h"
+
+#include "input/filter.h"
+
+
+/** Input filter */
+struct hubbub_filter {
+ hubbub_charsetcodec *read_codec; /**< Read codec */
+ hubbub_charsetcodec *write_codec; /**< Write codec */
+
+ uint32_t filter_output[2]; /**< Filter output buffer */
+ uint32_t last_filter_char; /**< Last filtered character */
+
+ uint32_t pivot_buf[64]; /**< Conversion pivot buffer */
+
+ bool leftover; /**< Data remains from last call */
+ uint8_t *pivot_left; /**< Remaining pivot to write */
+ size_t pivot_len; /**< Length of pivot remaining */
+
+ struct {
+ uint16_t encoding; /**< Input encoding */
+ } settings; /**< Filter settings */
+
+ hubbub_alloc alloc; /**< Memory (de)allocation function */
+ void *pw; /**< Client private data */
+};
+
+static hubbub_error hubbub_filter_set_defaults(hubbub_filter *input);
+static hubbub_error hubbub_filter_set_encoding(hubbub_filter *input,
+ const char *enc);
+static hubbub_error read_character_filter(uint32_t c,
+ uint32_t **output, size_t *outputlen, void *pw);
+
+/**
+ * Create an input filter
+ *
+ * \param int_enc Desired encoding of document
+ * \param alloc Function used to (de)allocate data
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to filter instance, or NULL on failure
+ */
+hubbub_filter *hubbub_filter_create(const char *int_enc,
+ hubbub_alloc alloc, void *pw)
+{
+ hubbub_filter *filter;
+
+ if (alloc == NULL)
+ return NULL;
+
+ filter = alloc(NULL, sizeof(*filter), pw);
+ if (!filter)
+ return NULL;
+
+ filter->last_filter_char = 0;
+
+ filter->leftover = false;
+ filter->pivot_left = NULL;
+ filter->pivot_len = 0;
+
+ filter->alloc = alloc;
+ filter->pw = pw;
+
+ if (hubbub_filter_set_defaults(filter) != HUBBUB_OK) {
+ filter->alloc(filter, 0, pw);
+ return NULL;
+ }
+
+ filter->write_codec = hubbub_charsetcodec_create(int_enc, alloc, pw);
+ if (filter->write_codec == NULL) {
+ if (filter->read_codec != NULL)
+ hubbub_charsetcodec_destroy(filter->read_codec);
+ filter->alloc(filter, 0, pw);
+ return NULL;
+ }
+
+ return filter;
+}
+
+/**
+ * Destroy an input filter
+ *
+ * \param input Pointer to filter instance
+ */
+void hubbub_filter_destroy(hubbub_filter *input)
+{
+ if (input == NULL)
+ return;
+
+ if (input->read_codec != NULL)
+ hubbub_charsetcodec_destroy(input->read_codec);
+
+ if (input->write_codec != NULL)
+ hubbub_charsetcodec_destroy(input->write_codec);
+
+ input->alloc(input, 0, input->pw);
+
+ return;
+}
+
+/**
+ * Configure an input filter
+ *
+ * \param input Pointer to filter instance
+ * \param type Input option type to configure
+ * \param params Option-specific parameters
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_filter_setopt(hubbub_filter *input,
+ hubbub_filter_opttype type,
+ hubbub_filter_optparams *params)
+{
+ hubbub_error error = HUBBUB_OK;
+
+ if (input == NULL || params == NULL)
+ return HUBBUB_BADPARM;
+
+ switch (type) {
+ case HUBBUB_FILTER_SET_ENCODING:
+ error = hubbub_filter_set_encoding(input,
+ params->encoding.name);
+ break;
+ }
+
+ return error;
+}
+
+/**
+ * Process a chunk of data
+ *
+ * \param input Pointer to filter instance
+ * \param data Pointer to pointer to input buffer
+ * \param len Pointer to length of input buffer
+ * \param output Pointer to pointer to output buffer
+ * \param outlen Pointer to length of output buffer
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ *
+ * Call this with an input buffer length of 0 to flush any buffers.
+ */
+hubbub_error hubbub_filter_process_chunk(hubbub_filter *input,
+ const uint8_t **data, size_t *len,
+ uint8_t **output, size_t *outlen)
+{
+ hubbub_error read_error, write_error;
+
+ if (input == NULL || data == NULL || *data == NULL || len == NULL ||
+ output == NULL || *output == NULL || outlen == NULL)
+ return HUBBUB_BADPARM;
+
+ if (input->leftover) {
+ /* Some data left to be written from last call */
+
+ /* Attempt to flush the remaining data. */
+ write_error = hubbub_charsetcodec_encode(input->write_codec,
+ (const uint8_t **) &input->pivot_left,
+ &input->pivot_len,
+ output, outlen);
+
+ if (write_error != HUBBUB_OK) {
+ return write_error;
+ }
+
+ /* And clear leftover */
+ input->pivot_left = NULL;
+ input->pivot_len = 0;
+ input->leftover = false;
+ }
+
+ while (*len > 0) {
+ size_t pivot_len = sizeof(input->pivot_buf);
+ uint8_t *pivot = (uint8_t *) input->pivot_buf;
+
+ read_error = hubbub_charsetcodec_decode(input->read_codec,
+ data, len,
+ (uint8_t **) &pivot, &pivot_len);
+
+ pivot = (uint8_t *) input->pivot_buf;
+ pivot_len = sizeof(input->pivot_buf) - pivot_len;
+
+ if (pivot_len > 0) {
+ write_error = hubbub_charsetcodec_encode(
+ input->write_codec,
+ (const uint8_t **) &pivot,
+ &pivot_len,
+ output, outlen);
+
+ if (write_error != HUBBUB_OK) {
+ input->leftover = true;
+ input->pivot_left = pivot;
+ input->pivot_len = pivot_len;
+
+ return write_error;
+ }
+ }
+
+ if (read_error != HUBBUB_OK && read_error != HUBBUB_NOMEM)
+ return read_error;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Reset an input filter's state
+ *
+ * \param input The input filter to reset
+ * \param HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_filter_reset(hubbub_filter *input)
+{
+ hubbub_error error;
+
+ if (input == NULL)
+ return HUBBUB_BADPARM;
+
+ /* Clear pivot buffer leftovers */
+ input->pivot_left = NULL;
+ input->pivot_len = 0;
+ input->leftover = false;
+
+ /* Reset read codec */
+ error = hubbub_charsetcodec_reset(input->read_codec);
+ if (error != HUBBUB_OK)
+ return error;
+
+ /* Reset write codec */
+ error = hubbub_charsetcodec_reset(input->write_codec);
+ if (error != HUBBUB_OK)
+ return error;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Set an input filter's default settings
+ *
+ * \param input Input filter to configure
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_filter_set_defaults(hubbub_filter *input)
+{
+ hubbub_error error;
+
+ if (input == NULL)
+ return HUBBUB_BADPARM;
+
+ input->read_codec = NULL;
+ input->write_codec = NULL;
+ input->settings.encoding = 0;
+ error = hubbub_filter_set_encoding(input, "ISO-8859-1");
+ if (error != HUBBUB_OK)
+ return error;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Set an input filter's encoding
+ *
+ * \param input Input filter to configure
+ * \param enc Encoding name
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_filter_set_encoding(hubbub_filter *input,
+ const char *enc)
+{
+ const char *old_enc;
+ uint16_t mibenum;
+ hubbub_error error;
+ hubbub_charsetcodec_optparams params;
+
+ if (input == NULL || enc == NULL)
+ return HUBBUB_BADPARM;
+
+ mibenum = hubbub_mibenum_from_name(enc, strlen(enc));
+ if (mibenum == 0)
+ return HUBBUB_INVALID;
+
+ /* Exit early if we're already using this encoding */
+ if (input->settings.encoding == mibenum)
+ return HUBBUB_OK;
+
+ old_enc = hubbub_mibenum_to_name(input->settings.encoding);
+ if (old_enc == NULL)
+ old_enc = "ISO-8859-1";
+
+ if (input->read_codec != NULL)
+ hubbub_charsetcodec_destroy(input->read_codec);
+
+ input->read_codec = hubbub_charsetcodec_create(enc, input->alloc,
+ input->pw);
+ if (input->read_codec == NULL)
+ return HUBBUB_NOMEM;
+
+ /* Register filter function */
+ params.filter_func.filter = read_character_filter;
+ params.filter_func.pw = (void *) input;
+ error = hubbub_charsetcodec_setopt(input->read_codec,
+ HUBBUB_CHARSETCODEC_FILTER_FUNC,
+ (hubbub_charsetcodec_optparams *) &params);
+ if (error != HUBBUB_OK)
+ return error;
+
+ input->settings.encoding = mibenum;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Character filter function for read characters
+ *
+ * \param c The read character (UCS4 - host byte order)
+ * \param output Pointer to pointer to output buffer (filled on exit)
+ * \param outputlen Pointer to output buffer length (filled on exit)
+ * \param pw Pointer to client-specific private data.
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error read_character_filter(uint32_t c, uint32_t **output,
+ size_t *outputlen, void *pw)
+{
+ hubbub_filter *input = (hubbub_filter *) pw;
+ size_t len;
+
+ if (output == NULL || outputlen == NULL || pw == NULL)
+ return HUBBUB_BADPARM;
+
+ /* Line ending normalisation:
+ * CRLF -> LF (trap CR and let LF through unmodified)
+ * CR -> LF (trap CR and convert to LF if not CRLF)
+ * LF -> LF (leave LF alone)
+ */
+
+#define NUL (0x00000000)
+#define CR (0x0000000D)
+#define LF (0x0000000A)
+#define REP (0x0000FFFD)
+
+ if (c == NUL) {
+ /* Replace NUL (U+0000) characters in input with U+FFFD */
+ input->filter_output[0] = REP;
+ len = 1;
+ } else if (c == CR) {
+ /* Trap CR characters */
+ len = 0;
+ } else if (input->last_filter_char == CR && c != LF) {
+ /* Last char was CR and this isn't LF => CR -> LF */
+ input->filter_output[0] = LF;
+ input->filter_output[1] = c;
+ len = 2;
+ } else {
+ /* Let character through unchanged */
+ input->filter_output[0] = c;
+ len = 1;
+ }
+
+#undef NUL
+#undef CR
+#undef LF
+#undef REP
+
+ input->last_filter_char = c;
+
+ *output = input->filter_output;
+ *outputlen = len;
+
+ return HUBBUB_OK;
+}
diff --git a/src/input/filter.h b/src/input/filter.h
new file mode 100644
index 0000000..6650e09
--- /dev/null
+++ b/src/input/filter.h
@@ -0,0 +1,57 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_input_filter_h_
+#define hubbub_input_filter_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+
+typedef struct hubbub_filter hubbub_filter;
+
+/**
+ * Input filter option types
+ */
+typedef enum hubbub_filter_opttype {
+ HUBBUB_FILTER_SET_ENCODING = 0,
+} hubbub_filter_opttype;
+
+/**
+ * Input filter option parameters
+ */
+typedef union hubbub_filter_optparams {
+ /** Parameters for encoding setting */
+ struct {
+ /** Encoding name */
+ const char *name;
+ } encoding;
+} hubbub_filter_optparams;
+
+
+/* Create an input filter */
+hubbub_filter *hubbub_filter_create(const char *int_enc,
+ hubbub_alloc alloc, void *pw);
+/* Destroy an input filter */
+void hubbub_filter_destroy(hubbub_filter *input);
+
+/* Configure an input filter */
+hubbub_error hubbub_filter_setopt(hubbub_filter *input,
+ hubbub_filter_opttype type,
+ hubbub_filter_optparams *params);
+
+/* Process a chunk of data */
+hubbub_error hubbub_filter_process_chunk(hubbub_filter *input,
+ const uint8_t **data, size_t *len,
+ uint8_t **output, size_t *outlen);
+
+/* Reset an input filter's state */
+hubbub_error hubbub_filter_reset(hubbub_filter *input);
+
+#endif
+
diff --git a/src/input/inputstream.c b/src/input/inputstream.c
new file mode 100644
index 0000000..f82d279
--- /dev/null
+++ b/src/input/inputstream.c
@@ -0,0 +1,479 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdlib.h>
+
+#include "charset/aliases.h"
+#include "input/streamimpl.h"
+
+/**
+ * Buffer moving claimant context
+ */
+struct hubbub_inputstream_bm_handler {
+ hubbub_inputstream_buffermoved handler; /**< Handler function */
+ void *pw; /**< Client private data */
+
+ struct hubbub_inputstream_bm_handler *next;
+ struct hubbub_inputstream_bm_handler *prev;
+};
+
+extern hubbub_streamhandler utf8stream;
+
+static hubbub_streamhandler *handler_table[] = {
+ &utf8stream,
+ NULL
+};
+
+/**
+ * Create an input stream
+ *
+ * \param enc Document charset, or NULL to autodetect
+ * \param int_enc Desired encoding of document
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to stream instance, or NULL on failure
+ */
+hubbub_inputstream *hubbub_inputstream_create(const char *enc,
+ const char *int_enc, hubbub_alloc alloc, void *pw)
+{
+ hubbub_inputstream *stream;
+ hubbub_streamhandler **handler;
+
+ if (int_enc == NULL || alloc == NULL)
+ return NULL;
+
+ /* Search for handler class */
+ for (handler = handler_table; *handler != NULL; handler++) {
+ if ((*handler)->uses_encoding(int_enc))
+ break;
+ }
+
+ /* None found */
+ if ((*handler) == NULL)
+ return NULL;
+
+ stream = (*handler)->create(enc, int_enc, alloc, pw);
+ if (stream == NULL)
+ return NULL;
+
+ stream->handlers = NULL;
+
+ stream->alloc = alloc;
+ stream->pw = pw;
+
+ return stream;
+}
+
+/**
+ * Destroy an input stream
+ *
+ * \param stream Input stream to destroy
+ */
+void hubbub_inputstream_destroy(hubbub_inputstream *stream)
+{
+ hubbub_inputstream_bm_handler *h, *i;
+
+ if (stream == NULL)
+ return;
+
+ for (h = stream->handlers; h; h = i) {
+ i = h->next;
+
+ stream->alloc(h, 0, stream->pw);
+ }
+
+ stream->destroy(stream);
+}
+
+/**
+ * Append data to an input stream
+ *
+ * \param stream Input stream to append data to
+ * \param data Data to append (in document charset), or NULL to flag EOF
+ * \param len Length, in bytes, of data
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream,
+ const uint8_t *data, size_t len)
+{
+ if (stream == NULL)
+ return HUBBUB_BADPARM;
+
+ /* Calling this if we've disowned the buffer is foolish */
+ if (stream->buffer == NULL)
+ return HUBBUB_INVALID;
+
+ return stream->append(stream, data, len);
+}
+
+/**
+ * Insert data into stream at current location
+ *
+ * \param stream Input stream to insert into
+ * \param data Data to insert (UTF-8 encoded)
+ * \param len Length, in bytes, of data
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream,
+ const uint8_t *data, size_t len)
+{
+ if (stream == NULL || data == NULL)
+ return HUBBUB_BADPARM;
+
+ /* Calling this if we've disowned the buffer is foolish */
+ if (stream->buffer == NULL)
+ return HUBBUB_INVALID;
+
+ return stream->insert(stream, data, len);
+}
+
+/**
+ * Look at the next character in the stream
+ *
+ * \param stream Stream to look in
+ * \return UCS4 (host-endian) character code, or EOF or OOD.
+ */
+uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream)
+{
+ /* It is illegal to call this after the buffer has been disowned */
+ if (stream == NULL || stream->buffer == NULL)
+ return HUBBUB_INPUTSTREAM_OOD;
+
+ return stream->peek(stream);;
+}
+
+/**
+ * Retrieve the byte index and length of the current character in the stream
+ *
+ * \param stream Stream to look in
+ * \param len Pointer to location to receive byte length of character
+ * \return Byte index of current character from start of stream,
+ * or (uint32_t) -1 on error
+ */
+uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream,
+ size_t *len)
+{
+ /* It is illegal to call this after the buffer has been disowned */
+ if (stream == NULL || len == NULL || stream->buffer == NULL)
+ return (uint32_t) -1;
+
+ return stream->cur_pos(stream, len);
+}
+
+/**
+ * Convert the current character to lower case
+ *
+ * \param stream Stream to look in
+ */
+void hubbub_inputstream_lowercase(hubbub_inputstream *stream)
+{
+ if (stream == NULL || stream->buffer == NULL)
+ return;
+
+ stream->lowercase(stream);
+}
+
+/**
+ * Convert the current character to upper case
+ *
+ * \param stream Stream to look in
+ */
+void hubbub_inputstream_uppercase(hubbub_inputstream *stream)
+{
+ if (stream == NULL || stream->buffer == NULL)
+ return;
+
+ stream->uppercase(stream);
+}
+
+/**
+ * Advance the stream's current position
+ *
+ * \param stream The stream whose position to advance
+ */
+void hubbub_inputstream_advance(hubbub_inputstream *stream)
+{
+ /* It is illegal to call this after the buffer has been disowned */
+ if (stream == NULL || stream->buffer == NULL)
+ return;
+
+ if (stream->cursor == stream->buffer_len)
+ return;
+
+ stream->advance(stream);
+}
+
+/**
+ * Push a character back onto the stream
+ *
+ * \param stream Stream to push back to
+ * \param character UCS4 (host-endian) codepoint to push back
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ *
+ * Note that this doesn't actually modify the data in the stream.
+ * It works by ensuring that the character located just before the
+ * current stream location is the same as ::character. If it is,
+ * then the stream pointer is moved back. If it is not, then an
+ * error is returned and the stream pointer remains unmodified.
+ */
+hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream,
+ uint32_t character)
+{
+ /* It is illegal to call this after the buffer has been disowned */
+ if (stream == NULL || stream->buffer == NULL)
+ return HUBBUB_BADPARM;
+
+ if (stream->cursor == 0)
+ return HUBBUB_INVALID;
+
+ return stream->push_back(stream, character);
+}
+
+/**
+ * Rewind the input stream by a number of bytes
+ *
+ * \param stream Stream to rewind
+ * \param n Number of bytes to go back
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n)
+{
+ if (stream == NULL || stream->buffer == NULL)
+ return HUBBUB_BADPARM;
+
+ if (stream->cursor < n)
+ return HUBBUB_INVALID;
+
+ stream->cursor -= n;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Claim ownership of an input stream's buffer
+ *
+ * \param stream Input stream whose buffer to claim
+ * \param buffer Pointer to location to receive buffer pointer
+ * \param len Pointer to location to receive byte length of buffer
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ *
+ * Once the buffer has been claimed by a client, the input stream disclaims
+ * all ownership rights (and invalidates any internal references it may have
+ * to the buffer). Therefore, the only input stream call which may be made
+ * after calling this function is to destroy the input stream. Therefore,
+ * unless the stream pointer is located at EOF, this call will return an
+ * error.
+ */
+hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream,
+ uint8_t **buffer, size_t *len)
+{
+ if (stream == NULL || buffer == NULL || len == NULL)
+ return HUBBUB_BADPARM;
+
+ if (stream->had_eof == false ||
+ stream->cursor != stream->buffer_len)
+ return HUBBUB_INVALID;
+
+ *buffer = stream->buffer;
+ *len = stream->buffer_len;
+
+ stream->buffer = NULL;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Register interest in buffer moved events
+ *
+ * \param stream Input stream to register interest with
+ * \param handler Pointer to handler function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_inputstream_register_movehandler(
+ hubbub_inputstream *stream,
+ hubbub_inputstream_buffermoved handler, void *pw)
+{
+ hubbub_inputstream_bm_handler *h;
+
+ if (stream == NULL || handler == NULL)
+ return HUBBUB_BADPARM;
+
+ h = stream->alloc(NULL, sizeof(hubbub_inputstream_bm_handler),
+ stream->pw);
+ if (h == NULL)
+ return HUBBUB_NOMEM;
+
+ h->handler = handler;
+ h->pw = pw;
+
+ h->prev = NULL;
+ h->next = stream->handlers;
+
+ if (stream->handlers)
+ stream->handlers->prev = h;
+ stream->handlers = h;
+
+ /* And notify claimant of current buffer location */
+ handler(stream->buffer, stream->buffer_len, pw);
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Deregister interest in buffer moved events
+ *
+ * \param stream Input stream to deregister from
+ * \param handler Pointer to handler function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_inputstream_deregister_movehandler(
+ hubbub_inputstream *stream,
+ hubbub_inputstream_buffermoved handler, void *pw)
+{
+ hubbub_inputstream_bm_handler *h;
+
+ if (stream == NULL || handler == NULL)
+ return HUBBUB_BADPARM;
+
+ for (h = stream->handlers; h; h = h->next) {
+ if (h->handler == handler && h->pw == pw)
+ break;
+ }
+
+ if (h == NULL)
+ return HUBBUB_INVALID;
+
+ if (h->next)
+ h->next->prev = h->prev;
+ if (h->prev)
+ h->prev->next = h->next;
+ else
+ stream->handlers = h->next;
+
+ stream->alloc(h, 0, stream->pw);
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Case insensitively compare a pair of ranges in the input stream
+ *
+ * \param stream Input stream to look in
+ * \param r1 Offset of start of first range
+ * \param r2 Offset of start of second range
+ * \param len Byte length of ranges
+ * \return 0 if ranges match, non-zero otherwise
+ */
+int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream,
+ uint32_t r1, uint32_t r2, size_t len)
+{
+ if (stream == NULL || stream->buffer == NULL)
+ return 1; /* arbitrary */
+
+ return stream->cmp_range_ci(stream, r1, r2, len);
+}
+
+/**
+ * Case sensitively compare a pair of ranges in the input stream
+ *
+ * \param stream Input stream to look in
+ * \param r1 Offset of start of first range
+ * \param r2 Offset of start of second range
+ * \param len Byte length of ranges
+ * \return 0 if ranges match, non-zero otherwise
+ */
+int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream,
+ uint32_t r1, uint32_t r2, size_t len)
+{
+ if (stream == NULL || stream->buffer == NULL)
+ return 1; /* arbitrary */
+
+ return stream->cmp_range_cs(stream, r1, r2, len);
+}
+
+/**
+ * Case sensitively compare a range of input stream against an ASCII string
+ *
+ * \param stream Input stream to look in
+ * \param off Offset of range start
+ * \param len Byte length of range
+ * \param data Comparison string
+ * \param dlen Byte length of comparison string
+ * \return 0 if match, non-zero otherwise
+ */
+int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream,
+ uint32_t off, size_t len, const char *data, size_t dlen)
+{
+ if (stream == NULL || stream->buffer == NULL)
+ return 1; /* arbitrary */
+
+ return stream->cmp_range_ascii(stream, off, len, data, dlen);
+}
+
+/**
+ * Replace a range of bytes in the input stream with a single character
+ *
+ * \param stream Input stream containing data
+ * \param start Offset of start of range to replace
+ * \param len Length (in bytes) of range to replace
+ * \param ucs4 UCS4 (host endian) encoded replacement character
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream,
+ uint32_t start, size_t len, uint32_t ucs4)
+{
+ if (stream == NULL || stream->buffer == NULL)
+ return HUBBUB_BADPARM;
+
+ if (start >= stream->buffer_len)
+ return HUBBUB_INVALID;
+
+ if (start < stream->cursor)
+ return HUBBUB_INVALID;
+
+ return stream->replace_range(stream, start, len, ucs4);
+}
+
+/**
+ * Read the document charset
+ *
+ * \param stream Input stream to query
+ * \param source Pointer to location to receive charset source
+ * \return Pointer to charset name (constant; do not free), or NULL if unknown
+ */
+const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream,
+ hubbub_charset_source *source)
+{
+ if (stream == NULL || source == NULL)
+ return NULL;
+
+ *source = stream->encsrc;
+
+ if (stream->encsrc == HUBBUB_CHARSET_UNKNOWN)
+ return NULL;
+
+ return hubbub_mibenum_to_name(stream->mibenum);
+}
+
+/**
+ * Inform interested parties that the buffer has moved
+ *
+ * \param stream Input stream
+ */
+void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream)
+{
+ hubbub_inputstream_bm_handler *h;
+
+ if (stream == NULL)
+ return;
+
+ for (h = stream->handlers; h; h = h->next)
+ h->handler(stream->buffer, stream->buffer_len, h->pw);
+}
+
diff --git a/src/input/inputstream.h b/src/input/inputstream.h
new file mode 100644
index 0000000..5325d14
--- /dev/null
+++ b/src/input/inputstream.h
@@ -0,0 +1,98 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_input_inputstream_h_
+#define hubbub_input_inputstream_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+#include <hubbub/types.h>
+
+typedef struct hubbub_inputstream hubbub_inputstream;
+
+/* EOF pseudo-character */
+#define HUBBUB_INPUTSTREAM_EOF (0xFFFFFFFFU)
+/* Out-of-data indicator */
+#define HUBBUB_INPUTSTREAM_OOD (0xFFFFFFFEU)
+
+/* Type of input stream buffer moved handler function */
+typedef void (*hubbub_inputstream_buffermoved)(const uint8_t *buffer,
+ size_t len, void *pw);
+
+/* Create an input stream */
+hubbub_inputstream *hubbub_inputstream_create(const char *enc,
+ const char *int_enc, hubbub_alloc alloc, void *pw);
+/* Destroy an input stream */
+void hubbub_inputstream_destroy(hubbub_inputstream *stream);
+
+/* Append data to an input stream */
+hubbub_error hubbub_inputstream_append(hubbub_inputstream *stream,
+ const uint8_t *data, size_t len);
+/* Insert data into stream at current location */
+hubbub_error hubbub_inputstream_insert(hubbub_inputstream *stream,
+ const uint8_t *data, size_t len);
+
+/* Look at the next character in the stream */
+uint32_t hubbub_inputstream_peek(hubbub_inputstream *stream);
+
+/* Retrieve the byte index and length of the current character in the stream */
+uint32_t hubbub_inputstream_cur_pos(hubbub_inputstream *stream, size_t *len);
+
+/* Convert the current character to lowercase */
+void hubbub_inputstream_lowercase(hubbub_inputstream *stream);
+
+/* Convert the current character to uppercase */
+void hubbub_inputstream_uppercase(hubbub_inputstream *stream);
+
+/* Advance the stream's current position */
+void hubbub_inputstream_advance(hubbub_inputstream *stream);
+
+/* Push a character back onto the stream */
+hubbub_error hubbub_inputstream_push_back(hubbub_inputstream *stream,
+ uint32_t character);
+
+/* Rewind the input stream by a number of bytes */
+hubbub_error hubbub_inputstream_rewind(hubbub_inputstream *stream, size_t n);
+
+/* Claim ownership of an input stream's buffer */
+hubbub_error hubbub_inputstream_claim_buffer(hubbub_inputstream *stream,
+ uint8_t **buffer, size_t *len);
+
+/* Register interest in buffer moved events */
+hubbub_error hubbub_inputstream_register_movehandler(
+ hubbub_inputstream *stream,
+ hubbub_inputstream_buffermoved handler, void *pw);
+
+/* Deregister interest in buffer moved events */
+hubbub_error hubbub_inputstream_deregister_movehandler(
+ hubbub_inputstream *stream,
+ hubbub_inputstream_buffermoved handler, void *pw);
+
+/* Case insensitively compare a pair of ranges in the input stream */
+int hubbub_inputstream_compare_range_ci(hubbub_inputstream *stream,
+ uint32_t r1, uint32_t r2, size_t len);
+
+/* Case sensitively compare a pair of ranges in the input stream */
+int hubbub_inputstream_compare_range_cs(hubbub_inputstream *stream,
+ uint32_t r1, uint32_t r2, size_t len);
+
+/* Case sensitively compare a range of input stream against an ASCII string */
+int hubbub_inputstream_compare_range_ascii(hubbub_inputstream *stream,
+ uint32_t off, size_t len, const char *data, size_t dlen);
+
+/* Replace a range of bytes in the input stream with a single character */
+hubbub_error hubbub_inputstream_replace_range(hubbub_inputstream *stream,
+ uint32_t start, size_t len, uint32_t ucs4);
+
+/* Read the document charset */
+const char *hubbub_inputstream_read_charset(hubbub_inputstream *stream,
+ hubbub_charset_source *source);
+
+#endif
+
diff --git a/src/input/streamimpl.h b/src/input/streamimpl.h
new file mode 100644
index 0000000..f44f6da
--- /dev/null
+++ b/src/input/streamimpl.h
@@ -0,0 +1,77 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_input_streamimpl_h_
+#define hubbub_input_streamimpl_h_
+
+#include <stdbool.h>
+
+#include <hubbub/types.h>
+
+#include "input/filter.h"
+#include "input/inputstream.h"
+
+typedef struct hubbub_inputstream_bm_handler hubbub_inputstream_bm_handler;
+
+/**
+ * Input stream definition: implementations extend this
+ */
+struct hubbub_inputstream {
+ uint8_t *buffer; /**< Document buffer */
+ size_t buffer_len; /**< Amount of data in buffer */
+ size_t buffer_alloc; /**< Allocated size of buffer */
+
+ uint32_t cursor; /**< Byte offset of current position */
+
+ bool had_eof; /**< Whether EOF has been reached */
+
+ uint16_t mibenum; /**< MIB enum for charset, or 0 */
+ hubbub_charset_source encsrc; /**< Charset source */
+
+ hubbub_filter *input; /**< Charset conversion filter */
+
+ hubbub_inputstream_bm_handler *handlers; /**< List of buffer
+ * moved handlers */
+ hubbub_alloc alloc; /**< Memory (de)allocation function */
+ void *pw; /**< Client private data */
+
+ void (*destroy)(hubbub_inputstream *stream);
+ hubbub_error (*append)(hubbub_inputstream *stream,
+ const uint8_t *data, size_t len);
+ hubbub_error (*insert)(hubbub_inputstream *stream,
+ const uint8_t *data, size_t len);
+ uint32_t (*peek)(hubbub_inputstream *stream);
+ uint32_t (*cur_pos)(hubbub_inputstream *stream, size_t *len);
+ void (*lowercase)(hubbub_inputstream *stream);
+ void (*uppercase)(hubbub_inputstream *stream);
+ void (*advance)(hubbub_inputstream *stream);
+ hubbub_error (*push_back)(hubbub_inputstream *stream,
+ uint32_t character);
+ int (*cmp_range_ci)(hubbub_inputstream *stream, uint32_t r1,
+ uint32_t r2, size_t len);
+ int (*cmp_range_cs)(hubbub_inputstream *stream, uint32_t r1,
+ uint32_t r2, size_t len);
+ int (*cmp_range_ascii)(hubbub_inputstream *stream,
+ uint32_t off, size_t len,
+ const char *data, size_t dlen);
+ hubbub_error (*replace_range)(hubbub_inputstream *stream,
+ uint32_t start, size_t len, uint32_t ucs4);
+};
+
+/**
+ * Input stream factory component definition
+ */
+typedef struct hubbub_streamhandler {
+ bool (*uses_encoding)(const char *int_enc);
+ hubbub_inputstream *(*create)(const char *enc, const char *int_enc,
+ hubbub_alloc alloc, void *pw);
+} hubbub_streamhandler;
+
+/* Notification of stream buffer moving */
+void hubbub_inputstream_buffer_moved(hubbub_inputstream *stream);
+
+#endif
diff --git a/src/input/utf8_stream.c b/src/input/utf8_stream.c
new file mode 100644
index 0000000..5d08993
--- /dev/null
+++ b/src/input/utf8_stream.c
@@ -0,0 +1,567 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "charset/detect.h"
+#include "input/streamimpl.h"
+#include "utils/utf8.h"
+#include "utils/utils.h"
+
+#define BUFFER_CHUNK (4096)
+
+static bool hubbub_utf8stream_uses_encoding(const char *int_enc);
+static hubbub_inputstream *hubbub_utf8stream_create(const char *enc,
+ const char *int_enc, hubbub_alloc alloc, void *pw);
+static void hubbub_utf8stream_destroy(hubbub_inputstream *stream);
+static hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream,
+ const uint8_t *data, size_t len);
+static hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream,
+ const uint8_t *data, size_t len);
+static uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream);
+static uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream,
+ size_t *len);
+static void hubbub_utf8stream_lowercase(hubbub_inputstream *stream);
+static void hubbub_utf8stream_uppercase(hubbub_inputstream *stream);
+static void hubbub_utf8stream_advance(hubbub_inputstream *stream);
+static hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream,
+ uint32_t character);
+static int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream,
+ uint32_t r1, uint32_t r2, size_t len);
+static int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream,
+ uint32_t r1, uint32_t r2, size_t len);
+static int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream,
+ uint32_t off, size_t len, const char *data, size_t dlen);
+static hubbub_error hubbub_utf8stream_replace_range(
+ hubbub_inputstream *stream,
+ uint32_t start, size_t len, uint32_t ucs4);
+
+/**
+ * Determine whether a stream implementation uses an internal encoding
+ *
+ * \param int_enc The desired encoding
+ * \return true if handled, false otherwise
+ */
+bool hubbub_utf8stream_uses_encoding(const char *int_enc)
+{
+ return (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) ==
+ hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")));
+}
+
+/**
+ * Create an input stream
+ *
+ * \param enc Document charset, or NULL if unknown
+ * \param int_enc Desired encoding of document
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to stream instance, or NULL on failure
+ */
+hubbub_inputstream *hubbub_utf8stream_create(const char *enc,
+ const char *int_enc, hubbub_alloc alloc, void *pw)
+{
+ hubbub_inputstream *stream;
+
+ if (hubbub_mibenum_from_name(int_enc, strlen(int_enc)) !=
+ hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")))
+ return NULL;
+
+ stream = alloc(NULL, sizeof(hubbub_inputstream), pw);
+ if (stream == NULL)
+ return NULL;
+
+ stream->buffer = alloc(NULL, BUFFER_CHUNK, pw);
+ if (stream->buffer == NULL) {
+ alloc(stream, 0, pw);
+ return NULL;
+ }
+
+ stream->buffer_len = 0;
+ stream->buffer_alloc = BUFFER_CHUNK;
+
+ stream->cursor = 0;
+
+ stream->had_eof = false;
+
+ stream->input = hubbub_filter_create(int_enc, alloc, pw);
+ if (stream->input == NULL) {
+ alloc(stream->buffer, 0, pw);
+ alloc(stream, 0, pw);
+ return NULL;
+ }
+
+ if (enc != NULL) {
+ hubbub_error error;
+ hubbub_filter_optparams params;
+
+ stream->mibenum = hubbub_mibenum_from_name(enc, strlen(enc));
+
+ if (stream->mibenum != 0) {
+ params.encoding.name = enc;
+
+ error = hubbub_filter_setopt(stream->input,
+ HUBBUB_FILTER_SET_ENCODING, &params);
+ if (error != HUBBUB_OK && error != HUBBUB_INVALID) {
+ hubbub_filter_destroy(stream->input);
+ alloc(stream->buffer, 0, pw);
+ alloc(stream, 0, pw);
+ return NULL;
+ }
+
+ stream->encsrc = HUBBUB_CHARSET_DICTATED;
+ }
+ } else {
+ stream->mibenum = 0;
+ stream->encsrc = HUBBUB_CHARSET_UNKNOWN;
+ }
+
+ stream->destroy = hubbub_utf8stream_destroy;
+ stream->append = hubbub_utf8stream_append;
+ stream->insert = hubbub_utf8stream_insert;
+ stream->peek = hubbub_utf8stream_peek;
+ stream->cur_pos = hubbub_utf8stream_cur_pos;
+ stream->lowercase = hubbub_utf8stream_lowercase;
+ stream->uppercase = hubbub_utf8stream_uppercase;
+ stream->advance = hubbub_utf8stream_advance;
+ stream->push_back = hubbub_utf8stream_push_back;
+ stream->cmp_range_ci = hubbub_utf8stream_compare_range_ci;
+ stream->cmp_range_cs = hubbub_utf8stream_compare_range_cs;
+ stream->cmp_range_ascii = hubbub_utf8stream_compare_range_ascii;
+ stream->replace_range = hubbub_utf8stream_replace_range;
+
+ return stream;
+}
+
+/**
+ * Destroy an input stream
+ *
+ * \param stream Input stream to destroy
+ */
+void hubbub_utf8stream_destroy(hubbub_inputstream *stream)
+{
+ if (stream->input != NULL) {
+ hubbub_filter_destroy(stream->input);
+ }
+
+ if (stream->buffer != NULL) {
+ stream->alloc(stream->buffer, 0, stream->pw);
+ }
+
+ stream->alloc(stream, 0, stream->pw);
+}
+
+/**
+ * Append data to an input stream
+ *
+ * \param stream Input stream to append data to
+ * \param data Data to append (in document charset), or NULL to flag EOF
+ * \param len Length, in bytes, of data
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_utf8stream_append(hubbub_inputstream *stream,
+ const uint8_t *data, size_t len)
+{
+ hubbub_error error;
+ uint8_t *base;
+ size_t space;
+
+ if (data == NULL) {
+ /* EOF indicated */
+ size_t dummy_len = 0;
+ uint8_t *dummy_data = (uint8_t *) &dummy_len;
+
+ base = stream->buffer + stream->buffer_len;
+ space = stream->buffer_alloc - stream->buffer_len;
+
+ /* Forcibly flush through any remaining buffered data */
+ while ((error = hubbub_filter_process_chunk(stream->input,
+ (const uint8_t **) &dummy_data, &dummy_len,
+ &base, &space)) == HUBBUB_NOMEM) {
+ bool moved = false;
+ uint8_t *temp = stream->alloc(stream->buffer,
+ stream->buffer_alloc + BUFFER_CHUNK,
+ stream->pw);
+
+ if (temp == NULL) {
+ return HUBBUB_NOMEM;
+ }
+
+ moved = (temp != stream->buffer);
+
+ stream->buffer = temp;
+ stream->buffer_len += stream->buffer_alloc -
+ stream->buffer_len - space;
+ stream->buffer_alloc += BUFFER_CHUNK;
+
+ base = stream->buffer + stream->buffer_len;
+ space = stream->buffer_alloc - stream->buffer_len;
+
+ if (moved)
+ hubbub_inputstream_buffer_moved(stream);
+ }
+
+ /* And fix up buffer length */
+ stream->buffer_len += stream->buffer_alloc -
+ stream->buffer_len - space;
+
+ stream->had_eof = true;
+ } else {
+ /* Normal data chunk */
+
+ if (stream->mibenum == 0) {
+ /* Haven't found charset yet; detect it */
+ error = hubbub_charset_extract(&data, &len,
+ &stream->mibenum, &stream->encsrc);
+ if (error) {
+ return error;
+ }
+
+ /* We should always have a charset by now */
+ if (stream->mibenum == 0)
+ abort();
+ }
+
+ base = stream->buffer + stream->buffer_len;
+ space = stream->buffer_alloc - stream->buffer_len;
+
+ /* Convert chunk to UTF-8 */
+ while ((error = hubbub_filter_process_chunk(stream->input,
+ &data, &len,
+ &base, &space)) == HUBBUB_NOMEM) {
+ bool moved = false;
+ uint8_t *temp = stream->alloc(stream->buffer,
+ stream->buffer_alloc + BUFFER_CHUNK,
+ stream->pw);
+
+ if (temp == NULL) {
+ return HUBBUB_NOMEM;
+ }
+
+ moved = (temp != stream->buffer);
+
+ stream->buffer = temp;
+ stream->buffer_len += stream->buffer_alloc -
+ stream->buffer_len - space;
+ stream->buffer_alloc += BUFFER_CHUNK;
+
+ base = stream->buffer + stream->buffer_len;
+ space = stream->buffer_alloc - stream->buffer_len -
+ space;
+
+ if (moved)
+ hubbub_inputstream_buffer_moved(stream);
+ }
+
+ /* And fix up buffer length */
+ stream->buffer_len += stream->buffer_alloc -
+ stream->buffer_len - space;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Insert data into stream at current location
+ *
+ * \param stream Input stream to insert into
+ * \param data Data to insert (UTF-8 encoded)
+ * \param len Length, in bytes, of data
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_utf8stream_insert(hubbub_inputstream *stream,
+ const uint8_t *data, size_t len)
+{
+ size_t space;
+ uint8_t *curpos;
+
+ space = stream->buffer_alloc - stream->buffer_len;
+
+ /* Need to grow buffer, if there's insufficient space */
+ if (space <= len) {
+ bool moved = false;
+ uint8_t *temp = stream->alloc(stream->buffer,
+ stream->buffer_alloc +
+ ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) +
+ BUFFER_CHUNK,
+ stream->pw);
+
+ if (temp == NULL)
+ return HUBBUB_NOMEM;
+
+ moved = (temp != stream->buffer);
+
+ stream->buffer = temp;
+ stream->buffer_alloc +=
+ ((len + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK);
+
+ if (moved)
+ hubbub_inputstream_buffer_moved(stream);
+ }
+
+ /* Find the insertion point
+ * (just before the next character to be read) */
+ curpos = stream->buffer + stream->cursor;
+
+ /* Move data above this point up */
+ memmove(curpos + len, curpos, stream->buffer_len - stream->cursor);
+
+ /* Copy new data into gap created by memmove */
+ memcpy(curpos, data, len);
+
+ /* Fix up buffer length */
+ stream->buffer_len += len;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Look at the next character in the stream
+ *
+ * \param stream Stream to look in
+ * \return UCS4 (host-endian) character code, or EOF or OOD.
+ */
+uint32_t hubbub_utf8stream_peek(hubbub_inputstream *stream)
+{
+ hubbub_error error;
+ size_t len;
+ uint32_t ret;
+
+ if (stream->cursor == stream->buffer_len) {
+ return stream->had_eof ? HUBBUB_INPUTSTREAM_EOF
+ : HUBBUB_INPUTSTREAM_OOD;
+ }
+
+ error = hubbub_utf8_to_ucs4(stream->buffer + stream->cursor,
+ stream->buffer_len - stream->cursor,
+ &ret, &len);
+ if (error != HUBBUB_OK && error != HUBBUB_NEEDDATA)
+ return HUBBUB_INPUTSTREAM_OOD;
+
+ if (error == HUBBUB_NEEDDATA) {
+ if (stream->had_eof)
+ return HUBBUB_INPUTSTREAM_EOF;
+ else
+ return HUBBUB_INPUTSTREAM_OOD;
+ }
+
+ return ret;
+}
+
+/**
+ * Retrieve the byte index and length of the current character in the stream
+ *
+ * \param stream Stream to look in
+ * \param len Pointer to location to receive byte length of character
+ * \return Byte index of current character from start of stream,
+ * or (uint32_t) -1 on error
+ */
+uint32_t hubbub_utf8stream_cur_pos(hubbub_inputstream *stream,
+ size_t *len)
+{
+ hubbub_utf8_char_byte_length(stream->buffer + stream->cursor, len);
+
+ return stream->cursor;
+}
+
+/**
+ * Convert the current character to lower case
+ *
+ * \param stream Stream to look in
+ */
+void hubbub_utf8stream_lowercase(hubbub_inputstream *stream)
+{
+ if ('A' <= stream->buffer[stream->cursor] &&
+ stream->buffer[stream->cursor] <= 'Z')
+ stream->buffer[stream->cursor] += 0x0020;
+}
+
+/**
+ * Convert the current character to upper case
+ *
+ * \param stream Stream to look in
+ */
+void hubbub_utf8stream_uppercase(hubbub_inputstream *stream)
+{
+ if ('a' <= stream->buffer[stream->cursor] &&
+ stream->buffer[stream->cursor] <= 'z')
+ stream->buffer[stream->cursor] -= 0x0020;
+}
+
+/**
+ * Advance the stream's current position
+ *
+ * \param stream The stream whose position to advance
+ */
+void hubbub_utf8stream_advance(hubbub_inputstream *stream)
+{
+ hubbub_error error;
+ uint32_t next;
+
+ error = hubbub_utf8_next(stream->buffer, stream->buffer_len,
+ stream->cursor, &next);
+
+ if (error == HUBBUB_OK)
+ stream->cursor = next;
+}
+
+/**
+ * Push a character back onto the stream
+ *
+ * \param stream Stream to push back to
+ * \param character UCS4 (host-endian) codepoint to push back
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ *
+ * Note that this doesn't actually modify the data in the stream.
+ * It works by ensuring that the character located just before the
+ * current stream location is the same as ::character. If it is,
+ * then the stream pointer is moved back. If it is not, then an
+ * error is returned and the stream pointer remains unmodified.
+ */
+hubbub_error hubbub_utf8stream_push_back(hubbub_inputstream *stream,
+ uint32_t character)
+{
+ hubbub_error error;
+ uint32_t prev;
+ uint8_t buf[6];
+ size_t len;
+
+ error = hubbub_utf8_prev(stream->buffer, stream->cursor, &prev);
+ if (error != HUBBUB_OK)
+ return error;
+
+ error = hubbub_utf8_from_ucs4(character, buf, &len);
+ if (error != HUBBUB_OK)
+ return error;
+
+ if ((stream->cursor - prev) != len ||
+ memcmp(stream->buffer + prev, buf, len) != 0)
+ return HUBBUB_INVALID;
+
+ stream->cursor = prev;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Case insensitively compare a pair of ranges in the input stream
+ *
+ * \param stream Input stream to look in
+ * \param r1 Offset of start of first range
+ * \param r2 Offset of start of second range
+ * \param len Byte length of ranges
+ * \return 0 if ranges match, non-zero otherwise
+ */
+int hubbub_utf8stream_compare_range_ci(hubbub_inputstream *stream,
+ uint32_t r1, uint32_t r2, size_t len)
+{
+ return strncasecmp((const char *) (stream->buffer + r1),
+ (const char *) (stream->buffer + r2), len);
+}
+
+/**
+ * Case sensitively compare a pair of ranges in the input stream
+ *
+ * \param stream Input stream to look in
+ * \param r1 Offset of start of first range
+ * \param r2 Offset of start of second range
+ * \param len Byte length of ranges
+ * \return 0 if ranges match, non-zero otherwise
+ */
+int hubbub_utf8stream_compare_range_cs(hubbub_inputstream *stream,
+ uint32_t r1, uint32_t r2, size_t len)
+{
+ return strncmp((const char *) (stream->buffer + r1),
+ (const char *) (stream->buffer + r2), len);
+}
+
+/**
+ * Case sensitively compare a range of input stream against an ASCII string
+ *
+ * \param stream Input stream to look in
+ * \param off Offset of range start
+ * \param len Byte length of range
+ * \param data Comparison string
+ * \param dlen Byte length of comparison string
+ * \return 0 if match, non-zero otherwise
+ */
+int hubbub_utf8stream_compare_range_ascii(hubbub_inputstream *stream,
+ uint32_t off, size_t len, const char *data, size_t dlen)
+{
+ /* Lengths don't match, so strings don't */
+ if (len != dlen)
+ return 1; /* arbitrary */
+
+ return strncmp((const char *) (stream->buffer + off),
+ data, len);
+}
+
+/**
+ * Replace a range of bytes in the input stream with a single character
+ *
+ * \param stream Input stream containing data
+ * \param start Offset of start of range to replace
+ * \param len Length (in bytes) of range to replace
+ * \param ucs4 UCS4 (host endian) encoded replacement character
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_utf8stream_replace_range(hubbub_inputstream *stream,
+ uint32_t start, size_t len, uint32_t ucs4)
+{
+ uint8_t buf[6];
+ size_t replen;
+ int32_t diff;
+ hubbub_error error;
+
+ /* Get UTF8 version of replacement character */
+ error = hubbub_utf8_from_ucs4(ucs4, buf, &replen);
+ if (error)
+ return error;
+
+ diff = replen - len;
+
+ if (stream->buffer_len + diff >= stream->buffer_alloc) {
+ /* Need more buffer space */
+ bool moved = false;
+ uint8_t *temp = stream->alloc(stream->buffer,
+ stream->buffer_alloc +
+ ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK) +
+ BUFFER_CHUNK,
+ stream->pw);
+
+ if (temp == NULL)
+ return HUBBUB_NOMEM;
+
+ moved = (temp != stream->buffer);
+
+ stream->buffer = temp;
+ stream->buffer_alloc +=
+ ((diff + BUFFER_CHUNK - 1) & ~BUFFER_CHUNK);
+
+ if (moved)
+ hubbub_inputstream_buffer_moved(stream);
+ }
+
+ /* Move subsequent input to correct location */
+ memmove(stream->buffer + start + len + diff,
+ stream->buffer + start + len,
+ stream->buffer_len - (start + len));
+
+ /* And fill the gap with the replacement character */
+ memcpy(stream->buffer + start, buf, replen);
+
+ /* Finally, update length */
+ stream->buffer_len += diff;
+
+ return HUBBUB_OK;
+}
+
+hubbub_streamhandler utf8stream = {
+ hubbub_utf8stream_uses_encoding,
+ hubbub_utf8stream_create
+};
diff --git a/src/parser.c b/src/parser.c
new file mode 100644
index 0000000..e7a4fe8
--- /dev/null
+++ b/src/parser.c
@@ -0,0 +1,237 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <hubbub/parser.h>
+
+#include "input/inputstream.h"
+#include "tokeniser/tokeniser.h"
+
+/**
+ * Hubbub parser object
+ */
+struct hubbub_parser {
+ hubbub_inputstream *stream; /**< Input stream instance */
+ hubbub_tokeniser *tok; /**< Tokeniser instance */
+
+ hubbub_alloc alloc; /**< Memory (de)allocation function */
+ void *pw; /**< Client data */
+};
+
+/**
+ * Create a hubbub parser
+ *
+ * \param enc Source document encoding, or NULL to autodetect
+ * \param int_enc Desired encoding of document
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to parser instance, or NULL on error
+ */
+hubbub_parser *hubbub_parser_create(const char *enc, const char *int_enc,
+ hubbub_alloc alloc, void *pw)
+{
+ hubbub_parser *parser;
+
+ if (alloc == NULL)
+ return NULL;
+
+ parser = alloc(NULL, sizeof(hubbub_parser), pw);
+ if (parser == NULL)
+ return NULL;
+
+ parser->stream = hubbub_inputstream_create(enc, int_enc, alloc, pw);
+ if (parser->stream == NULL) {
+ alloc(parser, 0, pw);
+ return NULL;
+ }
+
+ parser->tok = hubbub_tokeniser_create(parser->stream, alloc, pw);
+ if (parser->tok == NULL) {
+ hubbub_inputstream_destroy(parser->stream);
+ alloc(parser, 0, pw);
+ return NULL;
+ }
+
+ parser->alloc = alloc;
+ parser->pw = pw;
+
+ return parser;
+}
+
+/**
+ * Destroy a hubbub parser
+ *
+ * \param parser Parser instance to destroy
+ */
+void hubbub_parser_destroy(hubbub_parser *parser)
+{
+ if (parser == NULL)
+ return;
+
+ hubbub_tokeniser_destroy(parser->tok);
+
+ hubbub_inputstream_destroy(parser->stream);
+
+ parser->alloc(parser, 0, parser->pw);
+}
+
+/**
+ * Configure a hubbub parser
+ *
+ * \param parser Parser instance to configure
+ * \param type Option to set
+ * \param params Option-specific parameters
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_parser_setopt(hubbub_parser *parser,
+ hubbub_parser_opttype type,
+ hubbub_parser_optparams *params)
+{
+ hubbub_tokeniser_opttype toktype;
+
+ if (parser == NULL || params == NULL)
+ return HUBBUB_BADPARM;
+
+ switch (type) {
+ case HUBBUB_PARSER_TOKEN_HANDLER:
+ toktype = HUBBUB_TOKENISER_TOKEN_HANDLER;
+ break;
+ case HUBBUB_PARSER_BUFFER_HANDLER:
+ toktype = HUBBUB_TOKENISER_BUFFER_HANDLER;
+ break;
+ case HUBBUB_PARSER_ERROR_HANDLER:
+ toktype = HUBBUB_TOKENISER_BUFFER_HANDLER;
+ break;
+ case HUBBUB_PARSER_CONTENT_MODEL:
+ toktype = HUBBUB_TOKENISER_CONTENT_MODEL;
+ break;
+ }
+
+ return hubbub_tokeniser_setopt(parser->tok, toktype,
+ (hubbub_tokeniser_optparams *) params);
+}
+
+/**
+ * Pass a chunk of data to a hubbub parser for parsing
+ *
+ * \param parser Parser instance to use
+ * \param data Data to parse (encoded in the input charset)
+ * \param len Length, in bytes, of data
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_parser_parse_chunk(hubbub_parser *parser,
+ uint8_t *data, size_t len)
+{
+ hubbub_error error;
+
+ if (parser == NULL || data == NULL)
+ return HUBBUB_BADPARM;
+
+ error = hubbub_inputstream_append(parser->stream, data, len);
+ if (error != HUBBUB_OK)
+ return error;
+
+ error = hubbub_tokeniser_run(parser->tok);
+ if (error != HUBBUB_OK)
+ return error;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Pass a chunk of extraneous data to a hubbub parser for parsing
+ *
+ * \param parser Parser instance to use
+ * \param data Data to parse (encoded in internal charset)
+ * \param len Length, in byte, of data
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_parser_parse_extraneous_chunk(hubbub_parser *parser,
+ uint8_t *data, size_t len)
+{
+ hubbub_error error;
+
+ /** \todo In some cases, we don't actually want script-inserted
+ * data to be parsed until later. We'll need some way of flagging
+ * this through the public API, and the inputstream API will need
+ * some way of marking the insertion point so that, when the
+ * tokeniser is run, only the inserted chunk is parsed. */
+
+ if (parser == NULL || data == NULL)
+ return HUBBUB_BADPARM;
+
+ error = hubbub_inputstream_insert(parser->stream, data, len);
+ if (error != HUBBUB_OK)
+ return error;
+
+ error = hubbub_tokeniser_run(parser->tok);
+ if (error != HUBBUB_OK)
+ return error;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Inform the parser that the last chunk of data has been parsed
+ *
+ * \param parser Parser to inform
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_parser_completed(hubbub_parser *parser)
+{
+ hubbub_error error;
+
+ if (parser == NULL)
+ return HUBBUB_BADPARM;
+
+ error = hubbub_inputstream_append(parser->stream, NULL, 0);
+ if (error != HUBBUB_OK)
+ return error;
+
+ error = hubbub_tokeniser_run(parser->tok);
+ if (error != HUBBUB_OK)
+ return error;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Read the document charset
+ *
+ * \param parser Parser instance to query
+ * \param source Pointer to location to receive charset source
+ * \return Pointer to charset name (constant; do not free), or NULL if unknown
+ */
+const char *hubbub_parser_read_charset(hubbub_parser *parser,
+ hubbub_charset_source *source)
+{
+ if (parser == NULL || source == NULL)
+ return NULL;
+
+ return hubbub_inputstream_read_charset(parser->stream, source);
+}
+
+/**
+ * Claim ownership of the document buffer
+ *
+ * \param parser Parser whose buffer to claim
+ * \param buffer Pointer to location to receive buffer pointer
+ * \param len Pointer to location to receive byte length of buffer
+ * \return HUBBUB_OK on success, appropriate error otherwise.
+ *
+ * Once the buffer has been claimed by a client, the parser disclaims
+ * all ownership rights (and invalidates any internal references it may have
+ * to the buffer). Therefore, the only parser call which may be made
+ * after calling this function is to destroy the parser.
+ */
+hubbub_error hubbub_parser_claim_buffer(hubbub_parser *parser,
+ uint8_t **buffer, size_t *len)
+{
+ if (parser == NULL || buffer == NULL || len == NULL)
+ return HUBBUB_BADPARM;
+
+ return hubbub_inputstream_claim_buffer(parser->stream, buffer, len);
+}
diff --git a/src/tokeniser/Makefile b/src/tokeniser/Makefile
new file mode 100644
index 0000000..539625f
--- /dev/null
+++ b/src/tokeniser/Makefile
@@ -0,0 +1,53 @@
+# Makefile for libhubbub
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT Name of component
+# EXPORT Absolute path of export directory
+# TOP Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean Clean source tree
+# debug Create a debug binary
+# distclean Fully clean source tree, back to pristine condition
+# export Export distributable components to ${EXPORT}
+# release Create a release binary
+# setup Perform any setup required prior to compilation
+# test Execute any test cases
+
+# Manipulate include paths
+CFLAGS += -I$(CURDIR)
+
+# Objects
+OBJS = entities tokeniser
+
+.PHONY: clean debug distclean export release setup test
+
+# Targets
+release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS)))
+
+debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS)))
+
+clean:
+ -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS}))
+ -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS}))
+
+distclean:
+
+setup:
+
+export:
+
+test:
+
+# Pattern rules
+../Release/%.o: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $<
+
+../Debug/%.o: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c -g ${CFLAGS} -o $@ $<
diff --git a/src/tokeniser/entities.c b/src/tokeniser/entities.c
new file mode 100644
index 0000000..8a9acf5
--- /dev/null
+++ b/src/tokeniser/entities.c
@@ -0,0 +1,363 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include "utils/dict.h"
+#include "utils/utils.h"
+#include "tokeniser/entities.h"
+
+typedef struct hubbub_entity hubbub_entity;
+
+static const struct hubbub_entity {
+ const char *name;
+ uint32_t ucs4;
+} entities[] = {
+ { "AElig", 0x00C6 },
+ { "Aacute", 0x00C1 },
+ { "Acirc", 0x00C2 },
+ { "Agrave", 0x00C0 },
+ { "Alpha", 0x0391 },
+ { "Aring", 0x00C5 },
+ { "Atilde", 0x00C3 },
+ { "Auml", 0x00C4 },
+ { "Beta", 0x0392 },
+ { "Ccedil", 0x00C7 },
+ { "Chi", 0x03A7 },
+ { "Dagger", 0x2021 },
+ { "Delta", 0x0394 },
+ { "ETH", 0x00D0 },
+ { "Eacute", 0x00C9 },
+ { "Ecirc", 0x00CA },
+ { "Egrave", 0x00C8 },
+ { "Epsilon", 0x0395 },
+ { "Eta", 0x0397 },
+ { "Euml", 0x00CB },
+ { "Gamma", 0x0393 },
+ { "Iacute", 0x00CD },
+ { "Icirc", 0x00CE },
+ { "Igrave", 0x00CC },
+ { "Iota", 0x0399 },
+ { "Iuml", 0x00CF },
+ { "Kappa", 0x039A },
+ { "Lambda", 0x039B },
+ { "Mu", 0x039C },
+ { "Ntilde", 0x00D1 },
+ { "Nu", 0x039D },
+ { "OElig", 0x0152 },
+ { "Oacute", 0x00D3 },
+ { "Ocirc", 0x00D4 },
+ { "Ograve", 0x00D2 },
+ { "Omega", 0x03A9 },
+ { "Omicron", 0x039F },
+ { "Oslash", 0x00D8 },
+ { "Otilde", 0x00D5 },
+ { "Ouml", 0x00D6 },
+ { "Phi", 0x03A6 },
+ { "Pi", 0x03A0 },
+ { "Prime", 0x2033 },
+ { "Psi", 0x03A8 },
+ { "Rho", 0x03A1 },
+ { "Scaron", 0x0160 },
+ { "Sigma", 0x03A3 },
+ { "THORN", 0x00DE },
+ { "Tau", 0x03A4 },
+ { "Theta", 0x0398 },
+ { "Uacute", 0x00DA },
+ { "Ucirc", 0x00DB },
+ { "Ugrave", 0x00D9 },
+ { "Upsilon", 0x03A5 },
+ { "Uuml", 0x00DC },
+ { "Xi", 0x039E },
+ { "Yacute", 0x00DD },
+ { "Yuml", 0x0178 },
+ { "Zeta", 0x0396 },
+ { "aacute", 0x00E1 },
+ { "acirc", 0x00E2 },
+ { "acute", 0x00B4 },
+ { "aelig", 0x00E6 },
+ { "agrave", 0x00E0 },
+ { "alefsym", 0x2135 },
+ { "alpha", 0x03B1 },
+ { "amp", 0x0026 },
+ { "AMP", 0x0026 },
+ { "and", 0x2227 },
+ { "ang", 0x2220 },
+ { "apos", 0x0027 },
+ { "aring", 0x00E5 },
+ { "asymp", 0x2248 },
+ { "atilde", 0x00E3 },
+ { "auml", 0x00E4 },
+ { "bdquo", 0x201E },
+ { "beta", 0x03B2 },
+ { "brvbar", 0x00A6 },
+ { "bull", 0x2022 },
+ { "cap", 0x2229 },
+ { "ccedil", 0x00E7 },
+ { "cedil", 0x00B8 },
+ { "cent", 0x00A2 },
+ { "chi", 0x03C7 },
+ { "circ", 0x02C6 },
+ { "clubs", 0x2663 },
+ { "cong", 0x2245 },
+ { "copy", 0x00A9 },
+ { "COPY", 0x00A9 },
+ { "crarr", 0x21B5 },
+ { "cup", 0x222A },
+ { "curren", 0x00A4 },
+ { "dArr", 0x21D3 },
+ { "dagger", 0x2020 },
+ { "darr", 0x2193 },
+ { "deg", 0x00B0 },
+ { "delta", 0x03B4 },
+ { "diams", 0x2666 },
+ { "divide", 0x00F7 },
+ { "eacute", 0x00E9 },
+ { "ecirc", 0x00EA },
+ { "egrave", 0x00E8 },
+ { "empty", 0x2205 },
+ { "emsp", 0x2003 },
+ { "ensp", 0x2002 },
+ { "epsilon", 0x03B5 },
+ { "equiv", 0x2261 },
+ { "eta", 0x03B7 },
+ { "eth", 0x00F0 },
+ { "euml", 0x00EB },
+ { "euro", 0x20AC },
+ { "exist", 0x2203 },
+ { "fnof", 0x0192 },
+ { "forall", 0x2200 },
+ { "frac12", 0x00BD },
+ { "frac14", 0x00BC },
+ { "frac34", 0x00BE },
+ { "frasl", 0x2044 },
+ { "gamma", 0x03B3 },
+ { "ge", 0x2265 },
+ { "gt", 0x003E },
+ { "GT", 0x003E },
+ { "hArr", 0x21D4 },
+ { "harr", 0x2194 },
+ { "hearts", 0x2665 },
+ { "hellip", 0x2026 },
+ { "iacute", 0x00ED },
+ { "icirc", 0x00EE },
+ { "iexcl", 0x00A1 },
+ { "igrave", 0x00EC },
+ { "image", 0x2111 },
+ { "infin", 0x221E },
+ { "int", 0x222B },
+ { "iota", 0x03B9 },
+ { "iquest", 0x00BF },
+ { "isin", 0x2208 },
+ { "iuml", 0x00EF },
+ { "kappa", 0x03BA },
+ { "lArr", 0x21D0 },
+ { "lambda", 0x03BB },
+ { "lang", 0x2329 },
+ { "laquo", 0x00AB },
+ { "larr", 0x2190 },
+ { "lceil", 0x2308 },
+ { "ldquo", 0x201C },
+ { "le", 0x2264 },
+ { "lfloor", 0x230A },
+ { "lowast", 0x2217 },
+ { "loz", 0x25CA },
+ { "lrm", 0x200E },
+ { "lsaquo", 0x2039 },
+ { "lsquo", 0x2018 },
+ { "lt", 0x003C },
+ { "LT", 0x003C },
+ { "macr", 0x00AF },
+ { "mdash", 0x2014 },
+ { "micro", 0x00B5 },
+ { "middot", 0x00B7 },
+ { "minus", 0x2212 },
+ { "mu", 0x03BC },
+ { "nabla", 0x2207 },
+ { "nbsp", 0x00A0 },
+ { "ndash", 0x2013 },
+ { "ne", 0x2260 },
+ { "ni", 0x220B },
+ { "not", 0x00AC },
+ { "notin", 0x2209 },
+ { "nsub", 0x2284 },
+ { "ntilde", 0x00F1 },
+ { "nu", 0x03BD },
+ { "oacute", 0x00F3 },
+ { "ocirc", 0x00F4 },
+ { "oelig", 0x0153 },
+ { "ograve", 0x00F2 },
+ { "oline", 0x203E },
+ { "omega", 0x03C9 },
+ { "omicron", 0x03BF },
+ { "oplus", 0x2295 },
+ { "or", 0x2228 },
+ { "ordf", 0x00AA },
+ { "ordm", 0x00BA },
+ { "oslash", 0x00F8 },
+ { "otilde", 0x00F5 },
+ { "otimes", 0x2297 },
+ { "ouml", 0x00F6 },
+ { "para", 0x00B6 },
+ { "part", 0x2202 },
+ { "permil", 0x2030 },
+ { "perp", 0x22A5 },
+ { "phi", 0x03C6 },
+ { "pi", 0x03C0 },
+ { "piv", 0x03D6 },
+ { "plusmn", 0x00B1 },
+ { "pound", 0x00A3 },
+ { "prime", 0x2032 },
+ { "prod", 0x220F },
+ { "prop", 0x221D },
+ { "psi", 0x03C8 },
+ { "quot", 0x0022 },
+ { "QUOT", 0x0022 },
+ { "rArr", 0x21D2 },
+ { "radic", 0x221A },
+ { "rang", 0x232A },
+ { "raquo", 0x00BB },
+ { "rarr", 0x2192 },
+ { "rceil", 0x2309 },
+ { "rdquo", 0x201D },
+ { "real", 0x211C },
+ { "reg", 0x00AE },
+ { "REG", 0x00AE },
+ { "rfloor", 0x230B },
+ { "rho", 0x03C1 },
+ { "rlm", 0x200F },
+ { "rsaquo", 0x203A },
+ { "rsquo", 0x2019 },
+ { "sbquo", 0x201A },
+ { "scaron", 0x0161 },
+ { "sdot", 0x22C5 },
+ { "sect", 0x00A7 },
+ { "shy", 0x00AD },
+ { "sigma", 0x03C3 },
+ { "sigmaf", 0x03C2 },
+ { "sim", 0x223C },
+ { "spades", 0x2660 },
+ { "sub", 0x2282 },
+ { "sube", 0x2286 },
+ { "sum", 0x2211 },
+ { "sup", 0x2283 },
+ { "sup1", 0x00B9 },
+ { "sup2", 0x00B2 },
+ { "sup3", 0x00B3 },
+ { "supe", 0x2287 },
+ { "szlig", 0x00DF },
+ { "tau", 0x03C4 },
+ { "there4", 0x2234 },
+ { "theta", 0x03B8 },
+ { "thetasym", 0x03D1 },
+ { "thinsp", 0x2009 },
+ { "thorn", 0x00FE },
+ { "tilde", 0x02DC },
+ { "times", 0x00D7 },
+ { "trade", 0x2122 },
+ { "uArr", 0x21D1 },
+ { "uacute", 0x00FA },
+ { "uarr", 0x2191 },
+ { "ucirc", 0x00FB },
+ { "ugrave", 0x00F9 },
+ { "uml", 0x00A8 },
+ { "upsih", 0x03D2 },
+ { "upsilon", 0x03C5 },
+ { "uuml", 0x00FC },
+ { "weierp", 0x2118 },
+ { "xi", 0x03BE },
+ { "yacute", 0x00FD },
+ { "yen", 0x00A5 },
+ { "yuml", 0x00FF },
+ { "zeta", 0x03B6 },
+ { "zwj", 0x200D },
+ { "zwnj", 0x200C },
+};
+
+static hubbub_dict *dict;
+
+/**
+ * Create the entities dictionary
+ *
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_entities_create(hubbub_alloc alloc, void *pw)
+{
+ hubbub_error error;
+ size_t i;
+
+ if (alloc == NULL)
+ return HUBBUB_BADPARM;
+
+ dict = hubbub_dict_create(alloc, pw);
+ if (dict == NULL)
+ return HUBBUB_NOMEM;
+
+ for (i = 0; i < sizeof(entities) / sizeof(entities[0]); i++) {
+ error = hubbub_dict_insert(dict, entities[i].name,
+ &entities[i]);
+ if (error != HUBBUB_OK) {
+ hubbub_dict_destroy(dict);
+ return error;
+ }
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Destroy the entities dictionary
+ *
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ */
+void hubbub_entities_destroy(hubbub_alloc alloc, void *pw)
+{
+ UNUSED(alloc);
+ UNUSED(pw);
+
+ hubbub_dict_destroy(dict);
+}
+
+/**
+ * Step-wise search for an entity in the dictionary
+ *
+ * \param c Character to look for
+ * \param result Pointer to location for result
+ * \param context Pointer to location for search context
+ * \return HUBBUB_OK if key found,
+ * HUBBUB_NEEDDATA if more steps are required
+ * HUBBUB_INVALID if nothing matches
+ *
+ * The value pointed to by ::context should be NULL for the first call.
+ * Thereafter, pass in the same value as returned by the previous call.
+ * The context is opaque to the caller and should not be inspected.
+ *
+ * The location pointed to by ::result will be set to U+FFFD unless a match
+ * is found.
+ */
+hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result,
+ void **context)
+{
+ const hubbub_entity *e;
+ hubbub_error error;
+
+ if (result == NULL || context == NULL)
+ return HUBBUB_BADPARM;
+
+ error = hubbub_dict_search_step(dict, c,
+ (const void **) (const void *) &e,
+ context);
+ if (error != HUBBUB_OK) {
+ *result = 0xFFFD;
+ return error;
+ }
+
+ *result = e->ucs4;
+
+ return HUBBUB_OK;
+}
diff --git a/src/tokeniser/entities.h b/src/tokeniser/entities.h
new file mode 100644
index 0000000..efd1987
--- /dev/null
+++ b/src/tokeniser/entities.h
@@ -0,0 +1,25 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_tokeniser_entities_h_
+#define hubbub_tokeniser_entities_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+
+/* Create the entities dictionary */
+hubbub_error hubbub_entities_create(hubbub_alloc alloc, void *pw);
+/* Destroy the entities dictionary */
+void hubbub_entities_destroy(hubbub_alloc alloc, void *pw);
+
+/* Step-wise search for an entity in the dictionary */
+hubbub_error hubbub_entities_search_step(uint8_t c, uint32_t *result,
+ void **context);
+
+#endif
diff --git a/src/tokeniser/tokeniser.c b/src/tokeniser/tokeniser.c
new file mode 100644
index 0000000..f8b6bb3
--- /dev/null
+++ b/src/tokeniser/tokeniser.c
@@ -0,0 +1,2282 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "utils/utils.h"
+
+#include "tokeniser/entities.h"
+#include "tokeniser/tokeniser.h"
+
+/**
+ * Table of mappings between Windows-1252 codepoints 128-159 and UCS4
+ */
+static const uint32_t cp1252Table[32] = {
+ 0x20AC, 0xFFFD, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
+ 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0xFFFD, 0x017D, 0xFFFD,
+ 0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
+ 0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0xFFFD, 0x017E, 0x0178
+};
+
+/**
+ * Tokeniser states
+ */
+typedef enum hubbub_tokeniser_state {
+ HUBBUB_TOKENISER_STATE_DATA,
+ HUBBUB_TOKENISER_STATE_ENTITY_DATA,
+ HUBBUB_TOKENISER_STATE_TAG_OPEN,
+ HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN,
+ HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH,
+ HUBBUB_TOKENISER_STATE_TAG_NAME,
+ HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME,
+ HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME,
+ HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME,
+ HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE,
+ HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ,
+ HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ,
+ HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ,
+ HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE,
+ HUBBUB_TOKENISER_STATE_BOGUS_COMMENT,
+ HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN,
+ HUBBUB_TOKENISER_STATE_COMMENT_START,
+ HUBBUB_TOKENISER_STATE_COMMENT,
+ HUBBUB_TOKENISER_STATE_COMMENT_DASH,
+ HUBBUB_TOKENISER_STATE_COMMENT_END,
+ HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE,
+ HUBBUB_TOKENISER_STATE_DOCTYPE,
+ HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME,
+ HUBBUB_TOKENISER_STATE_DOCTYPE_NAME,
+ HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME,
+ HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE,
+ HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY,
+ HUBBUB_TOKENISER_STATE_NAMED_ENTITY
+} hubbub_tokeniser_state;
+
+/**
+ * Context for tokeniser
+ */
+typedef struct hubbub_tokeniser_context {
+ hubbub_token_type current_tag_type; /**< Type of current_tag */
+ hubbub_tag current_tag; /**< Current tag */
+
+ hubbub_string current_comment; /**< Current comment */
+
+ hubbub_doctype current_doctype; /**< Current doctype */
+
+ hubbub_string current_chars; /**< Pending characters */
+
+ hubbub_tokeniser_state prev_state; /**< Previous state */
+
+ struct {
+ hubbub_string tag; /**< Pending close tag */
+ } close_tag_match;
+
+ struct {
+ uint32_t count; /**< Index into "DOCTYPE" */
+ } match_doctype;
+
+ struct {
+ hubbub_string str; /**< Pending string */
+ uint8_t base; /**< Base for numeric
+ * entities */
+ uint32_t codepoint; /**< UCS4 codepoint */
+ bool had_data; /**< Whether we read
+ * anything after &#(x)? */
+ hubbub_tokeniser_state return_state; /**< State we were
+ * called from */
+ bool complete; /**< Flag that entity
+ * matching completed */
+ bool done_setup; /**< Flag that match setup
+ * has completed */
+ void *context; /**< Context for named
+ * entity search */
+ size_t prev_len; /**< Previous byte length
+ * of str */
+ } match_entity;
+
+ struct {
+ uint32_t line; /**< Current line of input */
+ uint32_t col; /**< Current character in
+ * line */
+ } position;
+} hubbub_tokeniser_context;
+
+/**
+ * Tokeniser data structure
+ */
+struct hubbub_tokeniser {
+ hubbub_tokeniser_state state; /**< Current tokeniser state */
+ hubbub_content_model content_model; /**< Current content
+ * model flag */
+
+ hubbub_inputstream *input; /**< Input stream */
+
+ const uint8_t *input_buffer; /**< Start of input stream's buffer */
+ size_t input_buffer_len; /**< Length of input buffer */
+
+ hubbub_tokeniser_context context; /**< Tokeniser context */
+
+ hubbub_token_handler token_handler;
+ void *token_pw;
+
+ hubbub_buffer_handler buffer_handler;
+ void *buffer_pw;
+
+ hubbub_error_handler error_handler;
+ void *error_pw;
+
+ hubbub_alloc alloc; /**< Memory (de)allocation function */
+ void *alloc_pw; /**< Client private data */
+};
+
+static bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_close_tag_open(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_close_tag_match(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_before_attribute_name(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_attribute_name(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_after_attribute_name(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_before_attribute_value(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_attribute_value_dq(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_attribute_value_sq(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_attribute_value_uq(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_entity_in_attribute_value(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_bogus_comment(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_markup_declaration_open(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_comment_start(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_comment_dash(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_match_doctype(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_before_doctype_name(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_doctype_name(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_after_doctype_name(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_bogus_doctype(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_numbered_entity(
+ hubbub_tokeniser *tokeniser);
+static bool hubbub_tokeniser_handle_named_entity(
+ hubbub_tokeniser *tokeniser);
+static void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer,
+ size_t len, void *pw);
+static void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
+ hubbub_token *token);
+
+/**
+ * Create a hubbub tokeniser
+ *
+ * \param input Input stream instance
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to tokeniser instance, or NULL on failure
+ */
+hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
+ hubbub_alloc alloc, void *pw)
+{
+ hubbub_tokeniser *tok;
+
+ if (input == NULL || alloc == NULL)
+ return NULL;
+
+ tok = alloc(NULL, sizeof(hubbub_tokeniser), pw);
+ if (tok == NULL)
+ return NULL;
+
+ tok->state = HUBBUB_TOKENISER_STATE_DATA;
+ tok->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
+
+ tok->input = input;
+ tok->input_buffer = NULL;
+ tok->input_buffer_len = 0;
+
+ tok->token_handler = NULL;
+ tok->token_pw = NULL;
+
+ tok->buffer_handler = NULL;
+ tok->buffer_pw = NULL;
+
+ tok->error_handler = NULL;
+ tok->error_pw = NULL;
+
+ tok->alloc = alloc;
+ tok->alloc_pw = pw;
+
+ if (hubbub_inputstream_register_movehandler(input,
+ hubbub_tokeniser_buffer_moved_handler, tok) !=
+ HUBBUB_OK) {
+ alloc(tok, 0, pw);
+ return NULL;
+ }
+
+ memset(&tok->context, 0, sizeof(hubbub_tokeniser_context));
+
+ return tok;
+}
+
+/**
+ * Destroy a hubbub tokeniser
+ *
+ * \param tokeniser The tokeniser instance to destroy
+ */
+void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser)
+{
+ if (tokeniser == NULL)
+ return;
+
+ hubbub_inputstream_deregister_movehandler(tokeniser->input,
+ hubbub_tokeniser_buffer_moved_handler, tokeniser);
+
+ if (tokeniser->context.current_tag.attributes != NULL) {
+ tokeniser->alloc(tokeniser->context.current_tag.attributes,
+ 0, tokeniser->alloc_pw);
+ }
+
+ tokeniser->alloc(tokeniser, 0, tokeniser->alloc_pw);
+}
+
+/**
+ * Configure a hubbub tokeniser
+ *
+ * \param tokeniser The tokeniser instance to configure
+ * \param type The option type to set
+ * \param params Option-specific parameters
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
+ hubbub_tokeniser_opttype type,
+ hubbub_tokeniser_optparams *params)
+{
+ if (tokeniser == NULL || params == NULL)
+ return HUBBUB_BADPARM;
+
+ switch (type) {
+ case HUBBUB_TOKENISER_TOKEN_HANDLER:
+ tokeniser->token_handler = params->token_handler.handler;
+ tokeniser->token_pw = params->token_handler.pw;
+ break;
+ case HUBBUB_TOKENISER_BUFFER_HANDLER:
+ tokeniser->buffer_handler = params->buffer_handler.handler;
+ tokeniser->buffer_pw = params->buffer_handler.pw;
+ tokeniser->buffer_handler(tokeniser->input_buffer,
+ tokeniser->input_buffer_len,
+ tokeniser->buffer_pw);
+ break;
+ case HUBBUB_TOKENISER_ERROR_HANDLER:
+ tokeniser->error_handler = params->error_handler.handler;
+ tokeniser->error_pw = params->error_handler.pw;
+ break;
+ case HUBBUB_TOKENISER_CONTENT_MODEL:
+ tokeniser->content_model = params->content_model.model;
+ break;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Process remaining data in the input stream
+ *
+ * \param tokeniser The tokeniser instance to invoke
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser)
+{
+ bool cont = true;
+
+ if (tokeniser == NULL)
+ return HUBBUB_BADPARM;
+
+ while (cont) {
+ switch (tokeniser->state) {
+ case HUBBUB_TOKENISER_STATE_DATA:
+ cont = hubbub_tokeniser_handle_data(tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_ENTITY_DATA:
+ cont = hubbub_tokeniser_handle_entity_data(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_TAG_OPEN:
+ cont = hubbub_tokeniser_handle_tag_open(tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN:
+ cont = hubbub_tokeniser_handle_close_tag_open(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH:
+ cont = hubbub_tokeniser_handle_close_tag_match(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_TAG_NAME:
+ cont = hubbub_tokeniser_handle_tag_name(tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME:
+ cont = hubbub_tokeniser_handle_before_attribute_name(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME:
+ cont = hubbub_tokeniser_handle_attribute_name(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME:
+ cont = hubbub_tokeniser_handle_after_attribute_name(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE:
+ cont = hubbub_tokeniser_handle_before_attribute_value(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ:
+ cont = hubbub_tokeniser_handle_attribute_value_dq(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ:
+ cont = hubbub_tokeniser_handle_attribute_value_sq(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ:
+ cont = hubbub_tokeniser_handle_attribute_value_uq(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE:
+ cont = hubbub_tokeniser_handle_entity_in_attribute_value(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_BOGUS_COMMENT:
+ cont = hubbub_tokeniser_handle_bogus_comment(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN:
+ cont = hubbub_tokeniser_handle_markup_declaration_open(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_COMMENT_START:
+ cont = hubbub_tokeniser_handle_comment_start(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_COMMENT:
+ cont = hubbub_tokeniser_handle_comment(tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_COMMENT_DASH:
+ cont = hubbub_tokeniser_handle_comment_dash(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_COMMENT_END:
+ cont = hubbub_tokeniser_handle_comment_end(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE:
+ cont = hubbub_tokeniser_handle_match_doctype(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_DOCTYPE:
+ cont = hubbub_tokeniser_handle_doctype(tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME:
+ cont = hubbub_tokeniser_handle_before_doctype_name(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_DOCTYPE_NAME:
+ cont = hubbub_tokeniser_handle_doctype_name(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME:
+ cont = hubbub_tokeniser_handle_after_doctype_name(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE:
+ cont = hubbub_tokeniser_handle_bogus_doctype(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY:
+ cont = hubbub_tokeniser_handle_numbered_entity(
+ tokeniser);
+ break;
+ case HUBBUB_TOKENISER_STATE_NAMED_ENTITY:
+ cont = hubbub_tokeniser_handle_named_entity(
+ tokeniser);
+ break;
+ }
+ }
+
+ return HUBBUB_OK;
+}
+
+bool hubbub_tokeniser_handle_data(hubbub_tokeniser *tokeniser)
+{
+ hubbub_token token;
+ uint32_t c;
+
+ /* Clear current characters */
+ tokeniser->context.current_chars.data_off = 0;
+ tokeniser->context.current_chars.len = 0;
+
+ while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
+ HUBBUB_INPUTSTREAM_EOF &&
+ c != HUBBUB_INPUTSTREAM_OOD) {
+ if (c == '&' && (tokeniser->content_model ==
+ HUBBUB_CONTENT_MODEL_PCDATA ||
+ tokeniser->content_model ==
+ HUBBUB_CONTENT_MODEL_RCDATA)) {
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_ENTITY_DATA;
+ /* Don't eat the '&'; it'll be handled by
+ * entity consumption */
+ break;
+ } else if (c == '<' && tokeniser->content_model !=
+ HUBBUB_CONTENT_MODEL_PLAINTEXT) {
+ if (tokeniser->context.current_chars.len > 0) {
+ /* Emit any pending characters */
+ token.type = HUBBUB_TOKEN_CHARACTER;
+ token.data.character =
+ tokeniser->context.current_chars;
+
+ hubbub_tokeniser_emit_token(tokeniser,
+ &token);
+ }
+
+ /* Buffer '<' */
+ tokeniser->context.current_chars.data_off =
+ hubbub_inputstream_cur_pos(tokeniser->input,
+ &tokeniser->context.current_chars.len);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_OPEN;
+ hubbub_inputstream_advance(tokeniser->input);
+ break;
+ } else {
+ uint32_t pos;
+ size_t len;
+
+ /* Accumulate characters into buffer */
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+
+ if (tokeniser->context.current_chars.len == 0) {
+ tokeniser->context.current_chars.data_off =
+ pos;
+ }
+ tokeniser->context.current_chars.len++;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+ }
+
+ if (tokeniser->state != HUBBUB_TOKENISER_STATE_TAG_OPEN &&
+ tokeniser->context.current_chars.len > 0) {
+ /* Emit any pending characters */
+ token.type = HUBBUB_TOKEN_CHARACTER;
+ token.data.character = tokeniser->context.current_chars;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->context.current_chars.data_off = 0;
+ tokeniser->context.current_chars.len = 0;
+ }
+
+ if (c == HUBBUB_INPUTSTREAM_EOF) {
+ token.type = HUBBUB_TOKEN_EOF;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+ }
+
+ return (c != HUBBUB_INPUTSTREAM_EOF && c != HUBBUB_INPUTSTREAM_OOD);
+}
+
+bool hubbub_tokeniser_handle_entity_data(hubbub_tokeniser *tokeniser)
+{
+ if (tokeniser->context.match_entity.complete == false) {
+ return hubbub_tokeniser_consume_entity(tokeniser);
+ } else {
+ hubbub_token token;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD ||
+ c == HUBBUB_INPUTSTREAM_EOF) {
+ /* Should never happen */
+ abort();
+ }
+
+ /* Emit character */
+ token.type = HUBBUB_TOKEN_CHARACTER;
+ token.data.character.data_off =
+ hubbub_inputstream_cur_pos(tokeniser->input,
+ &token.data.character.len);
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ /* Reset for next time */
+ tokeniser->context.match_entity.complete = false;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_tag_open(hubbub_tokeniser *tokeniser)
+{
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ uint32_t pos;
+ size_t len;
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+ tokeniser->content_model ==
+ HUBBUB_CONTENT_MODEL_CDATA) {
+ if (c == '/') {
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+ tokeniser->context.current_chars.len += len;
+
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ } else {
+ hubbub_token token;
+
+ /* Emit '<' */
+ token.type = HUBBUB_TOKEN_CHARACTER;
+ token.data.character =
+ tokeniser->context.current_chars;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_DATA;
+ }
+ } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
+ if (c == '!') {
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+
+ tokeniser->context.current_chars.len += len;
+
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_MARKUP_DECLARATION_OPEN;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '/') {
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+
+ tokeniser->context.current_chars.len += len;
+
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if ('A' <= c && c <= 'Z') {
+ hubbub_inputstream_lowercase(tokeniser->input);
+
+ tokeniser->context.current_tag_type =
+ HUBBUB_TOKEN_START_TAG;
+
+ ctag->name.data_off =
+ hubbub_inputstream_cur_pos(tokeniser->input,
+ &ctag->name.len);
+ ctag->n_attributes = 0;
+
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_TAG_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if ('a' <= c && c <= 'z') {
+ tokeniser->context.current_tag_type =
+ HUBBUB_TOKEN_START_TAG;
+
+ ctag->name.data_off =
+ hubbub_inputstream_cur_pos(tokeniser->input,
+ &ctag->name.len);
+ ctag->n_attributes = 0;
+
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_TAG_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '>') {
+ hubbub_token token;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+ tokeniser->context.current_chars.len += len;
+
+ /* Emit "<>" */
+ token.type = HUBBUB_TOKEN_CHARACTER;
+ token.data.character =
+ tokeniser->context.current_chars;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_DATA;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '?') {
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+ tokeniser->context.current_chars.len += len;
+
+ tokeniser->context.current_comment.data_off = pos;
+ tokeniser->context.current_comment.len = len;
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else {
+ hubbub_token token;
+
+ /* Emit '<' */
+ token.type = HUBBUB_TOKEN_CHARACTER;
+ token.data.character =
+ tokeniser->context.current_chars;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_DATA;
+ }
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_close_tag_open(hubbub_tokeniser *tokeniser)
+{
+ if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_RCDATA ||
+ tokeniser->content_model ==
+ HUBBUB_CONTENT_MODEL_CDATA) {
+ tokeniser->context.close_tag_match.tag.len = 0;
+ tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_MATCH;
+ } else if (tokeniser->content_model == HUBBUB_CONTENT_MODEL_PCDATA) {
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ uint32_t pos;
+ size_t len;
+
+ if ('A' <= c && c <= 'Z') {
+ hubbub_inputstream_lowercase(tokeniser->input);
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+
+ tokeniser->context.current_tag_type =
+ HUBBUB_TOKEN_END_TAG;
+ ctag->name.data_off = pos;
+ ctag->name.len = len;
+ ctag->n_attributes = 0;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if ('a' <= c && c <= 'z') {
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+
+ tokeniser->context.current_tag_type =
+ HUBBUB_TOKEN_END_TAG;
+ ctag->name.data_off = pos;
+ ctag->name.len = len;
+ ctag->n_attributes = 0;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_TAG_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '>') {
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit "</" */
+ token.type = HUBBUB_TOKEN_CHARACTER;
+ token.data.character =
+ tokeniser->context.current_chars;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else if (c != HUBBUB_INPUTSTREAM_OOD) {
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+
+ tokeniser->context.current_comment.data_off = pos;
+ tokeniser->context.current_comment.len = len;
+
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else {
+ /* Out of data */
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_close_tag_match(hubbub_tokeniser *tokeniser)
+{
+ hubbub_tokeniser_context *ctx = &tokeniser->context;
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ uint32_t c = 0;
+
+ while (ctx->close_tag_match.tag.len < ctag->name.len &&
+ (c = hubbub_inputstream_peek(tokeniser->input)) !=
+ HUBBUB_INPUTSTREAM_EOF &&
+ c != HUBBUB_INPUTSTREAM_OOD) {
+ /* Match last open tag */
+ uint32_t off;
+ size_t len;
+
+ off = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ if (ctx->close_tag_match.tag.len == 0) {
+ ctx->close_tag_match.tag.data_off = off;
+ ctx->close_tag_match.tag.len = len;
+ } else {
+ ctx->close_tag_match.tag.len += len;
+ }
+
+ hubbub_inputstream_advance(tokeniser->input);
+
+ if (ctx->close_tag_match.tag.len > ctag->name.len ||
+ (ctx->close_tag_match.tag.len == ctag->name.len &&
+ hubbub_inputstream_compare_range_ci(
+ tokeniser->input,
+ ctag->name.data_off,
+ ctx->close_tag_match.tag.data_off,
+ ctag->name.len) != 0)) {
+ hubbub_token token;
+
+ /* Rewind input stream to start of tag name */
+ if (hubbub_inputstream_rewind(tokeniser->input,
+ ctx->close_tag_match.tag.len) !=
+ HUBBUB_OK)
+ abort();
+
+ /* Emit "</" */
+ token.type = HUBBUB_TOKEN_CHARACTER;
+ token.data.character =
+ tokeniser->context.current_chars;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+ return true;
+ } else if (ctx->close_tag_match.tag.len == ctag->name.len &&
+ hubbub_inputstream_compare_range_ci(
+ tokeniser->input,
+ ctag->name.data_off,
+ ctx->close_tag_match.tag.data_off,
+ ctag->name.len) == 0) {
+ /* Matched => stop searching */
+ break;
+ }
+ }
+
+ if (c == HUBBUB_INPUTSTREAM_OOD) {
+ /* Need more data */
+ return false;
+ }
+
+ if (c == HUBBUB_INPUTSTREAM_EOF) {
+ /* Ran out of data - parse error */
+ hubbub_token token;
+
+ /* Rewind input stream to start of tag name */
+ if (hubbub_inputstream_rewind(tokeniser->input,
+ ctx->close_tag_match.tag.len) != HUBBUB_OK)
+ abort();
+
+ /* Emit "</" */
+ token.type = HUBBUB_TOKEN_CHARACTER;
+ token.data.character = tokeniser->context.current_chars;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+ return true;
+ }
+
+ /* Match following char */
+ c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD) {
+ /* Need more data */
+ return false;
+ }
+
+ /* Rewind input stream to start of tag name */
+ if (hubbub_inputstream_rewind(tokeniser->input,
+ ctx->close_tag_match.tag.len) != HUBBUB_OK)
+ abort();
+
+ /* Check that following char was valid */
+ if (c != '\t' && c != '\n' && c != '\v' && c != '\f' &&
+ c != ' ' && c != '>' && c != '/' && c != '<' &&
+ c != HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit "</" */
+ token.type = HUBBUB_TOKEN_CHARACTER;
+ token.data.character = tokeniser->context.current_chars;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+ return true;
+ }
+
+ /* Switch the content model back to PCDATA */
+ tokeniser->content_model = HUBBUB_CONTENT_MODEL_PCDATA;
+
+ /* Finally, transition back to close tag open state */
+ tokeniser->state = HUBBUB_TOKENISER_STATE_CLOSE_TAG_OPEN;
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_tag_name(hubbub_tokeniser *tokeniser)
+{
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '>') {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if ('A' <= c && c <= 'Z') {
+ uint32_t pos;
+ size_t len;
+
+ hubbub_inputstream_lowercase(tokeniser->input);
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ ctag->name.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else if (c == '/') {
+ /** \todo permitted slash */
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else {
+ uint32_t pos;
+ size_t len;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ ctag->name.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_before_attribute_name(
+ hubbub_tokeniser *tokeniser)
+{
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '>') {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if ('A' <= c && c <= 'Z') {
+ uint32_t pos;
+ size_t len;
+ hubbub_attribute *attr;
+
+ hubbub_inputstream_lowercase(tokeniser->input);
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ attr = tokeniser->alloc(ctag->attributes,
+ (ctag->n_attributes + 1) *
+ sizeof(hubbub_attribute),
+ tokeniser->alloc_pw);
+ if (attr == NULL) {
+ /** \todo handle memory exhaustion */
+ }
+
+ ctag->attributes = attr;
+
+ attr[ctag->n_attributes].name.data_off = pos;
+ attr[ctag->n_attributes].name.len = len;
+ attr[ctag->n_attributes].value.data_off = 0;
+ attr[ctag->n_attributes].value.len = 0;
+
+ ctag->n_attributes++;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '/') {
+ /** \todo permitted slash */
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ uint32_t pos;
+ size_t len;
+ hubbub_attribute *attr;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ attr = tokeniser->alloc(ctag->attributes,
+ (ctag->n_attributes + 1) *
+ sizeof(hubbub_attribute),
+ tokeniser->alloc_pw);
+ if (attr == NULL) {
+ /** \todo handle memory exhaustion */
+ }
+
+ ctag->attributes = attr;
+
+ attr[ctag->n_attributes].name.data_off = pos;
+ attr[ctag->n_attributes].name.len = len;
+ attr[ctag->n_attributes].value.data_off = 0;
+ attr[ctag->n_attributes].value.len = 0;
+
+ ctag->n_attributes++;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_attribute_name(hubbub_tokeniser *tokeniser)
+{
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_AFTER_ATTRIBUTE_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '=') {
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '>') {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if ('A' <= c && c <= 'Z') {
+ uint32_t pos;
+ size_t len;
+
+ hubbub_inputstream_lowercase(tokeniser->input);
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ ctag->attributes[ctag->n_attributes - 1].name.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '/') {
+ /** \todo permitted slash */
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ uint32_t pos;
+ size_t len;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ ctag->attributes[ctag->n_attributes - 1].name.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_after_attribute_name(
+ hubbub_tokeniser *tokeniser)
+{
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '=') {
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_VALUE;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '>') {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if ('A' <= c && c <= 'Z') {
+ uint32_t pos;
+ size_t len;
+ hubbub_attribute *attr;
+
+ hubbub_inputstream_lowercase(tokeniser->input);
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ attr = tokeniser->alloc(ctag->attributes,
+ (ctag->n_attributes + 1) *
+ sizeof(hubbub_attribute),
+ tokeniser->alloc_pw);
+ if (attr == NULL) {
+ /** \todo handle memory exhaustion */
+ }
+
+ ctag->attributes = attr;
+
+ attr[ctag->n_attributes].name.data_off = pos;
+ attr[ctag->n_attributes].name.len = len;
+ attr[ctag->n_attributes].value.data_off = 0;
+ attr[ctag->n_attributes].value.len = 0;
+
+ ctag->n_attributes++;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '/') {
+ /** \todo permitted slash */
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ uint32_t pos;
+ size_t len;
+ hubbub_attribute *attr;
+
+ hubbub_inputstream_lowercase(tokeniser->input);
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ attr = tokeniser->alloc(ctag->attributes,
+ (ctag->n_attributes + 1) *
+ sizeof(hubbub_attribute),
+ tokeniser->alloc_pw);
+ if (attr == NULL) {
+ /** \todo handle memory exhaustion */
+ }
+
+ ctag->attributes = attr;
+
+ attr[ctag->n_attributes].name.data_off = pos;
+ attr[ctag->n_attributes].name.len = len;
+ attr[ctag->n_attributes].value.data_off = 0;
+ attr[ctag->n_attributes].value.len = 0;
+
+ ctag->n_attributes++;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_NAME;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_before_attribute_value(
+ hubbub_tokeniser *tokeniser)
+{
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '"') {
+ tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_DQ;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '&') {
+ tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ;
+ } else if (c == '\'') {
+ tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_SQ;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '>') {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ uint32_t pos;
+ size_t len;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ ctag->attributes[ctag->n_attributes - 1].value.data_off = pos;
+ ctag->attributes[ctag->n_attributes - 1].value.len = len;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_ATTRIBUTE_VALUE_UQ;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_attribute_value_dq(hubbub_tokeniser *tokeniser)
+{
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '"') {
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '&') {
+ tokeniser->context.prev_state = tokeniser->state;
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE;
+ /* Don't eat the '&'; entity consumption handles this */
+ } else if (c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ uint32_t pos;
+ size_t len;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
+ ctag->attributes[ctag->n_attributes - 1].value.data_off =
+ pos;
+ }
+
+ ctag->attributes[ctag->n_attributes - 1].value.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_attribute_value_sq(hubbub_tokeniser *tokeniser)
+{
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '\'') {
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '&') {
+ tokeniser->context.prev_state = tokeniser->state;
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE;
+ /* Don't eat the '&'; entity consumption handles this */
+ } else if (c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ uint32_t pos;
+ size_t len;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
+ ctag->attributes[ctag->n_attributes - 1].value.data_off =
+ pos;
+ }
+
+ ctag->attributes[ctag->n_attributes - 1].value.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_attribute_value_uq(hubbub_tokeniser *tokeniser)
+{
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_BEFORE_ATTRIBUTE_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '&') {
+ tokeniser->context.prev_state = tokeniser->state;
+ tokeniser->state =
+ HUBBUB_TOKENISER_STATE_ENTITY_IN_ATTRIBUTE_VALUE;
+ /* Don't eat the '&'; entity consumption handles this */
+ } else if (c == '>') {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '<' || c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit current tag */
+ token.type = tokeniser->context.current_tag_type;
+ token.data.tag = tokeniser->context.current_tag;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ uint32_t pos;
+ size_t len;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
+ ctag->attributes[ctag->n_attributes - 1].value.data_off =
+ pos;
+ }
+
+ ctag->attributes[ctag->n_attributes - 1].value.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_entity_in_attribute_value(
+ hubbub_tokeniser *tokeniser)
+{
+ hubbub_tag *ctag = &tokeniser->context.current_tag;
+ uint32_t pos;
+ size_t len;
+
+ if (tokeniser->context.match_entity.complete == false) {
+ return hubbub_tokeniser_consume_entity(tokeniser);
+ } else {
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD ||
+ c == HUBBUB_INPUTSTREAM_EOF) {
+ /* Should never happen */
+ abort();
+ }
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ if (ctag->attributes[ctag->n_attributes - 1].value.len == 0) {
+ ctag->attributes[ctag->n_attributes - 1].value.data_off =
+ pos;
+ }
+
+ ctag->attributes[ctag->n_attributes - 1].value.len += len;
+
+ /* Reset for next time */
+ tokeniser->context.match_entity.complete = false;
+
+ /* And back to the previous state */
+ tokeniser->state = tokeniser->context.prev_state;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_bogus_comment(hubbub_tokeniser *tokeniser)
+{
+ hubbub_token token;
+ uint32_t c;
+
+ while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
+ HUBBUB_INPUTSTREAM_EOF &&
+ c != HUBBUB_INPUTSTREAM_OOD) {
+ uint32_t pos;
+ size_t len;
+
+ if (c == '>') {
+ hubbub_inputstream_advance(tokeniser->input);
+ break;
+ }
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ if (tokeniser->context.current_comment.len == 0)
+ tokeniser->context.current_comment.data_off = pos;
+ tokeniser->context.current_comment.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ /* Emit comment */
+ token.type = HUBBUB_TOKEN_COMMENT;
+ token.data.comment = tokeniser->context.current_comment;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_markup_declaration_open(
+ hubbub_tokeniser *tokeniser)
+{
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '-') {
+ tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_START;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if ((c & ~0x20) == 'D') {
+ hubbub_inputstream_uppercase(tokeniser->input);
+ tokeniser->context.match_doctype.count = 1;
+ tokeniser->state = HUBBUB_TOKENISER_STATE_MATCH_DOCTYPE;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else {
+ tokeniser->context.current_comment.data_off = 0;
+ tokeniser->context.current_comment.len = 0;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_comment_start(hubbub_tokeniser *tokeniser)
+{
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ tokeniser->context.current_comment.data_off = 0;
+ tokeniser->context.current_comment.len = 0;
+
+
+ if (c == '-') {
+ tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else {
+ hubbub_inputstream_push_back(tokeniser->input, '-');
+ tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_comment(hubbub_tokeniser *tokeniser)
+{
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '-') {
+ tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_DASH;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit comment */
+ token.type = HUBBUB_TOKEN_COMMENT;
+ token.data.comment = tokeniser->context.current_comment;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ uint32_t pos;
+ size_t len;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ if (tokeniser->context.current_comment.len == 0)
+ tokeniser->context.current_comment.data_off = pos;
+ tokeniser->context.current_comment.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_comment_dash(hubbub_tokeniser *tokeniser)
+{
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '-') {
+ tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit comment */
+ token.type = HUBBUB_TOKEN_COMMENT;
+ token.data.comment = tokeniser->context.current_comment;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ uint32_t pos;
+ size_t len;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ if (tokeniser->context.current_comment.len == 0) {
+ tokeniser->context.current_comment.data_off = pos;
+ } else {
+ /* Need to do this to get length of '-' */
+ len += pos -
+ tokeniser->context.current_comment.data_off;
+ }
+
+ tokeniser->context.current_comment.len = len;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_comment_end(hubbub_tokeniser *tokeniser)
+{
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '>') {
+ hubbub_token token;
+
+ /* Emit comment */
+ token.type = HUBBUB_TOKEN_COMMENT;
+ token.data.comment = tokeniser->context.current_comment;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '-') {
+ uint32_t pos;
+ size_t len;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ if (tokeniser->context.current_comment.len == 0) {
+ tokeniser->context.current_comment.data_off = pos;
+ tokeniser->context.current_comment.len = len;
+ } else {
+ /* Need to do this to get length of '-' */
+ len = pos -
+ tokeniser->context.current_comment.data_off;
+ }
+
+ tokeniser->context.current_comment.len = len;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT_END;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit comment */
+ token.type = HUBBUB_TOKEN_COMMENT;
+ token.data.comment = tokeniser->context.current_comment;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ uint32_t pos;
+ size_t len;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ if (tokeniser->context.current_comment.len == 0) {
+ tokeniser->context.current_comment.data_off = pos;
+ } else {
+ /* Need to do this to get length of '--' */
+ len += pos -
+ tokeniser->context.current_comment.data_off;
+ }
+
+ tokeniser->context.current_comment.len = len;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_COMMENT;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_match_doctype(hubbub_tokeniser *tokeniser)
+{
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (tokeniser->context.match_doctype.count == 1 &&
+ (c & ~0x20) == 'O') {
+ hubbub_inputstream_uppercase(tokeniser->input);
+ tokeniser->context.match_doctype.count++;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (tokeniser->context.match_doctype.count == 2 &&
+ (c & ~0x20) == 'C') {
+ hubbub_inputstream_uppercase(tokeniser->input);
+ tokeniser->context.match_doctype.count++;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (tokeniser->context.match_doctype.count == 3 &&
+ (c & ~0x20) == 'T') {
+ hubbub_inputstream_uppercase(tokeniser->input);
+ tokeniser->context.match_doctype.count++;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (tokeniser->context.match_doctype.count == 4 &&
+ (c & ~0x20) == 'Y') {
+ hubbub_inputstream_uppercase(tokeniser->input);
+ tokeniser->context.match_doctype.count++;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (tokeniser->context.match_doctype.count == 5 &&
+ (c & ~0x20) == 'P') {
+ hubbub_inputstream_uppercase(tokeniser->input);
+ tokeniser->context.match_doctype.count++;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (tokeniser->context.match_doctype.count == 6 &&
+ (c & ~0x20) == 'E') {
+ hubbub_inputstream_uppercase(tokeniser->input);
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else {
+ switch (tokeniser->context.match_doctype.count) {
+ case 6: hubbub_inputstream_push_back(tokeniser->input, 'P');
+ case 5: hubbub_inputstream_push_back(tokeniser->input, 'Y');
+ case 4: hubbub_inputstream_push_back(tokeniser->input, 'T');
+ case 3: hubbub_inputstream_push_back(tokeniser->input, 'C');
+ case 2: hubbub_inputstream_push_back(tokeniser->input, 'O');
+ case 1: hubbub_inputstream_push_back(tokeniser->input, 'D');
+ }
+
+ tokeniser->context.current_comment.data_off = 0;
+ tokeniser->context.current_comment.len = 0;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_COMMENT;
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_doctype(hubbub_tokeniser *tokeniser)
+{
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_BEFORE_DOCTYPE_NAME;
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_before_doctype_name(
+ hubbub_tokeniser *tokeniser)
+{
+ hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if ('a' <= c && c <= 'z') {
+ uint32_t pos;
+ size_t len;
+
+ hubbub_inputstream_uppercase(tokeniser->input);
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ cdoc->name.data_off = pos;
+ cdoc->name.len = len;
+ cdoc->correct = false;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '>') {
+ hubbub_token token;
+
+ /* Emit doctype */
+ token.type = HUBBUB_TOKEN_DOCTYPE;
+ token.data.doctype = tokeniser->context.current_doctype;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit doctype */
+ token.type = HUBBUB_TOKEN_DOCTYPE;
+ token.data.doctype = tokeniser->context.current_doctype;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ uint32_t pos;
+ size_t len;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ cdoc->name.data_off = pos;
+ cdoc->name.len = len;
+ cdoc->correct = false;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DOCTYPE_NAME;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_doctype_name(hubbub_tokeniser *tokeniser)
+{
+ hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+ tokeniser->state = HUBBUB_TOKENISER_STATE_AFTER_DOCTYPE_NAME;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '>') {
+ hubbub_token token;
+
+ /* Emit doctype */
+ token.type = HUBBUB_TOKEN_DOCTYPE;
+ token.data.doctype = tokeniser->context.current_doctype;
+ token.data.doctype.correct =
+ (hubbub_inputstream_compare_range_ascii(
+ tokeniser->input,
+ token.data.doctype.name.data_off,
+ token.data.doctype.name.len,
+ "HTML", SLEN("HTML")) == 0);
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if ('a' <= c && c <= 'z') {
+ uint32_t pos;
+ size_t len;
+
+ hubbub_inputstream_uppercase(tokeniser->input);
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ cdoc->name.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit doctype */
+ token.type = HUBBUB_TOKEN_DOCTYPE;
+ token.data.doctype = tokeniser->context.current_doctype;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ uint32_t pos;
+ size_t len;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ cdoc->name.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_after_doctype_name(hubbub_tokeniser *tokeniser)
+{
+ hubbub_doctype *cdoc = &tokeniser->context.current_doctype;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == ' ') {
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == '>') {
+ hubbub_token token;
+
+ /* Emit doctype */
+ token.type = HUBBUB_TOKEN_DOCTYPE;
+ token.data.doctype = tokeniser->context.current_doctype;
+ token.data.doctype.correct =
+ (hubbub_inputstream_compare_range_ascii(
+ tokeniser->input,
+ token.data.doctype.name.data_off,
+ token.data.doctype.name.len,
+ "HTML", SLEN("HTML")) == 0);
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit doctype */
+ token.type = HUBBUB_TOKEN_DOCTYPE;
+ token.data.doctype = tokeniser->context.current_doctype;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ cdoc->correct = false;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_BOGUS_DOCTYPE;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_bogus_doctype(hubbub_tokeniser *tokeniser)
+{
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '>') {
+ hubbub_token token;
+
+ /* Emit doctype */
+ token.type = HUBBUB_TOKEN_DOCTYPE;
+ token.data.doctype = tokeniser->context.current_doctype;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else if (c == HUBBUB_INPUTSTREAM_EOF) {
+ hubbub_token token;
+
+ /* Emit doctype */
+ token.type = HUBBUB_TOKEN_DOCTYPE;
+ token.data.doctype = tokeniser->context.current_doctype;
+
+ hubbub_tokeniser_emit_token(tokeniser, &token);
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_DATA;
+ } else {
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_consume_entity(hubbub_tokeniser *tokeniser)
+{
+ uint32_t c;
+ uint32_t pos;
+ size_t len;
+
+ if (tokeniser->context.match_entity.done_setup == false) {
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ tokeniser->context.match_entity.str.data_off = pos;
+ tokeniser->context.match_entity.str.len = len;
+ tokeniser->context.match_entity.base = 0;
+ tokeniser->context.match_entity.codepoint = 0;
+ tokeniser->context.match_entity.had_data = false;
+ tokeniser->context.match_entity.return_state =
+ tokeniser->state;
+ tokeniser->context.match_entity.complete = false;
+ tokeniser->context.match_entity.done_setup = true;
+ tokeniser->context.match_entity.context = NULL;
+ tokeniser->context.match_entity.prev_len = len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ c = hubbub_inputstream_peek(tokeniser->input);
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (c == '#') {
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+
+ tokeniser->context.match_entity.str.len += len;
+
+ tokeniser->state = HUBBUB_TOKENISER_STATE_NUMBERED_ENTITY;
+ hubbub_inputstream_advance(tokeniser->input);
+ } else {
+ tokeniser->state = HUBBUB_TOKENISER_STATE_NAMED_ENTITY;
+ }
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_numbered_entity(hubbub_tokeniser *tokeniser)
+{
+ hubbub_tokeniser_context *ctx = &tokeniser->context;
+ uint32_t c = hubbub_inputstream_peek(tokeniser->input);
+ uint32_t pos;
+ size_t len;
+ hubbub_error error;
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ if (ctx->match_entity.base == 0) {
+ if ((c & ~0x20) == 'X') {
+ ctx->match_entity.base = 16;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+ ctx->match_entity.str.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ } else {
+ ctx->match_entity.base = 10;
+ }
+ }
+
+ while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
+ HUBBUB_INPUTSTREAM_EOF &&
+ c != HUBBUB_INPUTSTREAM_OOD) {
+ if (ctx->match_entity.base == 10 &&
+ ('0' <= c && c <= '9')) {
+ ctx->match_entity.had_data = true;
+
+ ctx->match_entity.codepoint =
+ ctx->match_entity.codepoint * 10 + (c - '0');
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+ ctx->match_entity.str.len += len;
+ } else if (ctx->match_entity.base == 16 &&
+ (('0' <= c && c <= '9') ||
+ ('A' <= (c & ~0x20) &&
+ (c & ~0x20) <= 'F'))) {
+ ctx->match_entity.had_data = true;
+
+ ctx->match_entity.codepoint *= 16;
+
+ if ('0' <= c && c <= '9') {
+ ctx->match_entity.codepoint += (c - '0');
+ } else {
+ ctx->match_entity.codepoint +=
+ ((c & ~0x20) - 'A' + 10);
+ }
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+ ctx->match_entity.str.len += len;
+ } else {
+ break;
+ }
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ /* Eat trailing semicolon, if any */
+ if (c == ';') {
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+ ctx->match_entity.str.len += len;
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ /* Rewind the inputstream to start of matched sequence */
+ hubbub_inputstream_rewind(tokeniser->input,
+ ctx->match_entity.str.len);
+
+ if (ctx->match_entity.had_data) {
+ /* Had data, so calculate final codepoint */
+ if (0x80 <= ctx->match_entity.codepoint &&
+ ctx->match_entity.codepoint <= 0x9F) {
+ ctx->match_entity.codepoint =
+ cp1252Table[ctx->match_entity.codepoint -
+ 0x80];
+ } else if (ctx->match_entity.codepoint == 0 ||
+ ctx->match_entity.codepoint > 0x10FFFF) {
+ ctx->match_entity.codepoint = 0xFFFD;
+ }
+
+ /* And replace the matched range with it */
+ error = hubbub_inputstream_replace_range(tokeniser->input,
+ ctx->match_entity.str.data_off,
+ ctx->match_entity.str.len,
+ ctx->match_entity.codepoint);
+ if (error != HUBBUB_OK) {
+ /** \todo handle memory exhaustion */
+ }
+ }
+
+ /* Reset for next time */
+ ctx->match_entity.done_setup = false;
+
+ /* Flag completion */
+ ctx->match_entity.complete = true;
+
+ /* And back to the state we were entered in */
+ tokeniser->state = ctx->match_entity.return_state;
+
+ return true;
+}
+
+bool hubbub_tokeniser_handle_named_entity(hubbub_tokeniser *tokeniser)
+{
+ hubbub_tokeniser_context *ctx = &tokeniser->context;
+ uint32_t c;
+ uint32_t pos;
+ size_t len;
+ hubbub_error error;
+
+ while ((c = hubbub_inputstream_peek(tokeniser->input)) !=
+ HUBBUB_INPUTSTREAM_EOF &&
+ c != HUBBUB_INPUTSTREAM_OOD) {
+ uint32_t cp;
+
+ if (c > 0x7F) {
+ /* Entity names are ASCII only */
+ break;
+ }
+
+ error = hubbub_entities_search_step((uint8_t) c,
+ &cp,
+ &ctx->match_entity.context);
+ if (error == HUBBUB_OK) {
+ /* Had a match - store it for later */
+ ctx->match_entity.codepoint = cp;
+
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+ ctx->match_entity.str.len += len;
+
+ /* And cache length, for replacement */
+ ctx->match_entity.prev_len =
+ ctx->match_entity.str.len;
+ } else if (error == HUBBUB_INVALID) {
+ /* No further matches - use last found */
+ break;
+ } else {
+ pos = hubbub_inputstream_cur_pos(tokeniser->input,
+ &len);
+ ctx->match_entity.str.len += len;
+ }
+
+ hubbub_inputstream_advance(tokeniser->input);
+ }
+
+ if (c == HUBBUB_INPUTSTREAM_OOD)
+ return false;
+
+ /* Eat trailing semicolon, if any */
+ if (ctx->match_entity.codepoint != 0 && c == ';' &&
+ ctx->match_entity.prev_len ==
+ ctx->match_entity.str.len) {
+ pos = hubbub_inputstream_cur_pos(tokeniser->input, &len);
+ ctx->match_entity.prev_len += len;
+ }
+
+ /* Rewind the inputstream to start of processed sequence */
+ hubbub_inputstream_rewind(tokeniser->input,
+ ctx->match_entity.str.len);
+
+ /* Now, replace range, if we found a named entity */
+ if (ctx->match_entity.codepoint != 0) {
+ error = hubbub_inputstream_replace_range(tokeniser->input,
+ ctx->match_entity.str.data_off,
+ ctx->match_entity.prev_len,
+ ctx->match_entity.codepoint);
+ if (error != HUBBUB_OK) {
+ /** \todo handle memory exhaustion */
+ }
+ }
+
+ /* Reset for next time */
+ ctx->match_entity.done_setup = false;
+
+ /* Flag completion */
+ ctx->match_entity.complete = true;
+
+ /* And back to the state from whence we came */
+ tokeniser->state = ctx->match_entity.return_state;
+
+ return true;
+}
+
+/**
+ * Handle input stream buffer moving
+ *
+ * \param buffer Pointer to buffer
+ * \param len Length of data in buffer (bytes)
+ * \param pw Pointer to our context
+ */
+void hubbub_tokeniser_buffer_moved_handler(const uint8_t *buffer,
+ size_t len, void *pw)
+{
+ hubbub_tokeniser *tok = (hubbub_tokeniser *) pw;
+
+ tok->input_buffer = buffer;
+ tok->input_buffer_len = len;
+
+ if (tok->buffer_handler != NULL)
+ tok->buffer_handler(buffer, len, tok->buffer_pw);
+}
+
+/**
+ * Emit a token, performing sanity checks if necessary
+ *
+ * \param tokeniser Tokeniser instance
+ * \param token Token to emit
+ */
+void hubbub_tokeniser_emit_token(hubbub_tokeniser *tokeniser,
+ hubbub_token *token)
+{
+ if (tokeniser == NULL || token == NULL)
+ return;
+
+ /* Nothing to do if there's no registered handler */
+ if (tokeniser->token_handler == NULL)
+ return;
+
+ if (token->type == HUBBUB_TOKEN_START_TAG ||
+ token->type == HUBBUB_TOKEN_END_TAG) {
+ uint32_t i, j;
+ uint32_t n_attributes = token->data.tag.n_attributes;
+ hubbub_attribute *attrs =
+ token->data.tag.attributes;
+
+ /* Discard duplicate attributes */
+ for (i = 0; i < n_attributes; i++) {
+ for (j = 0; j < n_attributes; j++) {
+ uint32_t move;
+
+ if (j == i ||
+ attrs[i].name.len !=
+ attrs[j].name.len ||
+ hubbub_inputstream_compare_range_cs(
+ tokeniser->input,
+ attrs[i].name.data_off,
+ attrs[j].name.data_off,
+ attrs[i].name.len) != 0) {
+ /* Attributes don't match */
+ continue;
+ }
+
+ /* Calculate amount to move */
+ move = (n_attributes - 1 -
+ ((i < j) ? j : i)) *
+ sizeof(hubbub_attribute);
+
+ if (move > 0) {
+ memmove((i < j) ? &attrs[j]
+ : &attrs[i],
+ (i < j) ? &attrs[j+1]
+ : &attrs[i+1],
+ move);
+ }
+
+ /* And reduce the number of attributes */
+ n_attributes--;
+ }
+ }
+
+ token->data.tag.n_attributes = n_attributes;
+ }
+
+ /* Finally, emit token */
+ tokeniser->token_handler(token, tokeniser->token_pw);
+}
diff --git a/src/tokeniser/tokeniser.h b/src/tokeniser/tokeniser.h
new file mode 100644
index 0000000..20bbe20
--- /dev/null
+++ b/src/tokeniser/tokeniser.h
@@ -0,0 +1,71 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_tokeniser_tokeniser_h_
+#define hubbub_tokeniser_tokeniser_h_
+
+#include <stdbool.h>
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/functypes.h>
+#include <hubbub/types.h>
+
+#include "input/inputstream.h"
+
+typedef struct hubbub_tokeniser hubbub_tokeniser;
+
+/**
+ * Hubbub tokeniser option types
+ */
+typedef enum hubbub_tokeniser_opttype {
+ HUBBUB_TOKENISER_TOKEN_HANDLER,
+ HUBBUB_TOKENISER_BUFFER_HANDLER,
+ HUBBUB_TOKENISER_ERROR_HANDLER,
+ HUBBUB_TOKENISER_CONTENT_MODEL,
+} hubbub_tokeniser_opttype;
+
+/**
+ * Hubbub tokeniser option parameters
+ */
+typedef union hubbub_tokeniser_optparams {
+ struct {
+ hubbub_token_handler handler;
+ void *pw;
+ } token_handler;
+
+ struct {
+ hubbub_buffer_handler handler;
+ void *pw;
+ } buffer_handler;
+
+ struct {
+ hubbub_error_handler handler;
+ void *pw;
+ } error_handler;
+
+ struct {
+ hubbub_content_model model;
+ } content_model;
+} hubbub_tokeniser_optparams;
+
+/* Create a hubbub tokeniser */
+hubbub_tokeniser *hubbub_tokeniser_create(hubbub_inputstream *input,
+ hubbub_alloc alloc, void *pw);
+/* Destroy a hubbub tokeniser */
+void hubbub_tokeniser_destroy(hubbub_tokeniser *tokeniser);
+
+/* Configure a hubbub tokeniser */
+hubbub_error hubbub_tokeniser_setopt(hubbub_tokeniser *tokeniser,
+ hubbub_tokeniser_opttype type,
+ hubbub_tokeniser_optparams *params);
+
+/* Process remaining data in the input stream */
+hubbub_error hubbub_tokeniser_run(hubbub_tokeniser *tokeniser);
+
+#endif
+
diff --git a/src/utils/Makefile b/src/utils/Makefile
new file mode 100644
index 0000000..59b5512
--- /dev/null
+++ b/src/utils/Makefile
@@ -0,0 +1,53 @@
+# Makefile for libhubbub
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT Name of component
+# EXPORT Absolute path of export directory
+# TOP Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean Clean source tree
+# debug Create a debug binary
+# distclean Fully clean source tree, back to pristine condition
+# export Export distributable components to ${EXPORT}
+# release Create a release binary
+# setup Perform any setup required prior to compilation
+# test Execute any test cases
+
+# Manipulate include paths
+CFLAGS += -I$(CURDIR)
+
+# Objects
+OBJS = dict errors utf8
+
+.PHONY: clean debug distclean export release setup test
+
+# Targets
+release: $(addprefix ../Release/, $(addsuffix .o, $(OBJS)))
+
+debug: $(addprefix ../Debug/, $(addsuffix .o, $(OBJS)))
+
+clean:
+ -@${RM} ${RMFLAGS} $(addprefix ../Release/, $(addsuffix .o, ${OBJS}))
+ -@${RM} ${RMFLAGS} $(addprefix ../Debug/, $(addsuffix .o, ${OBJS}))
+
+distclean:
+
+setup:
+
+export:
+
+test:
+
+# Pattern rules
+../Release/%.o: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c ${CFLAGS} -DNDEBUG -o $@ $<
+
+../Debug/%.o: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c -g ${CFLAGS} -o $@ $<
diff --git a/src/utils/dict.c b/src/utils/dict.c
new file mode 100644
index 0000000..f50ffab
--- /dev/null
+++ b/src/utils/dict.c
@@ -0,0 +1,219 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+
+#include "utils/dict.h"
+
+/** Node in a dictionary tree */
+typedef struct hubbub_dict_node {
+ uint8_t split; /**< Data to split on */
+ struct hubbub_dict_node *lt; /**< Subtree for data less than
+ * split */
+ struct hubbub_dict_node *eq; /**< Subtree for data equal to split
+ * If split == '\0', this stores the
+ * pointer to the actual data, not a
+ * subtree */
+ struct hubbub_dict_node *gt; /**< Subtree for data greater than
+ * split */
+} hubbub_dict_node;
+
+/** Dictionary object */
+struct hubbub_dict {
+ hubbub_dict_node *dict; /**< Root of tree */
+
+ hubbub_alloc alloc; /**< Memory (de)allocation function */
+ void *pw; /**< Pointer to client data */
+};
+
+static void hubbub_dict_destroy_internal(hubbub_dict *dict,
+ hubbub_dict_node *root);
+static hubbub_dict_node *hubbub_dict_insert_internal(hubbub_dict *dict,
+ hubbub_dict_node *parent, const char *key,
+ const void *value);
+
+
+/**
+ * Create a dictionary
+ *
+ * \param alloc Memory (de)allocation function
+ * \param pw Pointer to client-specific private data (may be NULL)
+ * \return Pointer to dictionary instance, or NULL on error
+ */
+hubbub_dict *hubbub_dict_create(hubbub_alloc alloc, void *pw)
+{
+ hubbub_dict *dict;
+
+ if (alloc == NULL)
+ return NULL;
+
+ dict = alloc(NULL, sizeof(hubbub_dict), pw);
+ if (dict == NULL)
+ return NULL;
+
+ dict->dict = NULL;
+
+ dict->alloc = alloc;
+ dict->pw = pw;
+
+ return dict;
+}
+
+/**
+ * Destroy a dictionary
+ *
+ * \param dict Dictionary to destroy
+ */
+void hubbub_dict_destroy(hubbub_dict *dict)
+{
+ if (dict == NULL)
+ return;
+
+ hubbub_dict_destroy_internal(dict, dict->dict);
+
+ dict->alloc(dict, 0, dict->pw);
+}
+
+/**
+ * Helper routine for dictionary destruction
+ *
+ * \param dict Dictionary being destroyed
+ * \param root Root node of dictionary (sub)tree to destroy
+ */
+void hubbub_dict_destroy_internal(hubbub_dict *dict, hubbub_dict_node *root)
+{
+ if (root == NULL)
+ return;
+
+ hubbub_dict_destroy_internal(dict, root->lt);
+ if (root->split != '\0')
+ hubbub_dict_destroy_internal(dict, root->eq);
+ hubbub_dict_destroy_internal(dict, root->gt);
+
+ dict->alloc(root, 0, dict->pw);
+}
+
+/**
+ * Insert a key-value pair into a dictionary
+ *
+ * \param dict Dictionary to insert into
+ * \param key Key string
+ * \param value Value to associate with key (may be NULL)
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+hubbub_error hubbub_dict_insert(hubbub_dict *dict, const char *key,
+ const void *value)
+{
+ if (dict == NULL || key == NULL)
+ return HUBBUB_BADPARM;
+
+ dict->dict = hubbub_dict_insert_internal(dict, dict->dict,
+ key, value);
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Helper routine for insertion into dictionary
+ *
+ * \param dict Dictionary being inserted into
+ * \param parent Parent node of subtree to insert into
+ * \param key Key string
+ * \param value Value to associate with key
+ * \return Pointer to root of tree created
+ */
+hubbub_dict_node *hubbub_dict_insert_internal(hubbub_dict *dict,
+ hubbub_dict_node *parent, const char *key, const void *value)
+{
+ if (parent == NULL) {
+ parent = dict->alloc(NULL,
+ sizeof(hubbub_dict_node), dict->pw);
+ if (parent == NULL)
+ return NULL;
+ parent->split = (uint8_t) key[0];
+ parent->lt = parent->eq = parent->gt = NULL;
+ }
+
+ if ((uint8_t) key[0] < parent->split) {
+ parent->lt = hubbub_dict_insert_internal(dict,
+ parent->lt, key, value);
+ } else if ((uint8_t) key[0] == parent->split) {
+ if (key[0] == '\0') {
+ parent->eq = (hubbub_dict_node *) value;
+ } else {
+ parent->eq = hubbub_dict_insert_internal(dict,
+ parent->eq, ++key, value);
+ }
+ } else {
+ parent->gt = hubbub_dict_insert_internal(dict,
+ parent->gt, key, value);
+ }
+
+ return parent;
+}
+
+/**
+ * Step-wise search for a key in a dictionary
+ *
+ * \param dict Dictionary to search
+ * \param c Character to look for
+ * \param result Pointer to location for result
+ * \param context Pointer to location for search context
+ * \return HUBBUB_OK if key found,
+ * HUBBUB_NEEDDATA if more steps are required
+ * HUBBUB_INVALID if nothing matches
+ *
+ * The value pointed to by ::context must be NULL for the first call.
+ * Thereafter, pass in the same value as returned by the previous call.
+ * The context is opaque to the caller and should not be inspected.
+ *
+ * The location pointed to by ::result will be set to NULL unless a match
+ * is found.
+ */
+hubbub_error hubbub_dict_search_step(hubbub_dict *dict, uint8_t c,
+ const void **result, void **context)
+{
+ bool match = false;
+ hubbub_dict_node *p;
+
+ if (dict == NULL || result == NULL || context == NULL)
+ return HUBBUB_BADPARM;
+
+ *result = NULL;
+
+ if (*context == NULL) {
+ p = dict->dict;
+ } else {
+ p = (hubbub_dict_node *) *context;
+ }
+
+ while (p != NULL) {
+ if (c < p->split) {
+ p = p->lt;
+ } else if (c == p->split) {
+ if (p->split == '\0') {
+ match = true;
+ p = NULL;
+ } else if (p->eq != NULL && p->eq->split == '\0') {
+ match = true;
+ *result = (const void *) p->eq->eq;
+ p = p->eq;
+ } else {
+ p = p->eq;
+ }
+
+ break;
+ } else {
+ p = p->gt;
+ }
+ }
+
+ *context = (void *) p;
+
+ return (match) ? HUBBUB_OK :
+ (p == NULL) ? HUBBUB_INVALID : HUBBUB_NEEDDATA;
+}
diff --git a/src/utils/dict.h b/src/utils/dict.h
new file mode 100644
index 0000000..2cde01d
--- /dev/null
+++ b/src/utils/dict.h
@@ -0,0 +1,31 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_utils_dict_h_
+#define hubbub_utils_dict_h_
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+#include <hubbub/hubbub.h>
+
+typedef struct hubbub_dict hubbub_dict;
+
+/* Create a dictionary */
+hubbub_dict *hubbub_dict_create(hubbub_alloc alloc, void *pw);
+/* Destroy a dictionary */
+void hubbub_dict_destroy(hubbub_dict *dict);
+
+/* Insert a key-value pair into a dictionary */
+hubbub_error hubbub_dict_insert(hubbub_dict *dict, const char *key,
+ const void *value);
+
+/* Step-wise search for a key in a dictionary */
+hubbub_error hubbub_dict_search_step(hubbub_dict *dict, uint8_t c,
+ const void **result, void **context);
+
+#endif
diff --git a/src/utils/errors.c b/src/utils/errors.c
new file mode 100644
index 0000000..e57ba6a
--- /dev/null
+++ b/src/utils/errors.c
@@ -0,0 +1,70 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <string.h>
+
+#include <hubbub/errors.h>
+
+/**
+ * Convert a hubbub error code to a string
+ *
+ * \param error The error code to convert
+ * \return Pointer to string representation of error, or NULL if unknown.
+ */
+const char *hubbub_error_to_string(hubbub_error error)
+{
+ const char *result = NULL;
+
+ switch (error) {
+ case HUBBUB_OK:
+ result = "No error";
+ break;
+ case HUBBUB_NOMEM:
+ result = "Insufficient memory";
+ break;
+ case HUBBUB_BADPARM:
+ result = "Bad parameter";
+ break;
+ case HUBBUB_INVALID:
+ result = "Invalid input";
+ break;
+ case HUBBUB_FILENOTFOUND:
+ result = "File not found";
+ break;
+ case HUBBUB_NEEDDATA:
+ result = "Insufficient data";
+ break;
+ }
+
+ return result;
+}
+
+/**
+ * Convert a string representation of an error name to a hubbub error code
+ *
+ * \param str String containing error name
+ * \param len Length of string (bytes)
+ * \return Hubbub error code, or HUBBUB_OK if unknown
+ */
+hubbub_error hubbub_error_from_string(const char *str, size_t len)
+{
+ if (strncmp(str, "HUBBUB_OK", len) == 0) {
+ return HUBBUB_OK;
+ } else if (strncmp(str, "HUBBUB_NOMEM", len) == 0) {
+ return HUBBUB_NOMEM;
+ } else if (strncmp(str, "HUBBUB_BADPARM", len) == 0) {
+ return HUBBUB_BADPARM;
+ } else if (strncmp(str, "HUBBUB_INVALID", len) == 0) {
+ return HUBBUB_INVALID;
+ } else if (strncmp(str, "HUBBUB_FILENOTFOUND", len) == 0) {
+ return HUBBUB_FILENOTFOUND;
+ } else if (strncmp(str, "HUBBUB_NEEDDATA", len) == 0) {
+ return HUBBUB_NEEDDATA;
+ }
+
+ return HUBBUB_OK;
+}
diff --git a/src/utils/utf8.c b/src/utils/utf8.c
new file mode 100644
index 0000000..062d629
--- /dev/null
+++ b/src/utils/utf8.c
@@ -0,0 +1,368 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (implementation).
+ */
+
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "utils/utf8.h"
+
+/** Number of continuation bytes for a given start byte */
+static const uint8_t numContinuations[256] = {
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
+};
+
+/**
+ * Convert a UTF-8 multibyte sequence into a single UCS4 character
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param s The sequence to process
+ * \param len Length of sequence
+ * \param ucs4 Pointer to location to receive UCS4 character (host endian)
+ * \param clen Pointer to location to receive byte length of UTF-8 sequence
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len,
+ uint32_t *ucs4, size_t *clen)
+{
+ if (s == NULL || ucs4 == NULL || clen == NULL)
+ return HUBBUB_BADPARM;
+
+ if (len == 0)
+ return HUBBUB_NEEDDATA;
+
+ if (*s < 0x80) {
+ *ucs4 = *s;
+ *clen = 1;
+ } else if ((*s & 0xE0) == 0xC0) {
+ if (len < 2)
+ return HUBBUB_NEEDDATA;
+ else if ((*(s+1) & 0xC0) != 0x80)
+ return HUBBUB_INVALID;
+ else {
+ *ucs4 = ((*s & 0x1F) << 6) | (*(s+1) & 0x3F);
+ *clen = 2;
+ }
+ } else if ((*s & 0xF0) == 0xE0) {
+ if (len < 3)
+ return HUBBUB_NEEDDATA;
+ else if ((*(s+1) & 0xC0) != 0x80 ||
+ (*(s+2) & 0xC0) != 0x80)
+ return HUBBUB_INVALID;
+ else {
+ *ucs4 = ((*s & 0x0F) << 12) |
+ ((*(s+1) & 0x3F) << 6) |
+ (*(s+2) & 0x3F);
+ *clen = 3;
+ }
+ } else if ((*s & 0xF8) == 0xF0) {
+ if (len < 4)
+ return HUBBUB_NEEDDATA;
+ else if ((*(s+1) & 0xC0) != 0x80 ||
+ (*(s+2) & 0xC0) != 0x80 ||
+ (*(s+3) & 0xC0) != 0x80)
+ return HUBBUB_INVALID;
+ else {
+ *ucs4 = ((*s & 0x0F) << 18) |
+ ((*(s+1) & 0x3F) << 12) |
+ ((*(s+2) & 0x3F) << 6) |
+ (*(s+3) & 0x3F);
+ *clen = 4;
+ }
+ } else if ((*s & 0xFC) == 0xF8) {
+ if (len < 5)
+ return HUBBUB_NEEDDATA;
+ else if ((*(s+1) & 0xC0) != 0x80 ||
+ (*(s+2) & 0xC0) != 0x80 ||
+ (*(s+3) & 0xC0) != 0x80 ||
+ (*(s+4) & 0xC0) != 0x80)
+ return HUBBUB_INVALID;
+ else {
+ *ucs4 = ((*s & 0x0F) << 24) |
+ ((*(s+1) & 0x3F) << 18) |
+ ((*(s+2) & 0x3F) << 12) |
+ ((*(s+3) & 0x3F) << 6) |
+ (*(s+4) & 0x3F);
+ *clen = 5;
+ }
+ } else if ((*s & 0xFE) == 0xFC) {
+ if (len < 6)
+ return HUBBUB_NEEDDATA;
+ else if ((*(s+1) & 0xC0) != 0x80 ||
+ (*(s+2) & 0xC0) != 0x80 ||
+ (*(s+3) & 0xC0) != 0x80 ||
+ (*(s+4) & 0xC0) != 0x80 ||
+ (*(s+5) & 0xC0) != 0x80)
+ return HUBBUB_INVALID;
+ else {
+ *ucs4 = ((*s & 0x0F) << 28) |
+ ((*(s+1) & 0x3F) << 24) |
+ ((*(s+2) & 0x3F) << 18) |
+ ((*(s+3) & 0x3F) << 12) |
+ ((*(s+4) & 0x3F) << 6) |
+ (*(s+5) & 0x3F);
+ *clen = 6;
+ }
+ } else {
+ return HUBBUB_INVALID;
+ }
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Convert a single UCS4 character into a UTF-8 multibyte sequence
+ *
+ * Encoding of UCS values outside the UTF-16 plane has been removed from
+ * RFC3629. This function conforms to RFC2279, however.
+ *
+ * \param ucs4 The character to process (0 <= c <= 0x7FFFFFFF) (host endian)
+ * \param s Pointer to 6 byte long output buffer
+ * \param len Pointer to location to receive length of multibyte sequence
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s,
+ size_t *len)
+{
+ uint32_t l = 0;
+
+ if (s == NULL || len == NULL)
+ return HUBBUB_BADPARM;
+ else if (ucs4 < 0x80) {
+ *s = (uint8_t) ucs4;
+ l = 1;
+ } else if (ucs4 < 0x800) {
+ *s = 0xC0 | ((ucs4 >> 6) & 0x1F);
+ *(s+1) = 0x80 | (ucs4 & 0x3F);
+ l = 2;
+ } else if (ucs4 < 0x10000) {
+ *s = 0xE0 | ((ucs4 >> 12) & 0xF);
+ *(s+1) = 0x80 | ((ucs4 >> 6) & 0x3F);
+ *(s+2) = 0x80 | (ucs4 & 0x3F);
+ l = 3;
+ } else if (ucs4 < 0x200000) {
+ *s = 0xF0 | ((ucs4 >> 18) & 0x7);
+ *(s+1) = 0x80 | ((ucs4 >> 12) & 0x3F);
+ *(s+2) = 0x80 | ((ucs4 >> 6) & 0x3F);
+ *(s+3) = 0x80 | (ucs4 & 0x3F);
+ l = 4;
+ } else if (ucs4 < 0x4000000) {
+ *s = 0xF8 | ((ucs4 >> 24) & 0x3);
+ *(s+1) = 0x80 | ((ucs4 >> 18) & 0x3F);
+ *(s+2) = 0x80 | ((ucs4 >> 12) & 0x3F);
+ *(s+3) = 0x80 | ((ucs4 >> 6) & 0x3F);
+ *(s+4) = 0x80 | (ucs4 & 0x3F);
+ l = 5;
+ } else if (ucs4 <= 0x7FFFFFFF) {
+ *s = 0xFC | ((ucs4 >> 30) & 0x1);
+ *(s+1) = 0x80 | ((ucs4 >> 24) & 0x3F);
+ *(s+2) = 0x80 | ((ucs4 >> 18) & 0x3F);
+ *(s+3) = 0x80 | ((ucs4 >> 12) & 0x3F);
+ *(s+4) = 0x80 | ((ucs4 >> 6) & 0x3F);
+ *(s+5) = 0x80 | (ucs4 & 0x3F);
+ l = 6;
+ } else {
+ return HUBBUB_INVALID;
+ }
+
+ *len = l;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Calculate the length (in characters) of a bounded UTF-8 string
+ *
+ * \param s The string
+ * \param max Maximum length
+ * \param len Pointer to location to receive length of string
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max,
+ size_t *len)
+{
+ const uint8_t *end = s + max;
+ int l = 0;
+
+ if (s == NULL || len == NULL)
+ return HUBBUB_BADPARM;
+
+ while (s < end) {
+ if ((*s & 0x80) == 0x00)
+ s += 1;
+ else if ((*s & 0xE0) == 0xC0)
+ s += 2;
+ else if ((*s & 0xF0) == 0xE0)
+ s += 3;
+ else if ((*s & 0xF8) == 0xF0)
+ s += 4;
+ else if ((*s & 0xFC) == 0xF8)
+ s += 5;
+ else if ((*s & 0xFE) == 0xFC)
+ s += 6;
+ else
+ return HUBBUB_INVALID;
+ l++;
+ }
+
+ *len = l;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Calculate the length (in bytes) of a UTF-8 character
+ *
+ * \param s Pointer to start of character
+ * \param len Pointer to location to receive length
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s,
+ size_t *len)
+{
+ if (s == NULL || len == NULL)
+ return HUBBUB_BADPARM;
+
+ *len = numContinuations[s[0]] + 1 /* Start byte */;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Find previous legal UTF-8 char in string
+ *
+ * \param s The string
+ * \param off Offset in the string to start at
+ * \param prevoff Pointer to location to receive offset of first byte of
+ * previous legal character
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off,
+ uint32_t *prevoff)
+{
+ if (s == NULL || prevoff == NULL)
+ return HUBBUB_BADPARM;
+
+ while (off != 0 && (s[--off] & 0xC0) == 0x80)
+ /* do nothing */;
+
+ *prevoff = off;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s The string (assumed valid)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff)
+{
+ if (s == NULL || off >= len || nextoff == NULL)
+ return HUBBUB_BADPARM;
+
+ /* Skip current start byte (if present - may be mid-sequence) */
+ if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)
+ off++;
+
+ while (off < len && (s[off] & 0xC0) == 0x80)
+ off++;
+
+ *nextoff = off;
+
+ return HUBBUB_OK;
+}
+
+/**
+ * Find next legal UTF-8 char in string
+ *
+ * \param s The string (assumed to be of dubious validity)
+ * \param len Maximum offset in string
+ * \param off Offset in the string to start at
+ * \param nextoff Pointer to location to receive offset of first byte of
+ * next legal character
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ */
+inline hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff)
+{
+ bool valid;
+
+ if (s == NULL || off >= len || nextoff == NULL)
+ return HUBBUB_BADPARM;
+
+ /* Skip current start byte (if present - may be mid-sequence) */
+ if (s[off] < 0x80 || (s[off] & 0xC0) == 0xC0)
+ off++;
+
+ while (1) {
+ /* Find next possible start byte */
+ while (off < len && (s[off] & 0xC0) == 0x80)
+ off++;
+
+ /* Ran off end of data */
+ if (off == len || off + numContinuations[s[off]] >= len)
+ return HUBBUB_NEEDDATA;
+
+ /* Found if start byte is ascii,
+ * or next n bytes are valid continuations */
+ valid = true;
+
+ switch (numContinuations[s[off]]) {
+ case 5:
+ valid &= ((s[off + 5] & 0xC0) == 0x80);
+ case 4:
+ valid &= ((s[off + 4] & 0xC0) == 0x80);
+ case 3:
+ valid &= ((s[off + 3] & 0xC0) == 0x80);
+ case 2:
+ valid &= ((s[off + 2] & 0xC0) == 0x80);
+ case 1:
+ valid &= ((s[off + 1] & 0xC0) == 0x80);
+ case 0:
+ valid &= (s[off + 0] < 0x80);
+ }
+
+ if (valid)
+ break;
+
+ /* Otherwise, skip this (invalid) start byte and try again */
+ off++;
+ }
+
+ *nextoff = off;
+
+ return HUBBUB_OK;
+}
+
diff --git a/src/utils/utf8.h b/src/utils/utf8.h
new file mode 100644
index 0000000..8836338
--- /dev/null
+++ b/src/utils/utf8.h
@@ -0,0 +1,38 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+/** \file
+ * UTF-8 manipulation functions (interface).
+ */
+
+#ifndef hubbub_utils_utf8_h_
+#define hubbub_utils_utf8_h
+
+#include <inttypes.h>
+
+#include <hubbub/errors.h>
+
+inline hubbub_error hubbub_utf8_to_ucs4(const uint8_t *s, size_t len,
+ uint32_t *ucs4, size_t *clen);
+inline hubbub_error hubbub_utf8_from_ucs4(uint32_t ucs4, uint8_t *s,
+ size_t *len);
+
+inline hubbub_error hubbub_utf8_length(const uint8_t *s, size_t max,
+ size_t *len);
+inline hubbub_error hubbub_utf8_char_byte_length(const uint8_t *s,
+ size_t *len);
+
+inline hubbub_error hubbub_utf8_prev(const uint8_t *s, uint32_t off,
+ uint32_t *prevoff);
+inline hubbub_error hubbub_utf8_next(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff);
+
+inline hubbub_error hubbub_utf8_next_paranoid(const uint8_t *s, uint32_t len,
+ uint32_t off, uint32_t *nextoff);
+
+#endif
+
diff --git a/src/utils/utils.h b/src/utils/utils.h
new file mode 100644
index 0000000..a1e0230
--- /dev/null
+++ b/src/utils/utils.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#ifndef hubbub_utils_h_
+#define hubbub_utils_h_
+
+#ifndef max
+#define max(a,b) ((a)>(b)?(a):(b))
+#endif
+
+#ifndef min
+#define min(a,b) ((a)<(b)?(a):(b))
+#endif
+
+#ifndef SLEN
+/* Calculate length of a string constant */
+#define SLEN(s) (sizeof((s)) - 1) /* -1 for '\0' */
+#endif
+
+#ifndef UNUSED
+#define UNUSED(x) ((x)=(x))
+#endif
+
+#endif
diff --git a/test/INDEX b/test/INDEX
new file mode 100644
index 0000000..100dd21
--- /dev/null
+++ b/test/INDEX
@@ -0,0 +1,15 @@
+# Index for libhubbub testcases
+#
+# Test Description DataDir
+
+aliases Encoding alias handling
+cscodec Charset codec implementation cscodec
+csdetect Charset detection csdetect
+dict Generic string dictionary
+entities Named entity dictionary
+filter Input stream filtering
+hubbub Library initialisation/finalisation
+inputstream Buffered input stream html
+parser Public parser API html
+tokeniser HTML tokeniser html
+tokeniser2 HTML tokeniser (again) tokeniser2 \ No newline at end of file
diff --git a/test/Makefile b/test/Makefile
new file mode 100644
index 0000000..ef50365
--- /dev/null
+++ b/test/Makefile
@@ -0,0 +1,63 @@
+# Makefile for Hubbub testcases
+#
+# Toolchain is exported by top-level makefile
+#
+# Top-level makefile also exports the following variables:
+#
+# COMPONENT Name of component
+# EXPORT Absolute path of export directory
+# TOP Absolute path of source tree root
+#
+# The top-level makefile requires the following targets to exist:
+#
+# clean Clean source tree
+# debug Create a debug binary
+# distclean Fully clean source tree, back to pristine condition
+# export Export distributable components to ${EXPORT}
+# release Create a release binary
+# setup Perform any setup required prior to compilation
+# test Execute any test cases
+
+# Extend toolchain settings
+# We require the presence of libjson -- http://oss.metaparadigm.com/json-c/
+CFLAGS += -I${TOP}/src/ -I$(CURDIR) \
+ `${PKGCONFIG} ${PKGCONFIGFLAGS} --cflags json`
+LDFLAGS += `${PKGCONFIG} ${PKGCONFIGFLAGS} --libs json`
+
+# Release output
+RELEASE =
+
+# Debug output
+DEBUG =
+
+# Objects
+OBJS = aliases cscodec csdetect dict entities filter hubbub \
+ inputstream parser tokeniser tokeniser2
+OBJS += regression/cscodec-segv regression/filter-segv
+
+.PHONY: clean debug export release setup test
+
+# Targets
+release:
+
+debug:
+
+clean:
+ -@${RM} ${RMFLAGS} $(addsuffix ${EXEEXT}, $(OBJS))
+
+distclean:
+ -@${RM} ${RMFLAGS} log
+
+setup:
+
+export:
+
+test: $(OBJS)
+ @${PERL} testrunner.pl ${EXEEXT}
+
+# Pattern rules
+%: %.c
+ @${ECHO} ${ECHOFLAGS} "==> $<"
+ @${CC} -c -g ${CFLAGS} -o $@.o $<
+ @${LD} -g -o $@ $@.o ${LDFLAGS} -lhubbub-debug
+ @${RM} ${RMFLAGS} $@.o
diff --git a/test/README b/test/README
new file mode 100644
index 0000000..e4a895b
--- /dev/null
+++ b/test/README
@@ -0,0 +1,84 @@
+Hubbub testcases
+================
+
+Testcases for hubbub are self-contained binaries which test various parts
+of the hubbub library. These may make use of external data files to drive
+the testing.
+
+Testcase command lines
+----------------------
+
+Testcase command lines are in a unified format, thus:
+
+ <aliases_file> [ <data_file> ]
+
+The aliases file parameter will always be specified (as it is required for
+the library to work at all).
+
+The data file parameter is optional and may be provided on a test-by-test
+basis.
+
+Testcase output
+---------------
+
+Testcases may output anything at all to stdout. The final line of the
+output must begin with either PASS or FAIL (case sensitive), indicating
+the success status of the test.
+
+Test Index
+----------
+
+In the test sources directory, is a file, named INDEX, which provides an
+index of all available test binaries. Any new test applications should be
+added to this index as they are created.
+
+The test index file format is as follows:
+
+ file = *line
+
+ line = ( entry / comment / blank ) LF
+
+ entry = testname 1*HTAB description [ 1*HTAB datadir ]
+ comment = "#" *non-newline
+ blank = 0<OCTET>
+
+ testname = 1*non-reserved
+ description = 1*non-reserved
+ datadir = 1*non-reserved
+
+ non-newline = VCHAR / WSP
+ non-reserved = VCHAR / SP
+
+Each entry contains a mandatory binary name and description followed by
+an optional data directory specifier. The data directory specifier is
+used to state the name of the directory containing data files for the
+test name. This directory will be searched for within the "data"
+directory in the source tree.
+
+If a data directory is specified, the test binary will be invoked for
+each data file listed within the data directory INDEX, passing the
+filename as the second parameter (<data_file>, above).
+
+Data Index
+----------
+
+Each test data directory contains a file, named INDEX, which provides an
+index of all available test data files.
+
+The data index file format is as follows:
+
+ file = *line
+
+ line = ( entry / comment / blank ) LF
+
+ entry = dataname 1*HTAB description
+ comment = "#" *non-newline
+ blank = 0<OCTET>
+
+ dataname = 1*non-reserved
+ description = 1*non-reserved
+
+ non-newline = VCHAR / WSP
+ non-reserved = VCHAR / SP
+
+Each entry contains a mandatory data file name and description.
diff --git a/test/aliases.c b/test/aliases.c
new file mode 100644
index 0000000..1cbf2a4
--- /dev/null
+++ b/test/aliases.c
@@ -0,0 +1,61 @@
+#include <stdio.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+
+#include "testutils.h"
+
+extern void hubbub_aliases_dump(void);
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+ UNUSED(pw);
+
+ return realloc(ptr, len);
+}
+
+int main (int argc, char **argv)
+{
+ hubbub_aliases_canon *c;
+
+ if (argc != 2) {
+ printf("Usage: %s <filename>\n", argv[0]);
+ return 1;
+ }
+
+ hubbub_aliases_create(argv[1], myrealloc, NULL);
+
+ hubbub_aliases_dump();
+
+ c = hubbub_alias_canonicalise("moose", 5);
+ if (c) {
+ printf("FAIL - found invalid encoding 'moose'\n");
+ return 1;
+ }
+
+ c = hubbub_alias_canonicalise("csinvariant", 11);
+ if (c) {
+ printf("%s %d\n", c->name, c->mib_enum);
+ } else {
+ printf("FAIL - failed finding encoding 'csinvariant'\n");
+ return 1;
+ }
+
+ c = hubbub_alias_canonicalise("nats-sefi-add", 13);
+ if (c) {
+ printf("%s %d\n", c->name, c->mib_enum);
+ } else {
+ printf("FAIL - failed finding encoding 'nats-sefi-add'\n");
+ return 1;
+ }
+
+ printf("%d\n", hubbub_mibenum_from_name(c->name, strlen(c->name)));
+
+ printf("%s\n", hubbub_mibenum_to_name(c->mib_enum));
+
+ hubbub_aliases_destroy(myrealloc, NULL);
+
+ printf("PASS\n");
+
+ return 0;
+}
diff --git a/test/cscodec.c b/test/cscodec.c
new file mode 100644
index 0000000..525b275
--- /dev/null
+++ b/test/cscodec.c
@@ -0,0 +1,247 @@
+#include <stdio.h>
+#include <string.h>
+
+#include <hubbub/hubbub.h>
+
+#include "charset/codec.h"
+#include "utils/utils.h"
+
+#include "testutils.h"
+
+typedef struct line_ctx {
+ hubbub_charsetcodec *codec;
+
+ size_t buflen;
+ size_t bufused;
+ uint8_t *buf;
+ size_t explen;
+ size_t expused;
+ uint8_t *exp;
+
+ bool indata;
+ bool inexp;
+
+ hubbub_error exp_ret;
+
+ enum { ENCODE, DECODE } dir;
+} line_ctx;
+
+static bool handle_line(const char *data, size_t datalen, void *pw);
+static void run_test(line_ctx *ctx);
+static hubbub_error filter(uint32_t c, uint32_t **output,
+ size_t *outputlen, void *pw);
+
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+ UNUSED(pw);
+
+ return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+ line_ctx ctx;
+
+ if (argc != 3) {
+ printf("Usage: %s <aliases_file> <filename>\n", argv[0]);
+ return 1;
+ }
+
+ assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK);
+
+ assert(hubbub_charsetcodec_create("NATS-SEFI-ADD",
+ myrealloc, NULL) == NULL);
+
+ ctx.codec = hubbub_charsetcodec_create("UTF-8", myrealloc, NULL);
+ assert(ctx.codec != NULL);
+
+ ctx.buflen = parse_filesize(argv[2]);
+ if (ctx.buflen == 0)
+ return 1;
+
+ ctx.buf = malloc(2 * ctx.buflen);
+ if (ctx.buf == NULL) {
+ printf("Failed allocating %u bytes\n",
+ (unsigned int) ctx.buflen);
+ return 1;
+ }
+
+ ctx.exp = ctx.buf + ctx.buflen;
+ ctx.explen = ctx.buflen;
+
+ ctx.buf[0] = '\0';
+ ctx.exp[0] = '\0';
+ ctx.bufused = 0;
+ ctx.expused = 0;
+ ctx.indata = false;
+ ctx.inexp = false;
+ ctx.exp_ret = HUBBUB_OK;
+
+ assert(parse_testfile(argv[2], handle_line, &ctx) == true);
+
+ /* and run final test */
+ if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
+ ctx.bufused -= 1;
+
+ if (ctx.expused > 0 && ctx.exp[ctx.expused - 1] == '\n')
+ ctx.expused -= 1;
+
+ run_test(&ctx);
+
+ free(ctx.buf);
+
+ hubbub_charsetcodec_destroy(ctx.codec);
+
+ assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK);
+
+ printf("PASS\n");
+
+ return 0;
+}
+
+bool handle_line(const char *data, size_t datalen, void *pw)
+{
+ line_ctx *ctx = (line_ctx *) pw;
+
+ if (data[0] == '#') {
+ if (ctx->inexp) {
+ /* This marks end of testcase, so run it */
+
+ if (ctx->buf[ctx->bufused - 1] == '\n')
+ ctx->bufused -= 1;
+
+ if (ctx->exp[ctx->expused - 1] == '\n')
+ ctx->expused -= 1;
+
+ run_test(ctx);
+
+ ctx->buf[0] = '\0';
+ ctx->exp[0] = '\0';
+ ctx->bufused = 0;
+ ctx->expused = 0;
+ ctx->exp_ret = HUBBUB_OK;
+ }
+
+ if (strncasecmp(data+1, "data", 4) == 0) {
+ hubbub_charsetcodec_optparams params;
+ const char *ptr = data + 6;
+
+ ctx->indata = true;
+ ctx->inexp = false;
+
+ if (strncasecmp(ptr, "decode", 6) == 0)
+ ctx->dir = DECODE;
+ else
+ ctx->dir = ENCODE;
+
+ ptr += 7;
+
+ if (strncasecmp(ptr, "LOOSE", 5) == 0) {
+ params.error_mode.mode =
+ HUBBUB_CHARSETCODEC_ERROR_LOOSE;
+ ptr += 6;
+ } else if (strncasecmp(ptr, "STRICT", 6) == 0) {
+ params.error_mode.mode =
+ HUBBUB_CHARSETCODEC_ERROR_STRICT;
+ ptr += 7;
+ } else {
+ params.error_mode.mode =
+ HUBBUB_CHARSETCODEC_ERROR_TRANSLIT;
+ ptr += 9;
+ }
+
+ assert(hubbub_charsetcodec_setopt(ctx->codec,
+ HUBBUB_CHARSETCODEC_ERROR_MODE,
+ (hubbub_charsetcodec_optparams *) &params)
+ == HUBBUB_OK);
+
+ if (strncasecmp(ptr, "filter", 6) == 0) {
+ params.filter_func.filter = filter;
+ params.filter_func.pw = ctx;
+
+ assert(hubbub_charsetcodec_setopt(ctx->codec,
+ HUBBUB_CHARSETCODEC_FILTER_FUNC,
+ (hubbub_charsetcodec_optparams *)
+ &params) == HUBBUB_OK);
+ }
+ } else if (strncasecmp(data+1, "expected", 8) == 0) {
+ ctx->indata = false;
+ ctx->inexp = true;
+
+ ctx->exp_ret = hubbub_error_from_string(data + 10,
+ datalen - 10 - 1 /* \n */);
+ } else if (strncasecmp(data+1, "reset", 5) == 0) {
+ ctx->indata = false;
+ ctx->inexp = false;
+
+ hubbub_charsetcodec_reset(ctx->codec);
+ }
+ } else {
+ if (ctx->indata) {
+ memcpy(ctx->buf + ctx->bufused, data, datalen);
+ ctx->bufused += datalen;
+ }
+ if (ctx->inexp) {
+ memcpy(ctx->exp + ctx->expused, data, datalen);
+ ctx->expused += datalen;
+ }
+ }
+
+ return true;
+}
+
+void run_test(line_ctx *ctx)
+{
+ static int testnum;
+ size_t destlen = ctx->bufused * 4;
+ uint8_t dest[destlen];
+ uint8_t *pdest = dest;
+ const uint8_t *psrc = ctx->buf;
+ size_t srclen = ctx->bufused;
+ size_t i;
+
+ if (ctx->dir == DECODE) {
+ assert(hubbub_charsetcodec_decode(ctx->codec,
+ &psrc, &srclen,
+ &pdest, &destlen) == ctx->exp_ret);
+ } else {
+ assert(hubbub_charsetcodec_encode(ctx->codec,
+ &psrc, &srclen,
+ &pdest, &destlen) == ctx->exp_ret);
+ }
+
+ printf("%d: Read '", ++testnum);
+ for (i = 0; i < ctx->expused; i++) {
+ printf("%c%c ", "0123456789abcdef"[(dest[i] >> 4) & 0xf],
+ "0123456789abcdef"[dest[i] & 0xf]);
+ }
+ printf("' Expected '");
+ for (i = 0; i < ctx->expused; i++) {
+ printf("%c%c ", "0123456789abcdef"[(ctx->exp[i] >> 4) & 0xf],
+ "0123456789abcdef"[ctx->exp[i] & 0xf]);
+ }
+ printf("'\n");
+
+ assert(memcmp(dest, ctx->exp, ctx->expused) == 0);
+}
+
+hubbub_error filter(uint32_t c, uint32_t **output,
+ size_t *outputlen, void *pw)
+{
+ static uint32_t outbuf;
+
+ UNUSED(pw);
+
+ if (c == HUBBUB_CHARSETCODEC_NULL) {
+ outbuf = 0;
+ return HUBBUB_OK;
+ }
+
+ outbuf = c;
+
+ *output = &outbuf;
+ *outputlen = 1;
+
+ return HUBBUB_OK;
+}
diff --git a/test/csdetect.c b/test/csdetect.c
new file mode 100644
index 0000000..3b39972
--- /dev/null
+++ b/test/csdetect.c
@@ -0,0 +1,132 @@
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <hubbub/hubbub.h>
+
+#include "charset/aliases.h"
+#include "charset/detect.h"
+#include "utils/utils.h"
+
+#include "testutils.h"
+
+typedef struct line_ctx {
+ size_t buflen;
+ size_t bufused;
+ uint8_t *buf;
+ char enc[64];
+ bool indata;
+ bool inenc;
+} line_ctx;
+
+static bool handle_line(const char *data, size_t datalen, void *pw);
+static void run_test(const uint8_t *data, size_t len, char *expected);
+
+static void *myrealloc(void *ptr, size_t len, void *pw)
+{
+ UNUSED(pw);
+
+ return realloc(ptr, len);
+}
+
+int main(int argc, char **argv)
+{
+ line_ctx ctx;
+
+ if (argc != 3) {
+ printf("Usage: %s <aliases_file> <filename>\n", argv[0]);
+ return 1;
+ }
+
+ assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK);
+
+ ctx.buflen = parse_filesize(argv[2]);
+ if (ctx.buflen == 0)
+ return 1;
+
+ ctx.buf = malloc(ctx.buflen);
+ if (ctx.buf == NULL) {
+ printf("Failed allocating %u bytes\n",
+ (unsigned int) ctx.buflen);
+ return 1;
+ }
+
+ ctx.buf[0] = '\0';
+ ctx.enc[0] = '\0';
+ ctx.bufused = 0;
+ ctx.indata = false;
+ ctx.inenc = false;
+
+ assert(parse_testfile(argv[2], handle_line, &ctx) == true);
+
+ /* and run final test */
+ if (ctx.bufused > 0 && ctx.buf[ctx.bufused - 1] == '\n')
+ ctx.bufused -= 1;
+
+ run_test(ctx.buf, ctx.bufused, ctx.enc);
+
+ free(ctx.buf);
+
+ assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK);
+
+ printf("PASS\n");
+
+ return 0;
+}
+
+bool handle_line(const char *data, size_t datalen, void *pw)
+{
+ line_ctx *ctx = (line_ctx *) pw;
+
+ if (data[0] == '#') {
+ if (ctx->inenc) {
+ /* This marks end of testcase, so run it */
+
+ if (ctx->buf[ctx->bufused - 1] == '\n')
+ ctx->bufused -= 1;
+
+ run_test(ctx->buf, ctx->bufused, ctx->enc);
+
+ ctx->buf[0] = '\0';
+ ctx->enc[0] = '\0';
+ ctx->bufused = 0;
+ }
+
+ ctx->indata = (strncasecmp(data+1, "data", 4) == 0);
+ ctx->inenc = (strncasecmp(data+1, "encoding", 8) == 0);
+ } else {
+ if (ctx->indata) {
+ memcpy(ctx->buf + ctx->bufused, data, datalen);
+ ctx->bufused += datalen;
+ }
+ if (ctx->inenc) {
+ strcpy(ctx->enc, data);
+ if (ctx->enc[strlen(ctx->enc) - 1] == '\n')
+ ctx->enc[strlen(ctx->enc) - 1] = '\0';
+ }
+ }
+
+ return true;
+}
+
+void run_test(const uint8_t *data, size_t len, char *expected)
+{
+ uint16_t mibenum;
+ hubbub_charset_source source;
+ static int testnum;
+
+ assert(hubbub_charset_extract(&data, &len,
+ &mibenum, &source) == HUBBUB_OK);
+
+ assert(mibenum != 0);
+
+ printf("%d: Detected charset %s (%d) Source %d Expected %s (%d)\n",
+ ++testnum, hubbub_mibenum_to_name(mibenum),
+ mibenum, source, expected,
+ hubbub_mibenum_from_name(expected, strlen(expected)));
+
+ assert(mibenum ==
+ hubbub_mibenum_from_name(expected, strlen(expected)));
+}
diff --git a/test/data/Aliases b/test/data/Aliases
new file mode 100644
index 0000000..db61ff1
--- /dev/null
+++ b/test/data/Aliases
@@ -0,0 +1,302 @@
+# > Unicode:Files.Aliases
+# Mapping of character set encoding names to their canonical form
+#
+# Lines starting with a '#' are comments, blank lines are ignored.
+#
+# Based on http://www.iana.org/assignments/character-sets and
+# http://www.iana.org/assignments/ianacharset-mib
+#
+# Canonical Form MIBenum Aliases...
+#
+US-ASCII 3 iso-ir-6 ANSI_X3.4-1986 ISO_646.irv:1991 ASCII ISO646-US ANSI_X3.4-1968 us IBM367 cp367 csASCII
+ISO-10646-UTF-1 27 csISO10646UTF1
+ISO_646.basic:1983 28 ref csISO646basic1983
+INVARIANT 29 csINVARIANT
+ISO_646.irv:1983 30 iso-ir-2 irv csISO2IntlRefVersion
+BS_4730 20 iso-ir-4 ISO646-GB gb uk csISO4UnitedKingdom
+NATS-SEFI 31 iso-ir-8-1 csNATSSEFI
+NATS-SEFI-ADD 32 iso-ir-8-2 csNATSSEFIADD
+NATS-DANO 33 iso-ir-9-1 csNATSDANO
+NATS-DANO-ADD 34 iso-ir-9-2 csNATSDANOADD
+SEN_850200_B 35 iso-ir-10 FI ISO646-FI ISO646-SE se csISO10Swedish
+SEN_850200_C 21 iso-ir-11 ISO646-SE2 se2 csISO11SwedishForNames
+KS_C_5601-1987 36 iso-ir-149 KS_C_5601-1989 KSC_5601 korean csKSC56011987
+ISO-2022-KR 37 csISO2022KR
+EUC-KR 38 csEUCKR EUCKR
+ISO-2022-JP 39 csISO2022JP
+ISO-2022-JP-2 40 csISO2022JP2
+ISO-2022-CN 104
+ISO-2022-CN-EXT 105
+JIS_C6220-1969-jp 41 JIS_C6220-1969 iso-ir-13 katakana x0201-7 csISO13JISC6220jp
+JIS_C6220-1969-ro 42 iso-ir-14 jp ISO646-JP csISO14JISC6220ro
+IT 22 iso-ir-15 ISO646-IT csISO15Italian
+PT 43 iso-ir-16 ISO646-PT csISO16Portuguese
+ES 23 iso-ir-17 ISO646-ES csISO17Spanish
+greek7-old 44 iso-ir-18 csISO18Greek7Old
+latin-greek 45 iso-ir-19 csISO19LatinGreek
+DIN_66003 24 iso-ir-21 de ISO646-DE csISO21German
+NF_Z_62-010_(1973) 46 iso-ir-25 ISO646-FR1 csISO25French
+Latin-greek-1 47 iso-ir-27 csISO27LatinGreek1
+ISO_5427 48 iso-ir-37 csISO5427Cyrillic
+JIS_C6226-1978 49 iso-ir-42 csISO42JISC62261978
+BS_viewdata 50 iso-ir-47 csISO47BSViewdata
+INIS 51 iso-ir-49 csISO49INIS
+INIS-8 52 iso-ir-50 csISO50INIS8
+INIS-cyrillic 53 iso-ir-51 csISO51INISCyrillic
+ISO_5427:1981 54 iso-ir-54 ISO5427Cyrillic1981
+ISO_5428:1980 55 iso-ir-55 csISO5428Greek
+GB_1988-80 56 iso-ir-57 cn ISO646-CN csISO57GB1988
+GB_2312-80 57 iso-ir-58 chinese csISO58GB231280
+NS_4551-1 25 iso-ir-60 ISO646-NO no csISO60DanishNorwegian csISO60Norwegian1
+NS_4551-2 58 ISO646-NO2 iso-ir-61 no2 csISO61Norwegian2
+NF_Z_62-010 26 iso-ir-69 ISO646-FR fr csISO69French
+videotex-suppl 59 iso-ir-70 csISO70VideotexSupp1
+PT2 60 iso-ir-84 ISO646-PT2 csISO84Portuguese2
+ES2 61 iso-ir-85 ISO646-ES2 csISO85Spanish2
+MSZ_7795.3 62 iso-ir-86 ISO646-HU hu csISO86Hungarian
+JIS_C6226-1983 63 iso-ir-87 x0208 JIS_X0208-1983 csISO87JISX0208
+greek7 64 iso-ir-88 csISO88Greek7
+ASMO_449 65 ISO_9036 arabic7 iso-ir-89 csISO89ASMO449
+iso-ir-90 66 csISO90
+JIS_C6229-1984-a 67 iso-ir-91 jp-ocr-a csISO91JISC62291984a
+JIS_C6229-1984-b 68 iso-ir-92 ISO646-JP-OCR-B jp-ocr-b csISO92JISC62991984b
+JIS_C6229-1984-b-add 69 iso-ir-93 jp-ocr-b-add csISO93JIS62291984badd
+JIS_C6229-1984-hand 70 iso-ir-94 jp-ocr-hand csISO94JIS62291984hand
+JIS_C6229-1984-hand-add 71 iso-ir-95 jp-ocr-hand-add csISO95JIS62291984handadd
+JIS_C6229-1984-kana 72 iso-ir-96 csISO96JISC62291984kana
+ISO_2033-1983 73 iso-ir-98 e13b csISO2033
+ANSI_X3.110-1983 74 iso-ir-99 CSA_T500-1983 NAPLPS csISO99NAPLPS
+ISO-8859-1 4 iso-ir-100 ISO_8859-1 ISO_8859-1:1987 latin1 l1 IBM819 CP819 csISOLatin1 8859_1 ISO8859-1
+ISO-8859-2 5 iso-ir-101 ISO_8859-2 ISO_8859-2:1987 latin2 l2 csISOLatin2 8859_2 ISO8859-2
+T.61-7bit 75 iso-ir-102 csISO102T617bit
+T.61-8bit 76 T.61 iso-ir-103 csISO103T618bit
+ISO-8859-3 6 iso-ir-109 ISO_8859-3 ISO_8859-3:1988 latin3 l3 csISOLatin3 8859_3 ISO8859-3
+ISO-8859-4 7 iso-ir-110 ISO_8859-4 ISO_8859-4:1988 latin4 l4 csISOLatin4 8859_4 ISO8859-4
+ECMA-cyrillic 77 iso-ir-111 KOI8-E csISO111ECMACyrillic
+CSA_Z243.4-1985-1 78 iso-ir-121 ISO646-CA csa7-1 ca csISO121Canadian1
+CSA_Z243.4-1985-2 79 iso-ir-122 ISO646-CA2 csa7-2 csISO122Canadian2
+CSA_Z243.4-1985-gr 80 iso-ir-123 csISO123CSAZ24341985gr
+ISO-8859-6 9 iso-ir-127 ISO_8859-6 ISO_8859-6:1987 ECMA-114 ASMO-708 arabic csISOLatinArabic
+ISO-8859-6-E 81 csISO88596E ISO_8859-6-E
+ISO-8859-6-I 82 csISO88596I ISO_8859-6-I
+ISO-8859-7 10 iso-ir-126 ISO_8859-7 ISO_8859-7:1987 ELOT_928 ECMA-118 greek greek8 csISOLatinGreek 8859_7 ISO8859-7
+T.101-G2 83 iso-ir-128 csISO128T101G2
+ISO-8859-8 11 iso-ir-138 ISO_8859-8 ISO_8859-8:1988 hebrew csISOLatinHebrew 8859_8 ISO8859-8
+ISO-8859-8-E 84 csISO88598E ISO_8859-8-E
+ISO-8859-8-I 85 csISO88598I ISO_8859-8-I
+CSN_369103 86 iso-ir-139 csISO139CSN369103
+JUS_I.B1.002 87 iso-ir-141 ISO646-YU js yu csISO141JUSIB1002
+ISO_6937-2-add 14 iso-ir-142 csISOTextComm
+IEC_P27-1 88 iso-ir-143 csISO143IECP271
+ISO-8859-5 8 iso-ir-144 ISO_8859-5 ISO_8859-5:1988 cyrillic csISOLatinCyrillic 8859_5 ISO8859-5
+JUS_I.B1.003-serb 89 iso-ir-146 serbian csISO146Serbian
+JUS_I.B1.003-mac 90 macedonian iso-ir-147 csISO147Macedonian
+ISO-8859-9 12 iso-ir-148 ISO_8859-9 ISO_8859-9:1989 latin5 l5 csISOLatin5 8859_9 ISO8859-9
+greek-ccitt 91 iso-ir-150 csISO150 csISO150GreekCCITT
+NC_NC00-10:81 92 cuba iso-ir-151 ISO646-CU csISO151Cuba
+ISO_6937-2-25 93 iso-ir-152 csISO6937Add
+GOST_19768-74 94 ST_SEV_358-88 iso-ir-153 csISO153GOST1976874
+ISO_8859-supp 95 iso-ir-154 latin1-2-5 csISO8859Supp
+ISO_10367-box 96 iso-ir-155 csISO10367Box
+ISO-8859-10 13 iso-ir-157 l6 ISO_8859-10:1992 csISOLatin6 latin6 8859_10 ISO8859-10
+latin-lap 97 lap iso-ir-158 csISO158Lap
+JIS_X0212-1990 98 x0212 iso-ir-159 csISO159JISX02121990
+DS_2089 99 DS2089 ISO646-DK dk csISO646Danish
+us-dk 100 csUSDK
+dk-us 101 csDKUS
+JIS_X0201 15 X0201 csHalfWidthKatakana
+KSC5636 102 ISO646-KR csKSC5636
+ISO-10646-UCS-2 1000 csUnicode UCS-2 UCS2
+ISO-10646-UCS-4 1001 csUCS4 UCS-4 UCS4
+DEC-MCS 2008 dec csDECMCS
+hp-roman8 2004 roman8 r8 csHPRoman8
+macintosh 2027 mac csMacintosh MACROMAN MAC-ROMAN X-MAC-ROMAN
+IBM037 2028 cp037 ebcdic-cp-us ebcdic-cp-ca ebcdic-cp-wt ebcdic-cp-nl csIBM037
+IBM038 2029 EBCDIC-INT cp038 csIBM038
+IBM273 2030 CP273 csIBM273
+IBM274 2031 EBCDIC-BE CP274 csIBM274
+IBM275 2032 EBCDIC-BR cp275 csIBM275
+IBM277 2033 EBCDIC-CP-DK EBCDIC-CP-NO csIBM277
+IBM278 2034 CP278 ebcdic-cp-fi ebcdic-cp-se csIBM278
+IBM280 2035 CP280 ebcdic-cp-it csIBM280
+IBM281 2036 EBCDIC-JP-E cp281 csIBM281
+IBM284 2037 CP284 ebcdic-cp-es csIBM284
+IBM285 2038 CP285 ebcdic-cp-gb csIBM285
+IBM290 2039 cp290 EBCDIC-JP-kana csIBM290
+IBM297 2040 cp297 ebcdic-cp-fr csIBM297
+IBM420 2041 cp420 ebcdic-cp-ar1 csIBM420
+IBM423 2042 cp423 ebcdic-cp-gr csIBM423
+IBM424 2043 cp424 ebcdic-cp-he csIBM424
+IBM437 2011 cp437 437 csPC8CodePage437
+IBM500 2044 CP500 ebcdic-cp-be ebcdic-cp-ch csIBM500
+IBM775 2087 cp775 csPC775Baltic
+IBM850 2009 cp850 850 csPC850Multilingual
+IBM851 2045 cp851 851 csIBM851
+IBM852 2010 cp852 852 csPCp852
+IBM855 2046 cp855 855 csIBM855
+IBM857 2047 cp857 857 csIBM857
+IBM860 2048 cp860 860 csIBM860
+IBM861 2049 cp861 861 cp-is csIBM861
+IBM862 2013 cp862 862 csPC862LatinHebrew
+IBM863 2050 cp863 863 csIBM863
+IBM864 2051 cp864 csIBM864
+IBM865 2052 cp865 865 csIBM865
+IBM866 2086 cp866 866 csIBM866
+IBM868 2053 CP868 cp-ar csIBM868
+IBM869 2054 cp869 869 cp-gr csIBM869
+IBM870 2055 CP870 ebcdic-cp-roece ebcdic-cp-yu csIBM870
+IBM871 2056 CP871 ebcdic-cp-is csIBM871
+IBM880 2057 cp880 EBCDIC-Cyrillic csIBM880
+IBM891 2058 cp891 csIBM891
+IBM903 2059 cp903 csIBM903
+IBM904 2060 cp904 904 csIBBM904
+IBM905 2061 CP905 ebcdic-cp-tr csIBM905
+IBM918 2062 CP918 ebcdic-cp-ar2 csIBM918
+IBM1026 2063 CP1026 csIBM1026
+EBCDIC-AT-DE 2064 csIBMEBCDICATDE
+EBCDIC-AT-DE-A 2065 csEBCDICATDEA
+EBCDIC-CA-FR 2066 csEBCDICCAFR
+EBCDIC-DK-NO 2067 csEBCDICDKNO
+EBCDIC-DK-NO-A 2068 csEBCDICDKNOA
+EBCDIC-FI-SE 2069 csEBCDICFISE
+EBCDIC-FI-SE-A 2070 csEBCDICFISEA
+EBCDIC-FR 2071 csEBCDICFR
+EBCDIC-IT 2072 csEBCDICIT
+EBCDIC-PT 2073 csEBCDICPT
+EBCDIC-ES 2074 csEBCDICES
+EBCDIC-ES-A 2075 csEBCDICESA
+EBCDIC-ES-S 2076 csEBCDICESS
+EBCDIC-UK 2077 csEBCDICUK
+EBCDIC-US 2078 csEBCDICUS
+UNKNOWN-8BIT 2079 csUnknown8BiT
+MNEMONIC 2080 csMnemonic
+MNEM 2081 csMnem
+VISCII 2082 csVISCII
+VIQR 2083 csVIQR
+KOI8-R 2084 csKOI8R
+KOI8-U 2088
+IBM00858 2089 CCSID00858 CP00858 PC-Multilingual-850+euro
+IBM00924 2090 CCSID00924 CP00924 ebcdic-Latin9--euro
+IBM01140 2091 CCSID01140 CP01140 ebcdic-us-37+euro
+IBM01141 2092 CCSID01141 CP01141 ebcdic-de-273+euro
+IBM01142 2093 CCSID01142 CP01142 ebcdic-dk-277+euro ebcdic-no-277+euro
+IBM01143 2094 CCSID01143 CP01143 ebcdic-fi-278+euro ebcdic-se-278+euro
+IBM01144 2095 CCSID01144 CP01144 ebcdic-it-280+euro
+IBM01145 2096 CCSID01145 CP01145 ebcdic-es-284+euro
+IBM01146 2097 CCSID01146 CP01146 ebcdic-gb-285+euro
+IBM01147 2098 CCSID01147 CP01147 ebcdic-fr-297+euro
+IBM01148 2099 CCSID01148 CP01148 ebcdic-international-500+euro
+IBM01149 2100 CCSID01149 CP01149 ebcdic-is-871+euro
+Big5-HKSCS 2101
+IBM1047 2102 IBM-1047
+PTCP154 2103 csPTCP154 PT154 CP154 Cyrillic-Asian
+Amiga-1251 2104 Ami1251 Amiga1251 Ami-1251
+KOI7-switched 2105
+UNICODE-1-1 1010 csUnicode11
+SCSU 1011
+UTF-7 1012
+UTF-16BE 1013
+UTF-16LE 1014
+UTF-16 1015
+CESU-8 1016 csCESU-8
+UTF-32 1017
+UTF-32BE 1018
+UTF-32LE 1019
+BOCU-1 1020 csBOCU-1
+UNICODE-1-1-UTF-7 103 csUnicode11UTF7
+UTF-8 106 UNICODE-1-1-UTF-8 UNICODE-2-0-UTF-8 utf8
+ISO-8859-13 109 8859_13 ISO8859-13
+ISO-8859-14 110 iso-ir-199 ISO_8859-14:1998 ISO_8859-14 latin8 iso-celtic l8 8859_14 ISO8859-14
+ISO-8859-15 111 ISO_8859-15 Latin-9 8859_15 ISO8859-15
+ISO-8859-16 112 iso-ir-226 ISO_8859-16:2001 ISO_8859-16 latin10 l10
+GBK 113 CP936 MS936 windows-936
+GB18030 114
+OSD_EBCDIC_DF04_15 115
+OSD_EBCDIC_DF03_IRV 116
+OSD_EBCDIC_DF04_1 117
+JIS_Encoding 16 csJISEncoding
+Shift_JIS 17 MS_Kanji csShiftJIS X-SJIS Shift-JIS
+EUC-JP 18 csEUCPkdFmtJapanese Extended_UNIX_Code_Packed_Format_for_Japanese EUCJP
+Extended_UNIX_Code_Fixed_Width_for_Japanese 19 csEUCFixWidJapanese
+ISO-10646-UCS-Basic 1002 csUnicodeASCII
+ISO-10646-Unicode-Latin1 1003 csUnicodeLatin1 ISO-10646
+ISO-Unicode-IBM-1261 1005 csUnicodeIBM1261
+ISO-Unicode-IBM-1268 1006 csUnicodeIBM1268
+ISO-Unicode-IBM-1276 1007 csUnicodeIBM1276
+ISO-Unicode-IBM-1264 1008 csUnicodeIBM1264
+ISO-Unicode-IBM-1265 1009 csUnicodeIBM1265
+ISO-8859-1-Windows-3.0-Latin-1 2000 csWindows30Latin1
+ISO-8859-1-Windows-3.1-Latin-1 2001 csWindows31Latin1
+ISO-8859-2-Windows-Latin-2 2002 csWindows31Latin2
+ISO-8859-9-Windows-Latin-5 2003 csWindows31Latin5
+Adobe-Standard-Encoding 2005 csAdobeStandardEncoding
+Ventura-US 2006 csVenturaUS
+Ventura-International 2007 csVenturaInternational
+PC8-Danish-Norwegian 2012 csPC8DanishNorwegian
+PC8-Turkish 2014 csPC8Turkish
+IBM-Symbols 2015 csIBMSymbols
+IBM-Thai 2016 csIBMThai
+HP-Legal 2017 csHPLegal
+HP-Pi-font 2018 csHPPiFont
+HP-Math8 2019 csHPMath8
+Adobe-Symbol-Encoding 2020 csHPPSMath
+HP-DeskTop 2021 csHPDesktop
+Ventura-Math 2022 csVenturaMath
+Microsoft-Publishing 2023 csMicrosoftPublishing
+Windows-31J 2024 csWindows31J
+GB2312 2025 csGB2312 EUC-CN EUCCN CN-GB
+Big5 2026 csBig5 BIG-FIVE BIG-5 CN-BIG5 BIG_FIVE
+windows-1250 2250 CP1250 MS-EE
+windows-1251 2251 CP1251 MS-CYRL
+windows-1252 2252 CP1252 MS-ANSI
+windows-1253 2253 CP1253 MS-GREEK
+windows-1254 2254 CP1254 MS-TURK
+windows-1255 2255
+windows-1256 2256 CP1256 MS-ARAB
+windows-1257 2257 CP1257 WINBALTRIM
+windows-1258 2258
+TIS-620 2259
+HZ-GB-2312 2085
+
+# Additional encodings not defined by IANA
+
+# Arbitrary allocations
+#CP737 3001
+#CP853 3002
+#CP856 3003
+CP874 3004 WINDOWS-874
+#CP922 3005
+#CP1046 3006
+#CP1124 3007
+#CP1125 3008 WINDOWS-1125
+#CP1129 3009
+#CP1133 3010 IBM-CP1133
+#CP1161 3011 IBM-1161 IBM1161 CSIBM1161
+#CP1162 3012 IBM-1162 IBM1162 CSIBM1162
+#CP1163 3013 IBM-1163 IBM1163 CSIBM1163
+#GEORGIAN-ACADEMY 3014
+#GEORGIAN-PS 3015
+#KOI8-RU 3016
+#KOI8-T 3017
+#MACARABIC 3018 X-MAC-ARABIC MAC-ARABIC
+#MACCROATIAN 3019 X-MAC-CROATIAN MAC-CROATIAN
+#MACGREEK 3020 X-MAC-GREEK MAC-GREEK
+#MACHEBREW 3021 X-MAC-HEBREW MAC-HEBREW
+#MACICELAND 3022 X-MAC-ICELAND MAC-ICELAND
+#MACROMANIA 3023 X-MAC-ROMANIA MAC-ROMANIA
+#MACTHAI 3024 X-MAC-THAI MAC-THAI
+#MACTURKISH 3025 X-MAC-TURKISH MAC-TURKISH
+#MULELAO-1 3026
+
+# From Unicode Lib
+ISO-IR-182 4000
+ISO-IR-197 4002
+ISO-2022-JP-1 4008
+MACCYRILLIC 4009 X-MAC-CYRILLIC MAC-CYRILLIC
+MACUKRAINE 4010 X-MAC-UKRAINIAN MAC-UKRAINIAN
+MACCENTRALEUROPE 4011 X-MAC-CENTRALEURROMAN MAC-CENTRALEURROMAN
+JOHAB 4012
+ISO-8859-11 4014 iso-ir-166 ISO_8859-11 ISO8859-11 8859_11
+X-CURRENT 4999 X-SYSTEM
+X-ACORN-LATIN1 5001
+X-ACORN-FUZZY 5002
diff --git a/test/data/cscodec/INDEX b/test/data/cscodec/INDEX
new file mode 100644
index 0000000..326cff5
--- /dev/null
+++ b/test/data/cscodec/INDEX
@@ -0,0 +1,5 @@
+# Index file for charset codec tests
+#
+# Test Description
+
+simple.dat Simple tests, designed to validate testdriver \ No newline at end of file
diff --git a/test/data/cscodec/simple.dat b/test/data/cscodec/simple.dat
new file mode 100644
index 0000000..6a3cad1
--- /dev/null
+++ b/test/data/cscodec/simple.dat
Binary files differ
diff --git a/test/data/csdetect/INDEX b/test/data/csdetect/INDEX
new file mode 100644
index 0000000..e292063
--- /dev/null
+++ b/test/data/csdetect/INDEX
@@ -0,0 +1,9 @@
+# Index file for charset detection tests
+#
+# Test Description
+
+bom.dat UTF Byte Order Mark detection tests
+non-ascii-meta.dat Tests for meta charsets claiming to be non-ASCII
+test-yahoo-jp.dat Yahoo! Japan, from html5lib testcases
+tests1.dat Assorted tests, including edge cases, from html5lib
+tests2.dat Further tests from html5lib
diff --git a/test/data/csdetect/bom.dat b/test/data/csdetect/bom.dat
new file mode 100644
index 0000000..9a2f719
--- /dev/null
+++ b/test/data/csdetect/bom.dat
Binary files differ
diff --git a/test/data/csdetect/non-ascii-meta.dat b/test/data/csdetect/non-ascii-meta.dat
new file mode 100644
index 0000000..ea2a707
--- /dev/null
+++ b/test/data/csdetect/non-ascii-meta.dat
@@ -0,0 +1,129 @@
+#data
+<html>
+<head>
+<meta charset="utf-16">
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset="utf-16le">
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset="utf-16be">
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset='utf-16'>
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset='utf-16le'>
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset='utf-16be'>
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset=utf-16>
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset=utf-16le>
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset=utf-16be>
+#encoding
+windows-1252
+
+
+
+#data
+<html>
+<head>
+<meta charset="utf-32">
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset="utf-32le">
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset="utf-32be">
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset='utf-32'>
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset='utf-32le'>
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset='utf-32be'>
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset=utf-32>
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset=utf-32le>
+#encoding
+windows-1252
+
+#data
+<html>
+<head>
+<meta charset=utf-32be>
+#encoding
+windows-1252
+
+
diff --git a/test/data/csdetect/test-yahoo-jp.dat b/test/data/csdetect/test-yahoo-jp.dat
new file mode 100644
index 0000000..daf6125
--- /dev/null
+++ b/test/data/csdetect/test-yahoo-jp.dat
@@ -0,0 +1,10 @@
+#data
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=euc-jp">
+<!--京-->
+<title>Yahoo! JAPAN</title>
+<meta name="description" content="日本最大級ã®ãƒãƒ¼ã‚¿ãƒ«ã‚µã‚¤ãƒˆã€‚検索ã€ã‚ªãƒ¼ã‚¯ã‚·ãƒ§ãƒ³ã€ãƒ‹ãƒ¥ãƒ¼ã‚¹ã€ãƒ¡ãƒ¼ãƒ«ã€ã‚³ãƒŸãƒ¥ãƒ‹ãƒ†ã‚£ã€ã‚·ãƒ§ãƒƒãƒ”ングã€ãªã©80以上ã®ã‚µãƒ¼ãƒ“スを展開。ã‚ãªãŸã®ç”Ÿæ´»ã‚’より豊ã‹ã«ã™ã‚‹ã€Œãƒ©ã‚¤ãƒ•・エンジンã€ã‚’目指ã—ã¦ã„ãã¾ã™ã€‚">
+<style type="text/css" media="all">
+#encoding
+euc-jp \ No newline at end of file
diff --git a/test/data/csdetect/tests1.dat b/test/data/csdetect/tests1.dat
new file mode 100644
index 0000000..8a62676
--- /dev/null
+++ b/test/data/csdetect/tests1.dat
@@ -0,0 +1,392 @@
+#data
+<!DOCTYPE HTML>
+<!-- (control test - for the other tests to work, this should pass - you may have to set your defaults appropriately) -->
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<meta charset="ISO-8859-1">
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<meta charset="ISO-8859-9">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta charset='ISO-8859-9'>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta charset=ISO-8859-9>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta
+charset=ISO-8859-9>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<metacharset=ISO-8859-9>
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-9">
+<!-- XXX this is a tough one, not sure how to do this one, unless we explictly do content= processing -->
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta content="text/html; charset=ISO-8859-9" http-equiv="Content-Type">
+<!-- XXX this is a tough one, not sure how to do this one, unless we explictly do content= processing -->
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta http-equiv="Content-Type" content=text/html; charset=ISO-8859-9>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta http-equiv="Content-Type content="text/html; charset=ISO-8859-9">
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<meta http-equiv="Content-Type " content="text/html; charset=ISO-8859-9">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta content="text/html; charset=ISO-8859-9" http-equiv="Content-Type ">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta http-equiv="Content-Type>" content="text/html; charset=ISO-8859-9">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta content="text/html; charset=ISO-8859-9" http-equiv="Content-Type>">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta http-equiv="Content-Style-Type" content="text/html; charset=ISO-8859-9">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta content="text/html; charset=ISO-8859-9" http-equiv="Content-Style-Type">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta name="Content-Style-Type" content="text/html; charset=ISO-8859-9">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta content="text/html; charset=ISO-8859-9" name="Content-Style-Type">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta content="text/html; charset=ISO-8859-9">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta content=" text/html; charset = ISO-8859-9 ">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta content="
+text/html; charset=ISO-8859-9
+">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta charset="
+ISO-8859-9
+">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta charset=
+ISO-8859-9
+>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta charset="ISO-8859-9>
+<p>"</p>
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<meta charset=ISO-8859-9">
+<p>"</p>
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<meta " charset=ISO-8859-9>
+<p>"</p>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta test" charset=ISO-8859-9>
+<p>"</p>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta test=" charset=ISO-8859-9>
+<p>"</p>
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<meta test="' charset=ISO-8859-9>
+<p>"'</p>
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<meta test='" charset=ISO-8859-9>
+<p>'"</p>
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<meta test="" charset=ISO-8859-9>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta test=x" charset=ISO-8859-9>
+<p>"</p>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<head></head><p title="x>
+<meta test=x" charset=ISO-8859-9>
+<p>"</p>
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<head></head><p title="x>
+<meta test=x charset=ISO-8859-9>
+<p>"</p>
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<head></head><p title="x>
+<meta charset=ISO-8859-9>
+<p>"</p>
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<head></head><p title="x>">
+<meta charset=ISO-8859-9>
+<p>"</p>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta charset="ISO-8859-1">
+<meta charset="ISO-8859-9">
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<meta charset="ISO-8859-9">
+<meta charset="ISO-8859-1">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<!--<meta charset="ISO-8859-1">-->
+<meta charset="ISO-8859-9">
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<!--<meta charset="ISO-8859-9">-->
+<meta charset="ISO-8859-1">
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<!-- Starts with UTF-8 BOM -->
+#encoding
+UTF-8
+
+#data
+<!DOCTYPE HTML>
+<meta charset="ISO-8859-1">
+<!-- Starts with UTF-8 BOM -->
+#encoding
+UTF-8
+
+#data
+<!-- 511 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-->
+<meta charset="ISO-8859-9">
+#encoding
+ISO-8859-9
+
+#data
+<!-- 512 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-->
+<meta charset="ISO-8859-9">
+#encoding
+ISO-8859-9
+
+#data
+<!-- 1024 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-->
+<meta charset="ISO-8859-9">
+#encoding
+Windows-1252
+
+#data
+<!-- 1025 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz-->
+<meta charset="ISO-8859-9">
+#encoding
+Windows-1252
+
+#data
+<!-- 2048 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-->
+<meta charset="ISO-8859-9">
+#encoding
+Windows-1252
+
+#data
+<!-- 2049 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz-->
+<meta charset="ISO-8859-9">
+#encoding
+Windows-1252
+
+#data <!-- 4096 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-->
+<meta charset="ISO-8859-9">
+#encoding
+Windows-1252
+
+#data <!-- 4097 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz-->
+<meta charset="ISO-8859-9">
+#encoding
+Windows-1252
+
+#data
+<!-- 8192 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx-->
+<meta charset="ISO-8859-9">
+#encoding
+Windows-1252
+
+#data
+<!-- 8193 characters xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxz-->
+<meta charset="ISO-8859-9">
+#encoding
+Windows-1252
+
+#data
+<!-- multi-script test -->
+<script>alert('step 1 of 3 ("þ")')</script>
+<!-- ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -->
+<script>alert('step 2 of 3 ("þ")')</script>
+<meta charset="ISO-8859-9">
+<script>alert('step 3 of 3 ("þ")')</script>
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<script>document.write('<meta charset="ISO-8859-' + '9">')</script>
+#encoding
+Windows-1252
+
+#data
+<!DOCTYPE HTML>
+<script>document.write('<meta charset="ISO-8859-9">')</script>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<script type="text/plain"><meta charset="ISO-8859-9"></script>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<style type="text/plain"><meta charset="ISO-8859-9"></style>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<p><meta charset="ISO-8859-9"></p>
+#encoding
+ISO-8859-9
+
+#data
+<!DOCTYPE HTML>
+<meta charset="bogus">
+<meta charset="ISO-8859-9">
+#encoding
+ISO-8859-9
diff --git a/test/data/csdetect/tests2.dat b/test/data/csdetect/tests2.dat
new file mode 100644
index 0000000..dd43f85
--- /dev/null
+++ b/test/data/csdetect/tests2.dat
@@ -0,0 +1,82 @@
+#data
+<meta
+#encoding
+windows-1252
+
+#data
+<
+#encoding
+windows-1252
+
+#data
+<!
+#encoding
+windows-1252
+
+#data
+<meta charset = "
+#encoding
+windows-1252
+
+#data
+<meta charset=EUC-jp
+#encoding
+windows-1252
+
+#data
+<meta <meta charset='EUC-jp'>
+#encoding
+EUC-jp
+
+#data
+<meta charset = 'EUC-jp'>
+#encoding
+EUC-jp
+
+
+#data
+<!-- -->
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+#encoding
+utf-8
+
+#data
+<!-- -->
+<meta http-equiv="Content-Type" content="text/html; charset=utf
+#encoding
+windows-1252
+
+#data
+<meta http-equiv="Content-Type<meta charset="utf-8">
+#encoding
+windows-1252
+
+#data
+<meta http-equiv="Content-Type" content="text/html; charset='utf-8'">
+#encoding
+utf-8
+
+#data
+<meta http-equiv="Content-Type" content="text/html; charset='utf-8">
+#encoding
+windows-1252
+
+#data
+<meta
+#encoding
+windows-1252
+
+#data
+<meta charset =
+#encoding
+windows-1252
+
+#data
+<meta charset= utf-8
+#encoding
+windows-1252
+
+#data
+<meta content = "text/html;
+#encoding
+windows-1252
diff --git a/test/data/html/INDEX b/test/data/html/INDEX
new file mode 100644
index 0000000..03d6e04
--- /dev/null
+++ b/test/data/html/INDEX
@@ -0,0 +1,6 @@
+# Index file for generic HTML content
+#
+# Test Description
+
+section-tree-construction.html HTML5 tree construction algorithm
+web-apps.html HTML5 specification
diff --git a/test/data/html/section-tree-construction.html b/test/data/html/section-tree-construction.html
new file mode 100644
index 0000000..45ce9ab
--- /dev/null
+++ b/test/data/html/section-tree-construction.html
@@ -0,0 +1,2783 @@
+<!DOCTYPE HTML>
+
+
+<html lang="en-GB-hixie">
+ <head>
+ <title>HTML 5</title>
+ <link href="/style/specification" type="text/css" rel="stylesheet">
+ <link href="/images/icon" rel="icon">
+
+ <style type="text/css">
+ h4 + .element { margin-top: -2.5em; padding-top: 2em; }
+ h4 + p + .element { margin-top: -5em; padding-top: 4em; }
+ .element { background: #EEFFEE; color: black; margin: 0 0 1em -1em; padding: 0 1em 0.25em 0.75em; border-left: solid #99FF99 0.25em; -padding: 0; /* that last decl is for IE6. Try removing it, it's hilarious! */ }
+ .proposal { border: blue solid; padding: 1em; }
+ table.matrix, table.matrix td { border: none; text-align: right; }
+ table.matrix { margin-left: 2em; }
+ </style>
+
+ <link href="section-tokenisation.html#nav-bar" rel="prev" title="8.2.3. Tokenisation"><link href="index.html#contents" rel="index" title="Table of contents"><link href="section-namespaces.html#nav-bar" rel="next" title="8.3. Namespaces"></head><body class="draft"><div class="head">
+ <p><a href="http://www.whatwg.org/" class="logo" rel="home"><img src="/images/logo" alt="WHATWG"></a></p>
+
+ <h1 id="html-5">HTML 5</h1>
+
+ <h2 id="working" class="no-num no-toc">Working Draft — 12 June 2007</h2></div><nav id="nav-bar"><a href="section-tokenisation.html#nav-bar">&lt; 8.2.3. Tokenisation</a> – <a href="index.html#contents">Table of contents</a> – <a href="section-namespaces.html#nav-bar">8.3. Namespaces &gt;</a></nav><h4 id="tree-construction"><span class="secno">8.2.4. </span><dfn id="tree-construction0">Tree construction</dfn></h4>
+
+ <p>The input to the tree construction stage is a sequence of tokens from
+ the <a href="section-tokenisation.html#tokenisation0">tokenisation</a> stage. The tree construction
+ stage is associated with a DOM <code>Document</code> object when a parser
+ is created. The &quot;output&quot; of this stage consists of dynamically modifying
+ or extending that document's DOM tree.
+
+ </p><p>Tree construction passes through several phases. Initially, UAs must act
+ according to the steps described as being those of <a href="#the-initial0">the initial phase</a>.
+
+ </p><p>This specification does not define when an interactive user agent has to
+ render the <code>Document</code> available to the user, or when it has to
+ begin accepting user input.
+
+ </p><p>When the steps below require the UA to <dfn id="append">append a
+ character</dfn> to a node, the UA must collect it and all subsequent
+ consecutive characters that would be appended to that node, and insert one
+ <code>Text</code> node whose data is the concatenation of all those
+ characters.
+
+ </p><p id="mutation-during-parsing">DOM mutation events must not fire for changes
+ caused by the UA parsing the document. (Conceptually, the parser is not
+ mutating the DOM, it is constructing it.) This includes the parsing of any
+ content inserted using <code title="dom-document-write-HTML"><a href="section-dynamic.html#document.write0">document.write()</a></code> and <code title="dom-document-writeln"><a href="section-dynamic.html#document.writeln">document.writeln()</a></code> calls.<!--
+ XXX xref -->
+ <a href="#refsDOM3EVENTS">[DOM3EVENTS]</a></p>
+ <!-- XXX
+ what abotu innerHTML? -->
+
+ <p class="note">Not all of the tag names mentioned below are conformant tag
+ names in this specification; many are included to handle legacy content.
+ They still form part of the algorithm that implementations are required to
+ implement to claim conformance.
+
+ </p><p class="note">The algorithm described below places no limit on the depth of
+ the DOM tree generated, or on the length of tag names, attribute names,
+ attribute values, text nodes, etc. While implementators are encouraged to
+ avoid arbitrary limits, it is recognised that <a href="section-conformance.html#hardwareLimitations">practical concerns</a> will likely force user
+ agents to impose nesting depths.
+
+ </p><h5 id="the-initial"><span class="secno">8.2.4.1. </span><dfn id="the-initial0">The initial phase</dfn></h5>
+
+ <p>Initially, the tree construction stage must handle each token emitted
+ from the <a href="section-tokenisation.html#tokenisation0">tokenisation</a> stage as follows:
+
+ </p><dl class="switch">
+ <dt>A DOCTYPE token that is marked as being in error
+
+ </dt><dt>A comment token
+
+ </dt><dt>A start tag token
+
+ </dt><dt>An end tag token
+
+ </dt><dt>A character token that is not one of one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM
+ FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dt>An end-of-file token
+
+ </dt><dd>
+ <p>This specification does not define how to handle this case. In
+ particular, user agents may ignore the entirety of this specification
+ altogether for such documents, and instead invoke special parse modes
+ with a greater emphasis on backwards compatibility.</p>
+
+ <div class="note">
+ <p>Browsers in particular have generally used DOCTYPE-based sniffing to
+ invoke an &quot;alternative conformance mode&quot; known as <em>quirks mode</em>
+ on certain documents. In this mode, emphasis is put on legacy
+ compatibility rather than on standards compliance. This specification
+ takes no position on this behaviour; documents without DOCTYPEs or with
+ DOCTYPEs that do not conform to the syntax allowed by this
+ specification are considered to be out of scope of this specification.</p>
+ </div>
+
+ <div class="big-issue">
+ <p>As far as parsing goes, the quirks I know of are:</p>
+
+ <ul>
+ <li>Comment parsing is different.
+
+ </li><li>The following is considered one script block (!):
+ <pre>&lt;script&gt;&lt;!-- document.write('&lt;/script&gt;'); --&gt;&lt;/script&gt;</pre>
+
+ </li><li><code title="">&lt;/br&gt;</code> and <code title="">&lt;/p&gt;</code> do
+ magical things.
+
+ </li><li><code><a href="section-prose.html#p">p</a></code> can contain <code><a href="section-tabular.html#table">table</a></code>
+
+ </li><li>Safari and IE have special parsing rules for &lt;% ... %&gt; (even
+ in standards mode, though clearly this should be quirks-only).
+ </li></ul>
+
+ <p>Maybe we should just adopt all those and be done with it. One parsing
+ mode to rule them all. Or legitimise/codify the quirks mode parsing in
+ some way.</p>
+
+ <p>Would be interesting to do a search to see how many pages hit each of
+ the above.</p>
+ <!-- biased by page rank? --></div>
+
+ </dd><dt>A DOCTYPE token marked as being correct
+
+ </dt><dd>
+ <p>Append a <code>DocumentType</code> node to the <code>Document</code>
+ node, with the <code title="">name</code> attribute set to the name
+ given in the DOCTYPE token (which will be &quot;HTML&quot;), and the other
+ attributes specific to <code>DocumentType</code> objects set to null,
+ empty lists, or the empty string as appropriate.</p>
+
+ <p>Then, switch to <a href="#the-root1">the root element phase</a> of the
+ tree construction stage.</p>
+ <!-- XXX should set doctype on the Document object, too, unless
+ spec is defined to already point to it if you append -->
+
+
+ </dd><dt>A character token that <em>is</em> one of one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM
+ FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dd>
+ <p><a href="#append" title="append a character">Append that character</a>
+ to the <code>Document</code> node.</p>
+ </dd></dl>
+
+ <h5 id="the-root0"><span class="secno">8.2.4.2. </span><dfn id="the-root1">The
+ root element phase</dfn></h5>
+
+ <p>After <a href="#the-initial0">the initial phase</a>, as each token is
+ emitted from the <a href="section-tokenisation.html#tokenisation0">tokenisation</a> stage, it must
+ be processed as described in this section.
+
+ </p><dl class="switch">
+ <dt>A DOCTYPE token
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+
+ </dd><dt>A comment token
+
+ </dt><dd>
+ <p>Append a <code>Comment</code> node to the <code>Document</code> object
+ with the <code title="">data</code> attribute set to the data given in
+ the comment token.</p>
+
+ </dd><dt>A character token that is one of one of U+0009 CHARACTER TABULATION,
+ U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
+ U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dd>
+ <p><a href="#append" title="append a character">Append that character</a>
+ to the <code>Document</code> node.</p>
+
+ </dd><dt>A character token that is <em>not</em> one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM
+ FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dt>A start tag token
+
+ </dt><dt>An end tag token
+
+ </dt><dt>An end-of-file token
+
+ </dt><dd>
+ <p>Create an <code><a href="section-elements.html#htmlelement">HTMLElement</a></code> node
+ with the tag name <code><a href="section-the-root.html#html">html</a></code>, in the <a href="section-namespaces.html#html-namespace0">HTML namespace</a>. Append it to the
+ <code>Document</code> object. Switch to <a href="#the-main0">the main
+ phase</a> and reprocess the current token.</p>
+
+ <p class="big-issue">Should probably make end tags be ignored, so that
+ &quot;&lt;/head&gt;&lt;!-- --&gt;&lt;html&gt;&quot; puts the comment befor the root node
+ (or should we?)</p>
+ </dd></dl>
+
+ <p>The root element can end up being removed from the <code>Document</code>
+ object, e.g. by scripts; nothing in particular happens in such cases,
+ content continues being appended to the nodes as described in the next
+ section.
+
+ </p><h5 id="the-main"><span class="secno">8.2.4.3. </span><dfn id="the-main0">The
+ main phase</dfn></h5>
+
+ <p>After <a href="#the-root1">the root element phase</a>, each token
+ emitted from the <a href="section-tokenisation.html#tokenisation0">tokenisation</a> stage must be
+ processed as described in <em>this</em> section. This is by far the most
+ involved part of parsing an HTML document.
+
+ </p><p>The tree construction stage in this phase has several pieces of state: a
+ <a href="#stack">stack of open elements</a>, a <a href="#list-of4">list of
+ active formatting elements</a>, a <a href="#head-element"><code title="">head</code> element pointer</a>, a <a href="#form-element"><code title="">form</code> element pointer</a>, and an <a href="#insertion0">insertion mode</a>.
+
+ </p><p class="big-issue">We could just fold insertion modes and phases into one
+ concept (and duplicate the two rules common to all insertion modes into
+ all of them).
+
+ </p><h6 id="the-stack"><span class="secno">8.2.4.3.1. </span>The stack of open
+ elements</h6>
+
+ <p>Initially the <dfn id="stack">stack of open elements</dfn> contains just
+ the <code><a href="section-the-root.html#html">html</a></code> root element node created in the
+ <a href="#the-root1" title="the root element phase">last phase</a> before
+ switching to <em>this</em> phase (or, in the <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>, the <code><a href="section-the-root.html#html">html</a></code> element created to represent the element
+ whose <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> attribute is being set). That's
+ the topmost node of the stack. It never gets popped off the stack. (This
+ stack grows downwards.)
+
+ </p><p>The <dfn id="current4">current node</dfn> is the bottommost node in this
+ stack.
+
+ </p><p>Elements in the stack fall into the following categories:
+
+ </p><dl>
+ <dt><dfn id="special">Special</dfn>
+
+ </dt><dd>
+ <p>The following HTML elements have varying levels of special parsing
+ rules: <code><a href="section-sections.html#address">address</a></code>, <code><a href="section-the-canvas.html#area">area</a></code>, <code><a href="section-document.html#base">base</a></code>,
+ <code>basefont</code>, <code>bgsound</code>, <code><a href="section-sections.html#blockquote">blockquote</a></code>, <code><a href="section-sections.html#body0">body</a></code>, <code><a href="section-prose.html#br">br</a></code>,
+ <code>center</code>, <code><a href="section-tabular.html#col">col</a></code>, <code><a href="section-tabular.html#colgroup">colgroup</a></code>, <code><a href="section-lists0.html#dd">dd</a></code>,
+ <code>dir</code>, <code><a href="section-miscellaneous.html#div">div</a></code>, <code><a href="section-lists0.html#dl">dl</a></code>, <code><a href="section-lists0.html#dt">dt</a></code>, <code><a href="section-embedded.html#embed">embed</a></code>, <code>fieldset</code>,
+ <code>form</code>, <code>frame</code>, <code>frameset</code>, <code><a href="section-sections.html#h1">h1</a></code>, <code><a href="section-sections.html#h2">h2</a></code>, <code><a href="section-sections.html#h3">h3</a></code>, <code><a href="section-sections.html#h4">h4</a></code>, <code><a href="section-sections.html#h5">h5</a></code>, <code><a href="section-sections.html#h6">h6</a></code>, <code><a href="section-document.html#head">head</a></code>, <code><a href="section-prose.html#hr">hr</a></code>,
+ <code><a href="section-embedded.html#iframe">iframe</a></code>,
+ <code>image</code><!-- XXX ? this isn't an element that can end up
+ on the stack-->,
+ <code><a href="section-embedded.html#img">img</a></code>, <code>input</code>,
+ <code>isindex</code>, <code><a href="section-lists0.html#li">li</a></code>, <code><a href="section-document.html#link">link</a></code>, <code>listing</code>, <code><a href="section-the-command.html#menu">menu</a></code>, <code><a href="section-document.html#meta0">meta</a></code>,
+ <code>noembed</code>, <code>noframes</code>, <code><a href="section-scripting0.html#noscript">noscript</a></code>, <code><a href="section-lists0.html#ol">ol</a></code>,
+ <code>optgroup</code>, <code>option</code>, <code><a href="section-prose.html#p">p</a></code>, <code><a href="section-embedded.html#param">param</a></code>,
+ <code>plaintext</code>, <code><a href="section-preformatted.html#pre">pre</a></code>, <code><a href="section-scripting0.html#script0">script</a></code>, <code>select</code>,
+ <code>spacer</code>, <code><a href="section-document.html#style">style</a></code>, <code><a href="section-tabular.html#tbody">tbody</a></code>, <code>textarea</code>, <code><a href="section-tabular.html#tfoot0">tfoot</a></code>, <code><a href="section-tabular.html#thead0">thead</a></code>, <code><a href="section-document.html#title1">title</a></code>, <code><a href="section-tabular.html#tr">tr</a></code>,
+ <code><a href="section-lists0.html#ul">ul</a></code>, and <code>wbr</code>.
+
+ </p></dd><dt><dfn id="scoping">Scoping</dfn>
+
+ </dt><dd>
+ <p>The following HTML elements introduce new <a href="#have-an" title="has an element in scope">scopes</a> for various parts of the
+ parsing: <code>button</code>, <code><a href="section-tabular.html#caption0">caption</a></code>, <code><a href="section-the-root.html#html">html</a></code>, <code>marquee</code>, <code><a href="section-embedded.html#object">object</a></code>, <code><a href="section-tabular.html#table">table</a></code>, <code><a href="section-tabular.html#td">td</a></code> and
+ <code><a href="section-tabular.html#th">th</a></code>.
+
+ </p></dd><dt><dfn id="formatting">Formatting</dfn>
+
+ </dt><dd>
+ <p>The following HTML elements are those that end up in the <a href="#list-of4">list of active formatting elements</a>: <code><a href="section-phrase.html#a">a</a></code>, <code><a href="section-phrase.html#b">b</a></code>,
+ <code>big</code>, <code><a href="section-phrase.html#em">em</a></code>, <code><a href="section-presentational.html#font">font</a></code>, <code><a href="section-phrase.html#i">i</a></code>,
+ <code>nobr</code>, <code>s</code>, <code><a href="section-phrase.html#small">small</a></code>, <code>strike</code>, <code><a href="section-phrase.html#strong">strong</a></code>, <code>tt</code>, and <code>u</code>.
+
+ </p></dd><dt><dfn id="phrasing">Phrasing</dfn>
+
+ </dt><dd>
+ <p>All other elements found while parsing an HTML document.
+ </p></dd></dl>
+
+ <p class="big-issue">Still need to add these new elements to the lists:
+ <code><a href="section-scripting0.html#event-source">event-source</a></code>, <code><a href="section-sections.html#section">section</a></code>, <code><a href="section-sections.html#nav">nav</a></code>,
+ <code><a href="section-sections.html#article">article</a></code>, <code><a href="section-sections.html#aside">aside</a></code>, <code><a href="section-sections.html#header">header</a></code>,
+ <code><a href="section-sections.html#footer">footer</a></code>, <code><a href="section-interactive.html#datagrid0">datagrid</a></code>, <code><a href="section-the-command.html#command0">command</a></code>
+
+ </p><p>The <a href="#stack">stack of open elements</a> is said to <dfn id="have-an" title="has an element in scope">have an element in scope</dfn>
+ or <dfn id="have-an0" title="has an element in table scope">have an element
+ in <em>table scope</em></dfn> when the following algorithm terminates in a
+ match state:
+
+ </p><ol>
+ <li>
+ <p>Initialise <var title="">node</var> to be the <a href="#current4">current node</a> (the bottommost node of the stack).
+
+ </p></li><li>
+ <p>If <var title="">node</var> is the target node, terminate in a match
+ state.
+
+ </p></li><li>
+ <p>Otherwise, if <var title="">node</var> is a <code><a href="section-tabular.html#table">table</a></code> element, terminate in a failure state.
+
+ </p></li><li>
+ <p>Otherwise, if the algorithm is the &quot;has an element in scope&quot; variant
+ (rather than the &quot;has an element in table scope&quot; variant), and <var title="">node</var> is one of the following, terminate in a failure
+ state:</p>
+
+ <ul class="brief">
+ <li><code><a href="section-tabular.html#caption0">caption</a></code>
+
+ </li><li><code><a href="section-tabular.html#td">td</a></code>
+
+ </li><li><code><a href="section-tabular.html#th">th</a></code>
+
+ </li><li><code>button</code>
+
+ </li><li><code>marquee</code>
+
+ </li><li><code><a href="section-embedded.html#object">object</a></code>
+ </li></ul>
+
+ </li><li>
+ <p>Otherwise, if <var title="">node</var> is an <code><a href="section-the-root.html#html">html</a></code> element, terminate in a failure state.
+ (This can only happen if the <var title="">node</var> is the topmost
+ node of the <a href="#stack">stack of open elements</a>, and prevents
+ the next step from being invoked if there are no more elements in the
+ stack.)
+
+ </p></li><li>
+ <p>Otherwise, set <var title="">node</var> to the previous entry in the
+ <a href="#stack">stack of open elements</a> and return to step 2. (This
+ will never fail, since the loop will always terminate in the previous
+ step if the top of the stack is reached.)
+ </p></li></ol>
+
+ <p>Nothing happens if at any time any of the elements in the <a href="#stack">stack of open elements</a> are moved to a new location in,
+ or removed from, the <code>Document</code> tree. In particular, the stack
+ is not changed in this situation. This can cause, amongst other strange
+ effects, content to be appended to nodes that are no longer in the DOM.
+
+ </p><p class="note">In some cases (namely, when <a href="#adoptionAgency">closing
+ misnested formatting elements</a>), the stack is manipulated in a
+ random-access fashion.
+
+ </p><h6 id="the-list"><span class="secno">8.2.4.3.2. </span>The list of active
+ formatting elements</h6>
+
+ <p>Initially the <dfn id="list-of4">list of active formatting elements</dfn>
+ is empty. It is used to handle mis-nested <a href="#formatting" title="formatting">formatting element tags</a>.
+
+ </p><p>The list contains elements in the <a href="#formatting">formatting</a>
+ category, and scope markers. The scope markers are inserted when entering
+ buttons, <code><a href="section-embedded.html#object">object</a></code> elements, marquees,
+ table cells, and table captions, and are used to prevent formatting from
+ &quot;leaking&quot; into tables, buttons, <code><a href="section-embedded.html#object">object</a></code>
+ elements, and marquees.
+
+ </p><p>When the steps below require the UA to <dfn id="reconstruct">reconstruct
+ the active formatting elements</dfn>, the UA must perform the following
+ steps:
+
+ </p><ol>
+ <li>If there are no entries in the <a href="#list-of4">list of active
+ formatting elements</a>, then there is nothing to reconstruct; stop this
+ algorithm.
+
+ </li><li>If the last (most recently added) entry in the <a href="#list-of4">list of active formatting elements</a> is a marker, or
+ if it is an element that is in the <a href="#stack">stack of open
+ elements</a>, then there is nothing to reconstruct; stop this algorithm.
+
+ </li><li>Let <var title="">entry</var> be the last (most recently added)
+ element in the <a href="#list-of4">list of active formatting
+ elements</a>.
+
+ </li><li>If there are no entries before <var title="">entry</var> in the <a href="#list-of4">list of active formatting elements</a>, then jump to
+ step 8.
+
+ </li><li>Let <var title="">entry</var> be the entry one earlier than <var title="">entry</var> in the <a href="#list-of4">list of active formatting
+ elements</a>.
+
+ </li><li>If <var title="">entry</var> is neither a marker nor an element that
+ is also in the <a href="#stack">stack of open elements</a>, go to step 4.
+
+ </li><li>Let <var title="">entry</var> be the element one later than <var title="">entry</var> in the <a href="#list-of4">list of active formatting
+ elements</a>.
+
+ </li><li>Perform a shallow clone of the element <var title="">entry</var> to
+ obtain <var title="">clone</var>. <a href="#refsDOM3CORE">[DOM3CORE]</a>
+
+ </li><li>Append <var title="">clone</var> to the <a href="#current4">current
+ node</a> and push it onto the <a href="#stack">stack of open elements</a>
+ so that it is the new <a href="#current4">current node</a>.
+
+ </li><li>Replace the entry for <var title="">entry</var> in the list with an
+ entry for <var title="">clone</var>.
+
+ </li><li>If the entry for <var title="">clone</var> in the <a href="#list-of4">list of active formatting elements</a> is not the last
+ entry in the list, return to step 7.
+ </li></ol>
+
+ <p>This has the effect of reopening all the formatting elements that were
+ opened in the current body, cell, or caption (whichever is youngest) that
+ haven't been explicitly closed.
+
+ </p><p class="note">The way this specification is written, the <a href="#list-of4">list of active formatting elements</a> always consists of
+ elements in chronological order with the least recently added element
+ first and the most recently added element last (except for while steps 8
+ to 11 of the above algorithm are being executed, of course).
+
+ </p><p>When the steps below require the UA to <dfn id="clear0">clear the list of
+ active formatting elements up to the last marker</dfn>, the UA must
+ perform the following steps:
+
+ </p><ol>
+ <li>Let <var title="">entry</var> be the last (most recently added) entry
+ in the <a href="#list-of4">list of active formatting elements</a>.
+
+ </li><li>Remove <var title="">entry</var> from the <a href="#list-of4">list of
+ active formatting elements</a>.
+
+ </li><li>If <var title="">entry</var> was a marker, then stop the algorithm at
+ this point. The list has been cleared up to the last marker.
+
+ </li><li>Go to step 1.
+ </li></ol>
+
+ <h6 id="creating"><span class="secno">8.2.4.3.3. </span>Creating and inserting
+ HTML elements</h6>
+
+ <p>When the steps below require the UA to <dfn id="create" title="create an
+ element for the token">create an element for a token</dfn>, the UA must
+ create a node implementing the interface appropriate for the element type
+ corresponding to the tag name of the token (as given in the section of
+ this specification that defines that element, e.g. for an <code><a href="section-phrase.html#a">a</a></code> element it would be the <code><a href="section-phrase.html#htmlanchorelement">HTMLAnchorElement</a></code> interface), with
+ the tag name being the name of that element, with the node being in the <a href="section-namespaces.html#html-namespace0">HTML namespace</a>, and with the attributes on the
+ node being those given in the given token.
+
+ </p><p>When the steps below require the UA to <dfn id="insert">insert an HTML
+ element</dfn> for a token, the UA must first <a href="#create">create an
+ element for the token</a>, and then append this node to the <a href="#current4">current node</a>, and push it onto the <a href="#stack">stack of open elements</a> so that it is the new <a href="#current4">current node</a>.
+
+ </p><p>The steps below may also require that the UA insert an HTML element in a
+ particular place, in which case the UA must <a href="#create">create an
+ element for the token</a> and then insert or append the new node in the
+ location specified. (This happens in particular during the parsing of
+ tables with invalid content.)
+
+ </p><p>The interface appropriate for an element that is not defined in this
+ specification is <code><a href="section-elements.html#htmlelement">HTMLElement</a></code>.
+
+ </p><h6 id="closing"><span class="secno">8.2.4.3.4. </span>Closing elements that
+ have implied end tags</h6>
+
+ <p>When the steps below require the UA to <dfn id="generate">generate implied
+ end tags</dfn>, then, if the <a href="#current4">current node</a> is a
+ <code><a href="section-lists0.html#dd">dd</a></code> element, a <code><a href="section-lists0.html#dt">dt</a></code> element, an <code><a href="section-lists0.html#li">li</a></code>
+ element, a <code><a href="section-prose.html#p">p</a></code> element, a <code><a href="section-tabular.html#td">td</a></code> element, a <code><a href="section-tabular.html#th">th</a></code>
+ element, or a <code><a href="section-tabular.html#tr">tr</a></code> element, the UA must act
+ as if an end tag with the respective tag name had been seen and then <a href="#generate">generate implied end tags</a> again.
+
+ </p><p>The step that requires the UA to generate implied end tags but lists an
+ element to exclude from the process, then the UA must perform the above
+ steps as if that element was not in the above list.
+
+ </p><h6 id="the-element"><span class="secno">8.2.4.3.5. </span>The element pointers</h6>
+
+ <p>Initially the <dfn id="head-element"><code title="">head</code> element
+ pointer</dfn> and the <dfn id="form-element"><code title="">form</code>
+ element pointer</dfn> are both null.
+
+ </p><p>Once a <code><a href="section-document.html#head">head</a></code> element has been parsed
+ (whether implicitly or explicitly) the <a href="#head-element"><code title="">head</code> element pointer</a> gets set to point to this node.
+
+ </p><p>The <a href="#form-element"><code title="">form</code> element
+ pointer</a> points to the last <code>form</code> element that was opened
+ and whose end tag has not yet been seen. It is used to make form controls
+ associate with forms in the face of dramatically bad markup, for
+ historical reasons.
+
+ </p><h6 id="the-insertion"><span class="secno">8.2.4.3.6. </span>The insertion mode</h6>
+
+ <p>Initially the <dfn id="insertion0">insertion mode</dfn> is &quot;<a href="#before2" title="insertion mode: before head">before head</a>&quot;. It
+ can change to &quot;<a href="#in-head" title="insertion mode: in head">in
+ head</a>&quot;, &quot;<a href="#after1" title="insertion mode: after head">after
+ head</a>&quot;, &quot;<a href="#in-body" title="insertion mode: in body">in
+ body</a>&quot;, &quot;<a href="#in-table" title="insertion mode: in table">in
+ table</a>&quot;, &quot;<a href="#in-caption" title="insertion mode: in caption">in
+ caption</a>&quot;, &quot;<a href="#in-column" title="insertion mode: in column
+ group">in column group</a>&quot;, &quot;<a href="#in-table0" title="insertion mode:
+ in table body">in table body</a>&quot;, &quot;<a href="#in-row" title="insertion
+ mode: in row">in row</a>&quot;, &quot;<a href="#in-cell" title="insertion mode: in
+ cell">in cell</a>&quot;, &quot;<a href="#in-select" title="insertion mode: in
+ select">in select</a>&quot;, &quot;<a href="#after2" title="insertion mode: after
+ body">after body</a>&quot;, &quot;<a href="#in-frameset" title="insertion mode: in
+ frameset">in frameset</a>&quot;, and &quot;<a href="#after3" title="insertion mode:
+ after frameset">after frameset</a>&quot; during the course of the parsing, as
+ described below. It affects how certain tokens are processed.
+
+ </p><p>If the tree construction stage is switched from <a href="#the-main0">the
+ main phase</a> to <a href="#the-trailing0">the trailing end phase</a> and
+ back again, the various pieces of state are not reset; the UA must act as
+ if the state was maintained.
+
+ </p><p>When the steps below require the UA to <dfn id="reset">reset the insertion
+ mode appropriately</dfn>, it means the UA must follow these steps:
+
+ </p><ol>
+ <li>Let <var title="">last</var> be false.
+
+ </li><li>Let <var title="">node</var> be the last node in the <a href="#stack">stack of open elements</a>.
+
+ </li><li>If <var title="">node</var> is the first node in the stack of open
+ elements, then set <var title="">last</var> to true. If the element whose
+ <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code>
+ attribute is being set is neither a <code><a href="section-tabular.html#td">td</a></code>
+ element nor a <code><a href="section-tabular.html#th">th</a></code> element, then set <var title="">node</var> to the element whose <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code>
+ attribute is being set. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code>
+ case</a>)
+
+ </li><li>If <var title="">node</var> is a <code>select</code> element, then
+ switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-select" title="insertion mode: in select">in select</a>&quot; and
+ abort these steps. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code>
+ case</a>)
+
+ </li><li>If <var title="">node</var> is a <code><a href="section-tabular.html#td">td</a></code> or
+ <code><a href="section-tabular.html#th">th</a></code> element, then switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-cell" title="insertion mode: in cell">in cell</a>&quot; and abort these steps.
+
+ </li><li>If <var title="">node</var> is a <code><a href="section-tabular.html#tr">tr</a></code>
+ element, then switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-row" title="insertion mode: in row">in row</a>&quot; and abort these
+ steps.
+
+ </li><li>If <var title="">node</var> is a <code><a href="section-tabular.html#tbody">tbody</a></code>, <code><a href="section-tabular.html#thead0">thead</a></code>,
+ or <code><a href="section-tabular.html#tfoot0">tfoot</a></code> element, then switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-table0" title="insertion mode: in table body">in table body</a>&quot; and abort these
+ steps.
+
+ </li><li>If <var title="">node</var> is a <code><a href="section-tabular.html#caption0">caption</a></code> element, then switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-caption" title="insertion mode: in caption">in caption</a>&quot; and abort these steps.
+
+ </li><li>If <var title="">node</var> is a <code><a href="section-tabular.html#colgroup">colgroup</a></code> element, then switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-column" title="insertion mode: in column group">in column group</a>&quot; and abort
+ these steps. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)
+
+ </li><li>If <var title="">node</var> is a <code><a href="section-tabular.html#table">table</a></code> element, then switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-table" title="insertion mode: in table">in table</a>&quot; and abort these steps.
+
+ </li><li>If <var title="">node</var> is a <code><a href="section-document.html#head">head</a></code>
+ element, then switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-body" title="insertion mode: in body">in body</a>&quot; (&quot;<a href="#in-body" title="insertion mode: in body">in body</a>&quot;! <em> not
+ &quot;<a href="#in-head" title="insertion mode: in head">in head</a>&quot;</em>!)
+ and abort these steps. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code>
+ case</a>)
+
+ </li><li>If <var title="">node</var> is a <code><a href="section-sections.html#body0">body</a></code> element, then switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-body" title="insertion mode: in body">in body</a>&quot; and abort these steps.
+
+ </li><li>If <var title="">node</var> is a <code>frameset</code> element, then
+ switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-frameset" title="insertion mode: in frameset">in frameset</a>&quot;
+ and abort these steps. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code>
+ case</a>)
+
+ </li><li>If <var title="">node</var> is an <code><a href="section-the-root.html#html">html</a></code> element, then: if the <a href="#head-element"><code title="">head</code> element pointer</a> is
+ null, switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#before2" title="insertion mode: before head">before head</a>&quot;,
+ otherwise, switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#after1" title="insertion mode: after head">after head</a>&quot;. In
+ either case, abort these steps. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</li>
+ <!-- XXX can the head element pointer ever be
+ non-null when we're going through these steps? -->
+
+ <li>If <var title="">last</var> is true, then set the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-body" title="insertion mode: in body">in body</a>&quot; and abort these steps. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)
+
+ </li><li>Let <var title="">node</var> now be the node before <var title="">node</var> in the <a href="#stack">stack of open elements</a>.
+
+ </li><li>Return to step 3.
+ </li></ol>
+ <!--When you don't have to handle innerHTML, you can use this
+simplified explanation instead:
+
+ <ol>
+
+ <li><p>If the <span>stack of open elements</span> <span title="has
+ an element in table scope">has a <code>td</code> or <code>th</code>
+ element in table scope</span>, then switch the <span>insertion
+ mode</span> to "<span title="insertion mode: in cell">in
+ cell</span>".</p></li>
+
+ <li><p>Otherwise, if the <span>stack of open elements</span> <span
+ title="has an element in table scope">has a <code>tr</code> element
+ in table scope</span>, then switch the <span>insertion mode</span>
+ to "<span title="insertion mode: in row">in row</span>".</p></li>
+
+ <li><p>Otherwise, if the <span>stack of open elements</span> <span
+ title="has an element in table scope">has a <code>tbody</code>,
+ <code>tfoot</code>, or <code>thead</code> element in table
+ scope</span>, then switch the <span>insertion mode</span> to "<span
+ title="insertion mode: in table body">in table
+ body</span>".</p></li>
+
+ <li><p>Otherwise, if the <span>stack of open elements</span> <span
+ title="has an element in table scope">has a <code>caption</code>
+ element in table scope</span>, then switch the <span>insertion
+ mode</span> to "<span title="insertion mode: in caption">in
+ caption</span>".</p></li>
+
+ ( you can't reach this point with a colgroup element on the
+ stack )
+
+ <li><p>Otherwise, if the <span>stack of open elements</span> <span
+ title="has an element in table scope">has a <code>table</code>
+ element in table scope</span>, then switch the <span>insertion
+ mode</span> to "<span title="insertion mode: in table">in
+ table</span>".</p></li>
+
+ <li><p>Otherwise, switch the <span>insertion mode</span> to "<span
+ title="insertion mode: in body">in body</span>".</p></li>
+
+ </ol>
+-->
+
+ <h6 id="how-to0"><span class="secno">8.2.4.3.7. </span>How to handle tokens in
+ the main phase</h6>
+
+ <p>Tokens in the main phase must be handled as follows:
+
+ </p><dl class="switch">
+ <dt>A DOCTYPE token
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+
+ </dd><dt>A start tag token with the tag name &quot;html&quot;
+
+ </dt><dd>
+ <p>If this start tag token was not the first start tag token, then it is
+ a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>For each attribute on the token, check to see if the attribute is
+ already present on the top element of the <a href="#stack">stack of open
+ elements</a>. If it is not, add the attribute and its corresponding
+ value to that element.</p>
+
+ </dd><dt>An end-of-file token
+
+ </dt><dd>
+ <p><a href="#generate">Generate implied end tags.</a></p>
+
+ <p>If there are more than two nodes on the <a href="#stack">stack of open
+ elements</a>, or if there are two nodes but the second node is not a
+ <code><a href="section-sections.html#body0">body</a></code> node, this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>Otherwise, if the parser was originally created in order to handle the
+ setting of an element's <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> attribute, and there's more than
+ one element in the <a href="#stack">stack of open elements</a>, and the
+ second node on the <a href="#stack">stack of open elements</a> is not a
+ <code><a href="section-sections.html#body0">body</a></code> node, then this is a <a href="section-parsing.html#parse">parse error</a>. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p><a href="#stops">Stop parsing.</a></p>
+
+ <p class="big-issue">This fails because it doesn't imply HEAD and BODY
+ tags. We should probably expand out the insertion modes and merge them
+ with phases and then put the three things here into each insertion mode
+ instead of trying to factor them out so carefully.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p>Depends on the <a href="#insertion0">insertion mode</a>:</p>
+
+ <dl class="switch">
+ <dt>If the <a href="#insertion0">insertion mode</a> is &quot;<dfn id="before2" title="insertion mode: before head">before head</dfn>&quot;
+
+ </dt><dd>
+ <p>Handle the token as follows:</p>
+
+ <dl class="switch">
+ <dt>A character token that is one of one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C
+ FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dd>
+ <p><a href="#append" title="append a character">Append the
+ character</a> to the <a href="#current4">current node</a>.</p>
+
+ </dd><dt>A comment token
+
+ </dt><dd>
+ <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment
+ token.</p>
+
+ </dd><dt>A start tag token with the tag name &quot;head&quot;
+
+ </dt><dd>
+ <p><a href="#create">Create an element for the token</a>.</p>
+
+ <p>Set the <a href="#head-element"><code title="">head</code> element
+ pointer</a> to this new element node.</p>
+
+ <p>Append the new element to the <a href="#current4">current node</a>
+ and push it onto the <a href="#stack">stack of open elements</a>.</p>
+
+ <p>Change the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-head" title="insertion mode: in head">in head</a>&quot;.</p>
+
+ </dd><dt>A start tag token whose tag name is one of: &quot;base&quot;, &quot;link&quot;,
+ &quot;meta&quot;, &quot;script&quot;, &quot;style&quot;, &quot;title&quot;
+
+ </dt><dd>
+ <p>Act as if a start tag token with the tag name &quot;head&quot; and no
+ attributes had been seen, then reprocess the current token.</p>
+
+ <p class="note">This will result in a <code><a href="section-document.html#head">head</a></code> element being generated, and with the
+ current token being reprocessed in the &quot;<a href="#in-head" title="insertion mode: in head">in head</a>&quot; <a href="#insertion0">insertion mode</a>.</p>
+
+ </dd><dt>An end tag with the tag name &quot;html&quot;
+
+ </dt><dd>
+ <p>Act as if a start tag token with the tag name &quot;head&quot; and no
+ attributes had been seen, then reprocess the current token.</p>
+
+ </dd><dt>Any other end tag
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+
+ </dd><dt>A character token that is <em>not</em> one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C
+ FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dt>Any other start tag token
+
+ </dt><dd>
+ <p>Act as if a start tag token with the tag name &quot;head&quot; and no
+ attributes had been seen, then reprocess the current token.</p>
+
+ <p class="note">This will result in an empty <code><a href="section-document.html#head">head</a></code> element being generated, with the
+ current token being reprocessed in the &quot;<a href="#after1" title="insertion mode: after head">after head</a>&quot; <a href="#insertion0">insertion mode</a>.</p>
+ </dd></dl>
+
+ </dd><dt id="parsing-main-inhead">If the <a href="#insertion0">insertion
+ mode</a> is &quot;<dfn id="in-head" title="insertion mode: in head">in
+ head</dfn>&quot;
+
+ </dt><dd>
+ <p>Handle the token as follows.</p>
+
+ <p class="note">The rules for handling &quot;title&quot;, &quot;style&quot;, and &quot;script&quot;
+ start tags are similar, but not identical.</p>
+
+ <p class="note">It is possible for the <a href="#tree-construction0">tree
+ construction</a> stage's <a href="#the-main0" title="the main
+ phase">main phase</a> to be in the &quot;<a href="#in-head" title="insertion mode: in head">in head</a>&quot; <a href="#insertion0">insertion mode</a> without the <a href="#current4">current node</a> being a <code><a href="section-document.html#head">head</a></code> element, e.g. if a <code><a href="section-document.html#head">head</a></code> end tag is immediately followed by a
+ <code><a href="section-document.html#meta0">meta</a></code> start tag.</p>
+
+ <dl class="switch">
+ <dt>A character token that is one of one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C
+ FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dd>
+ <p><a href="#append" title="append a character">Append the
+ character</a> to the <a href="#current4">current node</a>.</p>
+
+ </dd><dt>A comment token
+
+ </dt><dd>
+ <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment
+ token.</p>
+
+ </dd><dt>A start tag with the tag name &quot;title&quot;
+
+ </dt><dd>
+ <p><a href="#create">Create an element for the token</a>.</p>
+
+ <p>Append the new element to the node pointed to by the <a href="#head-element"><code title="">head</code> element pointer</a>,
+ or, if that is null (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code>
+ case</a>), to the <a href="#current4">current node</a>.</p>
+
+ <p>Switch the tokeniser's <a href="section-tokenisation.html#content2">content model flag</a>
+ to the RCDATA state.</p>
+
+ <p>Then, collect all the character tokens that the tokeniser returns
+ until it returns a token that is not a character token.</p>
+
+ <p>If this process resulted in a collection of character tokens,
+ append a single <code>Text</code> node to the <code><a href="section-document.html#title1">title</a></code> element node whose contents is the
+ concatenation of all those tokens' characters.</p>
+
+ <p>The tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> will
+ have switched back to the PCDATA state.</p>
+
+ <p>If the next token is an end tag token with the tag name &quot;title&quot;,
+ ignore it. Otherwise, this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ </dd><dt>A start tag with the tag name &quot;style&quot;
+
+ </dt><dd>
+ <p><a href="#create">Create an element for the token</a>.</p>
+
+ <p>Append the new element to the <a href="#current4">current
+ node</a>, unless the <a href="#insertion0">insertion mode</a> is &quot;<a href="#in-head" title="insertion mode: in head">in head</a>&quot; and the
+ <a href="#head-element"><code title="">head</code> element
+ pointer</a> is not null, in which case append it to the node pointed
+ to by the <a href="#head-element"><code title="">head</code> element
+ pointer</a>. <!--
+ <head></head><style><body> should put the style block in the
+ head, and does so by switching back to in head, but the head
+ isn't the current node at that point (comments should go
+ between the head and the body) -->.</p>
+
+ <p>Switch the tokeniser's <a href="section-tokenisation.html#content2">content model flag</a>
+ to the CDATA state.</p>
+
+ <p>Then, collect all the character tokens that the tokeniser returns
+ until it returns a token that is not a character token, or until it
+ stops tokenising.</p>
+
+ <p>If this process resulted in a collection of character tokens,
+ append a single <code>Text</code> node to the <code><a href="section-document.html#style">style</a></code> element node whose contents is the
+ concatenation of all those tokens' characters.</p>
+
+ <p>The tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> will
+ have switched back to the PCDATA state.</p>
+
+ <p>If the next token is an end tag token with the tag name &quot;style&quot;,
+ ignore it. Otherwise, this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ </dd><dt id="scriptTag">A start tag with the tag name &quot;script&quot;
+
+ </dt><dd>
+ <p><a href="#create">Create an element for the token</a>.</p>
+
+ <p>Mark the element as being <a href="section-scripting0.html#parser-inserted">&quot;parser-inserted&quot;</a>. This ensures that, if
+ the script is external, any <code title="dom-document-write-HTML"><a href="section-dynamic.html#document.write0">document.write()</a></code> calls in the
+ script will execute in-line, instead of blowing the document away,
+ as would happen in most other cases.</p>
+
+ <p>Switch the tokeniser's <a href="section-tokenisation.html#content2">content model flag</a>
+ to the CDATA state.</p>
+
+ <p>Then, collect all the character tokens that the tokeniser returns
+ until it returns a token that is not a character token, or until it
+ stops tokenising.</p>
+
+ <p>If this process resulted in a collection of character tokens,
+ append a single <code>Text</code> node to the <code><a href="section-scripting0.html#script0">script</a></code> element node whose contents is the
+ concatenation of all those tokens' characters.</p>
+
+ <p>The tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> will
+ have switched back to the PCDATA state.</p>
+
+ <p>If the next token is not an end tag token with the tag name
+ &quot;script&quot;, then this is a <a href="section-parsing.html#parse">parse error</a>; mark the
+ <code><a href="section-scripting0.html#script0">script</a></code> element as <a href="section-scripting0.html#already">&quot;already executed&quot;</a>. Otherwise, the token is the
+ <code><a href="section-scripting0.html#script0">script</a></code> element's end tag, so
+ ignore it.</p>
+
+ <p>If the parser was originally created in order to handle the
+ setting of a node's <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> attribute, then mark the
+ <code><a href="section-scripting0.html#script0">script</a></code> element as <a href="section-scripting0.html#already">&quot;already executed&quot;</a>, and skip the rest of the
+ processing described for this token (including the part below where
+ &quot;<a href="section-scripting0.html#the-script" title="the script that will execute as soon
+ as the parser resumes">scripts that will execute as soon as the
+ parser resumes</a>&quot; are executed). (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p class="note">Marking the <code><a href="section-scripting0.html#script0">script</a></code>
+ element as &quot;already executed&quot; prevents it from executing when it is
+ inserted into the document a few paragraphs below. Scripts missing
+ their end tags and scripts that were inserted using <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code>
+ aren't executed.</p>
+
+ <p>Let the <var title="">old insertion point</var> have the same
+ value as the current <a href="section-parsing.html#insertion">insertion point</a>. Let
+ the <a href="section-parsing.html#insertion">insertion point</a> be just before the <a href="section-parsing.html#next-input">next input character</a>.</p>
+
+ <p>Append the new element to the <a href="#current4">current
+ node</a>, unless the <a href="#insertion0">insertion mode</a> is &quot;<a href="#in-head" title="insertion mode: in head">in head</a>&quot; and the
+ <a href="#head-element"><code title="">head</code> element
+ pointer</a> is not null, in which case append it to the node pointed
+ to by the <a href="#head-element"><code title="">head</code> element
+ pointer</a>. <!--
+ <head></head><script><body> should put the script in the head,
+ and does so by switching back to in head, but the head isn't
+ the current node at that point (comments should go between the
+ head and the body) -->
+ <a href="section-scripting0.html#running0" title="running a script">Special processing
+ occurs when a <code>script</code> element is inserted into a
+ document</a> that might cause some script to execute, which might
+ cause <a href="section-dynamic.html#document.write0" title="dom-document-write-HTML">new
+ characters to be inserted into the tokeniser</a>.</p>
+
+ <p>Let the <a href="section-parsing.html#insertion">insertion point</a> have the value of
+ the <var title="">old insertion point</var>. (In other words,
+ restore the <a href="section-parsing.html#insertion">insertion point</a> to the value it
+ had before the previous paragraph. This value might be the
+ &quot;undefined&quot; value.)</p>
+
+ <p id="scriptTagParserResumes">At this stage, if there is <a href="section-scripting0.html#the-script" title="the script that will execute as soon as
+ the parser resumes">a script that will execute as soon as the parser
+ resumes</a>, then:</p>
+
+ <dl class="switch">
+ <dt>If the tree construction stage is <a href="section-parsing.html#nestedParsing">being
+ called reentrantly</a>, say from a call to <code title="dom-document-write-HTML"><a href="section-dynamic.html#document.write0">document.write()</a></code>:
+
+ </dt><dd>
+ <p>Abort the processing of any nested invokations of the tokeniser,
+ yielding control back to the caller. (Tokenisation will resume
+ when the caller returns to the &quot;outer&quot; tree construction stage.)
+
+ </p></dd><dt>Otherwise:
+
+ </dt><dd>
+ <p>Follow these steps:</p>
+
+ <ol>
+ <li>
+ <p>Let <var title="">the script</var> be <a href="section-scripting0.html#the-script">the script that will execute as soon as the
+ parser resumes</a>. There is no longer <a href="section-scripting0.html#the-script" title="the script that will execute as soon as the parser
+ resumes">a script that will execute as soon as the parser
+ resumes</a>.
+
+ </p></li><li>
+ <p><a href="section-terminology.html#pause">Pause</a> until the script has
+ <span>completed loading</span><!-- XXX xref -->.
+
+ </p></li><li>
+ <p>Let the <a href="section-parsing.html#insertion">insertion point</a> be just
+ before the <a href="section-parsing.html#next-input">next input character</a>.
+
+ </p></li><li>
+ <p><a href="section-scripting0.html#executing0" title="executing a script block">Execute
+ the script</a>.
+
+ </p></li><li>
+ <p>Let the <a href="section-parsing.html#insertion">insertion point</a> be undefined
+ again.
+
+ </p></li><li>
+ <p>If there is once again <a href="section-scripting0.html#the-script" title="the script
+ that will execute as soon as the parser resumes">a script that
+ will execute as soon as the parser resumes</a>, then repeat
+ these steps from step 1.
+ </p></li></ol>
+ </dd></dl>
+
+ </dd><dt>A start tag with the tag name &quot;base&quot;, &quot;link&quot;, or &quot;meta&quot;
+
+ </dt><dd>
+ <p><a href="#create">Create an element for the token</a>.</p>
+
+ <p>Append the new element to the node pointed to by the <a href="#head-element"><code title="">head</code> element pointer</a>,
+ or, if that is null (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code>
+ case</a>), to the <a href="#current4">current node</a>.</p>
+
+ </dd><dt>An end tag with the tag name &quot;head&quot;
+
+ </dt><dd>
+ <p>If the <a href="#current4">current node</a> is a <code><a href="section-document.html#head">head</a></code> element, pop the <a href="#current4">current node</a> off the <a href="#stack">stack of
+ open elements</a>. Otherwise, this is a <a href="section-parsing.html#parse">parse
+ error</a>.</p>
+ <!-- might happen if you see two </head>s
+ and something in between the two sends you from "after head"
+ back to "in head" -->
+
+ <p>Change the <a href="#insertion0">insertion mode</a> to &quot;<a href="#after1" title="insertion mode: after head">after head</a>&quot;.</p>
+
+ </dd><dt>An end tag with the tag name &quot;html&quot;
+
+ </dt><dd>
+ <p>Act as described in the &quot;anything else&quot; entry below.</p>
+
+ </dd><dt>A start tag with the tag name &quot;head&quot;
+
+ </dt><dt>Any other end tag
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p>If the <a href="#current4">current node</a> is a <code><a href="section-document.html#head">head</a></code> element, act as if an end tag token
+ with the tag name &quot;head&quot; had been seen.</p>
+
+ <p>Otherwise, change the <a href="#insertion0">insertion mode</a> to
+ &quot;<a href="#after1" title="insertion mode: after head">after
+ head</a>&quot;.</p>
+
+ <p>Then, reprocess the current token.</p>
+
+ <p class="big-issue">In certain UAs, <a href="https://bugzilla.mozilla.org/attachment.cgi?id=180157&amp;action=view">some
+ elements</a> don't trigger the &quot;in body&quot; mode straight away, but
+ instead get put into the head. Do we want to copy that?</p>
+ </dd></dl>
+
+ </dd><dt>If the <a href="#insertion0">insertion mode</a> is &quot;<dfn id="after1" title="insertion mode: after head">after head</dfn>&quot;
+
+ </dt><dd>
+ <p>Handle the token as follows:</p>
+
+ <dl class="switch">
+ <dt>A character token that is one of one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C
+ FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dd>
+ <p><a href="#append" title="append a character">Append the
+ character</a> to the <a href="#current4">current node</a>.</p>
+
+ </dd><dt>A comment token
+
+ </dt><dd>
+ <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment
+ token.</p>
+
+ </dd><dt>A start tag token with the tag name &quot;body&quot;
+
+ </dt><dd>
+ <p><a href="#insert" title="insert an HTML element">Insert a
+ <code>body</code> element</a> for the token.</p>
+
+ <p>Change the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-body" title="insertion mode: in body">in body</a>&quot;.</p>
+
+ </dd><dt>A start tag token with the tag name &quot;frameset&quot;
+
+ </dt><dd>
+ <p><a href="#insert" title="insert an HTML element">Insert a
+ <code>frameset</code> element</a> for the token.</p>
+
+ <p>Change the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-frameset" title="insertion mode: in frameset">in
+ frameset</a>&quot;.</p>
+
+ </dd><dt>A start tag token whose tag name is one of: &quot;base&quot;, &quot;link&quot;,
+ &quot;meta&quot;, &quot;script&quot;, &quot;style&quot;, &quot;title&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Switch the <a href="#insertion0">insertion mode</a> back to &quot;<a href="#in-head" title="insertion mode: in head">in head</a>&quot; and reprocess the
+ token.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p>Act as if a start tag token with the tag name &quot;body&quot; and no
+ attributes had been seen, and then reprocess the current token.</p>
+ </dd></dl>
+
+ </dd><dt id="parsing-main-inbody">If the <a href="#insertion0">insertion
+ mode</a> is &quot;<dfn id="in-body" title="insertion mode: in body">in
+ body</dfn>&quot;
+
+ </dt><dd>
+ <p>Handle the token as follows:</p>
+
+ <dl class="switch">
+ <dt>A character token
+
+ </dt><dd>
+ <p><a href="#reconstruct">Reconstruct the active formatting
+ elements</a>, if any.</p>
+
+ <p><a href="#append" title="append a character">Append the token's
+ character</a> to the <a href="#current4">current node</a>.</p>
+
+ </dd><dt>A comment token
+
+ </dt><dd>
+ <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment
+ token.</p>
+
+ </dd><dt>A start tag token whose tag name is one of: &quot;script&quot;, &quot;style&quot;
+
+ </dt><dd>
+ <p>Process the token as if the <a href="#insertion0">insertion
+ mode</a> had been &quot;<a href="#in-head" title="insertion mode: in
+ head">in head</a>&quot;.</p>
+
+ </dd><dt>A start tag token whose tag name is one of: &quot;base&quot;, &quot;link&quot;,
+ &quot;meta&quot;, &quot;title&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Process the token as if the <a href="#insertion0">insertion mode</a> had been &quot;<a href="#in-head" title="insertion mode: in head">in head</a>&quot;.</p>
+
+ </dd><dt>A start tag token with the tag name &quot;body&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>.</p>
+
+ <p>If the second element on the <a href="#stack">stack of open
+ elements</a> is not a <code><a href="section-sections.html#body0">body</a></code>
+ element, or, if the <a href="#stack">stack of open elements</a> has
+ only one node on it, then ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p>Otherwise, for each attribute on the token, check to see if the
+ attribute is already present on the <code><a href="section-sections.html#body0">body</a></code> element (the second element) on the <a href="#stack">stack of open elements</a>. If it is not, add the
+ attribute and its corresponding value to that element.</p>
+
+ </dd><dt>An end tag with the tag name &quot;body&quot;
+
+ </dt><dd>
+ <p>If the second element in the <a href="#stack">stack of open
+ elements</a> is not a <code><a href="section-sections.html#body0">body</a></code>
+ element, this is a <a href="section-parsing.html#parse">parse error</a>. Ignore the
+ token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p>Otherwise:</p>
+
+ <p class="big-issue">this needs to handle closing of implied elements,
+ but without closing them</p>
+
+ <p>If the <a href="#current4">current node</a> is not the <code><a href="section-sections.html#body0">body</a></code> element, then this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>Change the <a href="#insertion0">insertion mode</a> to &quot;<a href="#after2" title="insertion mode: after body">after body</a>&quot;.</p>
+
+ </dd><dt>An end tag with the tag name &quot;html&quot;
+
+ </dt><dd>
+ <p>Act as if an end tag with tag name &quot;body&quot; had been seen, then, if
+ that token wasn't ignored, reprocess the current token.</p>
+
+ <p class="note">The fake end tag token here can only be ignored in the
+ <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p>
+
+ </dd><dt>A start tag whose tag name is one of: &quot;address&quot;, &quot;blockquote&quot;,
+ &quot;center&quot;, &quot;dir&quot;, &quot;div&quot;, &quot;dl&quot;, &quot;fieldset&quot;, &quot;listing&quot;, &quot;menu&quot;, &quot;ol&quot;,
+ &quot;p&quot;, &quot;ul&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code>
+ element in scope</a>, then act as if an end tag with the tag name
+ <code><a href="section-prose.html#p">p</a></code> had been seen.</p>
+
+ <p><a href="#insert" title="insert an html element">Insert an HTML
+ element</a> for the token.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;pre&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code>
+ element in scope</a>, then act as if an end tag with the tag name
+ <code><a href="section-prose.html#p">p</a></code> had been seen.</p>
+
+ <p><a href="#insert" title="insert an html element">Insert an HTML
+ element</a> for the token.</p>
+
+ <p>If the next token is a U+000A LINE FEED (LF) character token, then
+ ignore that token and move on to the next one. (Newlines at the
+ start of <code><a href="section-preformatted.html#pre">pre</a></code> blocks are ignored as
+ an authoring convenience.)</p>
+
+ </dd><dt>A start tag whose tag name is &quot;form&quot;
+
+ </dt><dd>
+ <p>If the <a href="#form-element"><code title="form">form</code>
+ element pointer</a> is not null, ignore the token with a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>Otherwise:</p>
+
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code>
+ element in scope</a>, then act as if an end tag with the tag name
+ <code><a href="section-prose.html#p">p</a></code> had been seen.</p>
+
+ <p><a href="#insert" title="insert an html Element">Insert an HTML
+ element</a> for the token, and set the <code title="form">form</code>
+ element pointer to point to the element created.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;li&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code>
+ element in scope</a>, then act as if an end tag with the tag name
+ <code><a href="section-prose.html#p">p</a></code> had been seen.</p>
+
+ <p>Run the following algorithm:</p>
+
+ <ol>
+ <li>
+ <p>Initialise <var title="">node</var> to be the <a href="#current4">current node</a> (the bottommost node of the
+ stack).
+
+ </p></li><li>
+ <p>If <var title="">node</var> is an <code><a href="section-lists0.html#li">li</a></code> element, then pop all the nodes from the
+ <a href="#current4">current node</a> up to <var title="">node</var>, including <var title="">node</var>, then stop
+ this algorithm. If more than one node is popped, then this is a <a href="section-parsing.html#parse">parse error</a>.
+
+ </p></li><li>
+ <p>If <var title="">node</var> is not in the <a href="#formatting">formatting</a> category, and is not in the <a href="#phrasing">phrasing</a> category, and is not an <code><a href="section-sections.html#address">address</a></code> or <code><a href="section-miscellaneous.html#div">div</a></code> element, then stop this algorithm.
+ </p></li>
+ <!-- an element <foo> is in this
+ list if the following markup:
+
+ <!DOCTYPE html><body><ol><li><foo><li>
+
+ ...results in the second <li> not being (in any way) a
+ descendant of the first <li>, or if <foo> is a formatting
+ element that gets reopened later. -->
+
+ <li>
+ <p>Otherwise, set <var title="">node</var> to the previous entry in
+ the <a href="#stack">stack of open elements</a> and return to step
+ 2.
+ </p></li></ol>
+
+ <p>Finally, <a href="#insert" title="insert an html element">insert
+ an <code>li</code> element</a>.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;dd&quot; or &quot;dt&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code>
+ element in scope</a>, then act as if an end tag with the tag name
+ <code><a href="section-prose.html#p">p</a></code> had been seen.</p>
+
+ <p>Run the following algorithm:</p>
+
+ <ol>
+ <li>
+ <p>Initialise <var title="">node</var> to be the <a href="#current4">current node</a> (the bottommost node of the
+ stack).
+
+ </p></li><li>
+ <p>If <var title="">node</var> is a <code><a href="section-lists0.html#dd">dd</a></code> or <code><a href="section-lists0.html#dt">dt</a></code>
+ element, then pop all the nodes from the <a href="#current4">current node</a> up to <var title="">node</var>,
+ including <var title="">node</var>, then stop this algorithm. If
+ more than one node is popped, then this is a <a href="section-parsing.html#parse">parse error</a>.
+
+ </p></li><li>
+ <p>If <var title="">node</var> is not in the <a href="#formatting">formatting</a> category, and is not in the <a href="#phrasing">phrasing</a> category, and is not an <code><a href="section-sections.html#address">address</a></code> or <code><a href="section-miscellaneous.html#div">div</a></code> element, then stop this algorithm.
+ </p></li>
+ <!-- an element <foo> is in this
+ list if the following markup:
+
+ <!DOCTYPE html><body><ol><dt><foo><dt>
+
+ ...results in the second <li> not being (in any way) a
+ descendant of the first <li>, or if <foo> is a formatting
+ element that gets reopened later. -->
+
+ <li>
+ <p>Otherwise, set <var title="">node</var> to the previous entry in
+ the <a href="#stack">stack of open elements</a> and return to step
+ 2.
+ </p></li></ol>
+
+ <p>Finally, <a href="#insert" title="insert an html element">insert
+ an HTML element</a> with the same tag name as the token's.</p>
+
+ </dd><dt>A start tag token whose tag name is &quot;plaintext&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code>
+ element in scope</a>, then act as if an end tag with the tag name
+ <code><a href="section-prose.html#p">p</a></code> had been seen.</p>
+
+ <p><a href="#insert" title="insert an html element">Insert an HTML
+ element</a> for the token.</p>
+
+ <p>Switch the <a href="section-tokenisation.html#content2">content model flag</a> to the
+ PLAINTEXT state.</p>
+
+ <p class="note">Once a start tag with the tag name &quot;plaintext&quot; has been
+ seen, that will be the last token ever seen other than character
+ tokens (and the end-of-file token), because there is no way to
+ switch the <a href="section-tokenisation.html#content2">content model flag</a> out of the
+ PLAINTEXT state.</p>
+
+ </dd><dt>An end tag whose tag name is one of: &quot;address&quot;, &quot;blockquote&quot;,
+ &quot;center&quot;, &quot;dir&quot;, &quot;div&quot;, &quot;dl&quot;, &quot;fieldset&quot;, &quot;listing&quot;, &quot;menu&quot;, &quot;ol&quot;,
+ &quot;pre&quot;, &quot;ul&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an">has an element in scope</a> with the same tag name
+ as that of the token, then <a href="#generate">generate implied end
+ tags</a>.</p>
+
+ <p>Now, if the <a href="#current4">current node</a> is not an element
+ with the same tag name as that of the token, then this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an">has an element in scope</a> with the same tag name
+ as that of the token, then pop elements from this stack until an
+ element with that tag name has been popped from the stack.</p>
+ <!-- XXX quirk (except for in certain cases?):
+ <p>Otherwise, act as if a start tag with the tag name given in
+ the token had been seen, then reprocess the current token.</p>
+ -->
+
+
+ </dd><dt>An end tag whose tag name is &quot;form&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an">has an element in scope</a> with the same tag name
+ as that of the token, then <a href="#generate">generate implied end
+ tags</a>.</p>
+
+ <p>Now, if the <a href="#current4">current node</a> is not an element
+ with the same tag name as that of the token, then this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>Otherwise, if the <a href="#current4">current node</a> is an
+ element with the same tag name as that of the token pop that element
+ from the stack.</p>
+
+ <p>In any case, set the <a href="#form-element"><code title="">form</code> element pointer</a> to null.</p>
+
+ </dd><dt>An end tag whose tag name is &quot;p&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code>
+ element in scope</a>, then <a href="#generate">generate implied end
+ tags</a>, except for <code><a href="section-prose.html#p">p</a></code> elements.</p>
+
+ <p>If the <a href="#current4">current node</a> is not a <code><a href="section-prose.html#p">p</a></code> element, then this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code>
+ element in scope</a>, then pop elements from this stack until the
+ stack no longer <a href="#have-an" title="has an element in
+ scope">has a <code>p</code> element in scope</a>.</p>
+ <!-- XXX quirk:
+ <p>Otherwise, act as if a start tag with the tag name
+ <code>p</code> had been seen, then reprocess the current
+ token.</p>
+ -->
+
+
+ </dd><dt>An end tag whose tag name is &quot;dd&quot;, &quot;dt&quot;, or &quot;li&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an">has an element in scope</a> whose tag name matches
+ the tag name of the token, then <a href="#generate">generate implied
+ end tags</a>, except for elements with the same tag name as the
+ token.</p>
+
+ <p>If the <a href="#current4">current node</a> is not an element with
+ the same tag name as the token, then this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an">has an element in scope</a> whose tag name matches
+ the tag name of the token, then pop elements from this stack until
+ an element with that tag name has been popped from the stack.</p>
+
+ </dd><dt>A start tag whose tag name is one of: &quot;h1&quot;, &quot;h2&quot;, &quot;h3&quot;, &quot;h4&quot;,
+ &quot;h5&quot;, &quot;h6&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code>
+ element in scope</a>, then act as if an end tag with the tag name
+ <code><a href="section-prose.html#p">p</a></code> had been seen.</p>
+
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has in scope</a> an
+ element whose tag name is one of &quot;h1&quot;, &quot;h2&quot;, &quot;h3&quot;, &quot;h4&quot;, &quot;h5&quot;, or
+ &quot;h6&quot;, then this is a <a href="section-parsing.html#parse">parse error</a>; pop elements
+ from the stack until an element with one of those tag names has been
+ popped from the stack.</p>
+
+ <p><a href="#insert" title="insert an html element">Insert an HTML
+ element</a> for the token.</p>
+
+ </dd><dt>An end tag whose tag name is one of: &quot;h1&quot;, &quot;h2&quot;, &quot;h3&quot;, &quot;h4&quot;, &quot;h5&quot;,
+ &quot;h6&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has in scope</a> an
+ element whose tag name is one of &quot;h1&quot;, &quot;h2&quot;, &quot;h3&quot;, &quot;h4&quot;, &quot;h5&quot;, or
+ &quot;h6&quot;, then <a href="#generate">generate implied end tags</a>.</p>
+
+ <p>Now, if the <a href="#current4">current node</a> is not an element
+ with the same tag name as that of the token, then this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has in scope</a> an
+ element whose tag name is one of &quot;h1&quot;, &quot;h2&quot;, &quot;h3&quot;, &quot;h4&quot;, &quot;h5&quot;, or
+ &quot;h6&quot;, then pop elements from the stack until an element with one of
+ those tag names has been popped from the stack.</p>
+ <!-- XXX quirk:
+ <p>Otherwise, act as if a start tag with the tag name given in
+ the token had been seen, then reprocess the current token.</p>
+ -->
+ </dd>
+ <!-- ADOPTION AGENCY ELEMENTS
+ Mozilla-only: bdo blink del ins sub sup q
+ Safari-only: code dfn kbd nobr samp var wbr
+ Both: a b big em font i s small strike strong tt u -->
+
+ <dt>A start tag whose tag name is &quot;a&quot;
+
+ </dt><dd>
+ <p>If the <a href="#list-of4">list of active formatting elements</a>
+ contains an element whose tag name is &quot;a&quot; between the end of the
+ list and the last marker on the list (or the start of the list if
+ there is no marker on the list), then this is a <a href="section-parsing.html#parse">parse error</a>; act as if an end tag with the tag
+ name &quot;a&quot; had been seen, then remove that element from the <a href="#list-of4">list of active formatting elements</a> and the <a href="#stack">stack of open elements</a> if the end tag didn't
+ already remove it (it might not have if the element is not <a href="#have-an0" title="has an element in table scope">in table
+ scope</a>).</p>
+
+ <p class="example">In the non-conforming stream
+ <code>&lt;a href=&quot;a&quot;&gt;a&lt;table&gt;&lt;a href=&quot;b&quot;&gt;b&lt;/table&gt;x</code>,
+ the first <code><a href="section-phrase.html#a">a</a></code> element would be closed
+ upon seeing the second one, and the &quot;x&quot; character would be inside a
+ link to &quot;b&quot;, not to &quot;a&quot;. This is despite the fact that the outer
+ <code><a href="section-phrase.html#a">a</a></code> element is not in table scope
+ (meaning that a regular <code>&lt;/a&gt;</code> end tag at the start of
+ the table wouldn't close the outer <code><a href="section-phrase.html#a">a</a></code>
+ element).</p>
+
+ <p><a href="#reconstruct">Reconstruct the active formatting
+ elements</a>, if any.</p>
+
+ <p><a href="#insert" title="insert an html element">Insert an HTML
+ element</a> for the token. Add that element to the <a href="#list-of4">list of active formatting elements</a>.</p>
+
+ </dd><dt>A start tag whose tag name is one of: &quot;b&quot;, &quot;big&quot;, &quot;em&quot;, &quot;font&quot;,
+ &quot;i&quot;, &quot;nobr&quot;, &quot;s&quot;, &quot;small&quot;, &quot;strike&quot;, &quot;strong&quot;, &quot;tt&quot;, &quot;u&quot;
+
+ </dt><dd>
+ <p><a href="#reconstruct">Reconstruct the active formatting
+ elements</a>, if any.</p>
+
+ <p><a href="#insert" title="insert an html element">Insert an HTML
+ element</a> for the token. Add that element to the <a href="#list-of4">list of active formatting elements</a>.</p>
+
+ </dd><dt id="adoptionAgency">An end tag whose tag name is one of: &quot;a&quot;, &quot;b&quot;,
+ &quot;big&quot;, &quot;em&quot;, &quot;font&quot;, &quot;i&quot;, &quot;nobr&quot;, &quot;s&quot;, &quot;small&quot;, &quot;strike&quot;, &quot;strong&quot;,
+ &quot;tt&quot;, &quot;u&quot;
+
+ </dt><dd>
+ <p>Follow these steps:</p>
+
+ <ol>
+ <li>
+ <p>Let the <var title="">formatting element</var> be the last
+ element in the <a href="#list-of4">list of active formatting
+ elements</a> that:</p>
+
+ <ul>
+ <li>is between the end of the list and the last scope marker in
+ the list, if any, or the start of the list otherwise, and
+
+ </li><li>has the same tag name as the token.
+ </li></ul>
+
+ <p>If there is no such node, or, if that node is also in the <a href="#stack">stack of open elements</a> but the element is not <a href="#have-an" title="has an element in scope">in scope</a>, then
+ this is a <a href="section-parsing.html#parse">parse error</a>. Abort these steps. The
+ token is ignored.</p>
+
+ <p>Otherwise, if there is such a node, but that node is not in the
+ <a href="#stack">stack of open elements</a>, then this is a <a href="section-parsing.html#parse">parse error</a>; remove the element from the list,
+ and abort these steps.</p>
+
+ <p>Otherwise, there is a <var title="">formatting element</var> and
+ that element is in <a href="#stack" title="stack of open
+ elements">the stack</a> and is <a href="#have-an" title="has an
+ element in scope">in scope</a>. If the element is not the <a href="#current4">current node</a>, this is a <a href="section-parsing.html#parse">parse error</a>. In any case, proceed with the
+ algorithm as written in the following steps.</p>
+
+ </li><li>
+ <p>Let the <var title="">furthest block</var> be the topmost node
+ in the <a href="#stack">stack of open elements</a> that is lower
+ in the stack than the <var title="">formatting element</var>, and
+ is not an element in the <a href="#phrasing">phrasing</a> or <a href="#formatting">formatting</a> categories. There might not be
+ one.
+
+ </p></li><li>
+ <p>If there is no <var title="">furthest block</var>, then the UA
+ must skip the subsequent steps and instead just pop all the nodes
+ from the bottom of the <a href="#stack">stack of open
+ elements</a>, from the <a href="#current4">current node</a> up to
+ the <var title="">formatting element</var>, and remove the <var title="">formatting element</var> from the <a href="#list-of4">list of active formatting elements</a>.
+
+ </p></li><li>
+ <p>Let the <var title="">common ancestor</var> be the element
+ immediately above the <var title="">formatting element</var> in
+ the <a href="#stack">stack of open elements</a>.
+
+ </p></li><li>
+ <p>If the <var title="">furthest block</var> has a parent node,
+ then remove the <var title="">furthest block</var> from its parent
+ node.
+
+ </p></li><li>
+ <p>Let a bookmark note the position of the <var title="">formatting
+ element</var> in the <a href="#list-of4">list of active formatting
+ elements</a> relative to the elements on either side of it in the
+ list.
+
+ </p></li><li>
+ <p>Let <var title="">node</var> and <var title="">last node</var>
+ be the <var title="">furthest block</var>. Follow these steps:</p>
+
+ <ol>
+ <li>Let <var title="">node</var> be the element immediately prior
+ to <var title="">node</var> in the <a href="#stack">stack of open
+ elements</a>.
+
+ </li><li>If <var title="">node</var> is not in the <a href="#list-of4">list of active formatting elements</a>, then
+ remove <var title="">node</var> from the <a href="#stack">stack
+ of open elements</a> and then go back to step 1.
+
+ </li><li>Otherwise, if <var title="">node</var> is the <var title="">formatting element</var>, then go to the next step in
+ the overall algorithm.
+
+ </li><li>Otherwise, if <var title="">last node</var> is the <var title="">furthest block</var>, then move the aforementioned
+ bookmark to be immediately after the <var title="">node</var> in
+ the <a href="#list-of4">list of active formatting elements</a>.
+
+ </li><li>If <var title="">node</var> has any children, perform a
+ shallow clone of <var title="">node</var>, replace the entry for
+ <var title="">node</var> in the <a href="#list-of4">list of
+ active formatting elements</a> with an entry for the clone,
+ replace the entry for <var title="">node</var> in the <a href="#stack">stack of open elements</a> with an entry for the
+ clone, and let <var title="">node</var> be the clone.
+
+ </li><li>Insert <var title="">last node</var> into <var title="">node</var>, first removing it from its previous parent
+ node if any.
+
+ </li><li>Let <var title="">last node</var> be <var title="">node</var>.
+
+ </li><li>Return to step 1 of this inner set of steps.
+ </li></ol>
+
+ </li><li>
+ <p>Insert whatever <var title="">last node</var> ended up being in
+ the previous step into the <var title="">common ancestor</var>
+ node, first removing it from its previous parent node if any.
+
+ </p></li><li>
+ <p>Perform a shallow clone of the <var title="">formatting
+ element</var>.
+
+ </p></li><li>
+ <p>Take all of the child nodes of the <var title="">furthest
+ block</var> and append them to the clone created in the last step.
+
+ </p></li><li>
+ <p>Append that clone to the <var title="">furthest block</var>.
+
+ </p></li><li>
+ <p>Remove the <var title="">formatting element</var> from the <a href="#list-of4">list of active formatting elements</a>, and
+ insert the clone into the <a href="#list-of4">list of active
+ formatting elements</a> at the position of the aforementioned
+ bookmark.
+
+ </p></li><li>
+ <p>Remove the <var title="">formatting element</var> from the <a href="#stack">stack of open elements</a>, and insert the clone
+ into the <a href="#stack">stack of open elements</a> immediately
+ after (i.e. in a more deeply nested position than) the position of
+ the <var title="">furthest block</var> in that stack.
+
+ </p></li><li>
+ <p>Jump back to step 1 in this series of steps.
+ </p></li></ol>
+
+ <p class="note">The way these steps are defined, only elements in the
+ <a href="#formatting">formatting</a> category ever get cloned by
+ this algorithm.</p>
+ <!--XXX
+ <div class="example">
+ <p class="big-issue">Need an example.</p>
+ </div>
+-->
+
+ <p class="note">Because of the way this algorithm causes elements to
+ change parents, it has been dubbed the &quot;adoption agency algorithm&quot;
+ (in contrast with other possibly algorithms for dealing with
+ misnested content, which included the &quot;incest algorithm&quot;, the
+ &quot;secret affair algorithm&quot;, and the &quot;Heisenberg algorithm&quot;).</p>
+
+ </dd><dt>A start tag token whose tag name is &quot;button&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a
+ <code>button</code> element in scope</a>, then this is a <a href="section-parsing.html#parse">parse error</a>; act as if an end tag with the tag
+ name &quot;button&quot; had been seen, then reprocess the token.</p>
+
+ <p>Otherwise:</p>
+
+ <p><a href="#reconstruct">Reconstruct the active formatting
+ elements</a>, if any.</p>
+
+ <p><a href="#insert">Insert an HTML element</a> for the token.</p>
+
+ <p>Insert a marker at the end of the <a href="#list-of4">list of
+ active formatting elements</a>.</p>
+
+ </dd><dt>A start tag token whose tag name is one of: &quot;marquee&quot;, &quot;object&quot;
+
+ </dt><dd>
+ <p><a href="#reconstruct">Reconstruct the active formatting
+ elements</a>, if any.</p>
+
+ <p><a href="#insert">Insert an HTML element</a> for the token.</p>
+
+ <p>Insert a marker at the end of the <a href="#list-of4">list of
+ active formatting elements</a>.</p>
+
+ </dd><dt>An end tag token whose tag name is one of: &quot;button&quot;, &quot;marquee&quot;,
+ &quot;object&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has in scope</a> an
+ element whose tag name is the same as the tag name of the token,
+ then <a href="#generate">generate implied end tags</a>.</p>
+
+ <p>Now, if the <a href="#current4">current node</a> is not an element
+ with the same tag name as the token, then this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>Now, if the <a href="#stack">stack of open elements</a> <a href="#have-an">has an element in scope</a> whose tag name matches
+ the tag name of the token, then pop elements from the stack until
+ that element has been popped from the stack, and <a href="#clear0">clear the list of active formatting elements up to
+ the last marker</a>.</p>
+
+ </dd><dt>A start tag token whose tag name is &quot;xmp&quot;
+
+ </dt><dd>
+ <p><a href="#reconstruct">Reconstruct the active formatting
+ elements</a>, if any.</p>
+
+ <p><a href="#insert">Insert an HTML element</a> for the token.</p>
+
+ <p>Switch the <a href="section-tokenisation.html#content2">content model flag</a> to the CDATA
+ state.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;table&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code>
+ element in scope</a>, then act as if an end tag with the tag name
+ <code><a href="section-prose.html#p">p</a></code> had been seen.</p>
+ <!-- XXX quirks: don't do this -->
+ <p><a href="#insert">Insert an HTML element</a> for the token.</p>
+
+ <p>Change the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-table" title="insertion mode: in table">in table</a>&quot;.</p>
+
+ </dd><dt>A start tag whose tag name is one of: &quot;area&quot;, &quot;basefont&quot;,
+ &quot;bgsound&quot;, &quot;br&quot;, &quot;embed&quot;, &quot;img&quot;, &quot;param&quot;, &quot;spacer&quot;, &quot;wbr&quot;
+
+ </dt><dd>
+ <p><a href="#reconstruct">Reconstruct the active formatting
+ elements</a>, if any.</p>
+
+ <p><a href="#insert" title="insert an html element">Insert an HTML
+ element</a> for the token. Immediately pop the <a href="#current4">current node</a> off the <a href="#stack">stack of
+ open elements</a>.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;hr&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an" title="has an element in scope">has a <code>p</code>
+ element in scope</a>, then act as if an end tag with the tag name
+ <code><a href="section-prose.html#p">p</a></code> had been seen.</p>
+ <!-- XXX quirks: don't do this -->
+ <p><a href="#insert" title="insert an html element">Insert an HTML
+ element</a> for the token. Immediately pop the <a href="#current4">current node</a> off the <a href="#stack">stack of
+ open elements</a>.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;image&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Change the token's tag name to
+ &quot;img&quot; and reprocess it. (Don't ask.)</p>
+ <!-- As of
+ 2005-12, studies showed that around 0.2% of pages used the
+ <image> element. -->
+
+
+ </dd><dt>A start tag whose tag name is &quot;input&quot;
+
+ </dt><dd>
+ <p><a href="#reconstruct">Reconstruct the active formatting
+ elements</a>, if any.</p>
+
+ <p><a href="#insert" title="insert an html element">Insert an
+ <code>input</code> element</a> for the token.</p>
+
+ <p>If the <a href="#form-element"><code title="">form</code> element
+ pointer</a> is not null, then <span>associate</span><!--XXX
+ xref! -->
+ the <code>input</code> element with the <code>form</code> element
+ pointed to by the <a href="#form-element"><code title="">form</code>
+ element pointer</a>.</p>
+
+ <p>Pop that <code>input</code> element off the <a href="#stack">stack
+ of open elements</a>.</p>
+
+ </dd><dt id="isindex">A start tag whose tag name is &quot;isindex&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>.</p>
+
+ <p>If the <a href="#form-element"><code title="">form</code> element
+ pointer</a> is not null, then ignore the token.</p>
+
+ <p>Otherwise:</p>
+
+ <p>Act as if a start tag token with the tag name &quot;form&quot; had been
+ seen.</p>
+
+ <p>Act as if a start tag token with the tag name &quot;hr&quot; had been seen.</p>
+
+ <p>Act as if a start tag token with the tag name &quot;p&quot; had been seen.</p>
+
+ <p>Act as if a start tag token with the tag name &quot;label&quot; had been
+ seen.</p>
+
+ <p>Act as if a stream of character tokens had been seen (see below
+ for what they should say).</p>
+
+ <p>Act as if a start tag token with the tag name &quot;input&quot; had been
+ seen, with all the attributes from the &quot;isindex&quot; token, except with
+ the &quot;name&quot; attribute set to the value &quot;isindex&quot; (ignoring any
+ explicit &quot;name&quot; attribute).</p>
+
+ <p>Act as if a stream of character tokens had been seen (see below
+ for what they should say).</p>
+
+ <p>Act as if an end tag token with the tag name &quot;label&quot; had been
+ seen.</p>
+
+ <p>Act as if an end tag token with the tag name &quot;p&quot; had been seen.</p>
+
+ <p>Act as if a start tag token with the tag name &quot;hr&quot; had been seen.</p>
+
+ <p>Act as if an end tag token with the tag name &quot;form&quot; had been seen.</p>
+
+ <p>The two streams of character tokens together should, together with
+ the <code>input</code> element, express the equivalent of &quot;This is a
+ searchable index. Insert your search keywords here: (input field)&quot;
+ in the user's preferred language.</p>
+
+ <p class="big-issue"> Then need to specify that if the form submission
+ causes just a single form control, whose name is &quot;isindex&quot;, to be
+ submitted, then we submit just the value part, not the &quot;isindex=&quot;
+ part.</p>
+ </dd>
+ <!-- XXX keygen support; don't forget form element pointer!
+
+ <dt>A start tag whose tag name is "keygen"</dt>
+ <dd>
+ ...
+ </dd>
+-->
+
+ <dt>A start tag whose tag name is &quot;textarea&quot;
+
+ </dt><dd>
+ <p><a href="#create">Create an element for the token</a>.</p>
+
+ <p>If the <a href="#form-element"><code title="">form</code> element
+ pointer</a> is not null, then <span>associate</span><!--XXX
+ xref! -->
+ the <code>textarea</code> element with the <code>form</code> element
+ pointed to by the <a href="#form-element"><code title="">form</code>
+ element pointer</a>.</p>
+
+ <p>Append the new element to the <a href="#current4">current
+ node</a>.</p>
+
+ <p>Switch the tokeniser's <a href="section-tokenisation.html#content2">content model flag</a>
+ to the RCDATA state.</p>
+
+ <p>If the next token is a U+000A LINE FEED (LF) character token, then
+ ignore that token and move on to the next one. (Newlines at the
+ start of <code>textarea</code> elements are ignored as an authoring
+ convenience.)</p>
+
+ <p>Then, collect all the character tokens that the tokeniser returns
+ until it returns a token that is not a character token, or until it
+ stops tokenising.</p>
+
+ <p>If this process resulted in a collection of character tokens,
+ append a single <code>Text</code> node, whose contents is the
+ concatenation of all those tokens' characters, to the new element
+ node.</p>
+
+ <p>The tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> will
+ have switched back to the PCDATA state.</p>
+
+ <p>If the next token is an end tag token with the tag name
+ &quot;textarea&quot;, ignore it. Otherwise, this is a <a href="section-parsing.html#parse">parse
+ error</a>.</p>
+
+ </dd><dt>A start tag whose tag name is one of: &quot;iframe&quot;, &quot;noembed&quot;,
+ &quot;noframes&quot;
+
+ </dt><dt>A start tag whose tag name is &quot;noscript&quot;, if <a href="section-scripting.html#scripting2">scripting is enabled</a>:
+
+ </dt><dd>
+ <p><a href="#create">Create an element for the token</a>.</p>
+
+ <p>For &quot;iframe&quot; tags, the node must be an <code><a href="section-embedded.html#htmliframeelement">HTMLIFrameElement</a></code> object, for
+ the other tags it must be an <code><a href="section-elements.html#htmlelement">HTMLElement</a></code> object.</p>
+
+ <p>Append the new element to the <a href="#current4">current
+ node</a>.</p>
+
+ <p>Switch the tokeniser's <a href="section-tokenisation.html#content2">content model flag</a>
+ to the CDATA state.</p>
+
+ <p>Then, collect all the character tokens that the tokeniser returns
+ until it returns a token that is not a character token, or until it
+ stops tokenising.</p>
+
+ <p>If this process resulted in a collection of character tokens,
+ append a single <code>Text</code> node, whose contents is the
+ concatenation of all those tokens' characters, to the new element
+ node.</p>
+
+ <p>The tokeniser's <a href="section-tokenisation.html#content2">content model flag</a> will
+ have switched back to the PCDATA state.</p>
+
+ <p>If the next token is an end tag token with the same tag name as
+ the start tag token, ignore it. Otherwise, this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;select&quot;
+
+ </dt><dd>
+ <p><a href="#reconstruct">Reconstruct the active formatting
+ elements</a>, if any.</p>
+
+ <p><a href="#insert">Insert an HTML element</a> for the token.</p>
+
+ <p>Change the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-select" title="insertion mode: in select">in select</a>&quot;.</p>
+ </dd>
+ <!-- XXX quirks:
+ <dt>An end tag whose tag name is "br"</dt>
+ <dd>
+ <p>Act as if a start tag token with the tag name "br" had been
+ seen. Ignore the end tag token.</p>
+ </dd>
+-->
+
+ <dt>A start or end tag whose tag name is one of: &quot;caption&quot;, &quot;col&quot;,
+ &quot;colgroup&quot;, &quot;frame&quot;, &quot;frameset&quot;, &quot;head&quot;, &quot;option&quot;, &quot;optgroup&quot;,
+ &quot;tbody&quot;, &quot;td&quot;, &quot;tfoot&quot;, &quot;th&quot;, &quot;thead&quot;, &quot;tr&quot;
+
+ </dt><dt>An end tag whose tag name is one of: &quot;area&quot;, &quot;basefont&quot;,
+ &quot;bgsound&quot;, <!--XXX quirks: remove br-->&quot;br&quot;, &quot;embed&quot;, &quot;hr&quot;, &quot;iframe&quot;,
+ &quot;image&quot;, &quot;img&quot;, &quot;input&quot;, &quot;isindex&quot;, &quot;noembed&quot;, &quot;noframes&quot;, &quot;param&quot;,
+ &quot;select&quot;, &quot;spacer&quot;, &quot;table&quot;, &quot;textarea&quot;, &quot;wbr&quot;</dt>
+ <!-- add keygen if we add the start tag -->
+
+ <dt>An end tag whose tag name is &quot;noscript&quot;, if <a href="section-scripting.html#scripting2">scripting is enabled</a>:
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+
+ </dd><dt>A start or end tag whose tag name is one of: &quot;event-source&quot;,
+ &quot;section&quot;, &quot;nav&quot;, &quot;article&quot;, &quot;aside&quot;, &quot;header&quot;, &quot;footer&quot;, &quot;datagrid&quot;,
+ &quot;command&quot;
+
+ </dt><dd> <!-- XXXX -->
+ <p class="big-issue">Work in progress!</p>
+
+ </dd><dt>A start tag token not covered by the previous entries
+
+ </dt><dd>
+ <p><a href="#reconstruct">Reconstruct the active formatting
+ elements</a>, if any.</p>
+
+ <p><a href="#insert">Insert an HTML element</a> for the token.</p>
+
+ <p class="note">This element will be a <a href="#phrasing">phrasing</a>
+ element.</p>
+ <!--
+Put the following into the MathML namespace if parsed:
+ math, mrow, mfrac, msqrt, mroot, mstyle, merror, mpadded,
+ mphantom, mfenced, menclose, msub, msup, msubsup, munder,
+ mover, munderover, mmultiscripts, mtable, mlabeledtr, mtr,
+ mtd, maction
+-->
+
+
+ </dd><dt>An end tag token not covered by the previous entries
+
+ </dt><dd>
+ <p>Run the following algorithm:</p>
+
+ <ol>
+ <li>
+ <p>Initialise <var title="">node</var> to be the <a href="#current4">current node</a> (the bottommost node of the
+ stack).
+
+ </p></li><li>
+ <p>If <var title="">node</var> has the same tag name as the end tag
+ token, then:</p>
+
+ <ol>
+ <li>
+ <p><a href="#generate">Generate implied end tags</a>.
+
+ </p></li><li>
+ <p>If the tag name of the end tag token does not match the tag
+ name of the <a href="#current4">current node</a>, this is a <a href="section-parsing.html#parse">parse error</a>.
+
+ </p></li><li>
+ <p>Pop all the nodes from the <a href="#current4">current
+ node</a> up to <var title="">node</var>, including <var title="">node</var>, then stop this algorithm.
+ </p></li></ol>
+
+ </li><li>
+ <p>Otherwise, if <var title="">node</var> is in neither the <a href="#formatting">formatting</a> category nor the <a href="#phrasing">phrasing</a> category, then this is a <a href="section-parsing.html#parse">parse error</a>. Stop this algorithm. The end tag
+ token is ignored.
+
+ </p></li><li>
+ <p>Set <var title="">node</var> to the previous entry in the <a href="#stack">stack of open elements</a>.
+
+ </p></li><li>
+ <p>Return to step 2.
+ </p></li></ol>
+ </dd></dl>
+
+ </dd><dt id="parsing-main-intable">If the <a href="#insertion0">insertion
+ mode</a> is &quot;<dfn id="in-table" title="insertion mode: in table">in
+ table</dfn>&quot;
+
+ </dt><dd>
+ <dl class="switch">
+ <dt>A character token that is one of one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C
+ FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dd>
+ <p><a href="#append" title="append a character">Append the
+ character</a> to the <a href="#current4">current node</a>.</p>
+
+ </dd><dt>A comment token
+
+ </dt><dd>
+ <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment
+ token.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;caption&quot;
+
+ </dt><dd>
+ <p><a href="#clear1">Clear the stack back to a table context</a>.
+ (See below.)</p>
+
+ <p>Insert a marker at the end of the <a href="#list-of4">list of
+ active formatting elements</a>.</p>
+
+ <p><a href="#insert">Insert an HTML element</a> for the token, then
+ switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-caption" title="insertion mode: in caption">in
+ caption</a>&quot;.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;colgroup&quot;
+
+ </dt><dd>
+ <p><a href="#clear1">Clear the stack back to a table context</a>.
+ (See below.)</p>
+
+ <p><a href="#insert">Insert an HTML element</a> for the token, then
+ switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-column" title="insertion mode: in column group">in column
+ group</a>&quot;.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;col&quot;
+
+ </dt><dd>
+ <p>Act as if a start tag token with the tag name &quot;colgroup&quot; had been
+ seen, then reprocess the current token.</p>
+
+ </dd><dt>A start tag whose tag name is one of: &quot;tbody&quot;, &quot;tfoot&quot;, &quot;thead&quot;
+
+ </dt><dd>
+ <p><a href="#clear1">Clear the stack back to a table context</a>.
+ (See below.)</p>
+
+ <p><a href="#insert">Insert an HTML element</a> for the token, then
+ switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-table0" title="insertion mode: in table body">in table
+ body</a>&quot;.</p>
+
+ </dd><dt>A start tag whose tag name is one of: &quot;td&quot;, &quot;th&quot;, &quot;tr&quot;
+
+ </dt><dd>
+ <p>Act as if a start tag token with the tag name &quot;tbody&quot; had been
+ seen, then reprocess the current token.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;table&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Act as if an end tag token with
+ the tag name &quot;table&quot; had been seen, then, if that token wasn't
+ ignored, reprocess the current token.</p>
+
+ <p class="note">The fake end tag token here can only be ignored in the
+ <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p>
+
+ </dd><dt>An end tag whose tag name is &quot;table&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an
+ element in table scope</a> with the same tag name as the token, this
+ is a <a href="section-parsing.html#parse">parse error</a>. Ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p>Otherwise:</p>
+
+ <p><a href="#generate">Generate implied end tags</a>.</p>
+
+ <p>Now, if the <a href="#current4">current node</a> is not a <code><a href="section-tabular.html#table">table</a></code> element, then this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>Pop elements from this stack until a <code><a href="section-tabular.html#table">table</a></code> element has been popped from the
+ stack.</p>
+
+ <p><a href="#reset">Reset the insertion mode appropriately</a>.</p>
+
+ </dd><dt>An end tag whose tag name is one of: &quot;body&quot;, &quot;caption&quot;, &quot;col&quot;,
+ &quot;colgroup&quot;, &quot;html&quot;, &quot;tbody&quot;, &quot;td&quot;, &quot;tfoot&quot;, &quot;th&quot;, &quot;thead&quot;, &quot;tr&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Process the token as if the <a href="#insertion0">insertion mode</a> was &quot;<a href="#in-body" title="insertion mode: in body">in body</a>&quot;, with the following
+ exception:</p>
+
+ <p>If the <a href="#current4">current node</a> is a <code><a href="section-tabular.html#table">table</a></code>, <code><a href="section-tabular.html#tbody">tbody</a></code>, <code><a href="section-tabular.html#tfoot0">tfoot</a></code>, <code><a href="section-tabular.html#thead0">thead</a></code>, or <code><a href="section-tabular.html#tr">tr</a></code> element, then, whenever a node would be
+ inserted into the <a href="#current4">current node</a>, it must
+ instead be inserted into the <em><a href="#foster">foster parent
+ element</a></em>.</p>
+
+ <p>The <dfn id="foster">foster parent element</dfn> is the parent
+ element of the last <code><a href="section-tabular.html#table">table</a></code> element
+ in the <a href="#stack">stack of open elements</a>, if there is a
+ <code><a href="section-tabular.html#table">table</a></code> element and it has such a
+ parent element. If there is no <code><a href="section-tabular.html#table">table</a></code> element in the <a href="#stack">stack
+ of open elements</a> (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code>
+ case</a>), then the <em><a href="#foster">foster parent
+ element</a></em> is the first element in the <a href="#stack">stack
+ of open elements</a> (the <code><a href="section-the-root.html#html">html</a></code>
+ element). Otherwise, if there is a <code><a href="section-tabular.html#table">table</a></code> element in the <a href="#stack">stack
+ of open elements</a>, but the last <code><a href="section-tabular.html#table">table</a></code> element in the <a href="#stack">stack
+ of open elements</a> has no parent, or its parent node is not an
+ element, then the <em><a href="#foster">foster parent
+ element</a></em> is the element before the last <code><a href="section-tabular.html#table">table</a></code> element in the <a href="#stack">stack
+ of open elements</a>.</p>
+
+ <p>If the <em><a href="#foster">foster parent element</a></em> is the
+ parent element of the last <code><a href="section-tabular.html#table">table</a></code>
+ element in the <a href="#stack">stack of open elements</a>, then the
+ new node must be inserted immediately <em>before</em> the last
+ <code><a href="section-tabular.html#table">table</a></code> element in the <a href="#stack">stack of open elements</a> in the <a href="#foster">foster parent element</a>; otherwise, the new node
+ must be <em>appended</em> to the <a href="#foster">foster parent
+ element</a>.</p>
+ </dd></dl>
+
+ <p>When the steps above require the UA to <dfn id="clear1">clear the
+ stack back to a table context</dfn>, it means that the UA must, while
+ the <a href="#current4">current node</a> is not a <code><a href="section-tabular.html#table">table</a></code> element or an <code><a href="section-the-root.html#html">html</a></code> element, pop elements from the <a href="#stack">stack of open elements</a>. If this causes any elements
+ to be popped from the stack, then this is a <a href="section-parsing.html#parse">parse
+ error</a>.</p>
+
+ <p class="note">The <a href="#current4">current node</a> being an
+ <code><a href="section-the-root.html#html">html</a></code> element after this process is an
+ <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p>
+
+ </dd><dt id="parsing-main-incaption">If the <a href="#insertion0">insertion
+ mode</a> is &quot;<dfn id="in-caption" title="insertion mode: in caption">in
+ caption</dfn>&quot;
+
+ </dt><dd>
+ <dl class="switch">
+ <dt>An end tag whose tag name is &quot;caption&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an
+ element in table scope</a> with the same tag name as the token, this
+ is a <a href="section-parsing.html#parse">parse error</a>. Ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p>Otherwise:</p>
+
+ <p><a href="#generate">Generate implied end tags</a>.</p>
+
+ <p>Now, if the <a href="#current4">current node</a> is not a <code><a href="section-tabular.html#caption0">caption</a></code> element, then this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>Pop elements from this stack until a <code><a href="section-tabular.html#caption0">caption</a></code> element has been popped from the
+ stack.</p>
+
+ <p><a href="#clear0">Clear the list of active formatting elements up
+ to the last marker</a>.</p>
+
+ <p>Switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-table" title="insertion mode: in table">in table</a>&quot;.</p>
+
+ </dd><dt>A start tag whose tag name is one of: &quot;caption&quot;, &quot;col&quot;,
+ &quot;colgroup&quot;, &quot;tbody&quot;, &quot;td&quot;, &quot;tfoot&quot;, &quot;th&quot;, &quot;thead&quot;, &quot;tr&quot;
+
+ </dt><dt>An end tag whose tag name is &quot;table&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Act as if an end tag with the
+ tag name &quot;caption&quot; had been seen, then, if that token wasn't
+ ignored, reprocess the current token.</p>
+
+ <p class="note">The fake end tag token here can only be ignored in the
+ <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p>
+
+ </dd><dt>An end tag whose tag name is one of: &quot;body&quot;, &quot;col&quot;, &quot;colgroup&quot;,
+ &quot;html&quot;, &quot;tbody&quot;, &quot;td&quot;, &quot;tfoot&quot;, &quot;th&quot;, &quot;thead&quot;, &quot;tr&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p>Process the token as if the <a href="#insertion0">insertion
+ mode</a> was &quot;<a href="#in-body" title="insertion mode: in body">in
+ body</a>&quot;.</p>
+ </dd></dl>
+
+ </dd><dt id="parsing-main-incolgroup">If the <a href="#insertion0">insertion
+ mode</a> is &quot;<dfn id="in-column" title="insertion mode: in column
+ group">in column group</dfn>&quot;
+
+ </dt><dd>
+ <dl class="switch">
+ <dt>A character token that is one of one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C
+ FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dd>
+ <p><a href="#append" title="append a character">Append the
+ character</a> to the <a href="#current4">current node</a>.</p>
+
+ </dd><dt>A comment token
+
+ </dt><dd>
+ <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment
+ token.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;col&quot;
+
+ </dt><dd>
+ <p><a href="#insert" title="insert an HTML element">Insert a
+ <code>col</code> element</a> for the token. Immediately pop the <a href="#current4">current node</a> off the <a href="#stack">stack of
+ open elements</a>.</p>
+
+ </dd><dt>An end tag whose tag name is &quot;colgroup&quot;
+
+ </dt><dd>
+ <p>If the <a href="#current4">current node</a> is the root <code><a href="section-the-root.html#html">html</a></code> element, then this is a <a href="section-parsing.html#parse">parse error</a>, ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p>Otherwise, pop the <a href="#current4">current node</a> (which
+ will be a <code><a href="section-tabular.html#colgroup">colgroup</a></code> element)
+ from the <a href="#stack">stack of open elements</a>. Switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-table" title="insertion mode: in table">in table</a>&quot;.</p>
+
+ </dd><dt>An end tag whose tag name is &quot;col&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p>Act as if an end tag with the tag name &quot;colgroup&quot; had been seen,
+ and then, if that token wasn't ignored, reprocess the current token.</p>
+
+ <p class="note">The fake end tag token here can only be ignored in the
+ <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p>
+ </dd></dl>
+
+ </dd><dt id="parsing-main-intbody">If the <a href="#insertion0">insertion
+ mode</a> is &quot;<dfn id="in-table0" title="insertion mode: in table body">in
+ table body</dfn>&quot;
+
+ </dt><dd>
+ <dl class="switch">
+ <dt>A start tag whose tag name is &quot;tr&quot;
+
+ </dt><dd>
+ <p><a href="#clear2">Clear the stack back to a table body
+ context</a>. (See below.)</p>
+
+ <p><a href="#insert" title="insert an HTML element">Insert a
+ <code>tr</code> element</a> for the token, then switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-row" title="insertion mode: in row">in row</a>&quot;.</p>
+
+ </dd><dt>A start tag whose tag name is one of: &quot;th&quot;, &quot;td&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Act as if a start tag with the
+ tag name &quot;tr&quot; had been seen, then reprocess the current token.</p>
+
+ </dd><dt>An end tag whose tag name is one of: &quot;tbody&quot;, &quot;tfoot&quot;, &quot;thead&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an
+ element in table scope</a> with the same tag name as the token, this
+ is a <a href="section-parsing.html#parse">parse error</a>. Ignore the token.</p>
+
+ <p>Otherwise:</p>
+
+ <p><a href="#clear2">Clear the stack back to a table body
+ context</a>. (See below.)</p>
+
+ <p>Pop the <a href="#current4">current node</a> from the <a href="#stack">stack of open elements</a>. Switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-table" title="insertion mode: in table">in table</a>&quot;.</p>
+
+ </dd><dt>A start tag whose tag name is one of: &quot;caption&quot;, &quot;col&quot;,
+ &quot;colgroup&quot;, &quot;tbody&quot;, &quot;tfoot&quot;, &quot;thead&quot;
+
+ </dt><dt>An end tag whose tag name is &quot;table&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have a
+ <code>tbody</code>, <code>thead</code>, or <code>tfoot</code>
+ element in table scope</a>, this is a <a href="section-parsing.html#parse">parse
+ error</a>. Ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p>Otherwise:</p>
+
+ <p><a href="#clear2">Clear the stack back to a table body
+ context</a>. (See below.)</p>
+
+ <p>Act as if an end tag with the same tag name as the <a href="#current4">current node</a> (&quot;tbody&quot;, &quot;tfoot&quot;, or &quot;thead&quot;) had
+ been seen, then reprocess the current token.</p>
+
+ </dd><dt>An end tag whose tag name is one of: &quot;body&quot;, &quot;caption&quot;, &quot;col&quot;,
+ &quot;colgroup&quot;, &quot;html&quot;, &quot;td&quot;, &quot;th&quot;, &quot;tr&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p>Process the token as if the <a href="#insertion0">insertion
+ mode</a> was &quot;<a href="#in-table" title="insertion mode: in
+ table">in table</a>&quot;.</p>
+ </dd></dl>
+
+ <p>When the steps above require the UA to <dfn id="clear2">clear the
+ stack back to a table body context</dfn>, it means that the UA must,
+ while the <a href="#current4">current node</a> is not a <code><a href="section-tabular.html#tbody">tbody</a></code>, <code><a href="section-tabular.html#tfoot0">tfoot</a></code>, <code><a href="section-tabular.html#thead0">thead</a></code>, or <code><a href="section-the-root.html#html">html</a></code> element, pop elements from the <a href="#stack">stack of open elements</a>. If this causes any elements
+ to be popped from the stack, then this is a <a href="section-parsing.html#parse">parse
+ error</a>.</p>
+
+ <p class="note">The <a href="#current4">current node</a> being an
+ <code><a href="section-the-root.html#html">html</a></code> element after this process is an
+ <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p>
+
+ </dd><dt id="parsing-main-intr">If the <a href="#insertion0">insertion mode</a>
+ is &quot;<dfn id="in-row" title="insertion mode: in row">in row</dfn>&quot;
+
+ </dt><dd>
+ <dl class="switch">
+ <dt>A start tag whose tag name is one of: &quot;th&quot;, &quot;td&quot;
+
+ </dt><dd>
+ <p><a href="#clear3">Clear the stack back to a table row context</a>.
+ (See below.)</p>
+
+ <p><a href="#insert" title="insert an HTML element">Insert an HTML
+ element</a> for the token, then switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-cell" title="insertion mode: in cell">in cell</a>&quot;.</p>
+
+ <p>Insert a marker at the end of the <a href="#list-of4">list of
+ active formatting elements</a>.</p>
+
+ </dd><dt>An end tag whose tag name is &quot;tr&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an
+ element in table scope</a> with the same tag name as the token, this
+ is a <a href="section-parsing.html#parse">parse error</a>. Ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p>Otherwise:</p>
+
+ <p><a href="#clear3">Clear the stack back to a table row context</a>.
+ (See below.)</p>
+
+ <p>Pop the <a href="#current4">current node</a> (which will be a
+ <code><a href="section-tabular.html#tr">tr</a></code> element) from the <a href="#stack">stack of open elements</a>. Switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-table0" title="insertion mode: in table body">in table body</a>&quot;.</p>
+
+ </dd><dt>A start tag whose tag name is one of: &quot;caption&quot;, &quot;col&quot;,
+ &quot;colgroup&quot;, &quot;tbody&quot;, &quot;tfoot&quot;, &quot;thead&quot;, &quot;tr&quot;
+
+ </dt><dt>An end tag whose tag name is &quot;table&quot;
+
+ </dt><dd>
+ <p>Act as if an end tag with the tag name &quot;tr&quot; had been seen, then,
+ if that token wasn't ignored, reprocess the current token.</p>
+
+ <p class="note">The fake end tag token here can only be ignored in the
+ <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p>
+
+ </dd><dt>An end tag whose tag name is one of: &quot;tbody&quot;, &quot;tfoot&quot;, &quot;thead&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an
+ element in table scope</a> with the same tag name as the token, this
+ is a <a href="section-parsing.html#parse">parse error</a>. Ignore the token.</p>
+
+ <p>Otherwise, act as if an end tag with the tag name &quot;tr&quot; had been
+ seen, then reprocess the current token.</p>
+
+ </dd><dt>An end tag whose tag name is one of: &quot;body&quot;, &quot;caption&quot;, &quot;col&quot;,
+ &quot;colgroup&quot;, &quot;html&quot;, &quot;td&quot;, &quot;th&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p>Process the token as if the <a href="#insertion0">insertion
+ mode</a> was &quot;<a href="#in-table" title="insertion mode: in
+ table">in table</a>&quot;.</p>
+ </dd></dl>
+
+ <p>When the steps above require the UA to <dfn id="clear3">clear the
+ stack back to a table row context</dfn>, it means that the UA must,
+ while the <a href="#current4">current node</a> is not a <code><a href="section-tabular.html#tr">tr</a></code> element or an <code><a href="section-the-root.html#html">html</a></code> element, pop elements from the <a href="#stack">stack of open elements</a>. If this causes any elements
+ to be popped from the stack, then this is a <a href="section-parsing.html#parse">parse
+ error</a>.</p>
+
+ <p class="note">The <a href="#current4">current node</a> being an
+ <code><a href="section-the-root.html#html">html</a></code> element after this process is an
+ <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>.</p>
+
+ </dd><dt id="parsing-main-intd">If the <a href="#insertion0">insertion mode</a>
+ is &quot;<dfn id="in-cell" title="insertion mode: in cell">in cell</dfn>&quot;
+
+ </dt><dd>
+ <dl class="switch">
+ <dt>An end tag whose tag name is one of: &quot;td&quot;, &quot;th&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an
+ element in table scope</a> with the same tag name as that of the
+ token, then this is a <a href="section-parsing.html#parse">parse error</a> and the token
+ must be ignored.</p>
+
+ <p>Otherwise:</p>
+
+ <p><a href="#generate">Generate implied end tags</a>, except for
+ elements with the same tag name as the token.</p>
+
+ <p>Now, if the <a href="#current4">current node</a> is not an element
+ with the same tag name as the token, then this is a <a href="section-parsing.html#parse">parse error</a>.</p>
+
+ <p>Pop elements from this stack until an element with the same tag
+ name as the token has been popped from the stack.</p>
+
+ <p><a href="#clear0">Clear the list of active formatting elements up
+ to the last marker</a>.</p>
+
+ <p>Switch the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-row" title="insertion mode: in row">in row</a>&quot;. (The <a href="#current4">current node</a> will be a <code><a href="section-tabular.html#tr">tr</a></code> element at this point.)</p>
+
+ </dd><dt>A start tag whose tag name is one of: &quot;caption&quot;, &quot;col&quot;,
+ &quot;colgroup&quot;, &quot;tbody&quot;, &quot;td&quot;, &quot;tfoot&quot;, &quot;th&quot;, &quot;thead&quot;, &quot;tr&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> does
+ <em>not</em> <a href="#have-an0" title="has an element in table
+ scope">have a <code>td</code> or <code>th</code> element in table
+ scope</a>, then this is a <a href="section-parsing.html#parse">parse error</a>; ignore
+ the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p>Otherwise, <a href="#close2">close the cell</a> (see below) and
+ reprocess the current token.</p>
+
+ </dd><dt>An end tag whose tag name is one of: &quot;body&quot;, &quot;caption&quot;, &quot;col&quot;,
+ &quot;colgroup&quot;, &quot;html&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+
+ </dd><dt>An end tag whose tag name is one of: &quot;table&quot;, &quot;tbody&quot;, &quot;tfoot&quot;,
+ &quot;thead&quot;, &quot;tr&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an
+ element in table scope</a> with the same tag name as that of the
+ token (which can only happen for &quot;tbody&quot;, &quot;tfoot&quot; and &quot;thead&quot;, or,
+ in the <a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>), then
+ this is a <a href="section-parsing.html#parse">parse error</a> and the token must be
+ ignored.</p>
+
+ <p>Otherwise, <a href="#close2">close the cell</a> (see below) and
+ reprocess the current token.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p>Process the token as if the <a href="#insertion0">insertion
+ mode</a> was &quot;<a href="#in-body" title="insertion mode: in body">in
+ body</a>&quot;.</p>
+ </dd></dl>
+
+ <p>Where the steps above say to <dfn id="close2">close the cell</dfn>,
+ they mean to follow the following algorithm:</p>
+
+ <ol>
+ <li>
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an0" title="has an element in table scope">has a
+ <code>td</code> element in table scope</a>, then act as if an end
+ tag token with the tag name &quot;td&quot; had been seen.
+
+ </p></li><li>
+ <p>Otherwise, the <a href="#stack">stack of open elements</a> will <a href="#have-an0" title="has an element in table scope">have a
+ <code>th</code> element in table scope</a>; act as if an end tag
+ token with the tag name &quot;th&quot; had been seen.
+ </p></li></ol>
+
+ <p class="note">The <a href="#stack">stack of open elements</a> cannot
+ have both a <code><a href="section-tabular.html#td">td</a></code> and a <code><a href="section-tabular.html#th">th</a></code> element <a href="#have-an0" title="has an
+ element in table scope">in table scope</a> at the same time, nor can
+ it have neither when the <a href="#insertion0">insertion mode</a> is
+ &quot;<a href="#in-cell" title="insertion mode: in cell">in cell</a>&quot;.</p>
+
+ </dd><dt id="parsing-main-inselect">If the <a href="#insertion0">insertion
+ mode</a> is &quot;<dfn id="in-select" title="insertion mode: in select">in
+ select</dfn>&quot;
+
+ </dt><dd>
+ <p>Handle the token as follows:</p>
+
+ <dl class="switch">
+ <dt>A character token
+
+ </dt><dd>
+ <p><a href="#append" title="append a character">Append the token's
+ character</a> to the <a href="#current4">current node</a>.</p>
+
+ </dd><dt>A comment token
+
+ </dt><dd>
+ <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment
+ token.</p>
+
+ </dd><dt>A start tag token whose tag name is &quot;option&quot;
+
+ </dt><dd>
+ <p>If the <a href="#current4">current node</a> is an
+ <code>option</code> element, act as if an end tag with the tag name
+ &quot;option&quot; had been seen.</p>
+
+ <p><a href="#insert">Insert an HTML element</a> for the token.</p>
+
+ </dd><dt>A start tag token whose tag name is &quot;optgroup&quot;
+
+ </dt><dd>
+ <p>If the <a href="#current4">current node</a> is an
+ <code>option</code> element, act as if an end tag with the tag name
+ &quot;option&quot; had been seen.</p>
+
+ <p>If the <a href="#current4">current node</a> is an
+ <code>optgroup</code> element, act as if an end tag with the tag
+ name &quot;optgroup&quot; had been seen.</p>
+
+ <p><a href="#insert">Insert an HTML element</a> for the token.</p>
+
+ </dd><dt>An end tag token whose tag name is &quot;optgroup&quot;
+
+ </dt><dd>
+ <p>First, if the <a href="#current4">current node</a> is an
+ <code>option</code> element, and the node immediately before it in
+ the <a href="#stack">stack of open elements</a> is an
+ <code>optgroup</code> element, then act as if an end tag with the
+ tag name &quot;option&quot; had been seen.</p>
+
+ <p>If the <a href="#current4">current node</a> is an
+ <code>optgroup</code> element, then pop that node from the <a href="#stack">stack of open elements</a>. Otherwise, this is a <a href="section-parsing.html#parse">parse error</a>, ignore the token.</p>
+
+ </dd><dt>An end tag token whose tag name is &quot;option&quot;
+
+ </dt><dd>
+ <p>If the <a href="#current4">current node</a> is an
+ <code>option</code> element, then pop that node from the <a href="#stack">stack of open elements</a>. Otherwise, this is a <a href="section-parsing.html#parse">parse error</a>, ignore the token.</p>
+
+ </dd><dt>An end tag whose tag name is &quot;select&quot;
+
+ </dt><dd>
+ <p>If the <a href="#stack">stack of open elements</a> does not <a href="#have-an0" title="has an element in table scope">have an
+ element in table scope</a> with the same tag name as the token, this
+ is a <a href="section-parsing.html#parse">parse error</a>. Ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p>Otherwise:</p>
+
+ <p>Pop elements from the <a href="#stack">stack of open elements</a>
+ until a <code>select</code> element has been popped from the stack.</p>
+
+ <p><a href="#reset">Reset the insertion mode appropriately</a>.</p>
+
+ </dd><dt>A start tag whose tag name is &quot;select&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Act as if the token had been an
+ end tag with the tag name &quot;select&quot; instead.</p>
+
+ </dd><dt>An end tag whose tag name is one of: &quot;caption&quot;, &quot;table&quot;, &quot;tbody&quot;,
+ &quot;tfoot&quot;, &quot;thead&quot;, &quot;tr&quot;, &quot;td&quot;, &quot;th&quot;
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>.</p>
+
+ <p>If the <a href="#stack">stack of open elements</a> <a href="#have-an0">has an element in table scope</a> with the same tag
+ name as that of the token, then act as if an end tag with the tag
+ name &quot;select&quot; had been seen, and reprocess the token. Otherwise,
+ ignore the token.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+ </dd></dl>
+
+ </dd><dt id="parsing-main-afterbody">If the <a href="#insertion0">insertion
+ mode</a> is &quot;<dfn id="after2" title="insertion mode: after body">after
+ body</dfn>&quot;
+
+ </dt><dd>
+ <p>Handle the token as follows:</p>
+
+ <dl class="switch">
+ <dt>A character token that is one of one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C
+ FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dd>
+ <p>Process the token as it would be processed if the <a href="#insertion0">insertion mode</a> was &quot;<a href="#in-body" title="insertion mode: in body">in body</a>&quot;.</p>
+
+ </dd><dt>A comment token
+
+ </dt><dd>
+ <p>Append a <code>Comment</code> node to the first element in the <a href="#stack">stack of open elements</a> (the <code><a href="section-the-root.html#html">html</a></code> element), with the <code title="">data</code> attribute set to the data given in the comment
+ token.</p>
+
+ </dd><dt>An end tag with the tag name &quot;html&quot;
+
+ </dt><dd>
+ <p>If the parser was originally created in order to handle the
+ setting of <em>an element</em>'s <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> attribute, this is a <a href="section-parsing.html#parse">parse error</a>; ignore the token. (The element will
+ be an <code><a href="section-the-root.html#html">html</a></code> element in this case.)
+ (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p>Otherwise, switch to <a href="#the-trailing0">the trailing end
+ phase</a>.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Set the <a href="#insertion0">insertion mode</a> to &quot;<a href="#in-body" title="insertion mode: in body">in body</a>&quot; and reprocess the
+ token.</p>
+ </dd></dl>
+
+ </dd><dt id="parsing-main-inframeset">If the <a href="#insertion0">insertion
+ mode</a> is &quot;<dfn id="in-frameset" title="insertion mode: in frameset">in
+ frameset</dfn>&quot;
+
+ </dt><dd>
+ <p>Handle the token as follows:</p>
+
+ <dl class="switch">
+ <dt>A character token that is one of one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C
+ FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dd>
+ <p><a href="#append" title="append a character">Append the
+ character</a> to the <a href="#current4">current node</a>.</p>
+
+ </dd><dt>A comment token
+
+ </dt><dd>
+ <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment
+ token.</p>
+
+ </dd><dt>A start tag with the tag name &quot;frameset&quot;
+
+ </dt><dd>
+ <p><a href="#insert" title="Insert an HTML element">Insert a
+ <code>frameset</code> element</a> for the token.</p>
+
+ </dd><dt>An end tag with the tag name &quot;frameset&quot;
+
+ </dt><dd>
+ <p>If the <a href="#current4">current node</a> is the root <code><a href="section-the-root.html#html">html</a></code> element, then this is a <a href="section-parsing.html#parse">parse error</a>; ignore the token. (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>)</p>
+
+ <p>Otherwise, pop the <a href="#current4">current node</a> from the
+ <a href="#stack">stack of open elements</a>.</p>
+
+ <p>If the parser was <em>not</em> originally created in order to
+ handle the setting of an element's <code title="dom-innerHTML-HTML"><a href="section-dynamic.html#innerhtml0">innerHTML</a></code> attribute (<a href="section-dynamic.html#innerhtml1"><code>innerHTML</code> case</a>), and the <a href="#current4">current node</a> is no longer a
+ <code>frameset</code> element, then change the <a href="#insertion0">insertion mode</a> to &quot;<a href="#after3" title="insertion mode: after frameset">after frameset</a>&quot;.</p>
+
+ </dd><dt>A start tag with the tag name &quot;frame&quot;
+
+ </dt><dd>
+ <p><a href="#insert">Insert an HTML element</a> for the token.
+ Immediately pop the <a href="#current4">current node</a> off the <a href="#stack">stack of open elements</a>.</p>
+
+ </dd><dt>A start tag with the tag name &quot;noframes&quot;
+
+ </dt><dd>
+ <p>Process the token as if the <a href="#insertion0">insertion
+ mode</a> had been &quot;<a href="#in-body" title="insertion mode: in
+ body">in body</a>&quot;.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+ </dd></dl>
+
+ </dd><dt id="parsing-main-afterframeset">If the <a href="#insertion0">insertion
+ mode</a> is &quot;<dfn id="after3" title="insertion mode: after
+ frameset">after frameset</dfn>&quot;
+
+ </dt><dd>
+ <p>Handle the token as follows:</p>
+
+ <dl class="switch">
+ <dt>A character token that is one of one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C
+ FORM FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dd>
+ <p><a href="#append" title="append a character">Append the
+ character</a> to the <a href="#current4">current node</a>.</p>
+
+ </dd><dt>A comment token
+
+ </dt><dd>
+ <p>Append a <code>Comment</code> node to the <a href="#current4">current node</a> with the <code title="">data</code> attribute set to the data given in the comment
+ token.</p>
+
+ </dd><dt>An end tag with the tag name &quot;html&quot;
+
+ </dt><dd>
+ <p>Switch to <a href="#the-trailing0">the trailing end phase</a>.</p>
+
+ </dd><dt>A start tag with the tag name &quot;noframes&quot;
+
+ </dt><dd>
+ <p>Process the token as if the <a href="#insertion0">insertion
+ mode</a> had been &quot;<a href="#in-body" title="insertion mode: in
+ body">in body</a>&quot;.</p>
+
+ </dd><dt>Anything else
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+ </dd></dl>
+ </dd></dl>
+ </dd></dl>
+
+ <p class="big-issue">This doesn't handle UAs that don't support frames, or
+ that do support frames but want to show the NOFRAMES content. Supporting
+ the former is easy; supporting the latter is harder.
+
+ </p><h5 id="the-trailing"><span class="secno">8.2.4.4. </span><dfn id="the-trailing0">The trailing end phase</dfn></h5>
+
+ <p>After <a href="#the-main0">the main phase</a>, as each token is emitted
+ from the <a href="section-tokenisation.html#tokenisation0">tokenisation</a> stage, it must be
+ processed as described in this section.
+
+ </p><dl class="switch">
+ <dt>A DOCTYPE token
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Ignore the token.</p>
+
+ </dd><dt>A comment token
+
+ </dt><dd>
+ <p>Append a <code>Comment</code> node to the <code>Document</code> object
+ with the <code title="">data</code> attribute set to the data given in
+ the comment token.</p>
+
+ </dd><dt>A character token that is one of one of U+0009 CHARACTER TABULATION,
+ U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
+ U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dd>
+ <p>Process the token as it would be processed in <a href="#the-main0">the
+ main phase</a>.</p>
+
+ </dd><dt>A character token that is <em>not</em> one of U+0009 CHARACTER
+ TABULATION, U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM
+ FEED (FF), U+000D CARRIAGE RETURN (CR), or U+0020 SPACE
+
+ </dt><dt>A start tag token
+
+ </dt><dt>An end tag token
+
+ </dt><dd>
+ <p><a href="section-parsing.html#parse">Parse error</a>. Switch back to <a href="#the-main0">the main phase</a> and reprocess the token.</p>
+
+ </dd><dt>An end-of-file token
+
+ </dt><dd>
+ <p><a href="#stops">Stop parsing</a>.</p>
+ </dd></dl>
+
+ <h4 id="the-end"><span class="secno">8.2.5. </span>The End</h4>
+
+ <p>Once the user agent <dfn id="stops" title="stop parsing">stops
+ parsing</dfn> the document, the user agent must follow the steps in this
+ section.
+
+ </p><p>First, <!--the user agent must <span title="fire a DOMContentLoaded
+ event">fire a <code
+ title="event-DOMContentLoaded">DOMContentLoaded</code> event</span>
+ at <span>the <code>body</code> element</span>.</p>
+
+ <p>Then, -->the
+ rules for <a href="section-scripting0.html#when-a">when a script completes loading</a> start
+ applying (script execution is no longer managed by the parser).
+
+ </p><p>If any of the scripts in the <a href="section-scripting0.html#list-of1">list of scripts that
+ will execute as soon as possible</a> have <span>completed
+ loading</span><!-- XXX xref -->, or if the <a href="section-scripting0.html#list-of0">list of
+ scripts that will execute asynchronously</a> is not empty and the first
+ script in that list has <span>completed loading</span><!-- XXX xref
+ -->,
+ then the user agent must act as if those scripts just completed loading,
+ following the rules given for that in the <code><a href="section-scripting0.html#script0">script</a></code> element definition.
+
+ </p><p>Then, if the <a href="section-scripting0.html#list-of">list of scripts that will execute when
+ the document has finished parsing</a> is not empty, and the first item in
+ this list has already <span>completed loading</span><!--XXX
+ xref -->,
+ then the user agent must act as if that script just finished loading.
+
+ </p><p>By this point, there will be no scripts that have loaded but have not
+ yet been executed.
+
+ </p><p>The user agent must then <a href="section-scripting.html#firing2">fire a simple event</a>
+ called <code title="event-DOMContentLoaded">DOMContentLoaded</code> at the
+ <code>Document</code>.
+
+ </p><p>Once everything that <dfn id="delays" title="delay the load event">delays
+ the load event</dfn> has completed, the user agent must <a href="section-scripting.html#firing4" title="fire a load event">fire a <code title="event-load">load</code>
+ event</a> at <a href="section-dom-tree.html#the-body0">the <code>body</code> element</a>.</p>
+ <!-- XXX make sure things "delay the load event" -->
+ <!--XXX need to handle
+http://lxr.mozilla.org/mozilla/source/parser/htmlparser/src/CNavDTD.cpp#2354
+2354 // Don't open transient styles if it makes the stack deep, bug 58917.
+-->
+ <!--XXX
+http://lxr.mozilla.org/mozilla/source/parser/htmlparser/src/nsHTMLTokenizer.cpp#749
+-->
+ <!--
+see also CTextToken::ConsumeCharacterData() for CDATA parsing?
+
+1212 1 Here's a tricky case from bug 22596: <h5><li><h5>
+1213 How do we know that the 2nd <h5> should close the <LI> rather than nest inside the <LI>?
+1214 (Afterall, the <h5> is a legal child of the <LI>).
+1215
+1216 The way you know is that there is no root between the two, so the <h5> binds more
+1217 tightly to the 1st <h5> than to the <LI>.
+1218 2. Also, bug 6148 shows this case: <SPAN><DIV><SPAN>
+1219 From this case we learned not to execute this logic if the parent is a block.
+1220
+1221 3. Fix for 26583
+1222 Ex. <A href=foo.html><B>foo<A href-bar.html>bar</A></B></A> <- A legal HTML
+1223 In the above example clicking on "foo" or "bar" should link to
+1224 foo.html or bar.html respectively. That is, the inner <A> should be informed
+1225 about the presence of an open <A> above <B>..so that the inner <A> can close out
+1226 the outer <A>. The following code does it for us.
+1227
+1228 4. Fix for 27865 [ similer to 22596 ]. Ex: <DL><DD><LI>one<DD><LI>two
+ - http://lxr.mozilla.org/mozilla/source/parser/htmlparser/src/CNavDTD.cpp#1211
+
+815 // Here's a problem. If theTag is legal in here, we don't move it
+816 // out. So if we're moving stuff out of here, the parent of theTag
+817 // gets closed at this point. But some things are legal
+818 // _everywhere_ and hence would effectively close out misplaced
+819 // content in tables. This is undesirable, so treat them as
+820 // illegal here so they'll be shipped out with their parents and
+821 // siblings. See bug 40855 for an explanation (that bug was for
+822 // comments, but the same issues arise with whitespace, newlines,
+823 // noscript, etc). Script is special, though. Shipping it out
+824 // breaks document.write stuff. See bug 243064.
+ - http://lxr.mozilla.org/mozilla/source/parser/htmlparser/src/CNavDTD.cpp#825
+
+
+1326 /**************************************************************************************
+1327 *
+1328 * Now a little code to deal with bug #49687 (crash when layout stack gets too deep)
+1329 * I've also opened this up to any container (not just inlines): re bug 55095
+1330 * Improved to handle bug 55980 (infinite loop caused when DEPTH is exceeded and
+1331 * </P> is encountered by itself (<P>) is continuously produced.
+1332 *
+1333 **************************************************************************************/
+
+1912 // Oh boy!! we found a "stray" tag. Nav4.x and IE introduce line break in
+1913 // such cases. So, let's simulate that effect for compatibility.
+1914 // Ex. <html><body>Hello</P>There</body></html>
+http://lxr.mozilla.org/mozilla/source/parser/htmlparser/src/CNavDTD.cpp#1912
+
+http://lxr.mozilla.org/seamonkey/search?string=nested
+/parser/htmlparser/src/CNavDTD.cpp, line 791 - * 2. <CENTER><DL><DT><A><CENTER> allow nested <CENTER>
+/parser/htmlparser/src/CNavDTD.cpp, line 792 - * 3. <TABLE><TR><TD><TABLE>... allow nested <TABLE>
+/parser/htmlparser/src/CNavDTD.cpp, line 2562 - // Discard nested forms - bug 72639
+/parser/htmlparser/src/nsElementTable.cpp, line 1453 - * 2. <CENTER><DL><DT><A><CENTER> allow nested <CENTER>
+/parser/htmlparser/src/nsElementTable.cpp, line 1454 - * 3. <TABLE><TR><TD><TABLE>... allow nested <TABLE>
+/parser/htmlparser/src/nsElementTable.cpp, line 1901 - // Ex: <H1><LI><H1><LI>. Inner LI has the potential of getting nested
+-->
+
+ <script src="http://status.whatwg.org/annotate-web-apps.js" type="text/javascript"></script></body></html> \ No newline at end of file
diff --git a/test/data/html/web-apps.html b/test/data/html/web-apps.html
new file mode 100644
index 0000000..d685320
--- /dev/null
+++ b/test/data/html/web-apps.html
@@ -0,0 +1,41271 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01//EN">
+
+<html lang=en-GB-hixie>
+ <head>
+ <title>HTML 5</title>
+ <link href="/style/specification" rel=stylesheet type="text/css">
+ <link href="/images/icon" rel=icon>
+
+ <style type="text/css">
+ h4 + .element { margin-top: -2.5em; padding-top: 2em; }
+ h4 + p + .element { margin-top: -5em; padding-top: 4em; }
+ .element { background: #EEFFEE; color: black; margin: 0 0 1em -1em; padding: 0 1em 0.25em 0.75em; border-left: solid #99FF99 0.25em; -padding: 0; /* that last decl is for IE6. Try removing it, it's hilarious! */ }
+ .proposal { border: blue solid; padding: 1em; }
+ table.matrix, table.matrix td { border: none; text-align: right; }
+ table.matrix { margin-left: 2em; }
+ </style>
+
+ <body class=draft>
+ <div class=head>
+ <p><a class=logo href="http://www.whatwg.org/" rel=home><img alt=WHATWG
+ src="/images/logo"></a></p>
+
+ <h1 id=html-5>HTML 5</h1>
+
+ <h2 class="no-num no-toc" id=working>Working Draft &mdash; 14 June 2007</h2>
+
+ <p>You can take part in this work. <a
+ href="http://www.whatwg.org/mailing-list">Join the working group's
+ discussion list.</a></p>
+
+ <p><strong>Web designers!</strong> We have a <a
+ href="http://blog.whatwg.org/faq/">FAQ</a>, a <a
+ href="http://forums.whatwg.org/">forum</a>, and a <a
+ href="http://www.whatwg.org/mailing-list#help">help mailing list</a> for
+ you!</p>
+
+ <dl>
+ <dt>One-page version:
+
+ <dd><a
+ href="http://www.whatwg.org/specs/web-apps/current-work/">http://www.whatwg.org/specs/web-apps/current-work/</a>
+
+ <dt>Multiple-page version:
+
+ <dd><a
+ href="http://www.whatwg.org/specs/web-apps/current-work/multipage/">http://www.whatwg.org/specs/web-apps/current-work/multipage/</a>
+
+ <dt>Version history:
+
+ <dd>Twitter messages (non-editorial changes only): <a
+ href="http://twitter.com/WHATWG">http://twitter.com/WHATWG</a>
+
+ <dd>Commit-Watchers mailing list: <a
+ href="http://lists.whatwg.org/listinfo.cgi/commit-watchers-whatwg.org">http://lists.whatwg.org/listinfo.cgi/commit-watchers-whatwg.org</a>
+
+ <dd>Interactive Web interface: <a
+ href="http://html5.org/tools/web-apps-tracker">http://html5.org/tools/web-apps-tracker</a>
+
+ <dd>Subversion interface: <a
+ href="http://svn.whatwg.org/">http://svn.whatwg.org/</a>
+
+ <dt>Editor:
+
+ <dd>Ian Hickson, Google, ian@hixie.ch
+ </dl>
+
+ <p class=copyright>&copy; Copyright 2004-2007 Apple Computer, Inc.,
+ Mozilla Foundation, and Opera Software ASA.</p>
+
+ <p class=copyright>You are granted a license to use, reproduce and create
+ derivative works of this document.</p>
+ </div>
+
+ <hr>
+
+ <h2 class="no-num no-toc" id=abstract>Abstract</h2>
+
+ <p>This specification introduces features to HTML and the DOM that ease the
+ authoring of Web-based applications. Additions include the context menus,
+ a direct-mode graphics canvas, inline popup windows, and server-sent
+ events.
+
+ <h2 class="no-num no-toc" id=status>Status of this document</h2>
+
+ <p><strong>This is a work in progress!</strong> This document is changing
+ on a daily if not hourly basis in response to comments and as a general
+ part of its development process. Comments are very welcome, please send
+ them to <a href="mailto:whatwg@whatwg.org">whatwg@whatwg.org</a>. Thank
+ you.
+
+ <p>Implementors should be aware that this specification is not stable.
+ <strong>Implementors who are not taking part in the discussions are likely
+ to find the specification changing out from under them in incompatible
+ ways.</strong> Vendors interested in implementing this specification
+ before it eventually reaches the call for implementations should join the
+ <a href="/mailing-list">WHATWG mailing list</a> and take part in the
+ discussions.
+
+ <p>This specification is also being produced by the <a
+ href="http://www.w3.org/html/wg">W3C HTML WG</a>. The two specifications
+ are identical from the table of contents onwards.
+
+ <p>This specification is intended to replace (be the new version of) what
+ was previously the HTML4, XHTML 1.x, and DOM2 HTML specifications.
+
+ <h3 class="no-num no-toc" id=stability0>Stability</h3>
+
+ <p>Different parts of this specification are at different levels of
+ maturity.
+
+ <div id=stability></div>
+
+ <p class=big-issue>Known issues are usually marked like this. There are
+ some spec-wide issues that have not yet been addressed: case-sensitivity
+ is a very poorly handled topic right now, and the firing of events needs
+ to be unified (right now some bubble, some don't, they all use different
+ text to fire events, etc).
+
+ <h2 class="no-num no-toc" id=contents>Table of contents</h2>
+ <!--begin-toc-->
+
+ <ul class=toc>
+ <li><a href="#introduction"><span class=secno>1. </span>Introduction</a>
+ <ul class=toc>
+ <li><a href="#scope"><span class=secno>1.1. </span>Scope</a>
+ <ul class=toc>
+ <li><a href="#relationship"><span class=secno>1.1.1.
+ </span>Relationship to HTML 4.01, XHTML 1.1, DOM2 HTML</a>
+
+ <li><a href="#relationship0"><span class=secno>1.1.2.
+ </span>Relationship to XHTML2</a>
+
+ <li><a href="#relationship1"><span class=secno>1.1.3.
+ </span>Relationship to XUL, Flash, Silverlight, and other proprietary
+ UI languages</a>
+ </ul>
+
+ <li><a href="#structure"><span class=secno>1.2. </span>Structure of this
+ specification</a>
+ <ul class=toc>
+ <li><a href="#how-to"><span class=secno>1.2.1. </span>How to read this
+ specification</a>
+ </ul>
+
+ <li><a href="#conformance"><span class=secno>1.3. </span>Conformance
+ requirements</a>
+ <ul class=toc>
+ <li><a href="#common"><span class=secno>1.3.1. </span>Common
+ conformance requirements for APIs exposed to JavaScript</a>
+
+ <li><a href="#dependencies"><span class=secno>1.3.2.
+ </span>Dependencies</a>
+
+ <li><a href="#features"><span class=secno>1.3.3. </span>Features
+ defined in other specifications</a>
+ </ul>
+
+ <li><a href="#terminology"><span class=secno>1.4. </span>Terminology</a>
+
+ <ul class=toc>
+ <li><a href="#html-vs"><span class=secno>1.4.1. </span>HTML vs
+ XHTML</a>
+ </ul>
+ </ul>
+
+ <li><a href="#dom"><span class=secno>2. </span>The Document Object
+ Model</a>
+ <ul class=toc>
+ <li><a href="#documents"><span class=secno>2.1. </span>Documents</a>
+ <ul class=toc>
+ <li><a href="#security"><span class=secno>2.1.1. </span>Security</a>
+
+ <li><a href="#resource"><span class=secno>2.1.2. </span>Resource
+ metadata management</a>
+ </ul>
+
+ <li><a href="#elements"><span class=secno>2.2. </span>Elements</a>
+ <ul class=toc>
+ <li><a href="#reflecting"><span class=secno>2.2.1. </span>Reflecting
+ content attributes in DOM attributes</a>
+ </ul>
+
+ <li><a href="#common0"><span class=secno>2.3. </span>Common DOM
+ interfaces</a>
+ <ul class=toc>
+ <li><a href="#collections"><span class=secno>2.3.1.
+ </span>Collections</a>
+ <ul class=toc>
+ <li><a href="#htmlcollection"><span class=secno>2.3.1.1.
+ </span>HTMLCollection</a>
+
+ <li><a href="#htmlformcontrolscollection"><span class=secno>2.3.1.2.
+ </span>HTMLFormControlsCollection</a>
+
+ <li><a href="#htmloptionscollection"><span class=secno>2.3.1.3.
+ </span>HTMLOptionsCollection</a>
+ </ul>
+
+ <li><a href="#domtokenlist"><span class=secno>2.3.2.
+ </span>DOMTokenList</a>
+
+ <li><a href="#dom-feature"><span class=secno>2.3.3. </span>DOM feature
+ strings</a>
+ </ul>
+
+ <li><a href="#dom-tree"><span class=secno>2.4. </span>DOM tree
+ accessors</a>
+
+ <li><a href="#dynamic"><span class=secno>2.5. </span>Dynamic markup
+ insertion</a>
+ <ul class=toc>
+ <li><a href="#controlling"><span class=secno>2.5.1. </span>Controlling
+ the input stream</a>
+
+ <li><a href="#dynamic0"><span class=secno>2.5.2. </span>Dynamic markup
+ insertion in HTML</a>
+
+ <li><a href="#dynamic1"><span class=secno>2.5.3. </span>Dynamic markup
+ insertion in XML</a>
+ </ul>
+
+ <li><a href="#apis-in"><span class=secno>2.6. </span>APIs in HTML
+ documents</a>
+ </ul>
+
+ <li><a href="#semantics"><span class=secno>3. </span>Semantics and
+ structure of HTML elements</a>
+ <ul class=toc>
+ <li><a href="#semantics-intro"><span class=secno>3.1.
+ </span>Introduction</a>
+
+ <li><a href="#common1"><span class=secno>3.2. </span>Common
+ microsyntaxes</a>
+ <ul class=toc>
+ <li><a href="#common2"><span class=secno>3.2.1. </span>Common parser
+ idioms</a>
+
+ <li><a href="#boolean"><span class=secno>3.2.2. </span>Boolean
+ attributes</a>
+
+ <li><a href="#numbers"><span class=secno>3.2.3. </span>Numbers</a>
+ <ul class=toc>
+ <li><a href="#unsigned"><span class=secno>3.2.3.1. </span>Unsigned
+ integers</a>
+
+ <li><a href="#signed"><span class=secno>3.2.3.2. </span>Signed
+ integers</a>
+
+ <li><a href="#real-numbers"><span class=secno>3.2.3.3. </span>Real
+ numbers</a>
+
+ <li><a href="#ratios"><span class=secno>3.2.3.4. </span>Ratios</a>
+
+ <li><a href="#percentages-and-dimensions"><span class=secno>3.2.3.5.
+ </span>Percentages and dimensions</a>
+
+ <li><a href="#lists"><span class=secno>3.2.3.6. </span>Lists of
+ integers</a>
+ </ul>
+
+ <li><a href="#dates"><span class=secno>3.2.4. </span>Dates and
+ times</a>
+ <ul class=toc>
+ <li><a href="#specific"><span class=secno>3.2.4.1. </span>Specific
+ moments in time</a>
+
+ <li><a href="#vaguer"><span class=secno>3.2.4.2. </span>Vaguer
+ moments in time</a>
+ </ul>
+
+ <li><a href="#time-offsets"><span class=secno>3.2.5. </span>Time
+ offsets</a>
+
+ <li><a href="#tokens"><span class=secno>3.2.6. </span>Tokens</a>
+
+ <li><a href="#keywords"><span class=secno>3.2.7. </span>Keywords and
+ enumerated attributes</a>
+
+ <li><a href="#syntax-references"><span class=secno>3.2.8.
+ </span>References</a>
+ </ul>
+
+ <li><a href="#documents0"><span class=secno>3.3. </span>Documents and
+ document fragments</a>
+ <ul class=toc>
+ <li><a href="#semantics0"><span class=secno>3.3.1.
+ </span>Semantics</a>
+
+ <li><a href="#structure0"><span class=secno>3.3.2.
+ </span>Structure</a>
+
+ <li><a href="#kinds"><span class=secno>3.3.3. </span>Kinds of
+ elements</a>
+ <ul class=toc>
+ <li><a href="#block-level"><span class=secno>3.3.3.1.
+ </span>Block-level elements</a>
+
+ <li><a href="#inline-level"><span class=secno>3.3.3.2.
+ </span>Inline-level content</a>
+
+ <li><a href="#transparent"><span class=secno>3.3.3.3.
+ </span>Transparent content models</a>
+
+ <li><a href="#determining"><span class=secno>3.3.3.4.
+ </span>Determining if a particular element contains block-level
+ elements or inline-level content</a>
+
+ <li><a href="#interactive0"><span class=secno>3.3.3.5.
+ </span>Interactive elements</a>
+
+ <li><a href="#paragraphs"><span class=secno>3.3.3.6.
+ </span>Paragraphs</a>
+ </ul>
+ </ul>
+
+ <li><a href="#global"><span class=secno>3.4. </span>Global
+ attributes</a>
+ <ul class=toc>
+ <li><a href="#the-id"><span class=secno>3.4.1. </span>The
+ <code>id</code> attribute</a>
+
+ <li><a href="#the-title"><span class=secno>3.4.2. </span>The
+ <code>title</code> attribute</a>
+
+ <li><a href="#the-lang"><span class=secno>3.4.3. </span>The
+ <code>lang</code> (HTML only) and <code>xml:lang</code> (XML only)
+ attributes</a>
+
+ <li><a href="#the-dir"><span class=secno>3.4.4. </span>The
+ <code>dir</code> attribute</a>
+
+ <li><a href="#classes"><span class=secno>3.4.5. </span>The
+ <code>class</code> attribute</a>
+
+ <li><a href="#the-irrelevant"><span class=secno>3.4.6. </span>The
+ <code>irrelevant</code> attribute</a>
+ </ul>
+
+ <li><a href="#interaction"><span class=secno>3.5. </span>Interaction</a>
+
+ <ul class=toc>
+ <li><a href="#activation"><span class=secno>3.5.1.
+ </span>Activation</a>
+
+ <li><a href="#focus"><span class=secno>3.5.2. </span>Focus</a>
+ <ul class=toc>
+ <li><a href="#focus-management"><span class=secno>3.5.2.1.
+ </span>Focus management</a>
+
+ <li><a href="#sequential"><span class=secno>3.5.2.2.
+ </span>Sequential focus navigation</a>
+ </ul>
+
+ <li><a href="#scrolling"><span class=secno>3.5.3. </span>Scrolling
+ elements into view</a>
+ </ul>
+
+ <li><a href="#the-root"><span class=secno>3.6. </span>The root
+ element</a>
+ <ul class=toc>
+ <li><a href="#the-html"><span class=secno>3.6.1. </span>The
+ <code>html</code> element</a>
+ </ul>
+
+ <li><a href="#document"><span class=secno>3.7. </span>Document
+ metadata</a>
+ <ul class=toc>
+ <li><a href="#the-head"><span class=secno>3.7.1. </span>The
+ <code>head</code> element</a>
+
+ <li><a href="#the-title0"><span class=secno>3.7.2. </span>The
+ <code>title</code> element</a>
+
+ <li><a href="#the-base"><span class=secno>3.7.3. </span>The
+ <code>base</code> element</a>
+
+ <li><a href="#the-link"><span class=secno>3.7.4. </span>The
+ <code>link</code> element</a>
+
+ <li><a href="#meta"><span class=secno>3.7.5. </span>The
+ <code>meta</code> element</a>
+ <ul class=toc>
+ <li><a href="#standard"><span class=secno>3.7.5.1. </span>Standard
+ metadata names</a>
+
+ <li><a href="#other"><span class=secno>3.7.5.2. </span>Other
+ metadata names</a>
+
+ <li><a href="#pragma"><span class=secno>3.7.5.3. </span>Pragma
+ directives</a>
+
+ <li><a href="#charset"><span class=secno>3.7.5.4. </span>Specifying
+ and establishing the document's character encoding</a>
+ </ul>
+
+ <li><a href="#the-style"><span class=secno>3.7.6. </span>The
+ <code>style</code> element</a>
+
+ <li><a href="#styling"><span class=secno>3.7.7. </span>Styling</a>
+ </ul>
+
+ <li><a href="#sections"><span class=secno>3.8. </span>Sections</a>
+ <ul class=toc>
+ <li><a href="#the-body"><span class=secno>3.8.1. </span>The
+ <code>body</code> element</a>
+
+ <li><a href="#the-section"><span class=secno>3.8.2. </span>The
+ <code>section</code> element</a>
+
+ <li><a href="#the-nav"><span class=secno>3.8.3. </span>The
+ <code>nav</code> element</a>
+
+ <li><a href="#the-article"><span class=secno>3.8.4. </span>The
+ <code>article</code> element</a>
+
+ <li><a href="#the-blockquote"><span class=secno>3.8.5. </span>The
+ <code>blockquote</code> element</a>
+
+ <li><a href="#the-aside"><span class=secno>3.8.6. </span>The
+ <code>aside</code> element</a>
+
+ <li><a href="#the-h1"><span class=secno>3.8.7. </span>The
+ <code>h1</code>, <code>h2</code>, <code>h3</code>, <code>h4</code>,
+ <code>h5</code>, and <code>h6</code> elements</a>
+
+ <li><a href="#the-header"><span class=secno>3.8.8. </span>The
+ <code>header</code> element</a>
+
+ <li><a href="#the-footer"><span class=secno>3.8.9. </span>The
+ <code>footer</code> element</a>
+
+ <li><a href="#the-address"><span class=secno>3.8.10. </span>The
+ <code>address</code> element</a>
+
+ <li><a href="#headings"><span class=secno>3.8.11. </span>Headings and
+ sections</a>
+ <ul class=toc>
+ <li><a href="#outlines"><span class=secno>3.8.11.1. </span>Creating
+ an outline</a>
+
+ <li><a href="#associatedSection"><span class=secno>3.8.11.2.
+ </span>Determining which heading and section applies to a
+ particular node</a>
+
+ <li><a href="#distinguishing"><span class=secno>3.8.11.3.
+ </span>Distinguishing site-wide headers from page headers</a>
+ </ul>
+ </ul>
+
+ <li><a href="#prose"><span class=secno>3.9. </span>Prose</a>
+ <ul class=toc>
+ <li><a href="#the-p"><span class=secno>3.9.1. </span>The
+ <code>p</code> element</a>
+
+ <li><a href="#the-hr"><span class=secno>3.9.2. </span>The
+ <code>hr</code> element</a>
+
+ <li><a href="#the-br"><span class=secno>3.9.3. </span>The
+ <code>br</code> element</a>
+
+ <li><a href="#the-dialog"><span class=secno>3.9.4. </span>The
+ <code>dialog</code> element</a>
+ </ul>
+
+ <li><a href="#preformatted"><span class=secno>3.10. </span>Preformatted
+ text</a>
+ <ul class=toc>
+ <li><a href="#the-pre"><span class=secno>3.10.1. </span>The
+ <code>pre</code> element</a>
+ </ul>
+
+ <li><a href="#lists0"><span class=secno>3.11. </span>Lists</a>
+ <ul class=toc>
+ <li><a href="#the-ol"><span class=secno>3.11.1. </span>The
+ <code>ol</code> element</a>
+
+ <li><a href="#the-ul"><span class=secno>3.11.2. </span>The
+ <code>ul</code> element</a>
+
+ <li><a href="#the-li"><span class=secno>3.11.3. </span>The
+ <code>li</code> element</a>
+
+ <li><a href="#the-dl"><span class=secno>3.11.4. </span>The
+ <code>dl</code> element</a>
+
+ <li><a href="#the-dt"><span class=secno>3.11.5. </span>The
+ <code>dt</code> element</a>
+
+ <li><a href="#the-dd"><span class=secno>3.11.6. </span>The
+ <code>dd</code> element</a>
+ </ul>
+
+ <li><a href="#phrase"><span class=secno>3.12. </span>Phrase elements</a>
+
+ <ul class=toc>
+ <li><a href="#the-a"><span class=secno>3.12.1. </span>The
+ <code>a</code> element</a>
+
+ <li><a href="#the-q"><span class=secno>3.12.2. </span>The
+ <code>q</code> element</a>
+
+ <li><a href="#the-cite"><span class=secno>3.12.3. </span>The
+ <code>cite</code> element</a>
+
+ <li><a href="#the-em"><span class=secno>3.12.4. </span>The
+ <code>em</code> element</a>
+
+ <li><a href="#the-strong"><span class=secno>3.12.5. </span>The
+ <code>strong</code> element</a>
+
+ <li><a href="#the-small"><span class=secno>3.12.6. </span>The
+ <code>small</code> element</a>
+
+ <li><a href="#the-m"><span class=secno>3.12.7. </span>The
+ <code>m</code> element</a>
+
+ <li><a href="#the-dfn"><span class=secno>3.12.8. </span>The
+ <code>dfn</code> element</a>
+
+ <li><a href="#the-abbr"><span class=secno>3.12.9. </span>The
+ <code>abbr</code> element</a>
+
+ <li><a href="#the-time"><span class=secno>3.12.10. </span>The
+ <code>time</code> element</a>
+
+ <li><a href="#the-meter"><span class=secno>3.12.11. </span>The
+ <code>meter</code> element</a>
+
+ <li><a href="#the-progress"><span class=secno>3.12.12. </span>The
+ <code>progress</code> element</a>
+
+ <li><a href="#the-code"><span class=secno>3.12.13. </span>The
+ <code>code</code> element</a>
+
+ <li><a href="#the-var"><span class=secno>3.12.14. </span>The
+ <code>var</code> element</a>
+
+ <li><a href="#the-samp"><span class=secno>3.12.15. </span>The
+ <code>samp</code> element</a>
+
+ <li><a href="#the-kbd"><span class=secno>3.12.16. </span>The
+ <code>kbd</code> element</a>
+
+ <li><a href="#the-sup"><span class=secno>3.12.17. </span>The
+ <code>sup</code> and <code>sub</code> elements</a>
+
+ <li><a href="#the-span"><span class=secno>3.12.18. </span>The
+ <code>span</code> element</a>
+
+ <li><a href="#the-i"><span class=secno>3.12.19. </span>The
+ <code>i</code> element</a>
+
+ <li><a href="#the-b"><span class=secno>3.12.20. </span>The
+ <code>b</code> element</a>
+
+ <li><a href="#the-bdo"><span class=secno>3.12.21. </span>The
+ <code>bdo</code> element</a>
+ </ul>
+
+ <li><a href="#edits"><span class=secno>3.13. </span>Edits</a>
+ <ul class=toc>
+ <li><a href="#the-ins"><span class=secno>3.13.1. </span>The
+ <code>ins</code> element</a>
+
+ <li><a href="#the-del"><span class=secno>3.13.2. </span>The
+ <code>del</code> element</a>
+
+ <li><a href="#attributes"><span class=secno>3.13.3. </span>Attributes
+ common to <code>ins</code> and <code>del</code> elements</a>
+ </ul>
+
+ <li><a href="#embedded"><span class=secno>3.14. </span>Embedded
+ content</a>
+ <ul class=toc>
+ <li><a href="#the-figure"><span class=secno>3.14.1. </span>The
+ <code>figure</code> element</a>
+
+ <li><a href="#the-img"><span class=secno>3.14.2. </span>The
+ <code>img</code> element</a>
+
+ <li><a href="#the-iframe"><span class=secno>3.14.3. </span>The
+ <code>iframe</code> element</a>
+
+ <li><a href="#the-embed"><span class=secno>3.14.4. </span>The
+ <code>embed</code> element</a>
+
+ <li><a href="#the-object"><span class=secno>3.14.5. </span>The
+ <code>object</code> element</a>
+
+ <li><a href="#the-param"><span class=secno>3.14.6. </span>The
+ <code>param</code> element</a>
+
+ <li><a href="#video"><span class=secno>3.14.7. </span>The
+ <code>video</code> element</a>
+ <ul class=toc>
+ <li><a href="#video0"><span class=secno>3.14.7.1. </span>Video and
+ audio codecs for <code>video</code> elements</a>
+ </ul>
+
+ <li><a href="#audio"><span class=secno>3.14.8. </span>The
+ <code>audio</code> element</a>
+ <ul class=toc>
+ <li><a href="#audio0"><span class=secno>3.14.8.1. </span>Audio
+ codecs for <code>audio</code> elements</a>
+ </ul>
+
+ <li><a href="#media"><span class=secno>3.14.9. </span>Media
+ elements</a>
+ <ul class=toc>
+ <li><a href="#error"><span class=secno>3.14.9.1. </span>Error
+ codes</a>
+
+ <li><a href="#location"><span class=secno>3.14.9.2. </span>Location
+ of the media resource</a>
+
+ <li><a href="#network0"><span class=secno>3.14.9.3. </span>Network
+ states</a>
+
+ <li><a href="#loading"><span class=secno>3.14.9.4. </span>Loading
+ the media resource</a>
+
+ <li><a href="#offsets"><span class=secno>3.14.9.5. </span>Offsets
+ into the media resource</a>
+
+ <li><a href="#the-ready"><span class=secno>3.14.9.6. </span>The
+ ready states</a>
+
+ <li><a href="#playing"><span class=secno>3.14.9.7. </span>Playing
+ the media resource</a>
+
+ <li><a href="#seeking"><span class=secno>3.14.9.8.
+ </span>Seeking</a>
+
+ <li><a href="#cue-points"><span class=secno>3.14.9.9. </span>Cue
+ points</a>
+
+ <li><a href="#user-interface"><span class=secno>3.14.9.10.
+ </span>User interface</a>
+
+ <li><a href="#time-range"><span class=secno>3.14.9.11. </span>Time
+ range</a>
+
+ <li><a href="#mediaevents"><span class=secno>3.14.9.12. </span>Event
+ summary</a>
+
+ <li><a href="#security0"><span class=secno>3.14.9.13.
+ </span>Security and privacy considerations</a>
+ </ul>
+
+ <li><a href="#the-source"><span class=secno>3.14.10. </span>The
+ <code>source</code> element</a>
+
+ <li><a href="#the-canvas"><span class=secno>3.14.11. </span>The
+ <code>canvas</code> element</a>
+ <ul class=toc>
+ <li><a href="#the-2d"><span class=secno>3.14.11.1. </span>The 2D
+ context</a>
+ <ul class=toc>
+ <li><a href="#the-canvas0"><span class=secno>3.14.11.1.1.
+ </span>The canvas state</a>
+
+ <li><a href="#transformations"><span class=secno>3.14.11.1.2.
+ </span>Transformations</a>
+
+ <li><a href="#compositing"><span class=secno>3.14.11.1.3.
+ </span>Compositing</a>
+
+ <li><a href="#colors"><span class=secno>3.14.11.1.4. </span>Colors
+ and styles</a>
+
+ <li><a href="#line-styles"><span class=secno>3.14.11.1.5.
+ </span>Line styles</a>
+
+ <li><a href="#shadows"><span class=secno>3.14.11.1.6.
+ </span>Shadows</a>
+
+ <li><a href="#simple"><span class=secno>3.14.11.1.7. </span>Simple
+ shapes (rectangles)</a>
+
+ <li><a href="#complex"><span class=secno>3.14.11.1.8.
+ </span>Complex shapes (paths)</a>
+
+ <li><a href="#images"><span class=secno>3.14.11.1.9.
+ </span>Images</a>
+
+ <li><a href="#pixel"><span class=secno>3.14.11.1.10. </span>Pixel
+ manipulation</a>
+
+ <li><a href="#drawing"><span class=secno>3.14.11.1.11.
+ </span>Drawing model</a>
+ </ul>
+ </ul>
+
+ <li><a href="#the-map"><span class=secno>3.14.12. </span>The
+ <code>map</code> element</a>
+
+ <li><a href="#the-area"><span class=secno>3.14.13. </span>The
+ <code>area</code> element</a>
+
+ <li><a href="#image-maps"><span class=secno>3.14.14. </span>Image
+ maps</a>
+ </ul>
+
+ <li><a href="#tabular"><span class=secno>3.15. </span>Tabular data</a>
+ <ul class=toc>
+ <li><a href="#the-table"><span class=secno>3.15.1. </span>The
+ <code>table</code> element</a>
+
+ <li><a href="#the-caption"><span class=secno>3.15.2. </span>The
+ <code>caption</code> element</a>
+
+ <li><a href="#the-colgroup"><span class=secno>3.15.3. </span>The
+ <code>colgroup</code> element</a>
+
+ <li><a href="#the-col"><span class=secno>3.15.4. </span>The
+ <code>col</code> element</a>
+
+ <li><a href="#the-tbody"><span class=secno>3.15.5. </span>The
+ <code>tbody</code> element</a>
+
+ <li><a href="#the-thead"><span class=secno>3.15.6. </span>The
+ <code>thead</code> element</a>
+
+ <li><a href="#the-tfoot"><span class=secno>3.15.7. </span>The
+ <code>tfoot</code> element</a>
+
+ <li><a href="#the-tr"><span class=secno>3.15.8. </span>The
+ <code>tr</code> element</a>
+
+ <li><a href="#the-td"><span class=secno>3.15.9. </span>The
+ <code>td</code> element</a>
+
+ <li><a href="#the-th"><span class=secno>3.15.10. </span>The
+ <code>th</code> element</a>
+
+ <li><a href="#processing"><span class=secno>3.15.11. </span>Processing
+ model</a>
+ <ul class=toc>
+ <li><a href="#forming"><span class=secno>3.15.11.1. </span>Forming a
+ table</a>
+
+ <li><a href="#header-and-data-cell-semantics"><span
+ class=secno>3.15.11.2. </span>Forming relationships between data
+ cells and header cells</a>
+ </ul>
+ </ul>
+
+ <li><a href="#forms"><span class=secno>3.16. </span>Forms</a>
+ <ul class=toc>
+ <li><a href="#the-form"><span class=secno>3.16.1. </span>The
+ <code>form</code> element</a>
+
+ <li><a href="#the-fieldset"><span class=secno>3.16.2. </span>The
+ <code>fieldset</code> element</a>
+
+ <li><a href="#the-input"><span class=secno>3.16.3. </span>The
+ <code>input</code> element</a>
+
+ <li><a href="#the-button"><span class=secno>3.16.4. </span>The
+ <code>button</code> element</a>
+
+ <li><a href="#the-label"><span class=secno>3.16.5. </span>The
+ <code>label</code> element</a>
+
+ <li><a href="#the-select"><span class=secno>3.16.6. </span>The
+ <code>select</code> element</a>
+
+ <li><a href="#the-datalist"><span class=secno>3.16.7. </span>The
+ <code>datalist</code> element</a>
+
+ <li><a href="#the-optgroup"><span class=secno>3.16.8. </span>The
+ <code>optgroup</code> element</a>
+
+ <li><a href="#the-option"><span class=secno>3.16.9. </span>The
+ <code>option</code> element</a>
+
+ <li><a href="#the-textarea"><span class=secno>3.16.10. </span>The
+ <code>textarea</code> element</a>
+
+ <li><a href="#the-output"><span class=secno>3.16.11. </span>The
+ <code>output</code> element</a>
+
+ <li><a href="#processing0"><span class=secno>3.16.12.
+ </span>Processing model</a>
+ <ul class=toc>
+ <li><a href="#form-submission"><span class=secno>3.16.12.1.
+ </span>Form submission</a>
+ </ul>
+ </ul>
+
+ <li><a href="#scripting0"><span class=secno>3.17. </span>Scripting</a>
+ <ul class=toc>
+ <li><a href="#script"><span class=secno>3.17.1. </span>The
+ <code>script</code> element</a>
+ <ul class=toc>
+ <li><a href="#scriptingLanguages"><span class=secno>3.17.1.1.
+ </span>Scripting languages</a>
+ </ul>
+
+ <li><a href="#the-noscript"><span class=secno>3.17.2. </span>The
+ <code>noscript</code> element</a>
+
+ <li><a href="#the-event-source"><span class=secno>3.17.3. </span>The
+ <code>event-source</code> element</a>
+ </ul>
+
+ <li><a href="#interactive"><span class=secno>3.18. </span>Interactive
+ elements</a>
+ <ul class=toc>
+ <li><a href="#the-details"><span class=secno>3.18.1. </span>The
+ <code>details</code> element</a>
+
+ <li><a href="#datagrid"><span class=secno>3.18.2. </span>The
+ <code>datagrid</code> element</a>
+ <ul class=toc>
+ <li><a href="#the-datagrid"><span class=secno>3.18.2.1. </span>The
+ <code>datagrid</code> data model</a>
+
+ <li><a href="#how-rows"><span class=secno>3.18.2.2. </span>How rows
+ are identified</a>
+
+ <li><a href="#the-data"><span class=secno>3.18.2.3. </span>The data
+ provider interface</a>
+
+ <li><a href="#the-default"><span class=secno>3.18.2.4. </span>The
+ default data provider</a>
+ <ul class=toc>
+ <li><a href="#commonDefaultDataGridMethodDefinitions"><span
+ class=secno>3.18.2.4.1. </span>Common default data provider
+ method definitions for cells</a>
+ </ul>
+
+ <li><a href="#populating"><span class=secno>3.18.2.5.
+ </span>Populating the <code>datagrid</code> element</a>
+
+ <li><a href="#updating"><span class=secno>3.18.2.6. </span>Updating
+ the <code>datagrid</code></a>
+
+ <li><a href="#requirements"><span class=secno>3.18.2.7.
+ </span>Requirements for interactive user agents</a>
+
+ <li><a href="#the-selection"><span class=secno>3.18.2.8. </span>The
+ selection</a>
+
+ <li><a href="#columns"><span class=secno>3.18.2.9. </span>Columns
+ and captions</a>
+ </ul>
+
+ <li><a href="#the-command"><span class=secno>3.18.3. </span>The
+ <code>command</code> element</a>
+
+ <li><a href="#menus"><span class=secno>3.18.4. </span>The
+ <code>menu</code> element</a>
+ <ul class=toc>
+ <li><a href="#menus-intro"><span class=secno>3.18.4.1.
+ </span>Introduction</a>
+
+ <li><a href="#building"><span class=secno>3.18.4.2. </span>Building
+ menus</a>
+
+ <li><a href="#context"><span class=secno>3.18.4.3. </span>Context
+ menus</a>
+
+ <li><a href="#toolbars"><span class=secno>3.18.4.4.
+ </span>Toolbars</a>
+ </ul>
+
+ <li><a href="#commands"><span class=secno>3.18.5. </span>Commands</a>
+ <ul class=toc>
+ <li><a href="#using"><span class=secno>3.18.5.1. </span>Using the
+ <code>a</code> element to define a command</a>
+
+ <li><a href="#using0"><span class=secno>3.18.5.2. </span>Using the
+ <code>button</code> element to define a command</a>
+
+ <li><a href="#using1"><span class=secno>3.18.5.3. </span>Using the
+ <code>input</code> element to define a command</a>
+
+ <li><a href="#using2"><span class=secno>3.18.5.4. </span>Using the
+ <code>option</code> element to define a command</a>
+
+ <li><a href="#using3"><span class=secno>3.18.5.5. </span>Using the
+ <code>command</code> element to define a command</a>
+ </ul>
+ </ul>
+
+ <li><a href="#miscellaneous"><span class=secno>3.19.
+ </span>Miscellaneous elements</a>
+ <ul class=toc>
+ <li><a href="#the-legend"><span class=secno>3.19.1. </span>The
+ <code>legend</code> element</a>
+
+ <li><a href="#the-div"><span class=secno>3.19.2. </span>The
+ <code>div</code> element</a>
+ </ul>
+ </ul>
+
+ <li><a href="#web-browsers"><span class=secno>4. </span>Web browsers</a>
+ <ul class=toc>
+ <li><a href="#windows"><span class=secno>4.1. </span>Browsing
+ contexts</a>
+ <ul class=toc>
+ <li><a href="#nested"><span class=secno>4.1.1. </span>Nested browsing
+ contexts</a>
+
+ <li><a href="#auxiliary"><span class=secno>4.1.2. </span>Auxiliary
+ browsing contexts</a>
+
+ <li><a href="#secondary"><span class=secno>4.1.3. </span>Secondary
+ browsing contexts</a>
+
+ <li><a href="#threads"><span class=secno>4.1.4. </span>Threads</a>
+
+ <li><a href="#browsing"><span class=secno>4.1.5. </span>Browsing
+ context names</a>
+ </ul>
+
+ <li><a href="#the-default0"><span class=secno>4.2. </span>The default
+ view</a>
+ <ul class=toc>
+ <li><a href="#security1"><span class=secno>4.2.1. </span>Security</a>
+
+ <li><a href="#constructors"><span class=secno>4.2.2.
+ </span>Constructors</a>
+
+ <li><a href="#apis-for"><span class=secno>4.2.3. </span>APIs for
+ creating and navigating browsing contexts by name</a>
+
+ <li><a href="#accessing"><span class=secno>4.2.4. </span>Accessing
+ other browsing contexts</a>
+ </ul>
+
+ <li><a href="#history"><span class=secno>4.3. </span>Session history and
+ navigation</a>
+ <ul class=toc>
+ <li><a href="#the-session"><span class=secno>4.3.1. </span>The session
+ history of browsing contexts</a>
+
+ <li><a href="#the-history"><span class=secno>4.3.2. </span>The
+ <code>History</code> interface</a>
+
+ <li><a href="#activating"><span class=secno>4.3.3. </span>Activating
+ state objects</a>
+
+ <li><a href="#the-location"><span class=secno>4.3.4. </span>The
+ <code>Location</code> interface</a>
+ <ul class=toc>
+ <li><a href="#security2"><span class=secno>4.3.4.1.
+ </span>Security</a>
+ </ul>
+
+ <li><a href="#history-notes"><span class=secno>4.3.5.
+ </span>Implementation notes for session history</a>
+ </ul>
+
+ <li><a href="#links"><span class=secno>4.4. </span>Links</a>
+ <ul class=toc>
+ <li><a href="#hyperlink"><span class=secno>4.4.1. </span>Hyperlink
+ elements</a>
+
+ <li><a href="#following"><span class=secno>4.4.2. </span>Following
+ hyperlinks</a>
+ <ul class=toc>
+ <li><a href="#hyperlink0"><span class=secno>4.4.2.1.
+ </span>Hyperlink auditing</a>
+ </ul>
+
+ <li><a href="#linkTypes"><span class=secno>4.4.3. </span>Link
+ types</a>
+ <ul class=toc>
+ <li><a href="#link-type"><span class=secno>4.4.3.1. </span>Link type
+ "<code>alternate</code>"</a>
+
+ <li><a href="#link-type0"><span class=secno>4.4.3.2. </span>Link
+ type "<code>archives</code>"</a>
+
+ <li><a href="#link-type1"><span class=secno>4.4.3.3. </span>Link
+ type "<code>author</code>"</a>
+
+ <li><a href="#link-type2"><span class=secno>4.4.3.4. </span>Link
+ type "<code>bookmark</code>"</a>
+
+ <li><a href="#link-type3"><span class=secno>4.4.3.5. </span>Link
+ type "<code>contact</code>"</a>
+
+ <li><a href="#link-type4"><span class=secno>4.4.3.6. </span>Link
+ type "<code>external</code>"</a>
+
+ <li><a href="#link-type5"><span class=secno>4.4.3.7. </span>Link
+ type "<code>feed</code>"</a>
+
+ <li><a href="#link-type6"><span class=secno>4.4.3.8. </span>Link
+ type "<code>help</code>"</a>
+
+ <li><a href="#link-type7"><span class=secno>4.4.3.9. </span>Link
+ type "<code>icon</code>"</a>
+
+ <li><a href="#link-type8"><span class=secno>4.4.3.10. </span>Link
+ type "<code>license</code>"</a>
+
+ <li><a href="#link-type9"><span class=secno>4.4.3.11. </span>Link
+ type "<code>nofollow</code>"</a>
+
+ <li><a href="#link-type10"><span class=secno>4.4.3.12. </span>Link
+ type "<code>pingback</code>"</a>
+
+ <li><a href="#link-type11"><span class=secno>4.4.3.13. </span>Link
+ type "<code>prefetch</code>"</a>
+
+ <li><a href="#link-type12"><span class=secno>4.4.3.14. </span>Link
+ type "<code>search</code>"</a>
+
+ <li><a href="#link-type13"><span class=secno>4.4.3.15. </span>Link
+ type "<code>stylesheet</code>"</a>
+
+ <li><a href="#link-type14"><span class=secno>4.4.3.16. </span>Link
+ type "<code>sidebar</code>"</a>
+
+ <li><a href="#link-type15"><span class=secno>4.4.3.17. </span>Link
+ type "<code>tag</code>"</a>
+
+ <li><a href="#hierarchical"><span class=secno>4.4.3.18.
+ </span>Hierarchical link types</a>
+ <ul class=toc>
+ <li><a href="#link-type16"><span class=secno>4.4.3.18.1.
+ </span>Link type "<code>first</code>"</a>
+
+ <li><a href="#link-type17"><span class=secno>4.4.3.18.2.
+ </span>Link type "<code>index</code>"</a>
+
+ <li><a href="#link-type18"><span class=secno>4.4.3.18.3.
+ </span>Link type "<code>last</code>"</a>
+
+ <li><a href="#link-type19"><span class=secno>4.4.3.18.4.
+ </span>Link type "<code>next</code>"</a>
+
+ <li><a href="#link-type20"><span class=secno>4.4.3.18.5.
+ </span>Link type "<code>prev</code>"</a>
+
+ <li><a href="#link-type21"><span class=secno>4.4.3.18.6.
+ </span>Link type "<code>up</code>"</a>
+ </ul>
+
+ <li><a href="#other0"><span class=secno>4.4.3.19. </span>Other link
+ types</a>
+ </ul>
+ </ul>
+
+ <li><a href="#interfaces"><span class=secno>4.5. </span>Interfaces for
+ URI manipulation</a>
+
+ <li><a href="#navigating"><span class=secno>4.6. </span>Navigating
+ across documents</a>
+ <ul class=toc>
+ <li><a href="#read-html"><span class=secno>4.6.1. </span>Page load
+ processing model for HTML files</a>
+
+ <li><a href="#read-xml"><span class=secno>4.6.2. </span>Page load
+ processing model for XML files</a>
+
+ <li><a href="#read-text"><span class=secno>4.6.3. </span>Page load
+ processing model for text files</a>
+
+ <li><a href="#read-image"><span class=secno>4.6.4. </span>Page load
+ processing model for images</a>
+
+ <li><a href="#read-plugin"><span class=secno>4.6.5. </span>Page load
+ processing model for content that uses plugins</a>
+
+ <li><a href="#non-DOM-inline-content"><span class=secno>4.6.6.
+ </span>Page load processing model for inline content that doesn't
+ have a DOM</a>
+
+ <li><a href="#scroll-to-fragid"><span class=secno>4.6.7.
+ </span>Scrolling to a fragment identifier</a>
+ </ul>
+
+ <li><a href="#content-type-sniffing"><span class=secno>4.7.
+ </span>Determining the type of a new resource in a browsing context</a>
+
+ <ul class=toc>
+ <li><a href="#content-type0"><span class=secno>4.7.1.
+ </span>Content-Type sniffing: text or binary</a>
+
+ <li><a href="#content-type1"><span class=secno>4.7.2.
+ </span>Content-Type sniffing: unknown type</a>
+
+ <li><a href="#content-type2"><span class=secno>4.7.3.
+ </span>Content-Type sniffing: image</a>
+
+ <li><a href="#content-type3"><span class=secno>4.7.4.
+ </span>Content-Type sniffing: feed or HTML</a>
+
+ <li><a href="#content-type"><span class=secno>4.7.5.
+ </span>Content-Type metadata</a>
+ </ul>
+
+ <li><a href="#user-prompts"><span class=secno>4.8. </span>User
+ prompts</a>
+
+ <li><a href="#scripting"><span class=secno>4.9. </span>Scripting</a>
+ <ul class=toc>
+ <li><a href="#running"><span class=secno>4.9.1. </span>Running
+ executable code</a>
+
+ <li><a href="#origin"><span class=secno>4.9.2. </span>Origin</a>
+
+ <li><a href="#security3"><span class=secno>4.9.3. </span>Security
+ exceptions</a>
+
+ <li><a href="#javascript-protocol"><span class=secno>4.9.4. </span>The
+ <code title="">javascript:</code> protocol</a>
+
+ <li><a href="#events"><span class=secno>4.9.5. </span>Events</a>
+ <ul class=toc>
+ <li><a href="#event-handler-attributes"><span class=secno>4.9.5.1.
+ </span>Event handler attributes</a>
+
+ <li><a href="#event"><span class=secno>4.9.5.2. </span>Event
+ firing</a>
+
+ <li><a href="#events0"><span class=secno>4.9.5.3. </span>Events and
+ the <code>Window</code> object</a>
+
+ <li><a href="#runtime-script-errors"><span class=secno>4.9.5.4.
+ </span>Runtime script errors</a>
+ </ul>
+ </ul>
+
+ <li><a href="#browser"><span class=secno>4.10. </span>Browser state</a>
+ <ul class=toc>
+ <li><a href="#offline"><span class=secno>4.10.1. </span>Offline Web
+ applications</a>
+
+ <li><a href="#custom-handlers"><span class=secno>4.10.2. </span>Custom
+ protocol and content handlers</a>
+ <ul class=toc>
+ <li><a href="#security4"><span class=secno>4.10.2.1. </span>Security
+ and privacy</a>
+
+ <li><a href="#sample-handler-impl"><span class=secno>4.10.2.2.
+ </span>Sample user interface</a>
+ </ul>
+ </ul>
+
+ <li><a href="#storage"><span class=secno>4.11. </span>Client-side
+ session and persistent storage of name/value pairs</a>
+ <ul class=toc>
+ <li><a href="#introduction0"><span class=secno>4.11.1.
+ </span>Introduction</a>
+
+ <li><a href="#the-storage"><span class=secno>4.11.2. </span>The
+ <code>Storage</code> interface</a>
+
+ <li><a href="#the-storageitem"><span class=secno>4.11.3. </span>The
+ <code>StorageItem</code> interface</a>
+
+ <li><a href="#the-sessionstorage"><span class=secno>4.11.4. </span>The
+ <code title=dom-sessionStorage>sessionStorage</code> attribute</a>
+
+ <li><a href="#the-globalstorage"><span class=secno>4.11.5. </span>The
+ <code title=dom-globalStorage>globalStorage</code> attribute</a>
+
+ <li><a href="#the-storage0"><span class=secno>4.11.6. </span>The <code
+ title=event-storage>storage</code> event</a>
+
+ <li><a href="#miscellaneous0"><span class=secno>4.11.7.
+ </span>Miscellaneous implementation requirements for storage
+ areas</a>
+ <ul class=toc>
+ <li><a href="#disk-space"><span class=secno>4.11.7.1. </span>Disk
+ space</a>
+
+ <li><a href="#threads0"><span class=secno>4.11.7.2.
+ </span>Threads</a>
+ </ul>
+
+ <li><a href="#security5"><span class=secno>4.11.8. </span>Security and
+ privacy</a>
+ <ul class=toc>
+ <li><a href="#user-tracking"><span class=secno>4.11.8.1. </span>User
+ tracking</a>
+
+ <li><a href="#cookie"><span class=secno>4.11.8.2. </span>Cookie
+ resurrection</a>
+
+ <li><a href="#integrity"><span class=secno>4.11.8.3.
+ </span>Integrity of "public" storage areas</a>
+
+ <li><a href="#cross-protocol"><span class=secno>4.11.8.4.
+ </span>Cross-protocol and cross-port attacks</a>
+
+ <li><a href="#dns-spoofing"><span class=secno>4.11.8.5. </span>DNS
+ spoofing attacks</a>
+
+ <li><a href="#cross-directory"><span class=secno>4.11.8.6.
+ </span>Cross-directory attacks</a>
+
+ <li><a href="#public"><span class=secno>4.11.8.7. </span>Public
+ storage areas corresponding to hosts</a>
+
+ <li><a href="#storage0"><span class=secno>4.11.8.8. </span>Storage
+ areas in the face of untrusted higher-level domains that do not
+ correspond to public storage areas</a>
+
+ <li><a href="#storage1"><span class=secno>4.11.8.9. </span>Storage
+ areas in the face of untrusted subdomains</a>
+
+ <li><a href="#implementation"><span class=secno>4.11.8.10.
+ </span>Implementation risks</a>
+ </ul>
+ </ul>
+
+ <li><a href="#sql"><span class=secno>4.12. </span>Client-side database
+ storage</a>
+ <ul class=toc>
+ <li><a href="#introduction1"><span class=secno>4.12.1.
+ </span>Introduction</a>
+
+ <li><a href="#executing"><span class=secno>4.12.2. </span>Executing
+ SQL statements</a>
+
+ <li><a href="#database"><span class=secno>4.12.3. </span>Database
+ query results</a>
+
+ <li><a href="#privacy"><span class=secno>4.12.4. </span>Privacy</a>
+
+ <li><a href="#security6"><span class=secno>4.12.5. </span>Security</a>
+
+ <ul class=toc>
+ <li><a href="#user-agents"><span class=secno>4.12.5.1. </span>User
+ agents</a>
+
+ <li><a href="#sql-injection"><span class=secno>4.12.5.2. </span>SQL
+ injection</a>
+ </ul>
+ </ul>
+ </ul>
+
+ <li><a href="#editing"><span class=secno>5. </span>Editing</a>
+ <ul class=toc>
+ <li><a href="#editing-intro"><span class=secno>5.1.
+ </span>Introduction</a>
+
+ <li><a href="#contenteditable"><span class=secno>5.2. </span>The <code
+ title=attr-contenteditable>contenteditable</code> attribute</a>
+ <ul class=toc>
+ <li><a href="#user-editing"><span class=secno>5.2.1. </span>User
+ editing actions</a>
+
+ <li><a href="#making"><span class=secno>5.2.2. </span>Making entire
+ documents editable</a>
+ </ul>
+
+ <li><a href="#dnd"><span class=secno>5.3. </span>Drag and drop</a>
+ <ul class=toc>
+ <li><a href="#the-dragevent"><span class=secno>5.3.1. </span>The
+ <code>DragEvent</code> and <code>DataTransfer</code> interfaces</a>
+
+ <li><a href="#events1"><span class=secno>5.3.2. </span>Events fired
+ during a drag-and-drop action</a>
+
+ <li><a href="#drag-and-drop"><span class=secno>5.3.3.
+ </span>Drag-and-drop processing model</a>
+ <ul class=toc>
+ <li><a href="#when-the"><span class=secno>5.3.3.1. </span>When the
+ drag-and-drop operation starts or ends in another document</a>
+
+ <li><a href="#when-the0"><span class=secno>5.3.3.2. </span>When the
+ drag-and-drop operation starts or ends in another application</a>
+ </ul>
+
+ <li><a href="#the-draggable"><span class=secno>5.3.4. </span>The
+ <code>draggable</code> attribute</a>
+
+ <li><a href="#copy-and"><span class=secno>5.3.5. </span>Copy and
+ paste</a>
+ <ul class=toc>
+ <li><a href="#copy-to"><span class=secno>5.3.5.1. </span>Copy to
+ clipboard</a>
+
+ <li><a href="#cut-to"><span class=secno>5.3.5.2. </span>Cut to
+ clipboard</a>
+
+ <li><a href="#paste"><span class=secno>5.3.5.3. </span>Paste from
+ clipboard</a>
+
+ <li><a href="#paste0"><span class=secno>5.3.5.4. </span>Paste from
+ selection</a>
+ </ul>
+
+ <li><a href="#security7"><span class=secno>5.3.6. </span>Security
+ risks in the drag-and-drop model</a>
+ </ul>
+
+ <li><a href="#undo"><span class=secno>5.4. </span>Undo history</a>
+ <ul class=toc>
+ <li><a href="#the-undomanager"><span class=secno>5.4.1. </span>The
+ <code>UndoManager</code> interface</a>
+
+ <li><a href="#undo-moving"><span class=secno>5.4.2. </span>Undo:
+ moving back in the undo transaction history</a>
+
+ <li><a href="#redo-moving"><span class=secno>5.4.3. </span>Redo:
+ moving forward in the undo transaction history</a>
+
+ <li><a href="#the-undomanagerevent"><span class=secno>5.4.4.
+ </span>The <code>UndoManagerEvent</code> interface and the <code
+ title=event-undo>undo</code> and <code title=event-redo>redo</code>
+ events</a>
+
+ <li><a href="#implementation0"><span class=secno>5.4.5.
+ </span>Implementation notes</a>
+ </ul>
+
+ <li><a href="#command"><span class=secno>5.5. </span>Command APIs</a>
+
+ <li><a href="#selection"><span class=secno>5.6. </span>The text
+ selection APIs</a>
+ <ul class=toc>
+ <li><a href="#documentSelection"><span class=secno>5.6.1. </span>APIs
+ for the browsing context selection</a>
+
+ <li><a href="#textFieldSelection"><span class=secno>5.6.2. </span>APIs
+ for the text field selections</a>
+ </ul>
+ </ul>
+
+ <li><a href="#comms"><span class=secno>6. </span>Communication</a>
+ <ul class=toc>
+ <li><a href="#event0"><span class=secno>6.1. </span>Event
+ definitions</a>
+
+ <li><a href="#server-sent-events"><span class=secno>6.2.
+ </span>Server-sent DOM events</a>
+ <ul class=toc>
+ <li><a href="#the-remoteeventtarget"><span class=secno>6.2.1.
+ </span>The <code>RemoteEventTarget</code> interface</a>
+
+ <li><a href="#connecting"><span class=secno>6.2.2. </span>Connecting
+ to an event stream</a>
+
+ <li><a href="#parsing0"><span class=secno>6.2.3. </span>Parsing an
+ event stream</a>
+
+ <li><a href="#event-stream-interpretation"><span class=secno>6.2.4.
+ </span>Interpreting an event stream</a>
+
+ <li><a href="#notes"><span class=secno>6.2.5. </span>Notes</a>
+ </ul>
+
+ <li><a href="#network"><span class=secno>6.3. </span>Network
+ connections</a>
+ <ul class=toc>
+ <li><a href="#network-intro"><span class=secno>6.3.1.
+ </span>Introduction</a>
+
+ <li><a href="#the-connection"><span class=secno>6.3.2. </span>The
+ <code>Connection</code> interface</a>
+
+ <li><a href="#connection"><span class=secno>6.3.3. </span>Connection
+ Events</a>
+
+ <li><a href="#tcp-connections"><span class=secno>6.3.4. </span>TCP
+ connections</a>
+
+ <li><a href="#broadcast"><span class=secno>6.3.5. </span>Broadcast
+ connections</a>
+ <ul class=toc>
+ <li><a href="#broadcasting"><span class=secno>6.3.5.1.
+ </span>Broadcasting over TCP/IP</a>
+
+ <li><a href="#bluetooth-broadcast"><span class=secno>6.3.5.2.
+ </span>Broadcasting over Bluetooth</a>
+
+ <li><a href="#irda-broadcast"><span class=secno>6.3.5.3.
+ </span>Broadcasting over IrDA</a>
+ </ul>
+
+ <li><a href="#peer-to-peer"><span class=secno>6.3.6.
+ </span>Peer-to-peer connections</a>
+ <ul class=toc>
+ <li><a href="#peer-to-peer0"><span class=secno>6.3.6.1.
+ </span>Peer-to-peer connections over TCP/IP</a>
+
+ <li><a href="#bluetooth-peer"><span class=secno>6.3.6.2.
+ </span>Peer-to-peer connections over Bluetooth</a>
+
+ <li><a href="#irda-peer"><span class=secno>6.3.6.3.
+ </span>Peer-to-peer connections over IrDA</a>
+ </ul>
+
+ <li><a href="#the-common"><span class=secno>6.3.7. </span>The common
+ protocol for TCP-based connections</a>
+ <ul class=toc>
+ <li><a href="#clients"><span class=secno>6.3.7.1. </span>Clients
+ connecting over TCP</a>
+
+ <li><a href="#servers"><span class=secno>6.3.7.2. </span>Servers
+ accepting connections over TCP</a>
+
+ <li><a href="#sending"><span class=secno>6.3.7.3. </span>Sending and
+ receiving data over TCP</a>
+ </ul>
+
+ <li><a href="#network-security"><span class=secno>6.3.8.
+ </span>Security</a>
+
+ <li><a href="#network-other-specs"><span class=secno>6.3.9.
+ </span>Relationship to other standards</a>
+ </ul>
+
+ <li><a href="#crossDocumentMessages"><span class=secno>6.4.
+ </span>Cross-document messaging</a>
+ <ul class=toc>
+ <li><a href="#processing1"><span class=secno>6.4.1. </span>Processing
+ model</a>
+ </ul>
+ </ul>
+
+ <li><a href="#repetition"><span class=secno>7. </span>Repetition
+ templates</a>
+
+ <li><a href="#syntax"><span class=secno>8. </span>The HTML syntax</a>
+ <ul class=toc>
+ <li><a href="#writing"><span class=secno>8.1. </span>Writing HTML
+ documents</a>
+ <ul class=toc>
+ <li><a href="#the-doctype"><span class=secno>8.1.1. </span>The
+ DOCTYPE</a>
+
+ <li><a href="#elements0"><span class=secno>8.1.2. </span>Elements</a>
+ <ul class=toc>
+ <li><a href="#start"><span class=secno>8.1.2.1. </span>Start
+ tags</a>
+
+ <li><a href="#end-tags"><span class=secno>8.1.2.2. </span>End
+ tags</a>
+
+ <li><a href="#attributes0"><span class=secno>8.1.2.3.
+ </span>Attributes</a>
+
+ <li><a href="#optional"><span class=secno>8.1.2.4. </span>Optional
+ tags</a>
+
+ <li><a href="#restrictions"><span class=secno>8.1.2.5.
+ </span>Restrictions on content models</a>
+ </ul>
+
+ <li><a href="#text"><span class=secno>8.1.3. </span>Text</a>
+ <ul class=toc>
+ <li><a href="#newlines"><span class=secno>8.1.3.1.
+ </span>Newlines</a>
+ </ul>
+
+ <li><a href="#character"><span class=secno>8.1.4. </span>Character
+ entity references</a>
+
+ <li><a href="#comments"><span class=secno>8.1.5. </span>Comments</a>
+ </ul>
+
+ <li><a href="#parsing"><span class=secno>8.2. </span>Parsing HTML
+ documents</a>
+ <ul class=toc>
+ <li><a href="#overview"><span class=secno>8.2.1. </span>Overview of
+ the parsing model</a>
+
+ <li><a href="#the-input0"><span class=secno>8.2.2. </span>The input
+ stream</a>
+
+ <li><a href="#tokenisation"><span class=secno>8.2.3.
+ </span>Tokenisation</a>
+ <ul class=toc>
+ <li><a href="#tokenising"><span class=secno>8.2.3.1.
+ </span>Tokenising entities</a>
+ </ul>
+
+ <li><a href="#tree-construction"><span class=secno>8.2.4. </span>Tree
+ construction</a>
+ <ul class=toc>
+ <li><a href="#the-initial"><span class=secno>8.2.4.1. </span>The
+ initial phase</a>
+
+ <li><a href="#the-root0"><span class=secno>8.2.4.2. </span>The root
+ element phase</a>
+
+ <li><a href="#the-main"><span class=secno>8.2.4.3. </span>The main
+ phase</a>
+ <ul class=toc>
+ <li><a href="#the-stack"><span class=secno>8.2.4.3.1. </span>The
+ stack of open elements</a>
+
+ <li><a href="#the-list"><span class=secno>8.2.4.3.2. </span>The
+ list of active formatting elements</a>
+
+ <li><a href="#creating"><span class=secno>8.2.4.3.3.
+ </span>Creating and inserting HTML elements</a>
+
+ <li><a href="#closing"><span class=secno>8.2.4.3.4. </span>Closing
+ elements that have implied end tags</a>
+
+ <li><a href="#the-element"><span class=secno>8.2.4.3.5. </span>The
+ element pointers</a>
+
+ <li><a href="#the-insertion"><span class=secno>8.2.4.3.6.
+ </span>The insertion mode</a>
+
+ <li><a href="#how-to0"><span class=secno>8.2.4.3.7. </span>How to
+ handle tokens in the main phase</a>
+ </ul>
+
+ <li><a href="#the-trailing"><span class=secno>8.2.4.4. </span>The
+ trailing end phase</a>
+ </ul>
+
+ <li><a href="#the-end"><span class=secno>8.2.5. </span>The End</a>
+ </ul>
+
+ <li><a href="#namespaces"><span class=secno>8.3. </span>Namespaces</a>
+
+ <li><a href="#entities"><span class=secno>8.4. </span>Entities</a>
+ </ul>
+
+ <li><a href="#wysiwyg"><span class=secno>9. </span>WYSIWYG editors</a>
+ <ul class=toc>
+ <li><a href="#presentational"><span class=secno>9.1.
+ </span>Presentational markup</a>
+ <ul class=toc>
+ <li><a href="#wysiwyg0"><span class=secno>9.1.1. </span>WYSIWYG
+ signature</a>
+
+ <li><a href="#the-font"><span class=secno>9.1.2. </span>The
+ <code>font</code> element</a>
+ </ul>
+ </ul>
+
+ <li><a href="#rendering"><span class=secno>10. </span>Rendering</a>
+ <ul class=toc>
+ <li><a href="#rendering0"><span class=secno>10.1. </span>Rendering and
+ the DOM</a>
+ </ul>
+
+ <li><a href="#no"><span class=secno>11. </span>Things that you can't do
+ with this specification because they are better handled using other
+ technologies that are further described herein</a>
+ <ul class=toc>
+ <li><a href="#localisation"><span class=secno>11.1.
+ </span>Localisation</a>
+
+ <li><a href="#declarative"><span class=secno>11.2. </span>Declarative 2D
+ vector graphics and animation</a>
+
+ <li><a href="#declarative0"><span class=secno>11.3. </span>Declarative
+ 3D scenes</a>
+
+ <li><a href="#timers"><span class=secno>11.4. </span>Timers</a>
+
+ <li><a href="#events2"><span class=secno>11.5. </span>Events</a>
+ </ul>
+
+ <li class=no-num><a href="#references">References</a>
+
+ <li class=no-num><a href="#acknowledgements">Acknowledgements</a>
+ </ul>
+ <!--end-toc-->
+
+ <hr>
+
+ <h2 id=introduction><span class=secno>1. </span>Introduction</h2>
+
+ <p><em>This section is non-normative.</em>
+
+ <p>The World Wide Web's markup language has always been HTML. HTML was
+ primarily designed as a language for semantically describing scientific
+ documents, although its general design and adaptations over the years has
+ enabled it to be used to describe a number of other types of documents.
+
+ <p>The main area that has not been adequately addressed by HTML is a vague
+ subject referred to as Web Applications. This specification attempts to
+ rectify this, while at the same time updating the HTML specifications to
+ address issues raised in the past few years.
+
+ <h3 id=scope><span class=secno>1.1. </span>Scope</h3>
+
+ <p><em>This section is non-normative.</em>
+
+ <p>This specification is limited to providing a semantic-level markup
+ language and associated semantic-level scripting APIs for authoring
+ accessible pages on the Web ranging from static documents to dynamic
+ applications.
+
+ <p>The scope of this specification does not include addressing presentation
+ concerns (although default rendering rules for Web browsers are included
+ at the end of this specification).
+
+ <p>The scope of this specification does not include documenting every HTML
+ or DOM feature supported by Web browsers. Browsers support many features
+ that are considered to be very bad for accessibility or that are otherwise
+ inappropriate. For example, the <code>blink</code> element is clearly
+ presentational and authors wishing to cause text to blink should instead
+ use CSS.
+
+ <p>The scope of this specification is not to describe an entire operating
+ system. In particular, hardware configuration software, image manipulation
+ tools, and applications that users would be expected to use with high-end
+ workstations on a daily basis are out of scope. In terms of applications,
+ this specification is targeted specifically at applications that would be
+ expected to be used by users on an occasional basis, or regularly but from
+ disparate locations, with low CPU requirements. For instance online
+ purchasing systems, searching systems, games (especially multiplayer
+ online games), public telephone books or address books, communications
+ software (e-mail clients, instant messaging clients, discussion software),
+ document editing software, etc.
+
+ <p>For sophisticated cross-platform applications, there already exist
+ several proprietary solutions (such as Mozilla's XUL and Macromedia's
+ Flash). These solutions are evolving faster than any standards process
+ could follow, and the requirements are evolving even faster. These systems
+ are also significantly more complicated to specify, and are orders of
+ magnitude more difficult to achieve interoperability with, than the
+ solutions described in this document. Platform-specific solutions for such
+ sophisticated applications (for example the MacOS X Core APIs) are even
+ further ahead.
+
+ <h4 id=relationship><span class=secno>1.1.1. </span>Relationship to HTML
+ 4.01, XHTML 1.1, DOM2 HTML</h4>
+
+ <p><em>This section is non-normative.</em>
+
+ <p>This specification represents a new version of HTML4 and XHTML1, along
+ with a new version of the associated DOM2 HTML API. Migration from HTML4
+ or XHTML1 to the format and APIs described in this specification should in
+ most cases be straightforward, as care has been taken to ensure that
+ backwards-compatibility is retained.</p>
+ <!-- XXX refs -->
+
+ <p>This specification will eventually supplant Web Forms 2.0 as well. <a
+ href="#refsWF2">[WF2]</a>
+
+ <h4 id=relationship0><span class=secno>1.1.2. </span>Relationship to XHTML2</h4>
+
+ <p><em>This section is non-normative.</em>
+
+ <p>XHTML2 <a href="#refsXHTML2">[XHTML2]</a> defines a new HTML vocabulary
+ with better features for hyperlinks, multimedia content, annotating
+ document edits, rich metadata, declarative interactive forms, and
+ describing the semantics of human literary works such as poems and
+ scientific papers.
+
+ <p>However, it lacks elements to express the semantics of many of the
+ non-document types of content often seen on the Web. For instance, forum
+ sites, auction sites, search engines, online shops, and the like, do not
+ fit the document metaphor well, and are not covered by XHTML2.
+
+ <p><em>This</em> specification aims to extend HTML so that it is also
+ suitable in these contexts.
+
+ <p>XHTML2 and this specification use different namespaces and therefore can
+ both be implemented in the same XML processor.
+
+ <h4 id=relationship1><span class=secno>1.1.3. </span>Relationship to XUL,
+ Flash, Silverlight, and other proprietary UI languages</h4>
+
+ <p><em>This section is non-normative.</em>
+
+ <p>This specification is independent of the various proprietary UI
+ languages that various vendors provide. As an open, vender-neutral
+ language, HTML provides for a solution to the same problems without the
+ risk of vendor lock-in.
+
+ <h3 id=structure><span class=secno>1.2. </span>Structure of this
+ specification</h3>
+
+ <p><em>This section is non-normative.</em>
+
+ <p>This specification is divided into the following important sections:
+
+ <dl>
+ <dt><a href="#dom">The DOM</a>
+
+ <dd>The DOM, or Document Object Model, provides a base for the rest of the
+ specification.
+
+ <dt><a href="#semantics">The Semantics</a>
+
+ <dd>Documents are built from elements. These elements form a tree using
+ the DOM. Each element also has a predefined meaning, which is explained
+ in this section. User agent requirements for how to handle each element
+ are also given, along with rules for authors on how to use the element.
+
+ <dt><a href="#windows">Browsing Contexts</a>
+
+ <dd>HTML documents do not exist in a vacuum &mdash; this section defines
+ many of the features that affect environments that deal with multiple
+ pages, links between pages, and running scripts.
+
+ <dt>APIs
+
+ <dd><a href="#editing">The Editing APIs</a>: HTML documents can provide a
+ number of mechanisms for users to modify content, which are described in
+ this section.
+
+ <dd><a href="#comms">The Communication APIs</a>: Applications written in
+ HTML often require mechanisms to communicate with remote servers, as well
+ as communicating with other applications from different domains running
+ on the same client.
+
+ <dd><a href="#repetition">Repetition Templates</a>: A mechanism to support
+ repeating sections in forms.
+
+ <dt><a href="#syntax">The Language Syntax</a>
+
+ <dd>All of these features would be for naught if they couldn't be
+ represented in a serialised form and sent to other people, and so this
+ section defines the syntax of HTML, along with rules for how to parse
+ HTML.
+ </dl>
+
+ <p>There are also a couple of appendices, defining <a href="#wysiwyg">shims
+ for WYSIWYG editors</a>, <a href="#rendering">rendering rules</a> for Web
+ browsers, and listing <a href="#no">areas that are out of scope</a> for
+ this specification.
+
+ <h4 id=how-to><span class=secno>1.2.1. </span>How to read this
+ specification</h4>
+
+ <p>This specification should be read like all other specifications. First,
+ it should be read cover-to-cover, multiple times. Then, it should be read
+ backwards at least once. Then it should be read by picking random sections
+ from the contents list and following all the cross-references.
+
+ <h3 id=conformance><span class=secno>1.3. </span>Conformance requirements</h3>
+
+ <p>All diagrams, examples, and notes in this specification are
+ non-normative, as are all sections explicitly marked non-normative.
+ Everything else in this specification is normative.
+
+ <p>The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
+ "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in the
+ normative parts of this document are to be interpreted as described in
+ RFC2119. For readability, these words do not appear in all uppercase
+ letters in this specification. <a href="#refsRFC2119">[RFC2119]</a></p>
+ <!-- XXX but they should be marked up -->
+
+ <p>This specification describes the conformance criteria for user agents
+ (relevant to implementors) and documents (relevant to authors and
+ authoring tool implementors).
+
+ <p class=note>There is no implied relationship between document conformance
+ requirements and implementation conformance requirements. User agents are
+ not free to handle non-conformant documents as they please; the processing
+ model described in this specification applies to implementations
+ regardless of the conformity of the input documents.</p>
+ <!--XXX quite possible that
+ this is stated twice. check for whether this is a dupe. -->
+
+ <p>User agents fall into several (overlapping) categories with different
+ conformance requirements.
+
+ <dl>
+ <dt id=interactive>Web browsers and other interactive user agents
+
+ <dd>
+ <p>Web browsers that support <a href="#xhtml5">XHTML</a> must process
+ elements and attributes from the <a href="#html-namespace0">HTML
+ namespace</a> found in <a href="#xml-documents">XML documents</a> as
+ described in this specification, so that users can interact with them,
+ unless the semantics of those elements have been overridden by other
+ specifications.</p>
+
+ <p class=example>A conforming XHTML processor would, upon finding an
+ XHTML <code><a href="#script0">script</a></code> element in an XML
+ document, execute the script contained in that element. However, if the
+ element is found within an XSLT transformation sheet (assuming the UA
+ also supports XSLT), then the processor would instead treat the <code><a
+ href="#script0">script</a></code> element as an opaque element that
+ forms part of the transform.</p>
+
+ <p>Web browsers that support <a href="#html5" title=HTML5>HTML</a> must
+ process documents labelled as <code>text/html</code> as described in
+ this specification, so that users can interact with them.</p>
+
+ <dt id=non-interactive>Non-interactive presentation user agents
+
+ <dd>
+ <p>User agents that process HTML and XHTML documents purely to render
+ non-interactive versions of them must comply to the same conformance
+ criteria as Web browsers, except that they are exempt from requirements
+ regarding user interaction.</p>
+
+ <p class=note>Typical examples of non-interactive presentation user
+ agents are printers (static UAs) and overhead displays (dynamic UAs). It
+ is expected that most static non-interactive presentation user agents
+ will also opt to <a href="#non-scripted">lack scripting support</a>.</p>
+
+ <p class=example>A non-interactive but dynamic presentation UA would
+ still execute scripts, allowing forms to be dynamically submitted, and
+ so forth. However, since the concept of "focus" is irrelevant when the
+ user cannot interact with the document, the UA would not need to support
+ any of the focus-related DOM APIs.</p>
+
+ <dt><dfn id=non-scripted>User agents with no scripting support</dfn>
+
+ <dd>
+ <p>Implementations that do not support scripting (or which have their
+ scripting features <a href="#scripting1" title="scripting is
+ disabled">disabled</a>) are exempt from supporting the events and DOM
+ interfaces mentioned in this specification. For the parts of this
+ specification that are defined in terms of an events model or in terms
+ of the DOM, such user agents must still act as if events and the DOM
+ were supported.</p>
+
+ <p class=note>Scripting can form an integral part of an application. Web
+ browsers that do not support scripting, or that have scripting disabled,
+ might be unable to fully convey the author's intent.</p>
+
+ <dt>Conformance checkers
+
+ <dd id=conformance-checkers>
+ <p>Conformance checkers must verify that a document conforms to the
+ applicable conformance criteria described in this specification.
+ Conformance checkers are exempt from detecting errors that require
+ interpretation of the author's intent (for example, while a document is
+ non-conforming if the content of a <code><a
+ href="#blockquote">blockquote</a></code> element is not a quote,
+ conformance checkers do not have to check that <code><a
+ href="#blockquote">blockquote</a></code> elements only contain quoted
+ material).</p>
+
+ <p>Conformance checkers must check that the input document conforms when
+ <a href="#scripting1">scripting is disabled</a>, and should also check
+ that the input document conforms when <a href="#scripting2">scripting is
+ enabled</a>. (This is only a "SHOULD" and not a "MUST" requirement
+ because it has been proven to be impossible. <a
+ href="#refsHALTINGPROBLEM">[HALTINGPROBLEM]</a>)</p>
+ <!-- XXX
+ [Computable] On computable numbers, with an application to the
+ Entscheidungsproblem. Alan M. Turing. In Proceedings of the London
+ Mathematical Society, series 2, volume 42, pages 230-265. London
+ Mathematical Society,
+ 1937. http://www.turingarchive.org/browse.php/B/12 (referenced:
+ 2007-03-03)
+ -->
+
+ <p>The term "HTML5 validator" can be used to refer to a conformance
+ checker that itself conforms to the applicable requirements of this
+ specification.</p>
+
+ <div class=note>
+ <p>XML DTDs cannot express all the conformance requirements of this
+ specification. Therefore, a validating XML processor and a DTD cannot
+ constitute a conformance checker. Also, since neither of the two
+ authoring formats defined in this specification are applications of
+ SGML, a validating SGML system cannot constitute a conformance checker
+ either.</p>
+
+ <p>To put it another way, there are three types of conformance criteria:</p>
+
+ <ol>
+ <li>Criteria that can be expressed in a DTD.
+
+ <li>Criteria that cannot be expressed by a DTD, but can still be
+ checked by a machine.
+
+ <li>Criteria that can only be checked by a human.
+ </ol>
+
+ <p>A conformance checker must check for the first two. A simple
+ DTD-based validator only checks for the first class of errors and is
+ therefore not a conforming conformance checker according to this
+ specification.</p>
+ </div>
+
+ <dt>Data mining tools
+
+ <dd id=data-mining>
+ <p>Applications and tools that process HTML and XHTML documents for
+ reasons other than to either render the documents or check them for
+ conformance should act in accordance to the semantics of the documents
+ that they process.</p>
+
+ <p class=example>A tool that generates <span title="sections and
+ headings">document outlines</span> but increases the nesting level for
+ each paragraph and does not increase the nesting level for each section
+ would not be conforming.</p>
+
+ <dt id=editors>Authoring tools and markup generators
+
+ <dd>
+ <p>Authoring tools and markup generators must generate conforming
+ documents. Conformance criteria that apply to authors also apply to
+ authoring tools, where appropriate.</p>
+
+ <p>Authoring tools are exempt from the strict requirements of using
+ elements only for their specified purpose, but only to the extent that
+ authoring tools are not yet able to determine author intent.</p>
+
+ <p class=example>For example, it is not conforming to use an <code><a
+ href="#address">address</a></code> element for arbitrary contact
+ information; that element can only be used for marking up contact
+ information for the author of the document or section. However, since an
+ authoring tools is likely unable to determine the difference, an
+ authoring tool is exempt from that requirement.</p>
+
+ <p class=note>In terms of conformance checking, an editor is therefore
+ required to output documents that conform to the same extent that a
+ conformance checker will verify.</p>
+
+ <p>When an authoring tool is used to edit a non-conforming document, it
+ may preserve the conformance errors in sections of the document that
+ were not edited during the editing session (i.e. an editing tool is
+ allowed to round-trip errorneous content). However, an authoring tool
+ must not claim that the output is conformant if errors have been so
+ preserved.</p>
+
+ <p>Authoring tools are expected to come in two broad varieties: tools
+ that work from structure or semantic data, and tools that work on a
+ What-You-See-Is-What-You-Get media-specific editing basis (WYSIWYG).</p>
+
+ <p>The former is the preferred mechanism for tools that author HTML,
+ since the structure in the source information can be used to make
+ informed choices regarding which HTML elements and attributes are most
+ appropriate.</p>
+
+ <p>However, WYSIWYG tools are legitimate, and this specification <a
+ href="#wysiwyg1" title="WYSIWYG editors">makes certain concessions to
+ WYSIWYG editors</a>.</p>
+
+ <p>All authoring tools, whether WYSIWYG or not, should make a best effort
+ attempt at enabling users to create well-structured, semantically rich,
+ media-independent content.</p>
+ </dl>
+
+ <p>Some conformance requirements are phrased as requirements on elements,
+ attributes, methods or objects. Such requirements fall into two
+ categories; those describing content model restrictions, and those
+ describing implementation behaviour. The former category of requirements
+ are requirements on documents and authoring tools. The second category are
+ requirements on user agents.
+
+ <p>Conformance requirements phrased as algorithms or specific steps may be
+ implemented in any manner, so long as the end result is equivalent. (In
+ particular, the algorithms defined in this specification are intended to
+ be easy to follow, and not intended to be performant.)
+
+ <p id=hardwareLimitations>User agents may impose implementation-specific
+ limits on otherwise unconstrained inputs, e.g. to prevent denial of
+ service attacks, to guard against running out of memory, or to work around
+ platform-specific limitations.
+
+ <p>For compatibility with existing content and prior specifications, this
+ specification describes two authoring formats: one based on XML (referred
+ to as <dfn id=xhtml5 title=XHTML>XHTML5</dfn>), and one using a <a
+ href="#parsing">custom format</a> inspired by SGML (referred to as <dfn
+ id=html5>HTML5</dfn>). Implementations may support only one of these two
+ formats, although supporting both is encouraged.
+
+ <p id=authors-using-xhtml><a href="#xhtml5">XHTML</a> documents (<a
+ href="#xml-documents">XML documents</a> using elements from the <a
+ href="#html-namespace0">HTML namespace</a>) that use the new features
+ described in this specification and that are served over the wire (e.g. by
+ HTTP) must be sent using an XML MIME type such as
+ <code>application/xml</code> or <code>application/xhtml+xml</code> and
+ must not be served as <code>text/html</code>. <a
+ href="#refsRFC3023">[RFC3023]</a>
+
+ <p>Such XML documents may contain a <code>DOCTYPE</code> if desired, but
+ this is not required to conform to this specification.
+
+ <p class=note>According to the XML specification, XML processors are not
+ guaranteed to process the external DTD subset referenced in the DOCTYPE.
+ This means, for example, that using entities for characters in XHTML
+ documents is unsafe (except for &amp;lt;, &amp;gt;, &amp;amp;, &amp;quot;
+ and &amp;apos;). For interoperability, authors are advised to avoid
+ optional features of XML.
+
+ <p id=authors-using-html><a href="#html5" title=HTML5>HTML documents</a>,
+ if they are served over the wire (e.g. by HTTP) must be labelled with the
+ <code>text/html</code> MIME type.</p>
+ <!--
+ XXX update RFC 2854 -->
+
+ <p id=entity-references>The language in this specification assumes that the
+ user agent expands all entity references, and therefore does not include
+ entity reference nodes in the DOM. If user agents do include entity
+ reference nodes in the DOM, then user agents must handle them as if they
+ were fully expanded when implementing this specification. For example, if
+ a requirement talks about an element's child text nodes, then any text
+ nodes that are children of an entity reference that is a child of that
+ element would be used as well.</p>
+ <!-- XXX unexpandable entities? -->
+
+ <h4 id=common><span class=secno>1.3.1. </span>Common conformance
+ requirements for APIs exposed to JavaScript</h4>
+
+ <p class=big-issue>A lot of arrays/lists/<span>collection</span>s in this
+ spec assume zero-based indexes but use the term "<var
+ title="">index</var>th" liberally. We should define those to be zero-based
+ and be clearer about this.
+
+ <p>Unless other specified, if a DOM attribute that is a floating point
+ number type (<code title="">float</code>) is assigned an Infinity or
+ Not-a-Number value, a <code title=big-issue>NOT_SUPPORTED_ERR</code>
+ exception must be raised.
+
+ <p>Unless other specified, if a DOM attribute that is a signed numberic
+ type is assigned a negative value, a <code
+ title=big-issue>NOT_SUPPORTED_ERR</code> exception must be raised.
+
+ <p>Unless other specified, if a method with an argument that is a floating
+ point number type (<code title="">float</code>) is passed an Infinity or
+ Not-a-Number value, a <code title=big-issue>NOT_SUPPORTED_ERR</code>
+ exception must be raised.
+
+ <p>Unless other specified, if a method is passed fewer arguments than is
+ defined for that method in its IDL definition, a <code
+ title=big-issue>NOT_SUPPORTED_ERR</code> exception must be raised.
+
+ <p>Unless other specified, if a method is passed more arguments than is
+ defined for that method in its IDL definition, the excess arguments must
+ be ignored.
+
+ <p>Unless other specified, if a method is expecting, as one of its
+ arguments, as defined by its IDL definition, an object implementing a
+ particular interface <var title="">X</var>, and the argument passed is an
+ object whose [[Class]] property is neither that interface <var
+ title="">X</var>, nor the name of an interface <var title="">Y</var> where
+ this specification requires that all objects implementing interface <var
+ title="">Y</var> also implement interface <var title="">X</var>, nor the
+ name of an interface that inherits from the expected interface <var
+ title="">X</var>, then a <code title="">TYPE_MISMATCH_ERR</code> exception
+ must be raised.
+
+ <p class=big-issue>Anything else? Passing the wrong type of object, maybe?
+ Implied conversions to int/float?
+
+ <h4 id=dependencies><span class=secno>1.3.2. </span>Dependencies</h4>
+
+ <p>This specification relies on several other underlying specifications.
+
+ <dl>
+ <dt>XML
+
+ <dd>
+ <p>Implementations that support XHTML5 must support some version of XML,
+ as well as its corresponding namespaces specification, because XHTML5
+ uses an XML serialisation with namespaces. <a href="#refsXML">[XML]</a>
+ <a href="#refsXMLNAMES">[XMLNAMES]</a></p>
+
+ <dt>XML Base
+
+ <dd>
+ <p id=xmlBase>User agents must follow the rules given by XML Base to
+ resolve relative URIs in HTML and XHTML fragments. That is the mechanism
+ used in this specification for resolving relative URIs in DOM trees. <a
+ href="#refsXMLBASE">[XMLBASE]</a></p>
+
+ <p class=note>It is possible for <code
+ title=attr-xml-base>xml:base</code> attributes to be present even in
+ HTML fragments, as such attributes can be added dynamically using
+ script.</p>
+
+ <dt>DOM
+
+ <dd>
+ <p>Implementations must support some version of DOM Core and DOM Events,
+ because this specification is defined in terms of the DOM, and some of
+ the features are defined as extensions to the DOM Core interfaces. <a
+ href="#refsDOM3CORE">[DOM3CORE]</a> <a
+ href="#refsDOM3CORE">[DOM3EVENTS]</a></p>
+
+ <dt>ECMAScript
+
+ <dd>
+ <p>Implementations that use ECMAScript to implement the APIs defined in
+ this specification must implement them in a manner consistent with the
+ ECMAScript Bindings for DOM Specifications specification, as this
+ specification uses that specification's terminology. <a
+ href="#refsEBFD">[EBFD]</a></p>
+ </dl>
+
+ <p>This specification does not require support of any particular network
+ transport protocols, image formats, audio formats, video formats, style
+ sheet language, scripting language, or any of the DOM and WebAPI
+ specifications beyond those described above. However, the language
+ described by this specification is biased towards CSS as the styling
+ language, ECMAScript as the scripting language, and HTTP as the network
+ protocol, and several features assume that those languages and protocols
+ are in use.
+
+ <h4 id=features><span class=secno>1.3.3. </span>Features defined in other
+ specifications</h4>
+
+ <p>Some elements are defined in terms of their DOM <dfn
+ id=textcontent><code>textContent</code></dfn> attribute. This is an
+ attribute defined on the <code>Node</code> interface in DOM3 Core. <a
+ href="#refsDOM3CORE">[DOM3CORE]</a>
+
+ <p class=big-issue>Should textContent be defined differently for dir="" and
+ &lt;bdo>? Should we come up with an alternative to textContent that
+ handles those and other things, like alt=""?</p>
+ <!-- This section is currently here exclusively so that we crossref
+ to textContent. XXX also add event-click, event-change,
+ event-DOMActivate, etc, here, and just have the section be a general
+ "defined in other specifications" section -->
+
+ <p>The term <dfn id=activation0>activation behavior</dfn> is used as
+ defined in the DOM3 Events specification. <a
+ href="#refsDOM3EVENTS">[DOM3EVENTS]</a> <span class=big-issue>At the time
+ of writing, DOM3 Events hadn't yet been updated to define that
+ phrase.</span>
+
+ <p id=alternate-style-sheets>The rules for handling alternative style
+ sheets are defined in the CSS object model specification. <a
+ href="#CSSOM">[CSSOM]</a>
+
+ <p class=big-issue>See <a
+ href="http://dev.w3.org/cvsweb/~checkout~/csswg/cssom/Overview.html?rev=1.35&amp;content-type=text/html;%20charset=utf-8">http://dev.w3.org/cvsweb/~checkout~/csswg/cssom/Overview.html?rev=1.35&amp;content-type=text/html;%20charset=utf-8</a>
+
+ <p>Certain features are defined in terms of CSS &lt;color&gt; values. When
+ the CSS value <code title="">currentColor</code> is specified in this
+ context, the "computed value of the 'color' property" for the purposes of
+ determining the computed value of the <code title="">currentColor</code>
+ keyword is the computed value of the 'color' property on the element in
+ question. <a href="#refsCSS3COLOR">[CSS3COLOR]</a>
+
+ <p class=example>If a canvas gradient's <code
+ title=dom-canvasgradient-addColorStop><a
+ href="#addcolorstop">addColorStop()</a></code> method is called with the
+ <code title="">currentColor</code> keyword as the color, then the computed
+ value of the 'color' property on the <code><a
+ href="#canvas">canvas</a></code> element is the one that is used.
+
+ <h3 id=terminology><span class=secno>1.4. </span>Terminology</h3>
+
+ <p>This specification refers to both HTML and XML attributes and DOM
+ attributes, often in the same context. When it is not clear which is being
+ referred to, they are referred to as <dfn id=content>content
+ attributes</dfn> for HTML and XML attributes, and <dfn
+ id=dom-attributes>DOM attributes</dfn> for those from the DOM. Similarly,
+ the term "properties" is used for both ECMAScript object properties and
+ CSS properties. When these are ambiguous they are qualified as object
+ properties and CSS properties respectively.
+
+ <p id=html-namespace>To ease migration from HTML to XHTML, UAs conforming
+ to this specification will place elements in HTML in the
+ <code>http://www.w3.org/1999/xhtml</code> namespace, at least for the
+ purposes of the DOM and CSS. The term "<dfn id=elements1>elements in the
+ HTML namespace</dfn>", or "<dfn id=html-elements>HTML elements</dfn>" for
+ short, when used in this specification, thus refers to both HTML and XHTML
+ elements.
+
+ <p>Unless otherwise stated, all elements defined or mentioned in this
+ specification are in the <code>http://www.w3.org/1999/xhtml</code>
+ namespace, and all attributes defined or mentioned in this specification
+ have no namespace (they are in the per-element partition).
+
+ <p>The term <a href="#html-">HTML documents</a> is sometimes used in
+ contrast with <a href="#xml-documents">XML documents</a> to mean
+ specifically documents that were parsed using an <a href="#html-0">HTML
+ parser</a> (as opposed to using an XML parser or created purely through
+ the DOM).
+
+ <p>Generally, when the specification states that a feature applies to HTML
+ or XHTML, it also includes the other. When a feature specifically only
+ applies to one of the two languages, it is called out by explicitly
+ stating that it does not apply to the other format, as in "for HTML, ...
+ (this does not apply to XHTML)".
+
+ <p>This specification uses the term <em>document</em> to refer to any use
+ of HTML, ranging from short static documents to long essays or reports
+ with rich multimedia, as well as to fully-fledged interactive
+ applications.
+
+ <p>For readability, the term URI is used to refer to both ASCII URIs and
+ Unicode IRIs, as those terms are defined by <a
+ href="#refsRFC3986">[RFC3986]</a> and <a href="#refsRFC3987">[RFC3987]</a>
+ respectively. On the rare occasions where IRIs are not allowed but ASCII
+ URIs are, this is called out explicitly.
+
+ <p>T