From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Sat, 23 Jun 2007 22:40:25 +0000
Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do
 (like tree generation ;)

svn path=/trunk/hubbub/; revision=3359
---
 test/data/tokeniser2/INDEX                  |   7 ++
 test/data/tokeniser2/contentModelFlags.test |  36 ++++++++
 test/data/tokeniser2/test1.test             | 136 ++++++++++++++++++++++++++++
 test/data/tokeniser2/test2.test             | 108 ++++++++++++++++++++++
 4 files changed, 287 insertions(+)
 create mode 100644 test/data/tokeniser2/INDEX
 create mode 100644 test/data/tokeniser2/contentModelFlags.test
 create mode 100644 test/data/tokeniser2/test1.test
 create mode 100644 test/data/tokeniser2/test2.test

(limited to 'test/data/tokeniser2')
diff --git a/test/data/tokeniser2/INDEX b/test/data/tokeniser2/INDEX
new file mode 100644
index 0000000..8539aeb
--- /dev/null
+++ b/test/data/tokeniser2/INDEX
@@ -0,0 +1,7 @@
+# Index file for tokeniser tests
+#
+# Test			Description
+
+test1.test		html5lib tests (part 1)
+test2.test		html5lib tests (part 2)
+contentModelFlags.test	html5lib content model tests
\ No newline at end of file
diff --git a/test/data/tokeniser2/contentModelFlags.test b/test/data/tokeniser2/contentModelFlags.test
new file mode 100644
index 0000000..84d41fc
--- /dev/null
+++ b/test/data/tokeniser2/contentModelFlags.test
@@ -0,0 +1,36 @@
+{"tests": [
+
+{"description":"PLAINTEXT content model flag",
+"contentModelFlags":["PLAINTEXT"],
+"input":"<head>&body;",
+"output":[["Character", "<head>&body;"]]},
+
+{"description":"End tag closing RCDATA or CDATA",
+"contentModelFlags":["RCDATA", "CDATA"],
+"lastStartTag":"bar",
+"input":"foo</bar>",
+"output":[["Character", "foo"], ["EndTag", "bar"]]},
+
+{"description":"End tag with incorrect name in RCDATA or CDATA",
+"contentModelFlags":["RCDATA", "CDATA"],
+"lastStartTag":"baz",
+"input":"</foo>bar</baz>",
+"output":["ParseError", ["Character", "</foo>bar"], ["EndTag", "baz"]]},
+
+{"description":"End tag closing RCDATA or CDATA, switching back to PCDATA",
+"contentModelFlags":["RCDATA", "CDATA"],
+"lastStartTag":"bar",
+"input":"foo</bar></baz>",
+"output":[["Character", "foo"], ["EndTag", "bar"], ["EndTag", "baz"]]},
+
+{"description":"CDATA w/ something looking like an entity",
+"contentModelFlags":["CDATA"],
+"input":"&foo;",
+"output":[["Character", "&foo;"]]},
+
+{"description":"RCDATA w/ an entity",
+"contentModelFlags":["RCDATA"],
+"input":"&lt;",
+"output":[["Character", "<"]]}
+
+]}
diff --git a/test/data/tokeniser2/test1.test b/test/data/tokeniser2/test1.test
new file mode 100644
index 0000000..c12ff5a
--- /dev/null
+++ b/test/data/tokeniser2/test1.test
@@ -0,0 +1,136 @@
+{"tests": [
+
+{"description":"Correct Doctype lowercase",
+"input":"<!DOCTYPE html>",
+"output":[["DOCTYPE", "HTML", false]]},
+
+{"description":"Correct Doctype uppercase",
+"input":"<!DOCTYPE HTML>", 
+"output":[["DOCTYPE", "HTML", false]]},
+
+{"description":"Correct Doctype mixed case",
+"input":"<!DOCTYPE HtMl>", 
+"output":[["DOCTYPE", "HTML", false]]},
+
+{"description":"Truncated doctype start",
+"input":"<!DOC>", 
+"output":["ParseError", ["Comment", "DOC"]]},
+
+{"description":"Doctype in error",
+"input":"<!DOCTYPE foo>", 
+"output":[["DOCTYPE", "FOO", true]]},
+
+{"description":"Single Start Tag",
+"input":"<h>",
+"output":[["StartTag", "h", {}]]},
+
+{"description":"Empty end tag",
+"input":"</>",
+"output":["ParseError"]},
+
+{"description":"Empty start tag",
+"input":"<>",
+"output":["ParseError", ["Character", "<>"]]},
+
+{"description":"Start Tag w/attribute",
+"input":"<h a='b'>",
+"output":[["StartTag", "h", {"a":"b"}]]},
+
+{"description":"Start Tag w/attribute no quotes",
+"input":"<h a=b>",
+"output":[["StartTag", "h", {"a":"b"}]]},
+
+{"description":"Start/End Tag",
+"input":"<h></h>",
+"output":[["StartTag", "h", {}], ["EndTag", "h"]]},
+
+{"description":"Two unclosed start tags",
+"input":"<p>One<p>Two",
+"output":[["StartTag", "p", {}], ["Character", "One"], ["StartTag", "p", {}], ["Character", "Two"]]},
+
+{"description":"End Tag w/attribute",
+"input":"<h></h a='b'>",
+"output":[["StartTag", "h", {}], "ParseError", ["EndTag", "h"]]},
+
+{"description":"Multiple atts",
+"input":"<h a='b' c='d'>",
+"output":[["StartTag", "h", {"a":"b", "c":"d"}]]},
+
+{"description":"Multiple atts no space",
+"input":"<h a='b'c='d'>",
+"output":[["StartTag", "h", {"a":"b", "c":"d"}]]},
+
+{"description":"Repeated attr",
+ "input":"<h a='b' a='d'>",
+ "output":["ParseError", ["StartTag", "h", {"a":"b"}]]},
+
+{"description":"Simple comment",
+ "input":"<!--comment-->",
+ "output":[["Comment", "comment"]]},
+
+{"description":"Comment, Central dash no space",
+ "input":"<!----->",
+ "output":["ParseError", ["Comment", "-"]]},
+
+{"description":"Comment, two central dashes",
+"input":"<!-- --comment -->",
+"output":["ParseError", ["Comment", " --comment "]]},
+
+{"description":"Unfinished comment",
+"input":"<!--comment",
+"output":["ParseError", ["Comment", "comment"]]},
+
+{"description":"Start of a comment",
+"input":"<!-",
+"output":["ParseError", ["Comment", "-"]]},
+
+{"description":"Ampersand only",
+"input":"&",
+"output":["ParseError", ["Character", "&"]]},
+
+{"description":"Unfinished entity",
+"input":"&f",
+"output":["ParseError", ["Character", "&"], ["Character", "f"]]},
+
+{"description":"Ampersand, number sign",
+"input":"&#",
+"output":["ParseError", ["Character", "&"], ["Character", "#"]]},
+
+{"description":"Unfinished numeric entity",
+"input":"&#x",
+"output":["ParseError", ["Character", "&#x"]]},
+
+{"description":"Entity with trailing semicolon (1)",
+"input":"I'm &not;it",
+"output":[["Character","I'm ¬it"]]},
+
+{"description":"Entity with trailing semicolon (2)",
+"input":"I'm &notin;",
+"output":[["Character","I'm ∉"]]},
+
+{"description":"Entity without trailing semicolon (1)",
+"input":"I'm &notit",
+"output":[["Character","I'm "], "ParseError", ["Character", "¬"],
+["Character", "it"]]},
+
+{"description":"Entity without trailing semicolon (2)",
+"input":"I'm &notin",
+"output":[["Character","I'm "], "ParseError", ["Character", "∉"]]},
+
+{"description":"Partial entity match at end of file",
+"input":"I'm &no",
+"output":[["Character","I'm "], "ParseError", ["Character", "&no"]]},
+
+{"description":"ASCII decimal entity",
+"input":"&#0036;",
+"output":[["Character","$"]]},
+
+{"description":"ASCII hexadecimal entity",
+"input":"&#x3f;",
+"output":[["Character","?"]]},
+
+{"description":"Hexadecimal entity in attribute",
+"input":"<h a='&#x3f;'></h>",
+"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]}
+
+]}
diff --git a/test/data/tokeniser2/test2.test b/test/data/tokeniser2/test2.test
new file mode 100644
index 0000000..32c0f99
--- /dev/null
+++ b/test/data/tokeniser2/test2.test
@@ -0,0 +1,108 @@
+{"tests": [
+
+{"description":"Doctype without a name",
+"input":"<!DOCTYPE>",
+"output":["ParseError", "ParseError", ["DOCTYPE", "", true]]},
+
+{"description":"Correct doctype without a space before name",
+"input":"<!DOCTYPEhtml>",
+"output":["ParseError", ["DOCTYPE", "HTML", false]]},
+
+{"description":"Incorrect doctype without a space before name",
+"input":"<!DOCTYPEfoo>",
+"output":["ParseError", ["DOCTYPE", "FOO", true]]},
+
+{"description":"Bogus doctype",
+"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\">",
+"output":["ParseError", ["DOCTYPE", "HTML", true]]},
+
+{"description":"Incomplete doctype",
+"input":"<!DOCTYPE html ",
+"output":["ParseError", ["DOCTYPE", "HTML", true]]},
+
+{"description":"Numeric entity representing the NUL character",
+"input":"&#0000;",
+"output":[["Character", "\uFFFD"]]},
+
+{"description":"Hexadecimal entity representing the NUL character",
+"input":"&#x0000;",
+"output":[["Character", "\uFFFD"]]},
+
+{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
+"input":"&#2225222;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
+"input":"&#x1010FFFF;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"Numeric entity representing a Windows-1252 'codepoint'",
+"input":"&#137;",
+"output":[["Character", "\u2030"]]},
+
+{"description":"Hexadecimal entity representing a Windows-1252 'codepoint'",
+"input":"&#x89;",
+"output":[["Character", "\u2030"]]},
+
+{"description":"Hexadecimal entity with mixed uppercase and lowercase",
+"input":"&#xaBcD;",
+"output":[["Character", "\uABCD"]]},
+
+{"description":"Entity without a name",
+"input":"&;",
+"output":["ParseError", ["Character", "&;"]]},
+
+{"description":"Unescaped ampersand in attribute value",
+"input":"<h a='&'>",
+"output":["ParseError", ["StartTag", "h", { "a":"&" }]]},
+
+{"description":"StartTag containing <",
+"input":"<a<b>",
+"output":["ParseError", ["StartTag", "a", { }], ["StartTag", "b", { }]]},
+
+{"description":"Non-void element containing trailing /",
+"input":"<h/>",
+"output":["ParseError", ["StartTag", "h", { }]]},
+
+{"description":"Void element with permitted slash",
+"input":"<br/>",
+"output":[["StartTag", "br", { }]]},
+
+{"description":"StartTag containing /",
+"input":"<h/a='b'>",
+"output":["ParseError", ["StartTag", "h", { "a":"b" }]]},
+
+{"description":"Double-quoted attribute value",
+"input":"<h a=\"b\">",
+"output":[["StartTag", "h", { "a":"b" }]]},
+
+{"description":"Unescaped </",
+"input":"</",
+"output":["ParseError", ["Character", "</"]]},
+
+{"description":"Illegal end tag name",
+"input":"</1>",
+"output":["ParseError", ["Comment", "1"]]},
+
+{"description":"Simili processing instruction",
+"input":"<?namespace>",
+"output":["ParseError", ["Comment", "?namespace"]]},
+
+{"description":"A bogus comment stops at >, even if preceeded by two dashes",
+"input":"<?foo-->",
+"output":["ParseError", ["Comment", "?foo--"]]},
+
+{"description":"Unescaped <",
+"input":"foo < bar",
+"output":[["Character", "foo "], "ParseError", ["Character", "< bar"]]},
+
+/* jmb -- libjson uses C strings internally, thus the input gets truncated before the
+ * data is fed to the input stream (and thus the tokeniser)
+{"description":"Null Byte Replacement",
+"input":"\u0000",
+"output":[["Character", "\ufffd"]]}
+*/
+
+]}
+
+
-- 
cgit v1.2.3