summaryrefslogtreecommitdiff
path: root/test/data/tokeniser2
diff options
context:
space:
mode:
Diffstat (limited to 'test/data/tokeniser2')
-rw-r--r--test/data/tokeniser2/INDEX7
-rw-r--r--test/data/tokeniser2/contentModelFlags.test36
-rw-r--r--test/data/tokeniser2/test1.test136
-rw-r--r--test/data/tokeniser2/test2.test108
4 files changed, 287 insertions, 0 deletions
diff --git a/test/data/tokeniser2/INDEX b/test/data/tokeniser2/INDEX
new file mode 100644
index 0000000..8539aeb
--- /dev/null
+++ b/test/data/tokeniser2/INDEX
@@ -0,0 +1,7 @@
+# Index file for tokeniser tests
+#
+# Test Description
+
+test1.test html5lib tests (part 1)
+test2.test html5lib tests (part 2)
+contentModelFlags.test html5lib content model tests \ No newline at end of file
diff --git a/test/data/tokeniser2/contentModelFlags.test b/test/data/tokeniser2/contentModelFlags.test
new file mode 100644
index 0000000..84d41fc
--- /dev/null
+++ b/test/data/tokeniser2/contentModelFlags.test
@@ -0,0 +1,36 @@
+{"tests": [
+
+{"description":"PLAINTEXT content model flag",
+"contentModelFlags":["PLAINTEXT"],
+"input":"<head>&body;",
+"output":[["Character", "<head>&body;"]]},
+
+{"description":"End tag closing RCDATA or CDATA",
+"contentModelFlags":["RCDATA", "CDATA"],
+"lastStartTag":"bar",
+"input":"foo</bar>",
+"output":[["Character", "foo"], ["EndTag", "bar"]]},
+
+{"description":"End tag with incorrect name in RCDATA or CDATA",
+"contentModelFlags":["RCDATA", "CDATA"],
+"lastStartTag":"baz",
+"input":"</foo>bar</baz>",
+"output":["ParseError", ["Character", "</foo>bar"], ["EndTag", "baz"]]},
+
+{"description":"End tag closing RCDATA or CDATA, switching back to PCDATA",
+"contentModelFlags":["RCDATA", "CDATA"],
+"lastStartTag":"bar",
+"input":"foo</bar></baz>",
+"output":[["Character", "foo"], ["EndTag", "bar"], ["EndTag", "baz"]]},
+
+{"description":"CDATA w/ something looking like an entity",
+"contentModelFlags":["CDATA"],
+"input":"&foo;",
+"output":[["Character", "&foo;"]]},
+
+{"description":"RCDATA w/ an entity",
+"contentModelFlags":["RCDATA"],
+"input":"&lt;",
+"output":[["Character", "<"]]}
+
+]}
diff --git a/test/data/tokeniser2/test1.test b/test/data/tokeniser2/test1.test
new file mode 100644
index 0000000..c12ff5a
--- /dev/null
+++ b/test/data/tokeniser2/test1.test
@@ -0,0 +1,136 @@
+{"tests": [
+
+{"description":"Correct Doctype lowercase",
+"input":"<!DOCTYPE html>",
+"output":[["DOCTYPE", "HTML", false]]},
+
+{"description":"Correct Doctype uppercase",
+"input":"<!DOCTYPE HTML>",
+"output":[["DOCTYPE", "HTML", false]]},
+
+{"description":"Correct Doctype mixed case",
+"input":"<!DOCTYPE HtMl>",
+"output":[["DOCTYPE", "HTML", false]]},
+
+{"description":"Truncated doctype start",
+"input":"<!DOC>",
+"output":["ParseError", ["Comment", "DOC"]]},
+
+{"description":"Doctype in error",
+"input":"<!DOCTYPE foo>",
+"output":[["DOCTYPE", "FOO", true]]},
+
+{"description":"Single Start Tag",
+"input":"<h>",
+"output":[["StartTag", "h", {}]]},
+
+{"description":"Empty end tag",
+"input":"</>",
+"output":["ParseError"]},
+
+{"description":"Empty start tag",
+"input":"<>",
+"output":["ParseError", ["Character", "<>"]]},
+
+{"description":"Start Tag w/attribute",
+"input":"<h a='b'>",
+"output":[["StartTag", "h", {"a":"b"}]]},
+
+{"description":"Start Tag w/attribute no quotes",
+"input":"<h a=b>",
+"output":[["StartTag", "h", {"a":"b"}]]},
+
+{"description":"Start/End Tag",
+"input":"<h></h>",
+"output":[["StartTag", "h", {}], ["EndTag", "h"]]},
+
+{"description":"Two unclosed start tags",
+"input":"<p>One<p>Two",
+"output":[["StartTag", "p", {}], ["Character", "One"], ["StartTag", "p", {}], ["Character", "Two"]]},
+
+{"description":"End Tag w/attribute",
+"input":"<h></h a='b'>",
+"output":[["StartTag", "h", {}], "ParseError", ["EndTag", "h"]]},
+
+{"description":"Multiple atts",
+"input":"<h a='b' c='d'>",
+"output":[["StartTag", "h", {"a":"b", "c":"d"}]]},
+
+{"description":"Multiple atts no space",
+"input":"<h a='b'c='d'>",
+"output":[["StartTag", "h", {"a":"b", "c":"d"}]]},
+
+{"description":"Repeated attr",
+ "input":"<h a='b' a='d'>",
+ "output":["ParseError", ["StartTag", "h", {"a":"b"}]]},
+
+{"description":"Simple comment",
+ "input":"<!--comment-->",
+ "output":[["Comment", "comment"]]},
+
+{"description":"Comment, Central dash no space",
+ "input":"<!----->",
+ "output":["ParseError", ["Comment", "-"]]},
+
+{"description":"Comment, two central dashes",
+"input":"<!-- --comment -->",
+"output":["ParseError", ["Comment", " --comment "]]},
+
+{"description":"Unfinished comment",
+"input":"<!--comment",
+"output":["ParseError", ["Comment", "comment"]]},
+
+{"description":"Start of a comment",
+"input":"<!-",
+"output":["ParseError", ["Comment", "-"]]},
+
+{"description":"Ampersand only",
+"input":"&",
+"output":["ParseError", ["Character", "&"]]},
+
+{"description":"Unfinished entity",
+"input":"&f",
+"output":["ParseError", ["Character", "&"], ["Character", "f"]]},
+
+{"description":"Ampersand, number sign",
+"input":"&#",
+"output":["ParseError", ["Character", "&"], ["Character", "#"]]},
+
+{"description":"Unfinished numeric entity",
+"input":"&#x",
+"output":["ParseError", ["Character", "&#x"]]},
+
+{"description":"Entity with trailing semicolon (1)",
+"input":"I'm &not;it",
+"output":[["Character","I'm ¬it"]]},
+
+{"description":"Entity with trailing semicolon (2)",
+"input":"I'm &notin;",
+"output":[["Character","I'm ∉"]]},
+
+{"description":"Entity without trailing semicolon (1)",
+"input":"I'm &notit",
+"output":[["Character","I'm "], "ParseError", ["Character", "¬"],
+["Character", "it"]]},
+
+{"description":"Entity without trailing semicolon (2)",
+"input":"I'm &notin",
+"output":[["Character","I'm "], "ParseError", ["Character", "∉"]]},
+
+{"description":"Partial entity match at end of file",
+"input":"I'm &no",
+"output":[["Character","I'm "], "ParseError", ["Character", "&no"]]},
+
+{"description":"ASCII decimal entity",
+"input":"&#0036;",
+"output":[["Character","$"]]},
+
+{"description":"ASCII hexadecimal entity",
+"input":"&#x3f;",
+"output":[["Character","?"]]},
+
+{"description":"Hexadecimal entity in attribute",
+"input":"<h a='&#x3f;'></h>",
+"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]}
+
+]}
diff --git a/test/data/tokeniser2/test2.test b/test/data/tokeniser2/test2.test
new file mode 100644
index 0000000..32c0f99
--- /dev/null
+++ b/test/data/tokeniser2/test2.test
@@ -0,0 +1,108 @@
+{"tests": [
+
+{"description":"Doctype without a name",
+"input":"<!DOCTYPE>",
+"output":["ParseError", "ParseError", ["DOCTYPE", "", true]]},
+
+{"description":"Correct doctype without a space before name",
+"input":"<!DOCTYPEhtml>",
+"output":["ParseError", ["DOCTYPE", "HTML", false]]},
+
+{"description":"Incorrect doctype without a space before name",
+"input":"<!DOCTYPEfoo>",
+"output":["ParseError", ["DOCTYPE", "FOO", true]]},
+
+{"description":"Bogus doctype",
+"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\">",
+"output":["ParseError", ["DOCTYPE", "HTML", true]]},
+
+{"description":"Incomplete doctype",
+"input":"<!DOCTYPE html ",
+"output":["ParseError", ["DOCTYPE", "HTML", true]]},
+
+{"description":"Numeric entity representing the NUL character",
+"input":"&#0000;",
+"output":[["Character", "\uFFFD"]]},
+
+{"description":"Hexadecimal entity representing the NUL character",
+"input":"&#x0000;",
+"output":[["Character", "\uFFFD"]]},
+
+{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)",
+"input":"&#2225222;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)",
+"input":"&#x1010FFFF;",
+"output":["ParseError", ["Character", "\uFFFD"]]},
+
+{"description":"Numeric entity representing a Windows-1252 'codepoint'",
+"input":"&#137;",
+"output":[["Character", "\u2030"]]},
+
+{"description":"Hexadecimal entity representing a Windows-1252 'codepoint'",
+"input":"&#x89;",
+"output":[["Character", "\u2030"]]},
+
+{"description":"Hexadecimal entity with mixed uppercase and lowercase",
+"input":"&#xaBcD;",
+"output":[["Character", "\uABCD"]]},
+
+{"description":"Entity without a name",
+"input":"&;",
+"output":["ParseError", ["Character", "&;"]]},
+
+{"description":"Unescaped ampersand in attribute value",
+"input":"<h a='&'>",
+"output":["ParseError", ["StartTag", "h", { "a":"&" }]]},
+
+{"description":"StartTag containing <",
+"input":"<a<b>",
+"output":["ParseError", ["StartTag", "a", { }], ["StartTag", "b", { }]]},
+
+{"description":"Non-void element containing trailing /",
+"input":"<h/>",
+"output":["ParseError", ["StartTag", "h", { }]]},
+
+{"description":"Void element with permitted slash",
+"input":"<br/>",
+"output":[["StartTag", "br", { }]]},
+
+{"description":"StartTag containing /",
+"input":"<h/a='b'>",
+"output":["ParseError", ["StartTag", "h", { "a":"b" }]]},
+
+{"description":"Double-quoted attribute value",
+"input":"<h a=\"b\">",
+"output":[["StartTag", "h", { "a":"b" }]]},
+
+{"description":"Unescaped </",
+"input":"</",
+"output":["ParseError", ["Character", "</"]]},
+
+{"description":"Illegal end tag name",
+"input":"</1>",
+"output":["ParseError", ["Comment", "1"]]},
+
+{"description":"Simili processing instruction",
+"input":"<?namespace>",
+"output":["ParseError", ["Comment", "?namespace"]]},
+
+{"description":"A bogus comment stops at >, even if preceeded by two dashes",
+"input":"<?foo-->",
+"output":["ParseError", ["Comment", "?foo--"]]},
+
+{"description":"Unescaped <",
+"input":"foo < bar",
+"output":[["Character", "foo "], "ParseError", ["Character", "< bar"]]},
+
+/* jmb -- libjson uses C strings internally, thus the input gets truncated before the
+ * data is fed to the input stream (and thus the tokeniser)
+{"description":"Null Byte Replacement",
+"input":"\u0000",
+"output":[["Character", "\ufffd"]]}
+*/
+
+]}
+
+