diff options
Diffstat (limited to 'test/data/tokeniser2')
-rw-r--r-- | test/data/tokeniser2/INDEX | 7 | ||||
-rw-r--r-- | test/data/tokeniser2/contentModelFlags.test | 36 | ||||
-rw-r--r-- | test/data/tokeniser2/test1.test | 136 | ||||
-rw-r--r-- | test/data/tokeniser2/test2.test | 108 |
4 files changed, 287 insertions, 0 deletions
diff --git a/test/data/tokeniser2/INDEX b/test/data/tokeniser2/INDEX new file mode 100644 index 0000000..8539aeb --- /dev/null +++ b/test/data/tokeniser2/INDEX @@ -0,0 +1,7 @@ +# Index file for tokeniser tests +# +# Test Description + +test1.test html5lib tests (part 1) +test2.test html5lib tests (part 2) +contentModelFlags.test html5lib content model tests
\ No newline at end of file diff --git a/test/data/tokeniser2/contentModelFlags.test b/test/data/tokeniser2/contentModelFlags.test new file mode 100644 index 0000000..84d41fc --- /dev/null +++ b/test/data/tokeniser2/contentModelFlags.test @@ -0,0 +1,36 @@ +{"tests": [ + +{"description":"PLAINTEXT content model flag", +"contentModelFlags":["PLAINTEXT"], +"input":"<head>&body;", +"output":[["Character", "<head>&body;"]]}, + +{"description":"End tag closing RCDATA or CDATA", +"contentModelFlags":["RCDATA", "CDATA"], +"lastStartTag":"bar", +"input":"foo</bar>", +"output":[["Character", "foo"], ["EndTag", "bar"]]}, + +{"description":"End tag with incorrect name in RCDATA or CDATA", +"contentModelFlags":["RCDATA", "CDATA"], +"lastStartTag":"baz", +"input":"</foo>bar</baz>", +"output":["ParseError", ["Character", "</foo>bar"], ["EndTag", "baz"]]}, + +{"description":"End tag closing RCDATA or CDATA, switching back to PCDATA", +"contentModelFlags":["RCDATA", "CDATA"], +"lastStartTag":"bar", +"input":"foo</bar></baz>", +"output":[["Character", "foo"], ["EndTag", "bar"], ["EndTag", "baz"]]}, + +{"description":"CDATA w/ something looking like an entity", +"contentModelFlags":["CDATA"], +"input":"&foo;", +"output":[["Character", "&foo;"]]}, + +{"description":"RCDATA w/ an entity", +"contentModelFlags":["RCDATA"], +"input":"<", +"output":[["Character", "<"]]} + +]} diff --git a/test/data/tokeniser2/test1.test b/test/data/tokeniser2/test1.test new file mode 100644 index 0000000..c12ff5a --- /dev/null +++ b/test/data/tokeniser2/test1.test @@ -0,0 +1,136 @@ +{"tests": [ + +{"description":"Correct Doctype lowercase", +"input":"<!DOCTYPE html>", +"output":[["DOCTYPE", "HTML", false]]}, + +{"description":"Correct Doctype uppercase", +"input":"<!DOCTYPE HTML>", +"output":[["DOCTYPE", "HTML", false]]}, + +{"description":"Correct Doctype mixed case", +"input":"<!DOCTYPE HtMl>", +"output":[["DOCTYPE", "HTML", false]]}, + +{"description":"Truncated doctype start", +"input":"<!DOC>", +"output":["ParseError", ["Comment", "DOC"]]}, + +{"description":"Doctype in error", +"input":"<!DOCTYPE foo>", +"output":[["DOCTYPE", "FOO", true]]}, + +{"description":"Single Start Tag", +"input":"<h>", +"output":[["StartTag", "h", {}]]}, + +{"description":"Empty end tag", +"input":"</>", +"output":["ParseError"]}, + +{"description":"Empty start tag", +"input":"<>", +"output":["ParseError", ["Character", "<>"]]}, + +{"description":"Start Tag w/attribute", +"input":"<h a='b'>", +"output":[["StartTag", "h", {"a":"b"}]]}, + +{"description":"Start Tag w/attribute no quotes", +"input":"<h a=b>", +"output":[["StartTag", "h", {"a":"b"}]]}, + +{"description":"Start/End Tag", +"input":"<h></h>", +"output":[["StartTag", "h", {}], ["EndTag", "h"]]}, + +{"description":"Two unclosed start tags", +"input":"<p>One<p>Two", +"output":[["StartTag", "p", {}], ["Character", "One"], ["StartTag", "p", {}], ["Character", "Two"]]}, + +{"description":"End Tag w/attribute", +"input":"<h></h a='b'>", +"output":[["StartTag", "h", {}], "ParseError", ["EndTag", "h"]]}, + +{"description":"Multiple atts", +"input":"<h a='b' c='d'>", +"output":[["StartTag", "h", {"a":"b", "c":"d"}]]}, + +{"description":"Multiple atts no space", +"input":"<h a='b'c='d'>", +"output":[["StartTag", "h", {"a":"b", "c":"d"}]]}, + +{"description":"Repeated attr", + "input":"<h a='b' a='d'>", + "output":["ParseError", ["StartTag", "h", {"a":"b"}]]}, + +{"description":"Simple comment", + "input":"<!--comment-->", + "output":[["Comment", "comment"]]}, + +{"description":"Comment, Central dash no space", + "input":"<!----->", + "output":["ParseError", ["Comment", "-"]]}, + +{"description":"Comment, two central dashes", +"input":"<!-- --comment -->", +"output":["ParseError", ["Comment", " --comment "]]}, + +{"description":"Unfinished comment", +"input":"<!--comment", +"output":["ParseError", ["Comment", "comment"]]}, + +{"description":"Start of a comment", +"input":"<!-", +"output":["ParseError", ["Comment", "-"]]}, + +{"description":"Ampersand only", +"input":"&", +"output":["ParseError", ["Character", "&"]]}, + +{"description":"Unfinished entity", +"input":"&f", +"output":["ParseError", ["Character", "&"], ["Character", "f"]]}, + +{"description":"Ampersand, number sign", +"input":"&#", +"output":["ParseError", ["Character", "&"], ["Character", "#"]]}, + +{"description":"Unfinished numeric entity", +"input":"&#x", +"output":["ParseError", ["Character", "&#x"]]}, + +{"description":"Entity with trailing semicolon (1)", +"input":"I'm ¬it", +"output":[["Character","I'm ¬it"]]}, + +{"description":"Entity with trailing semicolon (2)", +"input":"I'm ∉", +"output":[["Character","I'm ∉"]]}, + +{"description":"Entity without trailing semicolon (1)", +"input":"I'm ¬it", +"output":[["Character","I'm "], "ParseError", ["Character", "¬"], +["Character", "it"]]}, + +{"description":"Entity without trailing semicolon (2)", +"input":"I'm ¬in", +"output":[["Character","I'm "], "ParseError", ["Character", "∉"]]}, + +{"description":"Partial entity match at end of file", +"input":"I'm &no", +"output":[["Character","I'm "], "ParseError", ["Character", "&no"]]}, + +{"description":"ASCII decimal entity", +"input":"$", +"output":[["Character","$"]]}, + +{"description":"ASCII hexadecimal entity", +"input":"?", +"output":[["Character","?"]]}, + +{"description":"Hexadecimal entity in attribute", +"input":"<h a='?'></h>", +"output":[["StartTag", "h", {"a":"?"}], ["EndTag", "h"]]} + +]} diff --git a/test/data/tokeniser2/test2.test b/test/data/tokeniser2/test2.test new file mode 100644 index 0000000..32c0f99 --- /dev/null +++ b/test/data/tokeniser2/test2.test @@ -0,0 +1,108 @@ +{"tests": [ + +{"description":"Doctype without a name", +"input":"<!DOCTYPE>", +"output":["ParseError", "ParseError", ["DOCTYPE", "", true]]}, + +{"description":"Correct doctype without a space before name", +"input":"<!DOCTYPEhtml>", +"output":["ParseError", ["DOCTYPE", "HTML", false]]}, + +{"description":"Incorrect doctype without a space before name", +"input":"<!DOCTYPEfoo>", +"output":["ParseError", ["DOCTYPE", "FOO", true]]}, + +{"description":"Bogus doctype", +"input":"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML Transitional 4.01//EN\">", +"output":["ParseError", ["DOCTYPE", "HTML", true]]}, + +{"description":"Incomplete doctype", +"input":"<!DOCTYPE html ", +"output":["ParseError", ["DOCTYPE", "HTML", true]]}, + +{"description":"Numeric entity representing the NUL character", +"input":"�", +"output":[["Character", "\uFFFD"]]}, + +{"description":"Hexadecimal entity representing the NUL character", +"input":"�", +"output":[["Character", "\uFFFD"]]}, + +{"description":"Numeric entity representing a codepoint after 1114111 (U+10FFFF)", +"input":"�", +"output":["ParseError", ["Character", "\uFFFD"]]}, + +{"description":"Hexadecimal entity representing a codepoint after 1114111 (U+10FFFF)", +"input":"�", +"output":["ParseError", ["Character", "\uFFFD"]]}, + +{"description":"Numeric entity representing a Windows-1252 'codepoint'", +"input":"‰", +"output":[["Character", "\u2030"]]}, + +{"description":"Hexadecimal entity representing a Windows-1252 'codepoint'", +"input":"‰", +"output":[["Character", "\u2030"]]}, + +{"description":"Hexadecimal entity with mixed uppercase and lowercase", +"input":"ꯍ", +"output":[["Character", "\uABCD"]]}, + +{"description":"Entity without a name", +"input":"&;", +"output":["ParseError", ["Character", "&;"]]}, + +{"description":"Unescaped ampersand in attribute value", +"input":"<h a='&'>", +"output":["ParseError", ["StartTag", "h", { "a":"&" }]]}, + +{"description":"StartTag containing <", +"input":"<a<b>", +"output":["ParseError", ["StartTag", "a", { }], ["StartTag", "b", { }]]}, + +{"description":"Non-void element containing trailing /", +"input":"<h/>", +"output":["ParseError", ["StartTag", "h", { }]]}, + +{"description":"Void element with permitted slash", +"input":"<br/>", +"output":[["StartTag", "br", { }]]}, + +{"description":"StartTag containing /", +"input":"<h/a='b'>", +"output":["ParseError", ["StartTag", "h", { "a":"b" }]]}, + +{"description":"Double-quoted attribute value", +"input":"<h a=\"b\">", +"output":[["StartTag", "h", { "a":"b" }]]}, + +{"description":"Unescaped </", +"input":"</", +"output":["ParseError", ["Character", "</"]]}, + +{"description":"Illegal end tag name", +"input":"</1>", +"output":["ParseError", ["Comment", "1"]]}, + +{"description":"Simili processing instruction", +"input":"<?namespace>", +"output":["ParseError", ["Comment", "?namespace"]]}, + +{"description":"A bogus comment stops at >, even if preceeded by two dashes", +"input":"<?foo-->", +"output":["ParseError", ["Comment", "?foo--"]]}, + +{"description":"Unescaped <", +"input":"foo < bar", +"output":[["Character", "foo "], "ParseError", ["Character", "< bar"]]}, + +/* jmb -- libjson uses C strings internally, thus the input gets truncated before the + * data is fed to the input stream (and thus the tokeniser) +{"description":"Null Byte Replacement", +"input":"\u0000", +"output":[["Character", "\ufffd"]]} +*/ + +]} + + |