From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sat, 23 Jun 2007 22:40:25 +0000 Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359 --- test/data/tokeniser2/INDEX | 7 ++ test/data/tokeniser2/contentModelFlags.test | 36 ++++++++ test/data/tokeniser2/test1.test | 136 ++++++++++++++++++++++++++++ test/data/tokeniser2/test2.test | 108 ++++++++++++++++++++++ 4 files changed, 287 insertions(+) create mode 100644 test/data/tokeniser2/INDEX create mode 100644 test/data/tokeniser2/contentModelFlags.test create mode 100644 test/data/tokeniser2/test1.test create mode 100644 test/data/tokeniser2/test2.test (limited to 'test/data/tokeniser2') diff --git a/test/data/tokeniser2/INDEX b/test/data/tokeniser2/INDEX new file mode 100644 index 0000000..8539aeb --- /dev/null +++ b/test/data/tokeniser2/INDEX @@ -0,0 +1,7 @@ +# Index file for tokeniser tests +# +# Test Description + +test1.test html5lib tests (part 1) +test2.test html5lib tests (part 2) +contentModelFlags.test html5lib content model tests \ No newline at end of file diff --git a/test/data/tokeniser2/contentModelFlags.test b/test/data/tokeniser2/contentModelFlags.test new file mode 100644 index 0000000..84d41fc --- /dev/null +++ b/test/data/tokeniser2/contentModelFlags.test @@ -0,0 +1,36 @@ +{"tests": [ + +{"description":"PLAINTEXT content model flag", +"contentModelFlags":["PLAINTEXT"], +"input":"&body;", +"output":[["Character", "&body;"]]}, + +{"description":"End tag closing RCDATA or CDATA", +"contentModelFlags":["RCDATA", "CDATA"], +"lastStartTag":"bar", +"input":"foo", +"output":[["Character", "foo"], ["EndTag", "bar"]]}, + +{"description":"End tag with incorrect name in RCDATA or CDATA", +"contentModelFlags":["RCDATA", "CDATA"], +"lastStartTag":"baz", +"input":"bar", +"output":["ParseError", ["Character", "bar"], ["EndTag", "baz"]]}, + +{"description":"End tag closing RCDATA or CDATA, switching back to PCDATA", +"contentModelFlags":["RCDATA", "CDATA"], +"lastStartTag":"bar", +"input":"foo", +"output":[["Character", "foo"], ["EndTag", "bar"], ["EndTag", "baz"]]}, + +{"description":"CDATA w/ something looking like an entity", +"contentModelFlags":["CDATA"], +"input":"&foo;", +"output":[["Character", "&foo;"]]}, + +{"description":"RCDATA w/ an entity", +"contentModelFlags":["RCDATA"], +"input":"<", +"output":[["Character", "<"]]} + +]} diff --git a/test/data/tokeniser2/test1.test b/test/data/tokeniser2/test1.test new file mode 100644 index 0000000..c12ff5a --- /dev/null +++ b/test/data/tokeniser2/test1.test @@ -0,0 +1,136 @@ +{"tests": [ + +{"description":"Correct Doctype lowercase", +"input":"", +"output":[["DOCTYPE", "HTML", false]]}, + +{"description":"Correct Doctype uppercase", +"input":"", +"output":[["DOCTYPE", "HTML", false]]}, + +{"description":"Correct Doctype mixed case", +"input":"", +"output":[["DOCTYPE", "HTML", false]]}, + +{"description":"Truncated doctype start", +"input":"", +"output":["ParseError", ["Comment", "DOC"]]}, + +{"description":"Doctype in error", +"input":"", +"output":[["DOCTYPE", "FOO", true]]}, + +{"description":"Single Start Tag", +"input":"", +"output":[["StartTag", "h", {}]]}, + +{"description":"Empty end tag", +"input":"", +"output":["ParseError"]}, + +{"description":"Empty start tag", +"input":"<>", +"output":["ParseError", ["Character", "<>"]]}, + +{"description":"Start Tag w/attribute", +"input":"", +"output":[["StartTag", "h", {"a":"b"}]]}, + +{"description":"Start Tag w/attribute no quotes", +"input":"", +"output":[["StartTag", "h", {"a":"b"}]]}, + +{"description":"Start/End Tag", +"input":"", +"output":[["StartTag", "h", {}], ["EndTag", "h"]]}, + +{"description":"Two unclosed start tags", +"input":"

One

Two", +"output":[["StartTag", "p", {}], ["Character", "One"], ["StartTag", "p", {}], ["Character", "Two"]]}, + +{"description":"End Tag w/attribute", +"input":"", +"output":[["StartTag", "h", {}], "ParseError", ["EndTag", "h"]]}, + +{"description":"Multiple atts", +"input":"", +"output":[["StartTag", "h", {"a":"b", "c":"d"}]]}, + +{"description":"Multiple atts no space", +"input":"", +"output":[["StartTag", "h", {"a":"b", "c":"d"}]]}, + +{"description":"Repeated attr", + "input":"", + "output":["ParseError", ["StartTag", "h", {"a":"b"}]]}, + +{"description":"Simple comment", + "input":"", + "output":[["Comment", "comment"]]}, + +{"description":"Comment, Central dash no space", + "input":"", + "output":["ParseError", ["Comment", "-"]]}, + +{"description":"Comment, two central dashes", +"input":"", +"output":["ParseError", ["Comment", " --comment "]]}, + +{"description":"Unfinished comment", +"input":"", +"output":["ParseError", ["Comment", "?foo--"]]}, + +{"description":"Unescaped <", +"input":"foo < bar", +"output":[["Character", "foo "], "ParseError", ["Character", "< bar"]]}, + +/* jmb -- libjson uses C strings internally, thus the input gets truncated before the + * data is fed to the input stream (and thus the tokeniser) +{"description":"Null Byte Replacement", +"input":"\u0000", +"output":[["Character", "\ufffd"]]} +*/ + +]} + + -- cgit v1.2.3