From 72c39e3522c5781d1e7dc8abad77d96141c5d49b Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Thu, 1 May 2008 16:36:27 +0000
Subject: Import beginnings of a CSS parsing library. Currently comprises a
 lexer.

svn path=/trunk/libcss/; revision=4112
---
 docs/Lexer  | 30 ++++++++++++++++++++++++++++
 docs/Tokens | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 95 insertions(+)
 create mode 100644 docs/Lexer
 create mode 100644 docs/Tokens

(limited to 'docs')

diff --git a/docs/Lexer b/docs/Lexer
new file mode 100644
index 0000000..8f8e1ea
--- /dev/null
+++ b/docs/Lexer
@@ -0,0 +1,30 @@
+Lexical analyser
+================
+
+This document contains various snippets of information about the lexer
+implementation.
+
+First sets
+----------
+
+IDENT 		[a-zA-Z] | '-' | '_' | [^#x0-#x7F] | '\'
+ATKEYWORD	'@'
+STRING		'"' | "'"
+HASH		'#'
+NUMBER		[0-9] | '.'
+PERCENTAGE	[0-9] | '.'
+DIMENSION	[0-9] | '.'
+URI		[Uu]
+UNICODE-RANGE	[Uu]
+CDO		'<'
+CDC		'-'
+S	 	#x9 | #xA | #xC | #xD | #x20
+COMMENT		'/'
+FUNCTION	[a-zA-Z] | '-' | '_' | [^#x0-#x7F] | '\'
+INCLUDES	'~'
+DASHMATCH	'|'
+PREFIXMATCH	'^'
+SUFFIXMATCH	'$'
+SUBSTRINGMATCH	'*'
+CHAR		anything except " or '
+
diff --git a/docs/Tokens b/docs/Tokens
new file mode 100644
index 0000000..21e09da
--- /dev/null
+++ b/docs/Tokens
@@ -0,0 +1,65 @@
+Production rules for lexical tokens
+===================================
+
+This file provides a complete set of production rules for the tokens generated
+by the lexer. In case of ambiguity, the longest match wins.
+
+Components
+----------
+
+ident      ::= '-'? nmstart nmchar*
+name       ::= nmchar+
+nmstart    ::= [a-zA-Z] | '_' | nonascii | escape
+nonascii   ::= [#x80-#xD7FF#xE000-#xFFFD#x10000-#x10FFFF]
+unicode    ::= '\' [0-9a-fA-F]{1,6} wc?
+escape     ::= unicode | '\' [^\n\r\f0-9a-fA-F]
+nmchar     ::= [a-zA-Z0-9] | '-' | '_' | nonascii | escape
+num        ::= [0-9]+ | [0-9]* '.' [0-9]+
+string     ::= '"' (stringchar | "'")* '"' | "'" (stringchar | '"')* "'"
+stringchar ::= urlchar | #x20 | #x29 | '\' nl
+urlchar    ::= [#x9#x21#x23-#x26#x28#x2A-#x7E] | nonascii | escape
+nl         ::= #xA | #xD #xA | #xD | #xC
+w          ::= wc*
+wc         ::= #x9 | #xA | #xC | #xD | #x20
+
+Tokens
+------
+
+IDENT          ::= ident
+ATKEYWORD      ::= '@' ident
+STRING         ::= string
+HASH           ::= '#' name
+NUMBER         ::= num
+PERCENTAGE     ::= num '%'
+DIMENSION      ::= num ident
+URI            ::= "url(" w (string | urlchar*) w ')'
+UNICODE-RANGE  ::= [Uu] '+' [0-9a-fA-F?]{1,6} ('-' [0-9a-fA-F]{1,6})?
+CDO            ::= "<!--"
+CDC            ::= "-->"
+S              ::= wc+
+COMMENT        ::= "/*" [^*]* '*'+ ([^/] [^*]* '*'+) '/'
+FUNCTION       ::= ident '('
+INCLUDES       ::= "~="
+DASHMATCH      ::= "|="
+PREFIXMATCH    ::= "^="
+SUFFIXMATCH    ::= "$="
+SUBSTRINGMATCH ::= "*="
+CHAR           ::= any other character, except " or '
+
+Differences from the CSS3 Syntax module specification
+-----------------------------------------------------
+
+1) UNICODE-RANGE is case insensitive (it's uppercase only in the spec)
+2) escape follows CSS2.1. CSS3 defines it as:
+   escape ::= unicode | '\' [#x20-#x7E#x80-#xD7FF#xE000-#xFFFD#x10000-#x10FFFF]
+3) urlchar omits ' and ):
+   a) If ' is permitted verbatim then, as stringchar inherits from urlchar, 
+      single quoted strings may contain verbatim single quotes. This is 
+      clearly nonsense.
+   b) If ) is permitted verbatim then it becomes impossible to determine the
+      true end of URI. Thus, for sanity's sake, it's omitted here.
+4) stringchar explicitly includes ). See 3(b) for why it won't inherit it 
+   from urlchar as the spec implies.
+5) BOM ::= #xFEFF is omitted. It is assumed that any leading BOM will be 
+   stripped from the document before lexing occurs.
+
-- 
cgit v1.2.3