From 72c39e3522c5781d1e7dc8abad77d96141c5d49b Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Thu, 1 May 2008 16:36:27 +0000 Subject: Import beginnings of a CSS parsing library. Currently comprises a lexer. svn path=/trunk/libcss/; revision=4112 --- docs/Lexer | 30 ++++++++++++++++++++++++++++ docs/Tokens | 65 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 95 insertions(+) create mode 100644 docs/Lexer create mode 100644 docs/Tokens (limited to 'docs') diff --git a/docs/Lexer b/docs/Lexer new file mode 100644 index 0000000..8f8e1ea --- /dev/null +++ b/docs/Lexer @@ -0,0 +1,30 @@ +Lexical analyser +================ + +This document contains various snippets of information about the lexer +implementation. + +First sets +---------- + +IDENT [a-zA-Z] | '-' | '_' | [^#x0-#x7F] | '\' +ATKEYWORD '@' +STRING '"' | "'" +HASH '#' +NUMBER [0-9] | '.' +PERCENTAGE [0-9] | '.' +DIMENSION [0-9] | '.' +URI [Uu] +UNICODE-RANGE [Uu] +CDO '<' +CDC '-' +S #x9 | #xA | #xC | #xD | #x20 +COMMENT '/' +FUNCTION [a-zA-Z] | '-' | '_' | [^#x0-#x7F] | '\' +INCLUDES '~' +DASHMATCH '|' +PREFIXMATCH '^' +SUFFIXMATCH '$' +SUBSTRINGMATCH '*' +CHAR anything except " or ' + diff --git a/docs/Tokens b/docs/Tokens new file mode 100644 index 0000000..21e09da --- /dev/null +++ b/docs/Tokens @@ -0,0 +1,65 @@ +Production rules for lexical tokens +=================================== + +This file provides a complete set of production rules for the tokens generated +by the lexer. In case of ambiguity, the longest match wins. + +Components +---------- + +ident ::= '-'? nmstart nmchar* +name ::= nmchar+ +nmstart ::= [a-zA-Z] | '_' | nonascii | escape +nonascii ::= [#x80-#xD7FF#xE000-#xFFFD#x10000-#x10FFFF] +unicode ::= '\' [0-9a-fA-F]{1,6} wc? +escape ::= unicode | '\' [^\n\r\f0-9a-fA-F] +nmchar ::= [a-zA-Z0-9] | '-' | '_' | nonascii | escape +num ::= [0-9]+ | [0-9]* '.' [0-9]+ +string ::= '"' (stringchar | "'")* '"' | "'" (stringchar | '"')* "'" +stringchar ::= urlchar | #x20 | #x29 | '\' nl +urlchar ::= [#x9#x21#x23-#x26#x28#x2A-#x7E] | nonascii | escape +nl ::= #xA | #xD #xA | #xD | #xC +w ::= wc* +wc ::= #x9 | #xA | #xC | #xD | #x20 + +Tokens +------ + +IDENT ::= ident +ATKEYWORD ::= '@' ident +STRING ::= string +HASH ::= '#' name +NUMBER ::= num +PERCENTAGE ::= num '%' +DIMENSION ::= num ident +URI ::= "url(" w (string | urlchar*) w ')' +UNICODE-RANGE ::= [Uu] '+' [0-9a-fA-F?]{1,6} ('-' [0-9a-fA-F]{1,6})? +CDO ::= "" +S ::= wc+ +COMMENT ::= "/*" [^*]* '*'+ ([^/] [^*]* '*'+) '/' +FUNCTION ::= ident '(' +INCLUDES ::= "~=" +DASHMATCH ::= "|=" +PREFIXMATCH ::= "^=" +SUFFIXMATCH ::= "$=" +SUBSTRINGMATCH ::= "*=" +CHAR ::= any other character, except " or ' + +Differences from the CSS3 Syntax module specification +----------------------------------------------------- + +1) UNICODE-RANGE is case insensitive (it's uppercase only in the spec) +2) escape follows CSS2.1. CSS3 defines it as: + escape ::= unicode | '\' [#x20-#x7E#x80-#xD7FF#xE000-#xFFFD#x10000-#x10FFFF] +3) urlchar omits ' and ): + a) If ' is permitted verbatim then, as stringchar inherits from urlchar, + single quoted strings may contain verbatim single quotes. This is + clearly nonsense. + b) If ) is permitted verbatim then it becomes impossible to determine the + true end of URI. Thus, for sanity's sake, it's omitted here. +4) stringchar explicitly includes ). See 3(b) for why it won't inherit it + from urlchar as the spec implies. +5) BOM ::= #xFEFF is omitted. It is assumed that any leading BOM will be + stripped from the document before lexing occurs. + -- cgit v1.2.3