From 72c39e3522c5781d1e7dc8abad77d96141c5d49b Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Thu, 1 May 2008 16:36:27 +0000 Subject: Import beginnings of a CSS parsing library. Currently comprises a lexer. svn path=/trunk/libcss/; revision=4112 --- src/charset/Makefile | 46 +++++++++++++++ src/charset/detect.c | 161 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/charset/detect.h | 24 ++++++++ 3 files changed, 231 insertions(+) create mode 100644 src/charset/Makefile create mode 100644 src/charset/detect.c create mode 100644 src/charset/detect.h (limited to 'src/charset') diff --git a/src/charset/Makefile b/src/charset/Makefile new file mode 100644 index 0000000..dda58c1 --- /dev/null +++ b/src/charset/Makefile @@ -0,0 +1,46 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Sources +SRCS_$(d) := detect.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/charset/detect.c b/src/charset/detect.c new file mode 100644 index 0000000..11ee699 --- /dev/null +++ b/src/charset/detect.c @@ -0,0 +1,161 @@ +/* + * This file is part of LibCSS. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2008 John-Mark Bell + */ + +#include +#include + +#include + +#include "charset/detect.h" +#include "utils/utils.h" + +static parserutils_error css_charset_read_bom_or_charset(const uint8_t *data, + size_t len, uint16_t *mibenum); + +/** + * Extract a charset from a chunk of data + * + * \param data Pointer to buffer containing data + * \param len Buffer length + * \param mibenum Pointer to location containing current MIB enum + * \param source Pointer to location containing current charset source + * \return PARSERUTILS_OK on success, appropriate error otherwise + * + * ::mibenum and ::source will be updated on exit + * + * CSS 2.1 $4.4 + */ +parserutils_error css_charset_extract(const uint8_t *data, size_t len, + uint16_t *mibenum, uint32_t *source) +{ + css_error error; + uint16_t charset = 0; + + if (data == NULL || mibenum == NULL || source == NULL) + return PARSERUTILS_BADPARM; + + /* If the charset was dictated by the client, we've nothing to detect */ + if (*source == CSS_CHARSET_DICTATED) + return PARSERUTILS_OK; + + /* We need at least 4 bytes of data */ + if (len < 4) + goto default_encoding; + + /* Look for a BOM and/or @charset */ + error = css_charset_read_bom_or_charset(data, len, &charset); + if (error != PARSERUTILS_OK) + return error; + + if (charset != 0) { + *mibenum = charset; + *source = CSS_CHARSET_DOCUMENT; + + return PARSERUTILS_OK; + } + + /* If we've already got a charset from the linking mechanism or + * referring document, then we've nothing further to do */ + if (*source != CSS_CHARSET_DEFAULT) + return PARSERUTILS_OK; + + /* We've not yet found a charset, so use the default fallback */ +default_encoding: + + charset = parserutils_charset_mibenum_from_name("UTF-8", SLEN("UTF-8")); + + *mibenum = charset; + *source = CSS_CHARSET_DEFAULT; + + return PARSERUTILS_OK; +} + + +/** + * Inspect the beginning of a buffer of data for the presence of a + * UTF Byte Order Mark and/or an @charset rule + * + * \param data Pointer to buffer containing data + * \param len Buffer length + * \param mibenum Pointer to location to receive MIB enum + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error css_charset_read_bom_or_charset(const uint8_t *data, + size_t len, uint16_t *mibenum) +{ + uint16_t charset = 0; + + if (data == NULL) + return PARSERUTILS_BADPARM; + + /* We require at least 4 bytes of data */ + if (len < 4) + return PARSERUTILS_NEEDDATA; + + + /* Look for BOM */ + if (data[0] == 0x00 && data[1] == 0x00 && + data[2] == 0xFE && data[3] == 0xFF) { + charset = parserutils_charset_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + } else if (data[0] == 0xFF && data[1] == 0xFE && + data[2] == 0x00 && data[3] == 0x00) { + charset = parserutils_charset_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } else if (data[0] == 0xFE && data[1] == 0xFF) { + charset = parserutils_charset_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + } else if (data[0] == 0xFF && data[1] == 0xFE) { + charset = parserutils_charset_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + } else if (data[0] == 0xEF && data[1] == 0xBB && data[2] == 0xBF) { + charset = parserutils_charset_mibenum_from_name("UTF-8", + SLEN("UTF-8")); + } + + /* BOM beats @charset. + * UAs differ here, but none appear to match the spec. + * The spec indicates that any @charset present in conjunction with a + * BOM, should match the BOM. In reality, it appears UAs just take the + * BOM as gospel and ignore any @charset rule. The w3c CSS validator + * appears to do the same (at the least, it doesn't complain about a + * mismatch). + */ + if (charset != 0) { + *mibenum = charset; + return PARSERUTILS_OK; + } + + /** \todo UTF-32 and UTF-16 @charset support */ + + /* Look for @charset, assuming ASCII-compatible source data */ + if (len > 10 && strncmp((const char *) data, "@charset \"", + SLEN("@charset \"")) == 0) { + const uint8_t *end; + + /* Look for "; at end of charset declaration */ + for (end = data + 10; end < data + len; end++) { + if (*end == '"' && end < data + len - 1 && + *(end + 1) == ';') + break; + } + + if (end == data + len) { + /* Ran out of input */ + return PARSERUTILS_NEEDDATA; + } + + /* Convert to MIB enum */ + charset = parserutils_charset_mibenum_from_name( + (char *) data + 10, end - data - 10); + } + + *mibenum = charset; + + return PARSERUTILS_OK; +} + diff --git a/src/charset/detect.h b/src/charset/detect.h new file mode 100644 index 0000000..d907921 --- /dev/null +++ b/src/charset/detect.h @@ -0,0 +1,24 @@ +/* + * This file is part of LibCSS. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#ifndef css_charset_detect_h_ +#define css_charset_detect_h_ + +#include + +#include +#include +#include + +#include + +/* Extract a charset from a chunk of data */ +parserutils_error css_charset_extract(const uint8_t *data, size_t len, + uint16_t *mibenum, uint32_t *source); + +#endif + -- cgit v1.2.3