From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sat, 23 Jun 2007 22:40:25 +0000 Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359 --- src/charset/detect.c | 673 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 673 insertions(+) create mode 100644 src/charset/detect.c (limited to 'src/charset/detect.c') diff --git a/src/charset/detect.c b/src/charset/detect.c new file mode 100644 index 0000000..8ff3b87 --- /dev/null +++ b/src/charset/detect.c @@ -0,0 +1,673 @@ +/* + * This file is part of Hubbub. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2007 John-Mark Bell + */ + +#include +#include + +#include "charset/aliases.h" +#include "utils/utils.h" + +#include "detect.h" + +static uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len); +static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len); +static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, + const uint8_t *end); +static uint16_t hubbub_charset_parse_content(const uint8_t *value, + uint32_t valuelen); +static bool hubbub_charset_get_attribute(const uint8_t **data, + const uint8_t *end, + const uint8_t **name, uint32_t *namelen, + const uint8_t **value, uint32_t *valuelen); + +/** + * Extract a charset from a chunk of data + * + * \param data Pointer to pointer to buffer containing data + * \param len Pointer to buffer length + * \param mibenum Pointer to location to store MIB enum representing charset + * \param source Pointer to location to receive charset source + * \return HUBBUB_OK on success, appropriate error otherwise + * + * The data pointer and length will be modified by this function if + * a byte order mark is encountered at the start of the buffer. The updated + * data pointer will point to the first byte in the buffer after the BOM. + * The length will be modified appropriately. + * + * The larger a chunk of data fed to this routine, the better, as it allows + * charset autodetection access to a larger dataset for analysis. + */ +hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len, + uint16_t *mibenum, hubbub_charset_source *source) +{ + uint16_t charset = 0; + + if (data == NULL || *data == NULL || len == NULL || + mibenum == NULL || source == NULL) + return HUBBUB_BADPARM; + + /* We need at least 4 bytes of data */ + if (*len < 4) + goto default_encoding; + + /* First, look for a BOM */ + charset = hubbub_charset_read_bom(data, len); + if (charset != 0) { + *mibenum = charset; + *source = HUBBUB_CHARSET_DOCUMENT; + + return HUBBUB_OK; + } + + /* No BOM was found, so we must look for a meta charset within + * the document itself. */ + charset = hubbub_charset_scan_meta(*data, *len); + if (charset != 0) { + /* ISO-8859-1 becomes Windows-1252 */ + if (charset == hubbub_mibenum_from_name("ISO-8859-1", + SLEN("ISO-8859-1"))) { + charset = hubbub_mibenum_from_name("Windows-1252", + SLEN("Windows-1252")); + /* Fallback to 8859-1 if that failed */ + if (charset == 0) + charset = hubbub_mibenum_from_name( + "ISO-8859-1", SLEN("ISO-8859-1")); + } + + /* If we've encountered a meta charset for a non-ASCII- + * compatible encoding, don't trust it. + * + * Firstly, it should have been sent with a BOM (and thus + * detected above). + * + * Secondly, we've just used an ASCII-only parser to + * extract the encoding from the document. Therefore, + * the document plainly isn't what the meta charset + * claims it is. + * + * What we do in this case is to ignore the meta charset's + * claims and leave the charset determination to the + * autodetection routines (or the fallback case if they + * fail). + */ + if (charset != hubbub_mibenum_from_name("UTF-16", + SLEN("UTF-16")) && + charset != hubbub_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")) && + charset != hubbub_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")) && + charset != hubbub_mibenum_from_name("UTF-32", + SLEN("UTF-32")) && + charset != hubbub_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")) && + charset != hubbub_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE"))) { + + *mibenum = charset; + *source = HUBBUB_CHARSET_DOCUMENT; + + return HUBBUB_OK; + } + } + + /* No charset was specified within the document, attempt to + * autodetect the encoding from the data that we have available. */ + + /** \todo Charset autodetection */ + + /* We failed to autodetect a charset, so use the default fallback */ +default_encoding: + + charset = hubbub_mibenum_from_name("Windows-1252", + SLEN("Windows-1252")); + if (charset == 0) + charset = hubbub_mibenum_from_name("ISO-8859-1", + SLEN("ISO-8859-1")); + + *mibenum = charset; + *source = HUBBUB_CHARSET_DEFAULT; + + return HUBBUB_OK; +} + + +/** + * Inspect the beginning of a buffer of data for the presence of a + * UTF Byte Order Mark. + * + * \param data Pointer to pointer to buffer containing data + * \param len Pointer to buffer length + * \return MIB enum representing encoding described by BOM, or 0 if not found + * + * If a BOM is found, the data pointer will be modified to point to the first + * byte in the buffer after the BOM. The length will also be modified + * appropriately. + */ +uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len) +{ + if (data == NULL || *data == NULL || len == NULL) + return 0; + + /* We require at least 4 bytes of data */ + if (*len < 4) + return 0; + +#define UTF32BOM_LEN (4) +#define UTF16BOM_LEN (2) +#define UTF8BOM_LEN (3) + + if ((*data)[0] == 0x00 && (*data)[1] == 0x00 && + (*data)[2] == 0xFE && (*data)[3] == 0xFF) { + *data += UTF32BOM_LEN; + *len -= UTF32BOM_LEN; + + return hubbub_mibenum_from_name("UTF-32BE", + SLEN("UTF-32BE")); + } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE && + (*data)[2] == 0x00 && (*data)[3] == 0x00) { + *data += UTF32BOM_LEN; + *len -= UTF32BOM_LEN; + + return hubbub_mibenum_from_name("UTF-32LE", + SLEN("UTF-32LE")); + } else if ((*data)[0] == 0xFE && (*data)[1] == 0xFF) { + *data += UTF16BOM_LEN; + *len -= UTF16BOM_LEN; + + return hubbub_mibenum_from_name("UTF-16BE", + SLEN("UTF-16BE")); + } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE) { + *data += UTF16BOM_LEN; + *len -= UTF16BOM_LEN; + + return hubbub_mibenum_from_name("UTF-16LE", + SLEN("UTF-16LE")); + } else if ((*data)[0] == 0xEF && (*data)[1] == 0xBB && + (*data)[2] == 0xBF) { + *data += UTF8BOM_LEN; + *len -= UTF8BOM_LEN; + + return hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8")); + } + +#undef UTF32BOM_LEN +#undef UTF16BOM_LEN +#undef UTF8BOM_LEN + + return 0; +} + +#define PEEK(a) \ + (pos < end - SLEN(a) && \ + strncasecmp((const char *) pos, a, SLEN(a)) == 0) + +#define ADVANCE(a) \ + while (pos < end - SLEN(a)) { \ + if (PEEK(a)) \ + break; \ + pos++; \ + } \ + \ + if (pos == end - SLEN(a)) \ + return 0; + +#define ISSPACE(a) \ + (a == 0x09 || a == 0x0a || a == 0x0b || \ + a == 0x0c || a == 0x0d || a == 0x20) + +/** + * Search for a meta charset within a buffer of data + * + * \param data Pointer to buffer containing data + * \param len Length of buffer in data + * \return MIB enum representing encoding, or 0 if none found + */ +uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len) +{ + const uint8_t *pos = data; + const uint8_t *end; + uint16_t mibenum; + + if (data == NULL) + return 0; + + end = pos + min(512, len); + + /* 1. */ + while (pos < end) { + /* a */ + if (PEEK(""); + /* b */ + } else if (PEEK("= end - 1) + return 0; + + if (ISSPACE(*(pos + SLEN("= end) + return 0; + } + /* c */ + } else if ((PEEK("' || *pos == '<') + break; + pos++; + } + + if (pos >= end) + return 0; + + /* 3 */ + if (*pos != '<') { + const uint8_t *n; + const uint8_t *v; + uint32_t nl, vl; + + while (hubbub_charset_get_attribute(&pos, end, + &n, &nl, &v, &vl)) + ; /* do nothing */ + /* 2 */ + } else + continue; + /* d */ + } else if (PEEK(""); + } + + /* e - do nothing */ + + /* 2 */ + pos++; + } + + return 0; +} + +/** + * Parse attributes on a meta tag + * + * \param pos Pointer to pointer to current location (updated on exit) + * \param end Pointer to end of data stream + * \return MIB enum of detected encoding, or 0 if none found + */ +uint16_t hubbub_charset_parse_attributes(const uint8_t **pos, + const uint8_t *end) +{ + const uint8_t *name; + const uint8_t *value; + uint32_t namelen, valuelen; + uint16_t mibenum; + + if (pos == NULL || *pos == NULL || end == NULL) + return 0; + + /* 2 */ + while (hubbub_charset_get_attribute(pos, end, + &name, &namelen, &value, &valuelen)) { + /* 3 */ + /* a */ + if (namelen == SLEN("charset") && valuelen > 0 && + strncasecmp((const char *) name, "charset", + SLEN("charset")) == 0) { + /* strip value */ + while (ISSPACE(*value)) { + value++; + valuelen--; + } + + while (valuelen > 0 && ISSPACE(value[valuelen - 1])) + valuelen--; + + mibenum = hubbub_mibenum_from_name( + (const char *) value, valuelen); + if (mibenum != 0) + return mibenum; + /* b */ + } else if (namelen == SLEN("content") && valuelen > 0 && + strncasecmp((const char *) name, "content", + SLEN("content")) == 0) { + mibenum = hubbub_charset_parse_content(value, + valuelen); + if (mibenum != 0) + return mibenum; + } + + /* c - do nothing */ + + /* 1 */ + while (*pos < end) { + if (ISSPACE(**pos)) + break; + (*pos)++; + } + + if (*pos >= end) { + return 0; + } + } + + return 0; +} + +/** + * Parse a content= attribute's value + * + * \param value Attribute's value + * \param valuelen Length of value + * \return MIB enum of detected encoding, or 0 if none found + */ +uint16_t hubbub_charset_parse_content(const uint8_t *value, + uint32_t valuelen) +{ + const uint8_t *end; + const uint8_t *tentative = NULL; + uint32_t tentative_len = 0; + + if (value == NULL) + return 0; + + end = value + valuelen; + + /* 1 */ + while (value < end) { + if (*value == ';') { + value++; + break; + } + + value++; + } + + if (value >= end) + return 0; + + /* 2 */ + while (value < end && ISSPACE(*value)) { + value++; + } + + if (value >= end) + return 0; + + /* 3 */ + if (value < end - SLEN("charset") && + strncasecmp((const char *) value, + "charset", SLEN("charset")) != 0) + return 0; + + value += SLEN("charset"); + + /* 4 */ + while (value < end && ISSPACE(*value)) { + value++; + } + + if (value >= end) + return 0; + + /* 5 */ + if (*value != '=') + return 0; + /* skip '=' */ + value++; + + /* 6 */ + while (value < end && ISSPACE(*value)) { + value++; + } + + if (value >= end) + return 0; + + /* 7 */ + tentative = value; + + /* a */ + if (*value == '"') { + while (++value < end && *value != '"') { + tentative_len++; + } + + if (value < end) + tentative++; + else + tentative = NULL; + /* b */ + } else if (*value == '\'') { + while (++value < end && *value != '\'') { + tentative_len++; + } + + if (value < end) + tentative++; + else + tentative = NULL; + /* c */ + } else { + while (value < end && !ISSPACE(*value)) { + value++; + tentative_len++; + } + } + + /* 8 */ + if (tentative != NULL) { + return hubbub_mibenum_from_name((const char *) tentative, + tentative_len); + } + + /* 9 */ + return 0; +} + +/** + * Extract an attribute from the data stream + * + * \param data Pointer to pointer to current location (updated on exit) + * \param end Pointer to end of data stream + * \param name Pointer to location to receive attribute name + * \param namelen Pointer to location to receive attribute name length + * \param value Pointer to location to receive attribute value + * \param valuelen Pointer to location to receive attribute value langth + * \return true if attribute extracted, false otherwise. + * + * Note: The caller should heed the returned lengths; these are the only + * indicator that useful content resides in name or value. + */ +bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end, + const uint8_t **name, uint32_t *namelen, + const uint8_t **value, uint32_t *valuelen) +{ + const uint8_t *pos; + + if (data == NULL || *data == NULL || end == NULL || name == NULL || + namelen == NULL || value == NULL || valuelen == NULL) + return false; + + pos = *data; + + /* 1. Skip leading spaces or '/' characters */ + while (pos < end && (ISSPACE(*pos) || *pos == '/')) { + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* 2. Invalid element open character */ + if (*pos == '<') { + pos--; + *data = pos; + return false; + } + + /* 3. End of element */ + if (*pos == '>') { + *data = pos; + return false; + } + + /* 4. Initialise name & value to empty string */ + *name = pos; + *namelen = 0; + *value = (const uint8_t *) ""; + *valuelen = 0; + + /* 5. Extract name */ + while (pos < end) { + /* a */ + if (*pos == '=') { + break; + } + + /* b */ + if (ISSPACE(*pos)) { + break; + } + + /* c */ + if (*pos == '/' || *pos == '<' || *pos == '>') { + return true; + } + + /* d is handled by strncasecmp in _parse_attributes */ + + /* e */ + (*namelen)++; + + /* 6 */ + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + if (ISSPACE(*pos)) { + /* 7. Skip trailing spaces */ + while (pos < end && ISSPACE(*pos)) { + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* 8. Must be '=' */ + if (*pos != '=') { + pos--; + *data = pos; + return true; + } + } + + /* 9. Skip '=' */ + pos++; + + /* 10. Skip any spaces after '=' */ + while (pos < end && ISSPACE(*pos)) { + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* 11. Extract value, if quoted */ + /* a */ + if (*pos == '\'' || *pos == '"') { + /* 1 */ + const uint8_t *quote = pos; + + /* 2 */ + while (++pos < end) { + /* 3 */ + if (*pos == *quote) { + *value = (quote + 1); + *data = ++pos; + return true; + } + + /* 4 is handled by strncasecmp */ + + /* 5 */ + (*valuelen)++; + + /* 6 */ + } + + if (pos >= end) { + *data = pos; + return false; + } + } + + /* b */ + if (*pos == '<' || *pos == '>') { + *data = pos; + return true; + } + + /* c is handled by strncasecmp */ + + /* d */ + *value = pos; + + while (pos < end) { + /* 12. Extract unquoted value */ + /* a */ + if (ISSPACE(*pos) || *pos == '<' || *pos == '>') { + *data = pos; + return true; + } + + /* b is handled by strncasecmp */ + + /* c */ + (*valuelen)++; + + /* 13. Advance */ + pos++; + } + + if (pos >= end) { + *data = pos; + return false; + } + + /* should never be reached */ + abort(); + + return false; +} -- cgit v1.2.3