summaryrefslogtreecommitdiff
path: root/src/charset/detect.c
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2007-06-23 22:40:25 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2007-06-23 22:40:25 +0000
commit7b30a5520cfb56e651f0eb4da85a3e07747da7dc (patch)
tree5d6281c071c089e1e7a8ae6f8044cecaf6a7db16 /src/charset/detect.c
downloadlibhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.gz
libhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.bz2
Import hubbub -- an HTML parsing library.
Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359
Diffstat (limited to 'src/charset/detect.c')
-rw-r--r--src/charset/detect.c673
1 files changed, 673 insertions, 0 deletions
diff --git a/src/charset/detect.c b/src/charset/detect.c
new file mode 100644
index 0000000..8ff3b87
--- /dev/null
+++ b/src/charset/detect.c
@@ -0,0 +1,673 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ * http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "utils/utils.h"
+
+#include "detect.h"
+
+static uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len);
+static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len);
+static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
+ const uint8_t *end);
+static uint16_t hubbub_charset_parse_content(const uint8_t *value,
+ uint32_t valuelen);
+static bool hubbub_charset_get_attribute(const uint8_t **data,
+ const uint8_t *end,
+ const uint8_t **name, uint32_t *namelen,
+ const uint8_t **value, uint32_t *valuelen);
+
+/**
+ * Extract a charset from a chunk of data
+ *
+ * \param data Pointer to pointer to buffer containing data
+ * \param len Pointer to buffer length
+ * \param mibenum Pointer to location to store MIB enum representing charset
+ * \param source Pointer to location to receive charset source
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ *
+ * The data pointer and length will be modified by this function if
+ * a byte order mark is encountered at the start of the buffer. The updated
+ * data pointer will point to the first byte in the buffer after the BOM.
+ * The length will be modified appropriately.
+ *
+ * The larger a chunk of data fed to this routine, the better, as it allows
+ * charset autodetection access to a larger dataset for analysis.
+ */
+hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
+ uint16_t *mibenum, hubbub_charset_source *source)
+{
+ uint16_t charset = 0;
+
+ if (data == NULL || *data == NULL || len == NULL ||
+ mibenum == NULL || source == NULL)
+ return HUBBUB_BADPARM;
+
+ /* We need at least 4 bytes of data */
+ if (*len < 4)
+ goto default_encoding;
+
+ /* First, look for a BOM */
+ charset = hubbub_charset_read_bom(data, len);
+ if (charset != 0) {
+ *mibenum = charset;
+ *source = HUBBUB_CHARSET_DOCUMENT;
+
+ return HUBBUB_OK;
+ }
+
+ /* No BOM was found, so we must look for a meta charset within
+ * the document itself. */
+ charset = hubbub_charset_scan_meta(*data, *len);
+ if (charset != 0) {
+ /* ISO-8859-1 becomes Windows-1252 */
+ if (charset == hubbub_mibenum_from_name("ISO-8859-1",
+ SLEN("ISO-8859-1"))) {
+ charset = hubbub_mibenum_from_name("Windows-1252",
+ SLEN("Windows-1252"));
+ /* Fallback to 8859-1 if that failed */
+ if (charset == 0)
+ charset = hubbub_mibenum_from_name(
+ "ISO-8859-1", SLEN("ISO-8859-1"));
+ }
+
+ /* If we've encountered a meta charset for a non-ASCII-
+ * compatible encoding, don't trust it.
+ *
+ * Firstly, it should have been sent with a BOM (and thus
+ * detected above).
+ *
+ * Secondly, we've just used an ASCII-only parser to
+ * extract the encoding from the document. Therefore,
+ * the document plainly isn't what the meta charset
+ * claims it is.
+ *
+ * What we do in this case is to ignore the meta charset's
+ * claims and leave the charset determination to the
+ * autodetection routines (or the fallback case if they
+ * fail).
+ */
+ if (charset != hubbub_mibenum_from_name("UTF-16",
+ SLEN("UTF-16")) &&
+ charset != hubbub_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE")) &&
+ charset != hubbub_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE")) &&
+ charset != hubbub_mibenum_from_name("UTF-32",
+ SLEN("UTF-32")) &&
+ charset != hubbub_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE")) &&
+ charset != hubbub_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"))) {
+
+ *mibenum = charset;
+ *source = HUBBUB_CHARSET_DOCUMENT;
+
+ return HUBBUB_OK;
+ }
+ }
+
+ /* No charset was specified within the document, attempt to
+ * autodetect the encoding from the data that we have available. */
+
+ /** \todo Charset autodetection */
+
+ /* We failed to autodetect a charset, so use the default fallback */
+default_encoding:
+
+ charset = hubbub_mibenum_from_name("Windows-1252",
+ SLEN("Windows-1252"));
+ if (charset == 0)
+ charset = hubbub_mibenum_from_name("ISO-8859-1",
+ SLEN("ISO-8859-1"));
+
+ *mibenum = charset;
+ *source = HUBBUB_CHARSET_DEFAULT;
+
+ return HUBBUB_OK;
+}
+
+
+/**
+ * Inspect the beginning of a buffer of data for the presence of a
+ * UTF Byte Order Mark.
+ *
+ * \param data Pointer to pointer to buffer containing data
+ * \param len Pointer to buffer length
+ * \return MIB enum representing encoding described by BOM, or 0 if not found
+ *
+ * If a BOM is found, the data pointer will be modified to point to the first
+ * byte in the buffer after the BOM. The length will also be modified
+ * appropriately.
+ */
+uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len)
+{
+ if (data == NULL || *data == NULL || len == NULL)
+ return 0;
+
+ /* We require at least 4 bytes of data */
+ if (*len < 4)
+ return 0;
+
+#define UTF32BOM_LEN (4)
+#define UTF16BOM_LEN (2)
+#define UTF8BOM_LEN (3)
+
+ if ((*data)[0] == 0x00 && (*data)[1] == 0x00 &&
+ (*data)[2] == 0xFE && (*data)[3] == 0xFF) {
+ *data += UTF32BOM_LEN;
+ *len -= UTF32BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-32BE",
+ SLEN("UTF-32BE"));
+ } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE &&
+ (*data)[2] == 0x00 && (*data)[3] == 0x00) {
+ *data += UTF32BOM_LEN;
+ *len -= UTF32BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-32LE",
+ SLEN("UTF-32LE"));
+ } else if ((*data)[0] == 0xFE && (*data)[1] == 0xFF) {
+ *data += UTF16BOM_LEN;
+ *len -= UTF16BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-16BE",
+ SLEN("UTF-16BE"));
+ } else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE) {
+ *data += UTF16BOM_LEN;
+ *len -= UTF16BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-16LE",
+ SLEN("UTF-16LE"));
+ } else if ((*data)[0] == 0xEF && (*data)[1] == 0xBB &&
+ (*data)[2] == 0xBF) {
+ *data += UTF8BOM_LEN;
+ *len -= UTF8BOM_LEN;
+
+ return hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+ }
+
+#undef UTF32BOM_LEN
+#undef UTF16BOM_LEN
+#undef UTF8BOM_LEN
+
+ return 0;
+}
+
+#define PEEK(a) \
+ (pos < end - SLEN(a) && \
+ strncasecmp((const char *) pos, a, SLEN(a)) == 0)
+
+#define ADVANCE(a) \
+ while (pos < end - SLEN(a)) { \
+ if (PEEK(a)) \
+ break; \
+ pos++; \
+ } \
+ \
+ if (pos == end - SLEN(a)) \
+ return 0;
+
+#define ISSPACE(a) \
+ (a == 0x09 || a == 0x0a || a == 0x0b || \
+ a == 0x0c || a == 0x0d || a == 0x20)
+
+/**
+ * Search for a meta charset within a buffer of data
+ *
+ * \param data Pointer to buffer containing data
+ * \param len Length of buffer in data
+ * \return MIB enum representing encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len)
+{
+ const uint8_t *pos = data;
+ const uint8_t *end;
+ uint16_t mibenum;
+
+ if (data == NULL)
+ return 0;
+
+ end = pos + min(512, len);
+
+ /* 1. */
+ while (pos < end) {
+ /* a */
+ if (PEEK("<!--")) {
+ pos += SLEN("<!--");
+ ADVANCE("-->");
+ /* b */
+ } else if (PEEK("<meta")) {
+ if (pos + SLEN("<meta") >= end - 1)
+ return 0;
+
+ if (ISSPACE(*(pos + SLEN("<meta")))) {
+ /* 1 */
+ pos += SLEN("<meta");
+
+ mibenum = hubbub_charset_parse_attributes(
+ &pos, end);
+ if (mibenum != 0)
+ return mibenum;
+
+ if (pos >= end)
+ return 0;
+ }
+ /* c */
+ } else if ((PEEK("</") && (pos < end - 3 &&
+ (0x41 <= (*(pos + 2) & ~ 0x20) &&
+ (*(pos + 2) & ~ 0x20) <= 0x5A))) ||
+ (pos < end - 2 && *pos == '<' &&
+ (0x41 <= (*(pos + 1) & ~ 0x20) &&
+ (*(pos + 1) & ~ 0x20) <= 0x5A))) {
+
+ /* skip '<' */
+ pos++;
+
+ /* 1. */
+ while (pos < end) {
+ if (ISSPACE(*pos) ||
+ *pos == '>' || *pos == '<')
+ break;
+ pos++;
+ }
+
+ if (pos >= end)
+ return 0;
+
+ /* 3 */
+ if (*pos != '<') {
+ const uint8_t *n;
+ const uint8_t *v;
+ uint32_t nl, vl;
+
+ while (hubbub_charset_get_attribute(&pos, end,
+ &n, &nl, &v, &vl))
+ ; /* do nothing */
+ /* 2 */
+ } else
+ continue;
+ /* d */
+ } else if (PEEK("<!") || PEEK("</") || PEEK("<?")) {
+ pos++;
+ ADVANCE(">");
+ }
+
+ /* e - do nothing */
+
+ /* 2 */
+ pos++;
+ }
+
+ return 0;
+}
+
+/**
+ * Parse attributes on a meta tag
+ *
+ * \param pos Pointer to pointer to current location (updated on exit)
+ * \param end Pointer to end of data stream
+ * \return MIB enum of detected encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
+ const uint8_t *end)
+{
+ const uint8_t *name;
+ const uint8_t *value;
+ uint32_t namelen, valuelen;
+ uint16_t mibenum;
+
+ if (pos == NULL || *pos == NULL || end == NULL)
+ return 0;
+
+ /* 2 */
+ while (hubbub_charset_get_attribute(pos, end,
+ &name, &namelen, &value, &valuelen)) {
+ /* 3 */
+ /* a */
+ if (namelen == SLEN("charset") && valuelen > 0 &&
+ strncasecmp((const char *) name, "charset",
+ SLEN("charset")) == 0) {
+ /* strip value */
+ while (ISSPACE(*value)) {
+ value++;
+ valuelen--;
+ }
+
+ while (valuelen > 0 && ISSPACE(value[valuelen - 1]))
+ valuelen--;
+
+ mibenum = hubbub_mibenum_from_name(
+ (const char *) value, valuelen);
+ if (mibenum != 0)
+ return mibenum;
+ /* b */
+ } else if (namelen == SLEN("content") && valuelen > 0 &&
+ strncasecmp((const char *) name, "content",
+ SLEN("content")) == 0) {
+ mibenum = hubbub_charset_parse_content(value,
+ valuelen);
+ if (mibenum != 0)
+ return mibenum;
+ }
+
+ /* c - do nothing */
+
+ /* 1 */
+ while (*pos < end) {
+ if (ISSPACE(**pos))
+ break;
+ (*pos)++;
+ }
+
+ if (*pos >= end) {
+ return 0;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Parse a content= attribute's value
+ *
+ * \param value Attribute's value
+ * \param valuelen Length of value
+ * \return MIB enum of detected encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_parse_content(const uint8_t *value,
+ uint32_t valuelen)
+{
+ const uint8_t *end;
+ const uint8_t *tentative = NULL;
+ uint32_t tentative_len = 0;
+
+ if (value == NULL)
+ return 0;
+
+ end = value + valuelen;
+
+ /* 1 */
+ while (value < end) {
+ if (*value == ';') {
+ value++;
+ break;
+ }
+
+ value++;
+ }
+
+ if (value >= end)
+ return 0;
+
+ /* 2 */
+ while (value < end && ISSPACE(*value)) {
+ value++;
+ }
+
+ if (value >= end)
+ return 0;
+
+ /* 3 */
+ if (value < end - SLEN("charset") &&
+ strncasecmp((const char *) value,
+ "charset", SLEN("charset")) != 0)
+ return 0;
+
+ value += SLEN("charset");
+
+ /* 4 */
+ while (value < end && ISSPACE(*value)) {
+ value++;
+ }
+
+ if (value >= end)
+ return 0;
+
+ /* 5 */
+ if (*value != '=')
+ return 0;
+ /* skip '=' */
+ value++;
+
+ /* 6 */
+ while (value < end && ISSPACE(*value)) {
+ value++;
+ }
+
+ if (value >= end)
+ return 0;
+
+ /* 7 */
+ tentative = value;
+
+ /* a */
+ if (*value == '"') {
+ while (++value < end && *value != '"') {
+ tentative_len++;
+ }
+
+ if (value < end)
+ tentative++;
+ else
+ tentative = NULL;
+ /* b */
+ } else if (*value == '\'') {
+ while (++value < end && *value != '\'') {
+ tentative_len++;
+ }
+
+ if (value < end)
+ tentative++;
+ else
+ tentative = NULL;
+ /* c */
+ } else {
+ while (value < end && !ISSPACE(*value)) {
+ value++;
+ tentative_len++;
+ }
+ }
+
+ /* 8 */
+ if (tentative != NULL) {
+ return hubbub_mibenum_from_name((const char *) tentative,
+ tentative_len);
+ }
+
+ /* 9 */
+ return 0;
+}
+
+/**
+ * Extract an attribute from the data stream
+ *
+ * \param data Pointer to pointer to current location (updated on exit)
+ * \param end Pointer to end of data stream
+ * \param name Pointer to location to receive attribute name
+ * \param namelen Pointer to location to receive attribute name length
+ * \param value Pointer to location to receive attribute value
+ * \param valuelen Pointer to location to receive attribute value langth
+ * \return true if attribute extracted, false otherwise.
+ *
+ * Note: The caller should heed the returned lengths; these are the only
+ * indicator that useful content resides in name or value.
+ */
+bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end,
+ const uint8_t **name, uint32_t *namelen,
+ const uint8_t **value, uint32_t *valuelen)
+{
+ const uint8_t *pos;
+
+ if (data == NULL || *data == NULL || end == NULL || name == NULL ||
+ namelen == NULL || value == NULL || valuelen == NULL)
+ return false;
+
+ pos = *data;
+
+ /* 1. Skip leading spaces or '/' characters */
+ while (pos < end && (ISSPACE(*pos) || *pos == '/')) {
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ /* 2. Invalid element open character */
+ if (*pos == '<') {
+ pos--;
+ *data = pos;
+ return false;
+ }
+
+ /* 3. End of element */
+ if (*pos == '>') {
+ *data = pos;
+ return false;
+ }
+
+ /* 4. Initialise name & value to empty string */
+ *name = pos;
+ *namelen = 0;
+ *value = (const uint8_t *) "";
+ *valuelen = 0;
+
+ /* 5. Extract name */
+ while (pos < end) {
+ /* a */
+ if (*pos == '=') {
+ break;
+ }
+
+ /* b */
+ if (ISSPACE(*pos)) {
+ break;
+ }
+
+ /* c */
+ if (*pos == '/' || *pos == '<' || *pos == '>') {
+ return true;
+ }
+
+ /* d is handled by strncasecmp in _parse_attributes */
+
+ /* e */
+ (*namelen)++;
+
+ /* 6 */
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ if (ISSPACE(*pos)) {
+ /* 7. Skip trailing spaces */
+ while (pos < end && ISSPACE(*pos)) {
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ /* 8. Must be '=' */
+ if (*pos != '=') {
+ pos--;
+ *data = pos;
+ return true;
+ }
+ }
+
+ /* 9. Skip '=' */
+ pos++;
+
+ /* 10. Skip any spaces after '=' */
+ while (pos < end && ISSPACE(*pos)) {
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ /* 11. Extract value, if quoted */
+ /* a */
+ if (*pos == '\'' || *pos == '"') {
+ /* 1 */
+ const uint8_t *quote = pos;
+
+ /* 2 */
+ while (++pos < end) {
+ /* 3 */
+ if (*pos == *quote) {
+ *value = (quote + 1);
+ *data = ++pos;
+ return true;
+ }
+
+ /* 4 is handled by strncasecmp */
+
+ /* 5 */
+ (*valuelen)++;
+
+ /* 6 */
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+ }
+
+ /* b */
+ if (*pos == '<' || *pos == '>') {
+ *data = pos;
+ return true;
+ }
+
+ /* c is handled by strncasecmp */
+
+ /* d */
+ *value = pos;
+
+ while (pos < end) {
+ /* 12. Extract unquoted value */
+ /* a */
+ if (ISSPACE(*pos) || *pos == '<' || *pos == '>') {
+ *data = pos;
+ return true;
+ }
+
+ /* b is handled by strncasecmp */
+
+ /* c */
+ (*valuelen)++;
+
+ /* 13. Advance */
+ pos++;
+ }
+
+ if (pos >= end) {
+ *data = pos;
+ return false;
+ }
+
+ /* should never be reached */
+ abort();
+
+ return false;
+}