Import hubbub -- an HTML parsing library.

Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359
author: John Mark Bell <jmb@netsurf-browser.org> 2007-06-23 22:40:25 +0000
committer: John Mark Bell <jmb@netsurf-browser.org> 2007-06-23 22:40:25 +0000
commit: 7b30a5520cfb56e651f0eb4da85a3e07747da7dc (patch)
tree: 5d6281c071c089e1e7a8ae6f8044cecaf6a7db16 /src/charset/detect.c
download: libhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.gz
libhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.bz2
1 files changed, 673 insertions, 0 deletions
diff --git a/src/charset/detect.c b/src/charset/detect.c
new file mode 100644
index 0000000..8ff3b87
--- /dev/null
+++ b/src/charset/detect.c
@@ -0,0 +1,673 @@
+/*
+ * This file is part of Hubbub.
+ * Licensed under the MIT License,
+ *                http://www.opensource.org/licenses/mit-license.php
+ * Copyright 2007 John-Mark Bell <jmb@netsurf-browser.org>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "charset/aliases.h"
+#include "utils/utils.h"
+
+#include "detect.h"
+
+static uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len);
+static uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len);
+static uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
+		const uint8_t *end);
+static uint16_t hubbub_charset_parse_content(const uint8_t *value,
+		uint32_t valuelen);
+static bool hubbub_charset_get_attribute(const uint8_t **data,
+		const uint8_t *end,
+		const uint8_t **name, uint32_t *namelen,
+		const uint8_t **value, uint32_t *valuelen);
+
+/**
+ * Extract a charset from a chunk of data
+ *
+ * \param data     Pointer to pointer to buffer containing data
+ * \param len      Pointer to buffer length
+ * \param mibenum  Pointer to location to store MIB enum representing charset
+ * \param source   Pointer to location to receive charset source
+ * \return HUBBUB_OK on success, appropriate error otherwise
+ *
+ * The data pointer and length will be modified by this function if
+ * a byte order mark is encountered at the start of the buffer. The updated
+ * data pointer will point to the first byte in the buffer after the BOM.
+ * The length will be modified appropriately.
+ *
+ * The larger a chunk of data fed to this routine, the better, as it allows
+ * charset autodetection access to a larger dataset for analysis.
+ */
+hubbub_error hubbub_charset_extract(const uint8_t **data, size_t *len,
+		uint16_t *mibenum, hubbub_charset_source *source)
+{
+	uint16_t charset = 0;
+
+	if (data == NULL || *data == NULL || len == NULL ||
+			mibenum == NULL || source == NULL)
+		return HUBBUB_BADPARM;
+
+	/* We need at least 4 bytes of data */
+	if (*len < 4)
+		goto default_encoding;
+
+	/* First, look for a BOM */
+	charset = hubbub_charset_read_bom(data, len);
+	if (charset != 0) {
+		*mibenum = charset;
+		*source = HUBBUB_CHARSET_DOCUMENT;
+
+		return HUBBUB_OK;
+	}
+
+	/* No BOM was found, so we must look for a meta charset within
+	 * the document itself. */
+	charset = hubbub_charset_scan_meta(*data, *len);
+	if (charset != 0) {
+		/* ISO-8859-1 becomes Windows-1252 */
+		if (charset == hubbub_mibenum_from_name("ISO-8859-1",
+				SLEN("ISO-8859-1"))) {
+			charset = hubbub_mibenum_from_name("Windows-1252",
+					SLEN("Windows-1252"));
+			/* Fallback to 8859-1 if that failed */
+			if (charset == 0)
+				charset = hubbub_mibenum_from_name(
+					"ISO-8859-1", SLEN("ISO-8859-1"));
+		}
+
+		/* If we've encountered a meta charset for a non-ASCII-
+		 * compatible encoding, don't trust it.
+		 *
+		 * Firstly, it should have been sent with a BOM (and thus
+		 * detected above).
+		 *
+		 * Secondly, we've just used an ASCII-only parser to
+		 * extract the encoding from the document. Therefore,
+		 * the document plainly isn't what the meta charset
+		 * claims it is.
+		 *
+		 * What we do in this case is to ignore the meta charset's
+		 * claims and leave the charset determination to the
+		 * autodetection routines (or the fallback case if they
+		 * fail).
+		 */
+		if (charset != hubbub_mibenum_from_name("UTF-16",
+					SLEN("UTF-16")) &&
+			charset != hubbub_mibenum_from_name("UTF-16LE",
+					SLEN("UTF-16LE")) &&
+			charset != hubbub_mibenum_from_name("UTF-16BE",
+					SLEN("UTF-16BE")) &&
+			charset != hubbub_mibenum_from_name("UTF-32",
+					SLEN("UTF-32")) &&
+			charset != hubbub_mibenum_from_name("UTF-32LE",
+					SLEN("UTF-32LE")) &&
+			charset != hubbub_mibenum_from_name("UTF-32BE",
+					SLEN("UTF-32BE"))) {
+
+			*mibenum = charset;
+			*source = HUBBUB_CHARSET_DOCUMENT;
+
+			return HUBBUB_OK;
+		}
+	}
+
+	/* No charset was specified within the document, attempt to
+	 * autodetect the encoding from the data that we have available. */
+
+	/** \todo Charset autodetection */
+
+	/* We failed to autodetect a charset, so use the default fallback */
+default_encoding:
+
+	charset = hubbub_mibenum_from_name("Windows-1252",
+			SLEN("Windows-1252"));
+	if (charset == 0)
+		charset = hubbub_mibenum_from_name("ISO-8859-1",
+				SLEN("ISO-8859-1"));
+
+	*mibenum = charset;
+	*source = HUBBUB_CHARSET_DEFAULT;
+
+	return HUBBUB_OK;
+}
+
+
+/**
+ * Inspect the beginning of a buffer of data for the presence of a
+ * UTF Byte Order Mark.
+ *
+ * \param data  Pointer to pointer to buffer containing data
+ * \param len   Pointer to buffer length
+ * \return MIB enum representing encoding described by BOM, or 0 if not found
+ *
+ * If a BOM is found, the data pointer will be modified to point to the first
+ * byte in the buffer after the BOM. The length will also be modified
+ * appropriately.
+ */
+uint16_t hubbub_charset_read_bom(const uint8_t **data, size_t *len)
+{
+	if (data == NULL || *data == NULL || len == NULL)
+		return 0;
+
+	/* We require at least 4 bytes of data */
+	if (*len < 4)
+		return 0;
+
+#define UTF32BOM_LEN (4)
+#define UTF16BOM_LEN (2)
+#define UTF8BOM_LEN  (3)
+
+	if ((*data)[0] == 0x00 && (*data)[1] == 0x00 &&
+			(*data)[2] == 0xFE && (*data)[3] == 0xFF) {
+		*data += UTF32BOM_LEN;
+		*len  -= UTF32BOM_LEN;
+
+		return hubbub_mibenum_from_name("UTF-32BE",
+				SLEN("UTF-32BE"));
+	} else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE &&
+			(*data)[2] == 0x00 && (*data)[3] == 0x00) {
+		*data += UTF32BOM_LEN;
+		*len  -= UTF32BOM_LEN;
+
+		return hubbub_mibenum_from_name("UTF-32LE",
+				SLEN("UTF-32LE"));
+	} else if ((*data)[0] == 0xFE && (*data)[1] == 0xFF) {
+		*data += UTF16BOM_LEN;
+		*len  -= UTF16BOM_LEN;
+
+		return hubbub_mibenum_from_name("UTF-16BE",
+				SLEN("UTF-16BE"));
+	} else if ((*data)[0] == 0xFF && (*data)[1] == 0xFE) {
+		*data += UTF16BOM_LEN;
+		*len  -= UTF16BOM_LEN;
+
+		return hubbub_mibenum_from_name("UTF-16LE",
+				SLEN("UTF-16LE"));
+	} else if ((*data)[0] == 0xEF && (*data)[1] == 0xBB &&
+			(*data)[2] == 0xBF) {
+		*data += UTF8BOM_LEN;
+		*len  -= UTF8BOM_LEN;
+
+		return hubbub_mibenum_from_name("UTF-8", SLEN("UTF-8"));
+	}
+
+#undef UTF32BOM_LEN
+#undef UTF16BOM_LEN
+#undef UTF8BOM_LEN
+
+	return 0;
+}
+
+#define PEEK(a)								\
+	(pos < end - SLEN(a) && 					\
+		strncasecmp((const char *) pos, a, SLEN(a)) == 0)
+
+#define ADVANCE(a)							\
+	while (pos < end - SLEN(a)) {					\
+		if (PEEK(a))						\
+			break;						\
+		pos++;							\
+	}								\
+									\
+	if (pos == end - SLEN(a))					\
+		return 0;
+
+#define ISSPACE(a)							\
+	(a == 0x09 || a == 0x0a || a == 0x0b || 			\
+		a == 0x0c || a == 0x0d || a == 0x20)
+
+/**
+ * Search for a meta charset within a buffer of data
+ *
+ * \param data  Pointer to buffer containing data
+ * \param len   Length of buffer in data
+ * \return MIB enum representing encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_scan_meta(const uint8_t *data, size_t len)
+{
+	const uint8_t *pos = data;
+	const uint8_t *end;
+	uint16_t mibenum;
+
+	if (data == NULL)
+		return 0;
+
+	end = pos + min(512, len);
+
+	/* 1. */
+	while (pos < end) {
+		/* a */
+		if (PEEK("<!--")) {
+			pos += SLEN("<!--");
+			ADVANCE("-->");
+		/* b */
+		} else if (PEEK("<meta")) {
+			if (pos + SLEN("<meta") >= end - 1)
+				return 0;
+
+			if (ISSPACE(*(pos + SLEN("<meta")))) {
+				/* 1 */
+				pos += SLEN("<meta");
+
+				mibenum = hubbub_charset_parse_attributes(
+						&pos, end);
+				if (mibenum != 0)
+					return mibenum;
+
+				if (pos >= end)
+					return 0;
+			}
+		/* c */
+		} else if ((PEEK("</") && (pos < end - 3 &&
+				(0x41 <= (*(pos + 2) & ~ 0x20) &&
+				(*(pos + 2) & ~ 0x20) <= 0x5A))) ||
+				(pos < end - 2 && *pos == '<' &&
+				(0x41 <= (*(pos + 1) & ~ 0x20) &&
+				(*(pos + 1) & ~ 0x20) <= 0x5A))) {
+
+			/* skip '<' */
+			pos++;
+
+			/* 1. */
+			while (pos < end) {
+				if (ISSPACE(*pos) ||
+						*pos == '>' || *pos == '<')
+					break;
+				pos++;
+			}
+
+			if (pos >= end)
+				return 0;
+
+			/* 3 */
+			if (*pos != '<') {
+				const uint8_t *n;
+				const uint8_t *v;
+				uint32_t nl, vl;
+
+				while (hubbub_charset_get_attribute(&pos, end,
+						&n, &nl, &v, &vl))
+					; /* do nothing */
+			/* 2 */
+			} else
+				continue;
+		/* d */
+		} else if (PEEK("<!") || PEEK("</") || PEEK("<?")) {
+			pos++;
+			ADVANCE(">");
+		}
+
+		/* e - do nothing */
+
+		/* 2 */
+		pos++;
+	}
+
+	return 0;
+}
+
+/**
+ * Parse attributes on a meta tag
+ *
+ * \param pos  Pointer to pointer to current location (updated on exit)
+ * \param end  Pointer to end of data stream
+ * \return MIB enum of detected encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_parse_attributes(const uint8_t **pos,
+		const uint8_t *end)
+{
+	const uint8_t *name;
+	const uint8_t *value;
+	uint32_t namelen, valuelen;
+	uint16_t mibenum;
+
+	if (pos == NULL || *pos == NULL || end == NULL)
+		return 0;
+
+	/* 2 */
+	while (hubbub_charset_get_attribute(pos, end,
+			&name, &namelen, &value, &valuelen)) {
+		/* 3 */
+		/* a */
+		if (namelen == SLEN("charset") && valuelen > 0 &&
+				strncasecmp((const char *) name, "charset",
+					SLEN("charset")) == 0) {
+			/* strip value */
+			while (ISSPACE(*value)) {
+				value++;
+				valuelen--;
+			}
+
+			while (valuelen > 0 && ISSPACE(value[valuelen - 1]))
+				valuelen--;
+
+			mibenum = hubbub_mibenum_from_name(
+					(const char *) value, valuelen);
+			if (mibenum != 0)
+				return mibenum;
+		/* b */
+		} else if (namelen == SLEN("content") && valuelen > 0 &&
+				strncasecmp((const char *) name, "content",
+					SLEN("content")) == 0) {
+			mibenum = hubbub_charset_parse_content(value,
+					valuelen);
+			if (mibenum != 0)
+				return mibenum;
+		}
+
+		/* c - do nothing */
+
+		/* 1 */
+		while (*pos < end) {
+			if (ISSPACE(**pos))
+				break;
+			(*pos)++;
+		}
+
+		if (*pos >= end) {
+			return 0;
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Parse a content= attribute's value
+ *
+ * \param value     Attribute's value
+ * \param valuelen  Length of value
+ * \return MIB enum of detected encoding, or 0 if none found
+ */
+uint16_t hubbub_charset_parse_content(const uint8_t *value,
+		uint32_t valuelen)
+{
+	const uint8_t *end;
+	const uint8_t *tentative = NULL;
+	uint32_t tentative_len = 0;
+
+	if (value == NULL)
+		return 0;
+
+	end = value + valuelen;
+
+	/* 1 */
+	while (value < end) {
+		if (*value == ';') {
+			value++;
+			break;
+		}
+
+		value++;
+	}
+
+	if (value >= end)
+		return 0;
+
+	/* 2 */
+	while (value < end && ISSPACE(*value)) {
+		value++;
+	}
+
+	if (value >= end)
+		return 0;
+
+	/* 3 */
+	if (value < end - SLEN("charset") &&
+			strncasecmp((const char *) value,
+					"charset", SLEN("charset")) != 0)
+		return 0;
+
+	value += SLEN("charset");
+
+	/* 4 */
+	while (value < end && ISSPACE(*value)) {
+		value++;
+	}
+
+	if (value >= end)
+		return 0;
+
+	/* 5 */
+	if (*value != '=')
+		return 0;
+	/* skip '=' */
+	value++;
+
+	/* 6 */
+	while (value < end && ISSPACE(*value)) {
+		value++;
+	}
+
+	if (value >= end)
+		return 0;
+
+	/* 7 */
+	tentative = value;
+
+	/* a */
+	if (*value == '"') {
+		while (++value < end && *value != '"') {
+			tentative_len++;
+		}
+
+		if (value < end)
+			tentative++;
+		else
+			tentative = NULL;
+	/* b */
+	} else if (*value == '\'') {
+		while (++value < end && *value != '\'') {
+			tentative_len++;
+		}
+
+		if (value < end)
+			tentative++;
+		else
+			tentative = NULL;
+	/* c */
+	} else {
+		while (value < end && !ISSPACE(*value)) {
+			value++;
+			tentative_len++;
+		}
+	}
+
+	/* 8 */
+	if (tentative != NULL) {
+		return hubbub_mibenum_from_name((const char *) tentative,
+				tentative_len);
+	}
+
+	/* 9 */
+	return 0;
+}
+
+/**
+ * Extract an attribute from the data stream
+ *
+ * \param data      Pointer to pointer to current location (updated on exit)
+ * \param end       Pointer to end of data stream
+ * \param name      Pointer to location to receive attribute name
+ * \param namelen   Pointer to location to receive attribute name length
+ * \param value     Pointer to location to receive attribute value
+ * \param valuelen  Pointer to location to receive attribute value langth
+ * \return true if attribute extracted, false otherwise.
+ *
+ * Note: The caller should heed the returned lengths; these are the only
+ * indicator that useful content resides in name or value.
+ */
+bool hubbub_charset_get_attribute(const uint8_t **data, const uint8_t *end,
+		const uint8_t **name, uint32_t *namelen,
+		const uint8_t **value, uint32_t *valuelen)
+{
+	const uint8_t *pos;
+
+	if (data == NULL || *data == NULL || end == NULL || name == NULL ||
+			namelen == NULL || value == NULL || valuelen == NULL)
+		return false;
+
+	pos = *data;
+
+	/* 1. Skip leading spaces or '/' characters */
+	while (pos < end && (ISSPACE(*pos) || *pos == '/')) {
+		pos++;
+	}
+
+	if (pos >= end) {
+		*data = pos;
+		return false;
+	}
+
+	/* 2. Invalid element open character */
+	if (*pos == '<') {
+		pos--;
+		*data = pos;
+		return false;
+	}
+
+	/* 3. End of element */
+	if (*pos == '>') {
+		*data = pos;
+		return false;
+	}
+
+	/* 4. Initialise name & value to empty string */
+	*name = pos;
+	*namelen = 0;
+	*value = (const uint8_t *) "";
+	*valuelen = 0;
+
+	/* 5. Extract name */
+	while (pos < end) {
+		/* a */
+		if (*pos == '=') {
+			break;
+		}
+
+		/* b */
+		if (ISSPACE(*pos)) {
+			break;
+		}
+
+		/* c */
+		if (*pos == '/' || *pos == '<' || *pos == '>') {
+			return true;
+		}
+
+		/* d is handled by strncasecmp in _parse_attributes */
+
+		/* e */
+		(*namelen)++;
+
+		/* 6 */
+		pos++;
+	}
+
+	if (pos >= end) {
+		*data = pos;
+		return false;
+	}
+
+	if (ISSPACE(*pos)) {
+		/* 7. Skip trailing spaces */
+		while (pos < end && ISSPACE(*pos)) {
+			pos++;
+		}
+
+		if (pos >= end) {
+			*data = pos;
+			return false;
+		}
+
+		/* 8. Must be '=' */
+		if (*pos != '=') {
+			pos--;
+			*data = pos;
+			return true;
+		}
+	}
+
+	/* 9. Skip '=' */
+	pos++;
+
+	/* 10. Skip any spaces after '=' */
+	while (pos < end && ISSPACE(*pos)) {
+		pos++;
+	}
+
+	if (pos >= end) {
+		*data = pos;
+		return false;
+	}
+
+	/* 11. Extract value, if quoted */
+	/* a */
+	if (*pos == '\'' || *pos == '"') {
+		/* 1 */
+		const uint8_t *quote = pos;
+
+		/* 2 */
+		while (++pos < end) {
+			/* 3 */
+			if (*pos == *quote) {
+				*value = (quote + 1);
+				*data = ++pos;
+				return true;
+			}
+
+			/* 4 is handled by strncasecmp */
+
+			/* 5 */
+			(*valuelen)++;
+
+			/* 6 */
+		}
+
+		if (pos >= end) {
+			*data = pos;
+			return false;
+		}
+	}
+
+	/* b */
+	if (*pos == '<' || *pos == '>') {
+		*data = pos;
+		return true;
+	}
+
+	/* c is handled by strncasecmp */
+
+	/* d */
+	*value = pos;
+
+	while (pos < end) {
+		/* 12. Extract unquoted value */
+		/* a */
+		if (ISSPACE(*pos) || *pos == '<' || *pos == '>') {
+			*data = pos;
+			return true;
+		}
+
+		/* b is handled by strncasecmp */
+
+		/* c */
+		(*valuelen)++;
+
+		/* 13. Advance */
+		pos++;
+	}
+
+	if (pos >= end) {
+		*data = pos;
+		return false;
+	}
+
+	/* should never be reached */
+	abort();
+
+	return false;
+}
author	John Mark Bell <jmb@netsurf-browser.org>	2007-06-23 22:40:25 +0000
committer	John Mark Bell <jmb@netsurf-browser.org>	2007-06-23 22:40:25 +0000
commit	7b30a5520cfb56e651f0eb4da85a3e07747da7dc (patch)
tree	5d6281c071c089e1e7a8ae6f8044cecaf6a7db16 /src/charset/detect.c
download	libhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.gz libhubbub-7b30a5520cfb56e651f0eb4da85a3e07747da7dc.tar.bz2