From 9b8208fca65e6ff52634bb5055d2566e1d2c82c1 Mon Sep 17 00:00:00 2001
From: John Mark Bell <jmb@netsurf-browser.org>
Date: Thu, 2 Apr 2009 00:19:22 +0000
Subject: Autodetect UTF-16 and UTF-32 charsets

svn path=/trunk/libcss/; revision=7026
---
 src/charset/detect.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 277 insertions(+), 14 deletions(-)

(limited to 'src/charset')

diff --git a/src/charset/detect.c b/src/charset/detect.c
index 11ee699..78b4663 100644
--- a/src/charset/detect.c
+++ b/src/charset/detect.c
@@ -15,6 +15,12 @@
 
 static parserutils_error css_charset_read_bom_or_charset(const uint8_t *data, 
 		size_t len, uint16_t *mibenum);
+static parserutils_error try_utf32_charset(const uint8_t *data, 
+		size_t len, uint16_t *result);
+static parserutils_error try_utf16_charset(const uint8_t *data, 
+		size_t len, uint16_t *result);
+static parserutils_error try_ascii_compatible_charset(const uint8_t *data, 
+		size_t len, uint16_t *result);
 
 /**
  * Extract a charset from a chunk of data
@@ -32,7 +38,7 @@ static parserutils_error css_charset_read_bom_or_charset(const uint8_t *data,
 parserutils_error css_charset_extract(const uint8_t *data, size_t len,
 		uint16_t *mibenum, uint32_t *source)
 {
-	css_error error;
+	parserutils_error error;
 	uint16_t charset = 0;
 
 	if (data == NULL || mibenum == NULL || source == NULL)
@@ -42,10 +48,6 @@ parserutils_error css_charset_extract(const uint8_t *data, size_t len,
 	if (*source == CSS_CHARSET_DICTATED)
 		return PARSERUTILS_OK;
 
-	/* We need at least 4 bytes of data */
-	if (len < 4)
-		goto default_encoding;
-
 	/* Look for a BOM and/or @charset */
 	error = css_charset_read_bom_or_charset(data, len, &charset);
 	if (error != PARSERUTILS_OK)
@@ -64,8 +66,6 @@ parserutils_error css_charset_extract(const uint8_t *data, size_t len,
 		return PARSERUTILS_OK;
 
 	/* We've not yet found a charset, so use the default fallback */
-default_encoding:
-
 	charset = parserutils_charset_mibenum_from_name("UTF-8", SLEN("UTF-8"));
 
 	*mibenum = charset;
@@ -87,6 +87,7 @@ default_encoding:
 parserutils_error css_charset_read_bom_or_charset(const uint8_t *data, 
 		size_t len, uint16_t *mibenum)
 {
+	parserutils_error error;
 	uint16_t charset = 0;
 
 	if (data == NULL)
@@ -130,15 +131,276 @@ parserutils_error css_charset_read_bom_or_charset(const uint8_t *data,
 		return PARSERUTILS_OK;
 	}
 
-	/** \todo UTF-32 and UTF-16 @charset support */
+	error = try_utf32_charset(data, len, &charset);
+	if (error == PARSERUTILS_OK && charset != 0) {
+		*mibenum = charset;
+		return PARSERUTILS_OK;
+	}
+
+	error = try_utf16_charset(data, len, &charset);
+	if (error == PARSERUTILS_OK && charset != 0) {
+		*mibenum = charset;
+		return PARSERUTILS_OK;
+	}
+
+	error = try_ascii_compatible_charset(data, len, &charset);
+	if (error != PARSERUTILS_OK)
+		return error;
+
+	*mibenum = charset;
+
+	return PARSERUTILS_OK;
+}
+
+static parserutils_error try_utf32_charset(const uint8_t *data, 
+		size_t len, uint16_t *result)
+{
+	uint16_t charset = 0;
+
+#define CHARSET_BE "\0\0\0@\0\0\0c\0\0\0h\0\0\0a\0\0\0r\0\0\0s\0\0\0e\0\0\0t\0\0\0 \0\0\0\""
+#define CHARSET_LE "@\0\0\0c\0\0\0h\0\0\0a\0\0\0r\0\0\0s\0\0\0e\0\0\0t\0\0\0 \0\0\0\"\0\0\0"
+
+	if (len <= SLEN(CHARSET_LE))
+		return PARSERUTILS_NEEDDATA;
+
+	/* Look for @charset, assuming UTF-32 source data */
+	if (memcmp(data, CHARSET_LE, SLEN(CHARSET_LE)) == 0) {
+		const uint8_t *start = data + SLEN(CHARSET_LE);
+		const uint8_t *end;
+		char buf[8];
+		char *ptr = buf;
+
+		/* Look for "; at end of charset declaration */
+		for (end = start; end < data + len - 4; end += 4) {
+			uint32_t c = end[0] | (end[1] << 8) | 
+				     (end[2] << 16) | (end[3] << 24);
+
+			/* Bail if non-ASCII */
+			if (c > 0x007f)
+				break;
+
+			/* Reached the end? */
+			if (c == '"' && end < data + len - 8) {
+				uint32_t d = end[4] | (end[5] << 8) |
+				    (end[6] << 16) | (end[7] << 24);
+
+				if (d == ';')
+					break;
+			}
+
+			/* Append to buf, if there's space */
+			if ((size_t) (ptr - buf) < sizeof(buf)) {
+				/* Uppercase */
+				if ('a' <= c && c <= 'z')
+					*ptr++ = c & ~0x20;
+				else
+					*ptr++ = c;
+			}
+		}
+
+		if (end == data + len - 4) {
+			/* Ran out of input */
+			return PARSERUTILS_NEEDDATA;
+		}
+
+		/* Ensure we have something that looks like UTF-32(LE)? */
+		if ((ptr - buf == SLEN("UTF-32LE") && 
+				memcmp(buf, "UTF-32LE", ptr - buf) == 0) ||
+				(ptr - buf == SLEN("UTF-32") &&
+				memcmp(buf, "UTF-32", ptr - buf) == 0)) {
+			/* Convert to MIB enum */
+			charset = parserutils_charset_mibenum_from_name(
+					"UTF-32LE", SLEN("UTF-32LE"));
+		}
+	} else if (memcmp(data, CHARSET_BE, SLEN(CHARSET_BE)) == 0) {
+		const uint8_t *start = data + SLEN(CHARSET_BE);
+		const uint8_t *end;
+		char buf[8];
+		char *ptr = buf;
+
+		/* Look for "; at end of charset declaration */
+		for (end = start; end < data + len - 4; end += 4) {
+			uint32_t c = end[3] | (end[2] << 8) | 
+				     (end[1] << 16) | (end[0] << 24);
+
+			/* Bail if non-ASCII */
+			if (c > 0x007f)
+				break;
+
+			/* Reached the end? */
+			if (c == '"' && end < data + len - 8) {
+				uint32_t d = end[7] | (end[6] << 8) |
+				    (end[5] << 16) | (end[4] << 24);
+
+				if (d == ';')
+					break;
+			}
+
+			/* Append to buf, if there's space */
+			if ((size_t) (ptr - buf) < sizeof(buf)) {
+				/* Uppercase */
+				if ('a' <= c && c <= 'z')
+					*ptr++ = c & ~0x20;
+				else
+					*ptr++ = c;
+			}
+		}
+
+		if (end == data + len - 4) {
+			/* Ran out of input */
+			return PARSERUTILS_NEEDDATA;
+		}
+
+		/* Ensure we have something that looks like UTF-32(BE)? */
+		if ((ptr - buf == SLEN("UTF-32BE") && 
+				memcmp(buf, "UTF-32BE", ptr - buf) == 0) ||
+				(ptr - buf == SLEN("UTF-32") &&
+				memcmp(buf, "UTF-32", ptr - buf) == 0)) {
+			/* Convert to MIB enum */
+			charset = parserutils_charset_mibenum_from_name(
+					"UTF-32BE", SLEN("UTF-32BE"));
+		}
+	}
+
+#undef CHARSET_LE
+#undef CHARSET_BE
+
+	*result = charset;
+
+	return PARSERUTILS_OK;
+}
+
+static parserutils_error try_utf16_charset(const uint8_t *data, 
+		size_t len, uint16_t *result)
+{
+	uint16_t charset = 0;
+
+#define CHARSET_BE "\0@\0c\0h\0a\0r\0s\0e\0t\0 \0\""
+#define CHARSET_LE "@\0c\0h\0a\0r\0s\0e\0t\0 \0\"\0"
+
+	if (len <= SLEN(CHARSET_LE))
+		return PARSERUTILS_NEEDDATA;
+
+	/* Look for @charset, assuming UTF-16 source data */
+	if (memcmp(data, CHARSET_LE, SLEN(CHARSET_LE)) == 0) {
+		const uint8_t *start = data + SLEN(CHARSET_LE);
+		const uint8_t *end;
+		char buf[8];
+		char *ptr = buf;
+
+		/* Look for "; at end of charset declaration */
+		for (end = start; end < data + len - 2; end += 2) {
+			uint32_t c = end[0] | (end[1] << 8);
+
+			/* Bail if non-ASCII */
+			if (c > 0x007f)
+				break;
+
+			/* Reached the end? */
+			if (c == '"' && end < data + len - 4) {
+				uint32_t d = end[2] | (end[3] << 8);
+
+				if (d == ';')
+					break;
+			}
+
+			/* Append to buf, if there's space */
+			if ((size_t) (ptr - buf) < sizeof(buf)) {
+				/* Uppercase */
+				if ('a' <= c && c <= 'z')
+					*ptr++ = c & ~0x20;
+				else
+					*ptr++ = c;
+			}
+		}
+
+		if (end == data + len - 2) {
+			/* Ran out of input */
+			return PARSERUTILS_NEEDDATA;
+		}
+
+		/* Ensure we have something that looks like UTF-16(LE)? */
+		if ((ptr - buf == SLEN("UTF-16LE") && 
+				memcmp(buf, "UTF-16LE", ptr - buf) == 0) ||
+				(ptr - buf == SLEN("UTF-16") &&
+				memcmp(buf, "UTF-16", ptr - buf) == 0)) {
+			/* Convert to MIB enum */
+			charset = parserutils_charset_mibenum_from_name(
+					"UTF-16LE", SLEN("UTF-16LE"));
+		}
+	} else if (memcmp(data, CHARSET_BE, SLEN(CHARSET_BE)) == 0) {
+		const uint8_t *start = data + SLEN(CHARSET_BE);
+		const uint8_t *end;
+		char buf[8];
+		char *ptr = buf;
+
+		/* Look for "; at end of charset declaration */
+		for (end = start; end < data + len - 2; end += 2) {
+			uint32_t c = end[1] | (end[0] << 8);
+
+			/* Bail if non-ASCII */
+			if (c > 0x007f)
+				break;
+
+			/* Reached the end? */
+			if (c == '"' && end < data + len - 4) {
+				uint32_t d = end[3] | (end[2] << 8);
+
+				if (d == ';')
+					break;
+			}
+
+			/* Append to buf, if there's space */
+			if ((size_t) (ptr - buf) < sizeof(buf)) {
+				/* Uppercase */
+				if ('a' <= c && c <= 'z')
+					*ptr++ = c & ~0x20;
+				else
+					*ptr++ = c;
+			}
+		}
+
+		if (end == data + len - 2) {
+			/* Ran out of input */
+			return PARSERUTILS_NEEDDATA;
+		}
+
+		/* Ensure we have something that looks like UTF-16(BE)? */
+		if ((ptr - buf == SLEN("UTF-16BE") && 
+				memcmp(buf, "UTF-16BE", ptr - buf) == 0) ||
+				(ptr - buf == SLEN("UTF-16") &&
+				memcmp(buf, "UTF-16", ptr - buf) == 0)) {
+			/* Convert to MIB enum */
+			charset = parserutils_charset_mibenum_from_name(
+					"UTF-16BE", SLEN("UTF-16BE"));
+		}
+	}
+
+#undef CHARSET_LE
+#undef CHARSET_BE
+
+	*result = charset;
+
+	return PARSERUTILS_OK;
+}
+
+parserutils_error try_ascii_compatible_charset(const uint8_t *data, size_t len,
+		uint16_t *result)
+{
+	uint16_t charset = 0;
+
+#define CHARSET "@charset \""
+
+	if (len <= SLEN(CHARSET))
+		return PARSERUTILS_NEEDDATA;
 
 	/* Look for @charset, assuming ASCII-compatible source data */
-	if (len > 10 &&	strncmp((const char *) data, "@charset \"", 
-			SLEN("@charset \"")) == 0) {
+	if (memcmp(data, CHARSET, SLEN(CHARSET)) == 0) {
+		const uint8_t *start = data + SLEN(CHARSET);
 		const uint8_t *end;
 
 		/* Look for "; at end of charset declaration */
-		for (end = data + 10; end < data + len; end++) {
+		for (end = start; end < data + len; end++) {
 			if (*end == '"' && end < data + len - 1 && 
 					*(end + 1) == ';')
 				break;
@@ -151,11 +413,12 @@ parserutils_error css_charset_read_bom_or_charset(const uint8_t *data,
 
 		/* Convert to MIB enum */
 		charset = parserutils_charset_mibenum_from_name(
-				(char *) data + 10,  end - data - 10);
+				(const char *) start,  end - start);
 	}
 
-	*mibenum = charset;
+#undef CHARSET
+
+	*result = charset;
 
 	return PARSERUTILS_OK;
 }
-
-- 
cgit v1.2.3