From ea3f09a72433be165cc4ec09052b3d636e64ac0c Mon Sep 17 00:00:00 2001
From: John-Mark Bell <jmb@netsurf-browser.org>
Date: Sun, 30 Dec 2012 23:33:27 +0000
Subject: Correctly emit entities when serialising to HTML.

---
 utils/utf8.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 138 insertions(+), 1 deletion(-)

(limited to 'utils/utf8.c')

diff --git a/utils/utf8.c b/utils/utf8.c
index 5c27fa7c6..c0f6b106a 100644
--- a/utils/utf8.c
+++ b/utils/utf8.c
@@ -297,7 +297,7 @@ utf8_convert_ret utf8_convert(const char *string, size_t len,
 	}
 
 	slen = len ? len : strlen(string);
-	/* Worst case = ACSII -> UCS4, so allocate an output buffer
+	/* Worst case = ASCII -> UCS4, so allocate an output buffer
 	 * 4 times larger than the input buffer, and add 4 bytes at
 	 * the end for the NULL terminator
 	 */
@@ -337,3 +337,140 @@ utf8_convert_ret utf8_convert(const char *string, size_t len,
 
 	return UTF8_CONVERT_OK;
 }
+
+static utf8_convert_ret utf8_convert_html_chunk(iconv_t cd,
+		const char *chunk, size_t inlen,
+		char **out, size_t *outlen)
+{
+	size_t ret, esclen;
+	uint32_t ucs4;
+	char *pescape, escape[11];
+
+	while (inlen > 0) {
+		ret = iconv(cd, (void *) &chunk, &inlen, (void *) out, outlen);
+		if (ret != (size_t) -1)
+			break;
+
+		if (errno != EILSEQ)
+			return UTF8_CONVERT_NOMEM;
+
+		ucs4 = utf8_to_ucs4(chunk, inlen);
+		esclen = snprintf(escape, sizeof(escape), "&#x%06x;", ucs4);
+		pescape = escape;
+		ret = iconv(cd, (void *) &pescape, &esclen,
+				(void *) out, outlen);
+		if (ret == (size_t) -1)
+			return UTF8_CONVERT_NOMEM;
+
+		esclen = utf8_next(chunk, inlen, 0);
+		chunk += esclen;
+		inlen -= esclen;
+	}
+
+	return UTF8_CONVERT_OK;
+}
+
+/**
+ * Convert a UTF-8 encoded string into a string of the given encoding, 
+ * applying HTML escape sequences where necessary.
+ *
+ * \param string   String to convert (NUL-terminated)
+ * \param encname  Name of encoding to convert to
+ * \param len      Length, in bytes, of the input string, or 0
+ * \param result   Pointer to location to receive result
+ * \return Appropriate utf8_convert_ret value
+ */
+utf8_convert_ret utf8_to_html(const char *string, const char *encname,
+		size_t len, char **result)
+{
+	iconv_t cd;
+	const char *in;
+	char *out, *origout;
+	size_t off, prev_off, inlen, outlen, origoutlen, esclen;
+	utf8_convert_ret ret;
+	char *pescape, escape[11];
+
+	if (len == 0)
+		len = strlen(string);
+
+	cd = iconv_open(encname, "UTF-8");
+	if (cd == (iconv_t) -1) {
+		if (errno == EINVAL)
+			return UTF8_CONVERT_BADENC;
+		/* default to no memory */
+		return UTF8_CONVERT_NOMEM;
+	}
+
+	/* Worst case is ASCII -> UCS4, with all characters escaped: 
+	 * "&#xYYYYYY;", thus each input character may become a string 
+	 * of 10 UCS4 characters, each 4 bytes in length */
+	origoutlen = outlen = len * 10 * 4;
+	origout = out = malloc(outlen);
+	if (out == NULL) {
+		iconv_close(cd);
+		return UTF8_CONVERT_NOMEM;
+	}
+
+	/* Process input in chunks between characters we must escape */
+	prev_off = off = 0;
+	while (off < len) {
+		/* Must escape '&', '<', and '>' */
+		if (string[off] == '&' || string[off] == '<' ||
+				string[off] == '>') {
+			if (off - prev_off > 0) {
+				/* Emit chunk */
+				in = string + prev_off;
+				inlen = off - prev_off;
+				ret = utf8_convert_html_chunk(cd, in, inlen,
+						&out, &outlen);
+				if (ret != UTF8_CONVERT_OK) {
+					free(origout);
+					iconv_close(cd);
+					return ret;
+				}
+			}
+
+			/* Emit mandatory escape */
+			esclen = snprintf(escape, sizeof(escape),
+					"&#x%06x;", string[off]);
+			pescape = escape;
+			ret = utf8_convert_html_chunk(cd, pescape, esclen,
+					&out, &outlen);
+			if (ret != UTF8_CONVERT_OK) {
+				free(origout);
+				iconv_close(cd);
+				return ret;
+			}
+
+			prev_off = off = utf8_next(string, len, off);
+		} else {
+			off = utf8_next(string, len, off);
+		}
+	}
+
+	/* Process final chunk */
+	if (prev_off < len) {
+		in = string + prev_off;
+		inlen = len - prev_off;
+		ret = utf8_convert_html_chunk(cd, in, inlen, &out, &outlen);
+		if (ret != UTF8_CONVERT_OK) {
+			free(origout);
+			iconv_close(cd);
+			return ret;
+		}
+	}
+
+	iconv_close(cd);
+
+	/* Shrink-wrap */
+	*result = realloc(origout, origoutlen - outlen + 4);
+	if (*result == NULL) {
+		free(origout);
+		return UTF8_CONVERT_NOMEM;
+	}
+	memset(*result + (origoutlen - outlen), 0, 4);
+
+	return UTF8_CONVERT_OK;
+}
+
+
-- 
cgit v1.2.3