6 files changed, 253 insertions, 16 deletions
diff --git a/desktop/save_complete.c b/desktop/save_complete.c
index 7137f0ba9..d9bd507b8 100644
--- a/desktop/save_complete.c
+++ b/desktop/save_complete.c
@@ -39,8 +39,10 @@
 #include "desktop/save_complete.h"
 #include "render/box.h"
 #include "render/html.h"
+#include "utils/corestrings.h"
 #include "utils/log.h"
 #include "utils/nsurl.h"
+#include "utils/utf8.h"
 #include "utils/utils.h"
 
 regex_t save_complete_import_re;
@@ -551,7 +553,9 @@ static bool save_complete_rewrite_url_value(save_complete_ctx *ctx,
 {
 	nsurl *url;
 	hlcache_handle *content;
+	char *escaped;
 	nserror error;
+	utf8_convert_ret ret;
 
 	error = nsurl_join(ctx->base, value, &url);
 	if (error == NSERROR_NOMEM)
@@ -566,11 +570,25 @@ static bool save_complete_rewrite_url_value(save_complete_ctx *ctx,
 			fprintf(ctx->fp, "\"%p\"", content);
 		} else {
 			/* no match found */
-			fprintf(ctx->fp, "\"%s\"", nsurl_access(url));
+			ret = utf8_to_html(nsurl_access(url), "UTF-8",
+					nsurl_length(url), &escaped);
 			nsurl_unref(url);
+
+			if (ret != UTF8_CONVERT_OK)
+				return false;
+
+			fprintf(ctx->fp, "\"%s\"", escaped);
+
+			free(escaped);
 		}
 	} else {
-		fprintf(ctx->fp, "\"%.*s\"", (int) value_len, value);
+		ret = utf8_to_html(value, "UTF-8", value_len, &escaped);
+		if (ret != UTF8_CONVERT_OK)
+			return false;
+
+		fprintf(ctx->fp, "\"%s\"", escaped);
+
+		free(escaped);
 	}
 
 	return true;
@@ -579,7 +597,16 @@ static bool save_complete_rewrite_url_value(save_complete_ctx *ctx,
 static bool save_complete_write_value(save_complete_ctx *ctx,
 		const char *value, size_t value_len)
 {
-	fprintf(ctx->fp, "\"%.*s\"", (int) value_len, value);
+	char *escaped;
+	utf8_convert_ret ret;
+
+	ret = utf8_to_html(value, "UTF-8", value_len, &escaped);
+	if (ret != UTF8_CONVERT_OK)
+		return false;
+
+	fprintf(ctx->fp, "\"%s\"", escaped);
+
+	free(escaped);
 
 	return true;
 }
@@ -728,7 +755,7 @@ static bool save_complete_handle_attrs(save_complete_ctx *ctx,
 	for (i = 0; i < length; i++) {
 		dom_attr *attr;
 
-		error = dom_namednodemap_item(attrs, i, &attr);
+		error = dom_namednodemap_item(attrs, i, (void *) &attr);
 		if (error != DOM_NO_ERR)
 			return false;
 
@@ -753,6 +780,7 @@ static bool save_complete_handle_element(save_complete_ctx *ctx,
 	dom_namednodemap *attrs;
 	const char *name_data;
 	size_t name_len;
+	bool process = true;
 	dom_exception error;
 
 	ctx->iter_state = STATE_NORMAL;
@@ -767,9 +795,56 @@ static bool save_complete_handle_element(save_complete_ctx *ctx,
 	name_data = dom_string_data(name);
 	name_len = dom_string_byte_length(name);
 
-	/* Elide BASE elements from the output */
 	if (name_len == SLEN("base") &&
 			strncasecmp(name_data, "base", name_len) == 0) {
+		/* Elide BASE elements from the output */
+		process = false;
+	} else if (name_len == SLEN("meta") && 
+			strncasecmp(name_data, "meta", name_len) == 0) {
+		/* Don't emit close tags for META elements */
+		if (event_type == EVENT_LEAVE) {
+			process = false;
+		} else {
+			/* Elide meta charsets */
+			dom_string *value;
+			error = dom_element_get_attribute(node,
+					corestring_dom_http_equiv, &value);
+			if (error != DOM_NO_ERR) {
+				dom_string_unref(name);
+				return false;
+			}
+
+			if (value != NULL) {
+				if (dom_string_length(value) ==
+					SLEN("Content-Type") &&
+					strncasecmp(dom_string_data(value),
+						"Content-Type",
+						SLEN("Content-Type")) == 0)
+					process = false;
+
+				dom_string_unref(value);
+			} else {
+				bool yes;
+
+				error = dom_element_has_attribute(node,
+						corestring_dom_charset, &yes);
+				if (error != DOM_NO_ERR) {
+					dom_string_unref(name);
+					return false;
+				}
+
+				if (yes)
+					process = false;
+			}
+		}
+	} else if (event_type == EVENT_LEAVE && 
+			((name_len == SLEN("link") && 
+			strncasecmp(name_data, "link", name_len) == 0))) {
+		/* Don't emit close tags for void elements */
+		process = false;
+	}
+
+	if (process == false) {
 		dom_string_unref(name);
 		return true;
 	}
@@ -833,6 +908,12 @@ static bool save_complete_handle_element(save_complete_ctx *ctx,
 		}
 
 		ctx->iter_state = STATE_IN_STYLE;
+	} else if (event_type == EVENT_ENTER && name_len == SLEN("head") &&
+			strncasecmp(name_data, "head", name_len) == 0) {
+		/* If this is a HEAD element, insert a meta charset */
+		fputs("<META http-equiv=\"Content-Type\" "
+				"content=\"text/html; charset=utf-8\">",
+				ctx->fp);
 	}
 
 	dom_string_unref(name);
@@ -846,6 +927,7 @@ static bool save_complete_node_handler(dom_node *node,
 	save_complete_ctx *ctx = ctxin;
 	dom_node_type type;
 	dom_exception error;
+	utf8_convert_ret ret;
 
 	error = dom_node_get_node_type(node, &type);
 	if (error != DOM_NO_ERR)
@@ -872,11 +954,20 @@ static bool save_complete_node_handler(dom_node *node,
 				fwrite("<!--", 1, sizeof("<!--") - 1, ctx->fp);
 
 			if (text != NULL) {
+				char *escaped;
+
 				text_data = dom_string_data(text);
 				text_len = dom_string_byte_length(text);
 
-				fwrite(text_data, sizeof(*text_data), 
-						text_len, ctx->fp);
+				ret = utf8_to_html(text_data, "UTF-8",
+						text_len, &escaped);
+				if (ret != UTF8_CONVERT_OK)
+					return false;
+
+				fwrite(escaped, sizeof(*escaped), 
+						strlen(escaped), ctx->fp);
+
+				free(escaped);
 
 				dom_string_unref(text);
 			}
@@ -917,8 +1008,9 @@ static bool save_complete_node_handler(dom_node *node,
 			name_data = dom_string_data(name);
 			name_len = dom_string_byte_length(name);
 
-			fprintf(ctx->fp, " PUBLIC \"%.*s\"",
-					(int) name_len, name_data);
+			if (name_len > 0)
+				fprintf(ctx->fp, " PUBLIC \"%.*s\"",
+						(int) name_len, name_data);
 
 			dom_string_unref(name);
 		}
@@ -931,8 +1023,9 @@ static bool save_complete_node_handler(dom_node *node,
 			name_data = dom_string_data(name);
 			name_len = dom_string_byte_length(name);
 
-			fprintf(ctx->fp, " \"%.*s\"",
-					(int) name_len, name_data);
+			if (name_len > 0)
+				fprintf(ctx->fp, " \"%.*s\"",
+						(int) name_len, name_data);
 
 			dom_string_unref(name);
 		}
diff --git a/desktop/tree_url_node.c b/desktop/tree_url_node.c
index d2701e5b1..5305fa11d 100644
--- a/desktop/tree_url_node.c
+++ b/desktop/tree_url_node.c
@@ -817,11 +817,11 @@ static bool tree_url_save_entry(struct node *entry, FILE *fp)
 	if (href == NULL)
 		return false;
 
-	ret = utf8_to_enc(text, "iso-8859-1", strlen(text), &latin1_text);
+	ret = utf8_to_html(text, "iso-8859-1", strlen(text), &latin1_text);
 	if (ret != UTF8_CONVERT_OK)
 		return false;
 
-	ret = utf8_to_enc(href, "iso-8859-1", strlen(href), &latin1_href);
+	ret = utf8_to_html(href, "iso-8859-1", strlen(href), &latin1_href);
 	if (ret != UTF8_CONVERT_OK) {
 		free(latin1_text);
 		return false;
@@ -872,7 +872,7 @@ static bool tree_url_save_directory(struct node *directory, FILE *fp)
 			if (text == NULL)
 				return false;
 
-			ret = utf8_to_enc(text, "iso-8859-1",
+			ret = utf8_to_html(text, "iso-8859-1",
 					strlen(text), &latin1_text);
 			if (ret != UTF8_CONVERT_OK)
 				return false;
@@ -919,7 +919,7 @@ bool tree_urlfile_save(struct tree *tree, const char *filename,
 	fputs("<meta http-equiv=\"Content-Type\" "
 		"content=\"text/html; charset=iso-8859-1\">\n", fp);
 	fprintf(fp, "<title>%s</title>\n", page_title);
-	fputs("<body>", fp);
+	fputs("</head>\n<body>", fp);
 
 	if (tree_url_save_directory(tree_get_root(tree), fp) == false) {
 		warn_user("HotlistSaveError", 0);
diff --git a/utils/corestrings.c b/utils/corestrings.c
index f970c107b..9fee96d6e 100644
--- a/utils/corestrings.c
+++ b/utils/corestrings.c
@@ -133,6 +133,7 @@ dom_string *corestring_dom_canplaythrough;
 dom_string *corestring_dom_cellpadding;
 dom_string *corestring_dom_cellspacing;
 dom_string *corestring_dom_change;
+dom_string *corestring_dom_charset;
 dom_string *corestring_dom_click;
 dom_string *corestring_dom_close;
 dom_string *corestring_dom_color;
@@ -355,6 +356,7 @@ void corestrings_fini(void)
 	CSS_DOM_STRING_UNREF(cellpadding);
 	CSS_DOM_STRING_UNREF(cellspacing);
 	CSS_DOM_STRING_UNREF(change);
+	CSS_DOM_STRING_UNREF(charset);
 	CSS_DOM_STRING_UNREF(click);
 	CSS_DOM_STRING_UNREF(close);
 	CSS_DOM_STRING_UNREF(color);
@@ -608,6 +610,7 @@ nserror corestrings_init(void)
 	CSS_DOM_STRING_INTERN(cellpadding);
 	CSS_DOM_STRING_INTERN(cellspacing);
 	CSS_DOM_STRING_INTERN(change);
+	CSS_DOM_STRING_INTERN(charset);
 	CSS_DOM_STRING_INTERN(click);
 	CSS_DOM_STRING_INTERN(close);
 	CSS_DOM_STRING_INTERN(color);
diff --git a/utils/corestrings.h b/utils/corestrings.h
index 61771c178..08d254501 100644
--- a/utils/corestrings.h
+++ b/utils/corestrings.h
@@ -139,6 +139,7 @@ extern struct dom_string *corestring_dom_canplaythrough;
 extern struct dom_string *corestring_dom_cellpadding;
 extern struct dom_string *corestring_dom_cellspacing;
 extern struct dom_string *corestring_dom_change;
+extern struct dom_string *corestring_dom_charset;
 extern struct dom_string *corestring_dom_click;
 extern struct dom_string *corestring_dom_close;
 extern struct dom_string *corestring_dom_color;
diff --git a/utils/utf8.c b/utils/utf8.c
index 5c27fa7c6..c0f6b106a 100644
--- a/utils/utf8.c
+++ b/utils/utf8.c
@@ -297,7 +297,7 @@ utf8_convert_ret utf8_convert(const char *string, size_t len,
 	}
 
 	slen = len ? len : strlen(string);
-	/* Worst case = ACSII -> UCS4, so allocate an output buffer
+	/* Worst case = ASCII -> UCS4, so allocate an output buffer
 	 * 4 times larger than the input buffer, and add 4 bytes at
 	 * the end for the NULL terminator
 	 */
@@ -337,3 +337,140 @@ utf8_convert_ret utf8_convert(const char *string, size_t len,
 
 	return UTF8_CONVERT_OK;
 }
+
+static utf8_convert_ret utf8_convert_html_chunk(iconv_t cd,
+		const char *chunk, size_t inlen,
+		char **out, size_t *outlen)
+{
+	size_t ret, esclen;
+	uint32_t ucs4;
+	char *pescape, escape[11];
+
+	while (inlen > 0) {
+		ret = iconv(cd, (void *) &chunk, &inlen, (void *) out, outlen);
+		if (ret != (size_t) -1)
+			break;
+
+		if (errno != EILSEQ)
+			return UTF8_CONVERT_NOMEM;
+
+		ucs4 = utf8_to_ucs4(chunk, inlen);
+		esclen = snprintf(escape, sizeof(escape), "&#x%06x;", ucs4);
+		pescape = escape;
+		ret = iconv(cd, (void *) &pescape, &esclen,
+				(void *) out, outlen);
+		if (ret == (size_t) -1)
+			return UTF8_CONVERT_NOMEM;
+
+		esclen = utf8_next(chunk, inlen, 0);
+		chunk += esclen;
+		inlen -= esclen;
+	}
+
+	return UTF8_CONVERT_OK;
+}
+
+/**
+ * Convert a UTF-8 encoded string into a string of the given encoding, 
+ * applying HTML escape sequences where necessary.
+ *
+ * \param string   String to convert (NUL-terminated)
+ * \param encname  Name of encoding to convert to
+ * \param len      Length, in bytes, of the input string, or 0
+ * \param result   Pointer to location to receive result
+ * \return Appropriate utf8_convert_ret value
+ */
+utf8_convert_ret utf8_to_html(const char *string, const char *encname,
+		size_t len, char **result)
+{
+	iconv_t cd;
+	const char *in;
+	char *out, *origout;
+	size_t off, prev_off, inlen, outlen, origoutlen, esclen;
+	utf8_convert_ret ret;
+	char *pescape, escape[11];
+
+	if (len == 0)
+		len = strlen(string);
+
+	cd = iconv_open(encname, "UTF-8");
+	if (cd == (iconv_t) -1) {
+		if (errno == EINVAL)
+			return UTF8_CONVERT_BADENC;
+		/* default to no memory */
+		return UTF8_CONVERT_NOMEM;
+	}
+
+	/* Worst case is ASCII -> UCS4, with all characters escaped: 
+	 * "&#xYYYYYY;", thus each input character may become a string 
+	 * of 10 UCS4 characters, each 4 bytes in length */
+	origoutlen = outlen = len * 10 * 4;
+	origout = out = malloc(outlen);
+	if (out == NULL) {
+		iconv_close(cd);
+		return UTF8_CONVERT_NOMEM;
+	}
+
+	/* Process input in chunks between characters we must escape */
+	prev_off = off = 0;
+	while (off < len) {
+		/* Must escape '&', '<', and '>' */
+		if (string[off] == '&' || string[off] == '<' ||
+				string[off] == '>') {
+			if (off - prev_off > 0) {
+				/* Emit chunk */
+				in = string + prev_off;
+				inlen = off - prev_off;
+				ret = utf8_convert_html_chunk(cd, in, inlen,
+						&out, &outlen);
+				if (ret != UTF8_CONVERT_OK) {
+					free(origout);
+					iconv_close(cd);
+					return ret;
+				}
+			}
+
+			/* Emit mandatory escape */
+			esclen = snprintf(escape, sizeof(escape),
+					"&#x%06x;", string[off]);
+			pescape = escape;
+			ret = utf8_convert_html_chunk(cd, pescape, esclen,
+					&out, &outlen);
+			if (ret != UTF8_CONVERT_OK) {
+				free(origout);
+				iconv_close(cd);
+				return ret;
+			}
+
+			prev_off = off = utf8_next(string, len, off);
+		} else {
+			off = utf8_next(string, len, off);
+		}
+	}
+
+	/* Process final chunk */
+	if (prev_off < len) {
+		in = string + prev_off;
+		inlen = len - prev_off;
+		ret = utf8_convert_html_chunk(cd, in, inlen, &out, &outlen);
+		if (ret != UTF8_CONVERT_OK) {
+			free(origout);
+			iconv_close(cd);
+			return ret;
+		}
+	}
+
+	iconv_close(cd);
+
+	/* Shrink-wrap */
+	*result = realloc(origout, origoutlen - outlen + 4);
+	if (*result == NULL) {
+		free(origout);
+		return UTF8_CONVERT_NOMEM;
+	}
+	memset(*result + (origoutlen - outlen), 0, 4);
+
+	return UTF8_CONVERT_OK;
+}
+
+
diff --git a/utils/utf8.h b/utils/utf8.h
index 9d8ec74fa..22aee1afa 100644
--- a/utils/utf8.h
+++ b/utils/utf8.h
@@ -47,6 +47,9 @@ utf8_convert_ret utf8_to_enc(const char *string, const char *encname,
 utf8_convert_ret utf8_from_enc(const char *string, const char *encname,
 		size_t len, char **result);
 
+utf8_convert_ret utf8_to_html(const char *string, const char *encname,
+		size_t len, char **result);
+
 /* These two are platform specific */
 utf8_convert_ret utf8_to_local_encoding(const char *string, size_t len,
 		char **result);