diff options
-rw-r--r-- | desktop/save_complete.c | 115 | ||||
-rw-r--r-- | desktop/tree_url_node.c | 8 | ||||
-rw-r--r-- | utils/corestrings.c | 3 | ||||
-rw-r--r-- | utils/corestrings.h | 1 | ||||
-rw-r--r-- | utils/utf8.c | 139 | ||||
-rw-r--r-- | utils/utf8.h | 3 |
6 files changed, 253 insertions, 16 deletions
diff --git a/desktop/save_complete.c b/desktop/save_complete.c index 7137f0ba9..d9bd507b8 100644 --- a/desktop/save_complete.c +++ b/desktop/save_complete.c @@ -39,8 +39,10 @@ #include "desktop/save_complete.h" #include "render/box.h" #include "render/html.h" +#include "utils/corestrings.h" #include "utils/log.h" #include "utils/nsurl.h" +#include "utils/utf8.h" #include "utils/utils.h" regex_t save_complete_import_re; @@ -551,7 +553,9 @@ static bool save_complete_rewrite_url_value(save_complete_ctx *ctx, { nsurl *url; hlcache_handle *content; + char *escaped; nserror error; + utf8_convert_ret ret; error = nsurl_join(ctx->base, value, &url); if (error == NSERROR_NOMEM) @@ -566,11 +570,25 @@ static bool save_complete_rewrite_url_value(save_complete_ctx *ctx, fprintf(ctx->fp, "\"%p\"", content); } else { /* no match found */ - fprintf(ctx->fp, "\"%s\"", nsurl_access(url)); + ret = utf8_to_html(nsurl_access(url), "UTF-8", + nsurl_length(url), &escaped); nsurl_unref(url); + + if (ret != UTF8_CONVERT_OK) + return false; + + fprintf(ctx->fp, "\"%s\"", escaped); + + free(escaped); } } else { - fprintf(ctx->fp, "\"%.*s\"", (int) value_len, value); + ret = utf8_to_html(value, "UTF-8", value_len, &escaped); + if (ret != UTF8_CONVERT_OK) + return false; + + fprintf(ctx->fp, "\"%s\"", escaped); + + free(escaped); } return true; @@ -579,7 +597,16 @@ static bool save_complete_rewrite_url_value(save_complete_ctx *ctx, static bool save_complete_write_value(save_complete_ctx *ctx, const char *value, size_t value_len) { - fprintf(ctx->fp, "\"%.*s\"", (int) value_len, value); + char *escaped; + utf8_convert_ret ret; + + ret = utf8_to_html(value, "UTF-8", value_len, &escaped); + if (ret != UTF8_CONVERT_OK) + return false; + + fprintf(ctx->fp, "\"%s\"", escaped); + + free(escaped); return true; } @@ -728,7 +755,7 @@ static bool save_complete_handle_attrs(save_complete_ctx *ctx, for (i = 0; i < length; i++) { dom_attr *attr; - error = dom_namednodemap_item(attrs, i, &attr); + error = dom_namednodemap_item(attrs, i, (void *) &attr); if (error != DOM_NO_ERR) return false; @@ -753,6 +780,7 @@ static bool save_complete_handle_element(save_complete_ctx *ctx, dom_namednodemap *attrs; const char *name_data; size_t name_len; + bool process = true; dom_exception error; ctx->iter_state = STATE_NORMAL; @@ -767,9 +795,56 @@ static bool save_complete_handle_element(save_complete_ctx *ctx, name_data = dom_string_data(name); name_len = dom_string_byte_length(name); - /* Elide BASE elements from the output */ if (name_len == SLEN("base") && strncasecmp(name_data, "base", name_len) == 0) { + /* Elide BASE elements from the output */ + process = false; + } else if (name_len == SLEN("meta") && + strncasecmp(name_data, "meta", name_len) == 0) { + /* Don't emit close tags for META elements */ + if (event_type == EVENT_LEAVE) { + process = false; + } else { + /* Elide meta charsets */ + dom_string *value; + error = dom_element_get_attribute(node, + corestring_dom_http_equiv, &value); + if (error != DOM_NO_ERR) { + dom_string_unref(name); + return false; + } + + if (value != NULL) { + if (dom_string_length(value) == + SLEN("Content-Type") && + strncasecmp(dom_string_data(value), + "Content-Type", + SLEN("Content-Type")) == 0) + process = false; + + dom_string_unref(value); + } else { + bool yes; + + error = dom_element_has_attribute(node, + corestring_dom_charset, &yes); + if (error != DOM_NO_ERR) { + dom_string_unref(name); + return false; + } + + if (yes) + process = false; + } + } + } else if (event_type == EVENT_LEAVE && + ((name_len == SLEN("link") && + strncasecmp(name_data, "link", name_len) == 0))) { + /* Don't emit close tags for void elements */ + process = false; + } + + if (process == false) { dom_string_unref(name); return true; } @@ -833,6 +908,12 @@ static bool save_complete_handle_element(save_complete_ctx *ctx, } ctx->iter_state = STATE_IN_STYLE; + } else if (event_type == EVENT_ENTER && name_len == SLEN("head") && + strncasecmp(name_data, "head", name_len) == 0) { + /* If this is a HEAD element, insert a meta charset */ + fputs("<META http-equiv=\"Content-Type\" " + "content=\"text/html; charset=utf-8\">", + ctx->fp); } dom_string_unref(name); @@ -846,6 +927,7 @@ static bool save_complete_node_handler(dom_node *node, save_complete_ctx *ctx = ctxin; dom_node_type type; dom_exception error; + utf8_convert_ret ret; error = dom_node_get_node_type(node, &type); if (error != DOM_NO_ERR) @@ -872,11 +954,20 @@ static bool save_complete_node_handler(dom_node *node, fwrite("<!--", 1, sizeof("<!--") - 1, ctx->fp); if (text != NULL) { + char *escaped; + text_data = dom_string_data(text); text_len = dom_string_byte_length(text); - fwrite(text_data, sizeof(*text_data), - text_len, ctx->fp); + ret = utf8_to_html(text_data, "UTF-8", + text_len, &escaped); + if (ret != UTF8_CONVERT_OK) + return false; + + fwrite(escaped, sizeof(*escaped), + strlen(escaped), ctx->fp); + + free(escaped); dom_string_unref(text); } @@ -917,8 +1008,9 @@ static bool save_complete_node_handler(dom_node *node, name_data = dom_string_data(name); name_len = dom_string_byte_length(name); - fprintf(ctx->fp, " PUBLIC \"%.*s\"", - (int) name_len, name_data); + if (name_len > 0) + fprintf(ctx->fp, " PUBLIC \"%.*s\"", + (int) name_len, name_data); dom_string_unref(name); } @@ -931,8 +1023,9 @@ static bool save_complete_node_handler(dom_node *node, name_data = dom_string_data(name); name_len = dom_string_byte_length(name); - fprintf(ctx->fp, " \"%.*s\"", - (int) name_len, name_data); + if (name_len > 0) + fprintf(ctx->fp, " \"%.*s\"", + (int) name_len, name_data); dom_string_unref(name); } diff --git a/desktop/tree_url_node.c b/desktop/tree_url_node.c index d2701e5b1..5305fa11d 100644 --- a/desktop/tree_url_node.c +++ b/desktop/tree_url_node.c @@ -817,11 +817,11 @@ static bool tree_url_save_entry(struct node *entry, FILE *fp) if (href == NULL) return false; - ret = utf8_to_enc(text, "iso-8859-1", strlen(text), &latin1_text); + ret = utf8_to_html(text, "iso-8859-1", strlen(text), &latin1_text); if (ret != UTF8_CONVERT_OK) return false; - ret = utf8_to_enc(href, "iso-8859-1", strlen(href), &latin1_href); + ret = utf8_to_html(href, "iso-8859-1", strlen(href), &latin1_href); if (ret != UTF8_CONVERT_OK) { free(latin1_text); return false; @@ -872,7 +872,7 @@ static bool tree_url_save_directory(struct node *directory, FILE *fp) if (text == NULL) return false; - ret = utf8_to_enc(text, "iso-8859-1", + ret = utf8_to_html(text, "iso-8859-1", strlen(text), &latin1_text); if (ret != UTF8_CONVERT_OK) return false; @@ -919,7 +919,7 @@ bool tree_urlfile_save(struct tree *tree, const char *filename, fputs("<meta http-equiv=\"Content-Type\" " "content=\"text/html; charset=iso-8859-1\">\n", fp); fprintf(fp, "<title>%s</title>\n", page_title); - fputs("<body>", fp); + fputs("</head>\n<body>", fp); if (tree_url_save_directory(tree_get_root(tree), fp) == false) { warn_user("HotlistSaveError", 0); diff --git a/utils/corestrings.c b/utils/corestrings.c index f970c107b..9fee96d6e 100644 --- a/utils/corestrings.c +++ b/utils/corestrings.c @@ -133,6 +133,7 @@ dom_string *corestring_dom_canplaythrough; dom_string *corestring_dom_cellpadding; dom_string *corestring_dom_cellspacing; dom_string *corestring_dom_change; +dom_string *corestring_dom_charset; dom_string *corestring_dom_click; dom_string *corestring_dom_close; dom_string *corestring_dom_color; @@ -355,6 +356,7 @@ void corestrings_fini(void) CSS_DOM_STRING_UNREF(cellpadding); CSS_DOM_STRING_UNREF(cellspacing); CSS_DOM_STRING_UNREF(change); + CSS_DOM_STRING_UNREF(charset); CSS_DOM_STRING_UNREF(click); CSS_DOM_STRING_UNREF(close); CSS_DOM_STRING_UNREF(color); @@ -608,6 +610,7 @@ nserror corestrings_init(void) CSS_DOM_STRING_INTERN(cellpadding); CSS_DOM_STRING_INTERN(cellspacing); CSS_DOM_STRING_INTERN(change); + CSS_DOM_STRING_INTERN(charset); CSS_DOM_STRING_INTERN(click); CSS_DOM_STRING_INTERN(close); CSS_DOM_STRING_INTERN(color); diff --git a/utils/corestrings.h b/utils/corestrings.h index 61771c178..08d254501 100644 --- a/utils/corestrings.h +++ b/utils/corestrings.h @@ -139,6 +139,7 @@ extern struct dom_string *corestring_dom_canplaythrough; extern struct dom_string *corestring_dom_cellpadding; extern struct dom_string *corestring_dom_cellspacing; extern struct dom_string *corestring_dom_change; +extern struct dom_string *corestring_dom_charset; extern struct dom_string *corestring_dom_click; extern struct dom_string *corestring_dom_close; extern struct dom_string *corestring_dom_color; diff --git a/utils/utf8.c b/utils/utf8.c index 5c27fa7c6..c0f6b106a 100644 --- a/utils/utf8.c +++ b/utils/utf8.c @@ -297,7 +297,7 @@ utf8_convert_ret utf8_convert(const char *string, size_t len, } slen = len ? len : strlen(string); - /* Worst case = ACSII -> UCS4, so allocate an output buffer + /* Worst case = ASCII -> UCS4, so allocate an output buffer * 4 times larger than the input buffer, and add 4 bytes at * the end for the NULL terminator */ @@ -337,3 +337,140 @@ utf8_convert_ret utf8_convert(const char *string, size_t len, return UTF8_CONVERT_OK; } + +static utf8_convert_ret utf8_convert_html_chunk(iconv_t cd, + const char *chunk, size_t inlen, + char **out, size_t *outlen) +{ + size_t ret, esclen; + uint32_t ucs4; + char *pescape, escape[11]; + + while (inlen > 0) { + ret = iconv(cd, (void *) &chunk, &inlen, (void *) out, outlen); + if (ret != (size_t) -1) + break; + + if (errno != EILSEQ) + return UTF8_CONVERT_NOMEM; + + ucs4 = utf8_to_ucs4(chunk, inlen); + esclen = snprintf(escape, sizeof(escape), "&#x%06x;", ucs4); + pescape = escape; + ret = iconv(cd, (void *) &pescape, &esclen, + (void *) out, outlen); + if (ret == (size_t) -1) + return UTF8_CONVERT_NOMEM; + + esclen = utf8_next(chunk, inlen, 0); + chunk += esclen; + inlen -= esclen; + } + + return UTF8_CONVERT_OK; +} + +/** + * Convert a UTF-8 encoded string into a string of the given encoding, + * applying HTML escape sequences where necessary. + * + * \param string String to convert (NUL-terminated) + * \param encname Name of encoding to convert to + * \param len Length, in bytes, of the input string, or 0 + * \param result Pointer to location to receive result + * \return Appropriate utf8_convert_ret value + */ +utf8_convert_ret utf8_to_html(const char *string, const char *encname, + size_t len, char **result) +{ + iconv_t cd; + const char *in; + char *out, *origout; + size_t off, prev_off, inlen, outlen, origoutlen, esclen; + utf8_convert_ret ret; + char *pescape, escape[11]; + + if (len == 0) + len = strlen(string); + + cd = iconv_open(encname, "UTF-8"); + if (cd == (iconv_t) -1) { + if (errno == EINVAL) + return UTF8_CONVERT_BADENC; + /* default to no memory */ + return UTF8_CONVERT_NOMEM; + } + + /* Worst case is ASCII -> UCS4, with all characters escaped: + * "&#xYYYYYY;", thus each input character may become a string + * of 10 UCS4 characters, each 4 bytes in length */ + origoutlen = outlen = len * 10 * 4; + origout = out = malloc(outlen); + if (out == NULL) { + iconv_close(cd); + return UTF8_CONVERT_NOMEM; + } + + /* Process input in chunks between characters we must escape */ + prev_off = off = 0; + while (off < len) { + /* Must escape '&', '<', and '>' */ + if (string[off] == '&' || string[off] == '<' || + string[off] == '>') { + if (off - prev_off > 0) { + /* Emit chunk */ + in = string + prev_off; + inlen = off - prev_off; + ret = utf8_convert_html_chunk(cd, in, inlen, + &out, &outlen); + if (ret != UTF8_CONVERT_OK) { + free(origout); + iconv_close(cd); + return ret; + } + } + + /* Emit mandatory escape */ + esclen = snprintf(escape, sizeof(escape), + "&#x%06x;", string[off]); + pescape = escape; + ret = utf8_convert_html_chunk(cd, pescape, esclen, + &out, &outlen); + if (ret != UTF8_CONVERT_OK) { + free(origout); + iconv_close(cd); + return ret; + } + + prev_off = off = utf8_next(string, len, off); + } else { + off = utf8_next(string, len, off); + } + } + + /* Process final chunk */ + if (prev_off < len) { + in = string + prev_off; + inlen = len - prev_off; + ret = utf8_convert_html_chunk(cd, in, inlen, &out, &outlen); + if (ret != UTF8_CONVERT_OK) { + free(origout); + iconv_close(cd); + return ret; + } + } + + iconv_close(cd); + + /* Shrink-wrap */ + *result = realloc(origout, origoutlen - outlen + 4); + if (*result == NULL) { + free(origout); + return UTF8_CONVERT_NOMEM; + } + memset(*result + (origoutlen - outlen), 0, 4); + + return UTF8_CONVERT_OK; +} + + diff --git a/utils/utf8.h b/utils/utf8.h index 9d8ec74fa..22aee1afa 100644 --- a/utils/utf8.h +++ b/utils/utf8.h @@ -47,6 +47,9 @@ utf8_convert_ret utf8_to_enc(const char *string, const char *encname, utf8_convert_ret utf8_from_enc(const char *string, const char *encname, size_t len, char **result); +utf8_convert_ret utf8_to_html(const char *string, const char *encname, + size_t len, char **result); + /* These two are platform specific */ utf8_convert_ret utf8_to_local_encoding(const char *string, size_t len, char **result); |