From f824ab8af4d3d2e34bd59b860b9c6d5568c3bb44 Mon Sep 17 00:00:00 2001 From: John-Mark Bell Date: Sat, 3 Nov 2012 23:19:28 +0000 Subject: Port save complete to libdom. --- desktop/save_complete.c | 1506 ++++++++++++++++++++++++++++------------------- desktop/save_complete.h | 29 +- 2 files changed, 912 insertions(+), 623 deletions(-) (limited to 'desktop') diff --git a/desktop/save_complete.c b/desktop/save_complete.c index 0ac64b515..42da86752 100644 --- a/desktop/save_complete.c +++ b/desktop/save_complete.c @@ -1,5 +1,5 @@ /* - * Copyright 2004 John M Bell + * Copyright 2012 John-Mark Bell * Copyright 2004-2007 James Bursa * * This file is part of NetSurf, http://www.netsurf-browser.org/ @@ -30,492 +30,251 @@ #include #include #include + #include -#include "utils/config.h" + #include "content/content.h" #include "content/hlcache.h" #include "css/css.h" -#include "render/box.h" #include "desktop/save_complete.h" +#include "render/box.h" +#include "render/html.h" #include "utils/log.h" -#include "utils/url.h" +#include "utils/nsurl.h" #include "utils/utils.h" -#include "render/html.h" regex_t save_complete_import_re; /** An entry in save_complete_list. */ -struct save_complete_entry { +typedef struct save_complete_entry { hlcache_handle *content; struct save_complete_entry *next; /**< Next entry in list */ -}; - -static bool save_complete_html(hlcache_handle *c, const char *path, - bool index, struct save_complete_entry **list); -static bool save_imported_sheets(struct nscss_import *imports, uint32_t count, - const char *path, struct save_complete_entry **list); -static char * rewrite_stylesheet_urls(const char *source, unsigned int size, - int *osize, const char *base, - struct save_complete_entry *list); -static bool rewrite_document_urls(xmlDoc *doc, const char *base, - struct save_complete_entry *list); -static bool rewrite_urls(xmlNode *n, const char *base, - struct save_complete_entry *list); -static bool rewrite_url(xmlNode *n, const char *attr, const char *base, - struct save_complete_entry *list); -static bool save_complete_list_add(hlcache_handle *content, - struct save_complete_entry **list); -static hlcache_handle * save_complete_list_find(const char *url, - struct save_complete_entry *list); -static bool save_complete_list_check(hlcache_handle *content, - struct save_complete_entry *list); -/* static void save_complete_list_dump(void); */ -static bool save_complete_inventory(const char *path, - struct save_complete_entry *list); - -/** - * Save an HTML page with all dependencies. - * - * \param c CONTENT_HTML to save - * \param path directory to save to (must exist) - * \return true on success, false on error and error reported - */ +} save_complete_entry; -bool save_complete(hlcache_handle *c, const char *path) -{ - bool result; - struct save_complete_entry *list = NULL; - - result = save_complete_html(c, path, true, &list); +typedef struct save_complete_ctx { + const char *path; + save_complete_entry *list; + save_complete_set_type_cb set_type; - if (result) - result = save_complete_inventory(path, list); + nsurl *base; + FILE *fp; + enum { STATE_NORMAL, STATE_IN_STYLE } iter_state; +} save_complete_ctx; - /* free save_complete_list */ - while (list) { - struct save_complete_entry *next = list->next; - free(list); - list = next; - } +typedef enum { + EVENT_ENTER, + EVENT_LEAVE +} save_complete_event_type; - return result; -} +static bool save_complete_save_html(save_complete_ctx *ctx, hlcache_handle *c, + bool index); +static bool save_complete_save_imported_sheets(save_complete_ctx *ctx, + struct nscss_import *imports, uint32_t import_count); -/** - * Save an HTML page with all dependencies, recursing through imported pages. - * - * \param c CONTENT_HTML to save - * \param path directory to save to (must exist) - * \param index true to save as "index" - * \return true on success, false on error and error reported - */ -bool save_complete_html(hlcache_handle *c, const char *path, bool index, - struct save_complete_entry **list) +static void save_complete_ctx_initialise(save_complete_ctx *ctx, + const char *path, save_complete_set_type_cb set_type) { - struct html_stylesheet *sheets; - struct content_html_object *object; - char filename[256]; - unsigned int i, count; - xmlDocPtr doc = NULL; - bool res; - - if (content_get_type(c) != CONTENT_HTML) - return false; - - if (save_complete_list_check(c, *list)) - return true; - - /* save stylesheets, ignoring the base and adblocking sheets */ - sheets = html_get_stylesheets(c, &count); - - for (i = STYLESHEET_START; i != count; i++) { - hlcache_handle *css; - const char *css_data; - unsigned long css_size; - char *source; - int source_len; - struct nscss_import *imports; - uint32_t import_count; - lwc_string *type; - - if (sheets[i].type == HTML_STYLESHEET_INTERNAL) { - if (save_imported_sheets( - sheets[i].data.internal->imports, - sheets[i].data.internal->import_count, - path, list) == false) - return false; - - continue; - } - - css = sheets[i].data.external; - - if (!css) - continue; - if (save_complete_list_check(css, *list)) - continue; - - if (!save_complete_list_add(css, list)) { - warn_user("NoMemory", 0); - return false; - } - - imports = nscss_get_imports(css, &import_count); - if (!save_imported_sheets(imports, import_count, path, list)) - return false; - - snprintf(filename, sizeof filename, "%p", css); - - css_data = content_get_source_data(css, &css_size); - - source = rewrite_stylesheet_urls(css_data, css_size, - &source_len, nsurl_access(hlcache_handle_get_url(css)), - *list); - if (!source) { - warn_user("NoMemory", 0); - return false; - } - - type = content_get_mime_type(css); - if (type == NULL) { - free(source); - return false; - } - - res = save_complete_gui_save(path, filename, source_len, - source, type); - - lwc_string_unref(type); - free(source); - - if (res == false) - return false; - } - - /* save objects */ - object = html_get_objects(c, &count); - - for (; object != NULL; object = object->next) { - hlcache_handle *obj = object->content; - const char *obj_data; - unsigned long obj_size; - lwc_string *type; - - if (obj == NULL || content_get_type(obj) == CONTENT_NONE) - continue; - - obj_data = content_get_source_data(obj, &obj_size); - - if (obj_data == NULL) - continue; - - if (save_complete_list_check(obj, *list)) - continue; - - if (!save_complete_list_add(obj, list)) { - warn_user("NoMemory", 0); - return false; - } - - if (content_get_type(obj) == CONTENT_HTML) { - if (!save_complete_html(obj, path, false, list)) - return false; - continue; - } - - snprintf(filename, sizeof filename, "%p", obj); - - type = content_get_mime_type(obj); - if (type == NULL) - return false; - - res = save_complete_gui_save(path, filename, - obj_size, obj_data, type); - - lwc_string_unref(type); - - if(res == false) - return false; - } + ctx->path = path; + ctx->list = NULL; + ctx->set_type = set_type; +} - /* create shiny XML document from the content source */ - - { - unsigned long html_size; - const char *html_source; - xmlChar *terminated_html_source; - html_source = content_get_source_data(c, &html_size); - - terminated_html_source = malloc(html_size + 1); - if (terminated_html_source != NULL) { - memcpy(terminated_html_source, html_source, html_size); - terminated_html_source[html_size] = '\0'; - doc = htmlParseDoc(terminated_html_source, NULL); - free(terminated_html_source); - } - - } - - if (doc == NULL) { - warn_user("NoMemory", 0); - return false; - } +static void save_complete_ctx_finalise(save_complete_ctx *ctx) +{ + save_complete_entry *list = ctx->list; - /* rewrite all urls we know about */ - if (!rewrite_document_urls(doc, nsurl_access(html_get_base_url(c)), - *list)) { - xmlFreeDoc(doc); - warn_user("NoMemory", 0); - return false; + while (list != NULL) { + save_complete_entry *next = list->next; + free(list); + list = next; } +} - /* save the html file out last of all */ - if (index) - snprintf(filename, sizeof filename, "index"); - else - snprintf(filename, sizeof filename, "%p", c); - - errno = 0; - if (save_complete_htmlSaveFileFormat(path, filename, doc, 0, 0) == -1) { - if (errno) - warn_user("SaveError", strerror(errno)); - else - warn_user("SaveError", "htmlSaveFileFormat failed"); +static bool save_complete_ctx_add_content(save_complete_ctx *ctx, + hlcache_handle *content) +{ + save_complete_entry *entry; - xmlFreeDoc(doc); + entry = malloc(sizeof (*entry)); + if (entry == NULL) return false; - } - xmlFreeDoc(doc); + entry->content = content; + entry->next = ctx->list; + ctx->list = entry; return true; } -/** - * Save stylesheets imported by a CONTENT_CSS. - * - * \param imports Array of imports - * \param count Number of imports in list - * \param path Path to save to - * \return true on success, false on error and error reported - */ -bool save_imported_sheets(struct nscss_import *imports, uint32_t count, - const char *path, struct save_complete_entry **list) +static hlcache_handle *save_complete_ctx_find_content(save_complete_ctx *ctx, + const nsurl *url) { - char filename[256]; - unsigned int j; - char *source; - int source_len; - bool res; - - for (j = 0; j != count; j++) { - hlcache_handle *css = imports[j].c; - const char *css_data; - unsigned long css_size; - struct nscss_import *child_imports; - uint32_t child_import_count; - lwc_string *type; - - if (css == NULL) - continue; - if (save_complete_list_check(css, *list)) - continue; + save_complete_entry *entry; - if (!save_complete_list_add(css, list)) { - warn_user("NoMemory", 0); - return false; - } + for (entry = ctx->list; entry != NULL; entry = entry->next) + if (nsurl_compare(url, + hlcache_handle_get_url(entry->content), + NSURL_COMPLETE)) + return entry->content; - child_imports = nscss_get_imports(css, &child_import_count); - if (!save_imported_sheets(child_imports, child_import_count, - path, list)) - return false; + return NULL; +} - snprintf(filename, sizeof filename, "%p", css); - css_data = content_get_source_data(css, &css_size); +static bool save_complete_ctx_has_content(save_complete_ctx *ctx, + hlcache_handle *content) +{ + save_complete_entry *entry; - source = rewrite_stylesheet_urls(css_data, css_size, - &source_len, nsurl_access(hlcache_handle_get_url(css)), - *list); - if (!source) { - warn_user("NoMemory", 0); - return false; - } + for (entry = ctx->list; entry != NULL; entry = entry->next) + if (entry->content == content) + return true; - if (lwc_intern_string("text/css", SLEN("text/css"), &type) != - lwc_error_ok) { - free(source); - warn_user("NoMemory", 0); - return false; - } + return false; +} - res = save_complete_gui_save(path, filename, source_len, - source, type); +static bool save_complete_save_buffer(save_complete_ctx *ctx, + const char *leafname, const char *data, size_t data_len, + lwc_string *mime_type) +{ + FILE *fp; + bool error; + char fullpath[PATH_MAX]; - lwc_string_unref(type); - free(source); + strncpy(fullpath, ctx->path, sizeof fullpath); + error = path_add_part(fullpath, sizeof fullpath, leafname); + if (error == false) { + warn_user("NoMemory", NULL); + return false; + } - if (res == false) - return false; + fp = fopen(fullpath, "wb"); + if (fp == NULL) { + LOG(("fopen(): errno = %i", errno)); + warn_user("SaveError", strerror(errno)); + return false; } - return true; -} + fwrite(data, sizeof(*data), data_len, fp); + fclose(fp); -/** - * Initialise the save_complete module. - */ + if (ctx->set_type != NULL) + ctx->set_type(fullpath, mime_type); -void save_complete_init(void) -{ - /* Match an @import rule - see CSS 2.1 G.1. */ - regcomp_wrapper(&save_complete_import_re, - "@import" /* IMPORT_SYM */ - "[ \t\r\n\f]*" /* S* */ - /* 1 */ - "(" /* [ */ - /* 2 3 */ - "\"(([^\"]|[\\]\")*)\"" /* STRING (approximated) */ - "|" - /* 4 5 */ - "'(([^']|[\\]')*)'" - "|" /* | */ - "url\\([ \t\r\n\f]*" /* URI (approximated) */ - /* 6 7 */ - "\"(([^\"]|[\\]\")*)\"" - "[ \t\r\n\f]*\\)" - "|" - "url\\([ \t\r\n\f]*" - /* 8 9 */ - "'(([^']|[\\]')*)'" - "[ \t\r\n\f]*\\)" - "|" - "url\\([ \t\r\n\f]*" - /* 10 */ - "([^) \t\r\n\f]*)" - "[ \t\r\n\f]*\\)" - ")", /* ] */ - REG_EXTENDED | REG_ICASE); + return true; } - /** * Rewrite stylesheet \@import rules for save complete. * - * @param source stylesheet source - * @param size size of source - * @param osize updated with the size of the result - * @param base url of stylesheet - * @return converted source, or 0 on out of memory + * \param source stylesheet source + * \param size size of source + * \param base url of stylesheet + * \param osize updated with the size of the result + * \return converted source, or NULL on out of memory */ -char * rewrite_stylesheet_urls(const char *source, unsigned int size, - int *osize, const char *base, - struct save_complete_entry *list) +static char *save_complete_rewrite_stylesheet_urls(save_complete_ctx *ctx, + const char *source, unsigned long size, const nsurl *base, + unsigned long *osize) { - char *res; - const char *url; - char *url2; - char buf[20]; - unsigned int offset = 0; - int url_len = 0; - hlcache_handle *content; - int m; - unsigned int i; + char *rewritten; + unsigned long offset = 0; unsigned int imports = 0; - regmatch_t match[11]; - url_func_result result; + nserror error; - /* count number occurences of @import to (over)estimate result size */ + /* count number occurrences of @import to (over)estimate result size */ /* can't use strstr because source is not 0-terminated string */ - for (i = 0; 7 < size && i != size - 7; i++) { - if (source[i] == '@' && - tolower(source[i + 1]) == 'i' && - tolower(source[i + 2]) == 'm' && - tolower(source[i + 3]) == 'p' && - tolower(source[i + 4]) == 'o' && - tolower(source[i + 5]) == 'r' && - tolower(source[i + 6]) == 't') + for (offset = 0; SLEN("@import") < size && + offset <= size - SLEN("@import"); offset++) { + if (source[offset] == '@' && + tolower(source[offset + 1]) == 'i' && + tolower(source[offset + 2]) == 'm' && + tolower(source[offset + 3]) == 'p' && + tolower(source[offset + 4]) == 'o' && + tolower(source[offset + 5]) == 'r' && + tolower(source[offset + 6]) == 't') imports++; } - res = malloc(size + imports * 20); - if (!res) - return 0; + rewritten = malloc(size + imports * 20); + if (rewritten == NULL) + return NULL; *osize = 0; + offset = 0; while (offset < size) { - m = regexec(&save_complete_import_re, source + offset, + const char *import_url = NULL; + char *import_url_copy; + int import_url_len = 0; + nsurl *url = NULL; + regmatch_t match[11]; + int m = regexec(&save_complete_import_re, source + offset, 11, match, 0); if (m) break; - /*for (unsigned int i = 0; i != 11; i++) { - if (match[i].rm_so == -1) - continue; - fprintf(stderr, "%i: '%.*s'\n", i, - match[i].rm_eo - match[i].rm_so, - source + offset + match[i].rm_so); - }*/ - - url = 0; if (match[2].rm_so != -1) { - url = source + offset + match[2].rm_so; - url_len = match[2].rm_eo - match[2].rm_so; + import_url = source + offset + match[2].rm_so; + import_url_len = match[2].rm_eo - match[2].rm_so; } else if (match[4].rm_so != -1) { - url = source + offset + match[4].rm_so; - url_len = match[4].rm_eo - match[4].rm_so; + import_url = source + offset + match[4].rm_so; + import_url_len = match[4].rm_eo - match[4].rm_so; } else if (match[6].rm_so != -1) { - url = source + offset + match[6].rm_so; - url_len = match[6].rm_eo - match[6].rm_so; + import_url = source + offset + match[6].rm_so; + import_url_len = match[6].rm_eo - match[6].rm_so; } else if (match[8].rm_so != -1) { - url = source + offset + match[8].rm_so; - url_len = match[8].rm_eo - match[8].rm_so; + import_url = source + offset + match[8].rm_so; + import_url_len = match[8].rm_eo - match[8].rm_so; } else if (match[10].rm_so != -1) { - url = source + offset + match[10].rm_so; - url_len = match[10].rm_eo - match[10].rm_so; + import_url = source + offset + match[10].rm_so; + import_url_len = match[10].rm_eo - match[10].rm_so; } - assert(url); + assert(import_url != NULL); - url2 = strndup(url, url_len); - if (!url2) { - free(res); - return 0; + import_url_copy = strndup(import_url, import_url_len); + if (import_url_copy == NULL) { + free(rewritten); + return NULL; } - result = url_join(url2, base, (char**)&url); - free(url2); - if (result == URL_FUNC_NOMEM) { - free(res); - return 0; + + error = nsurl_join(base, import_url_copy, &url); + free(import_url_copy); + if (error == NSERROR_NOMEM) { + free(rewritten); + return NULL; } /* copy data before match */ - memcpy(res + *osize, source + offset, match[0].rm_so); + memcpy(rewritten + *osize, source + offset, match[0].rm_so); *osize += match[0].rm_so; - if (result == URL_FUNC_OK) { - content = save_complete_list_find(url, list); - if (content) { + if (url != NULL) { + hlcache_handle *content; + content = save_complete_ctx_find_content(ctx, url); + if (content != NULL) { /* replace import */ + char buf[64]; snprintf(buf, sizeof buf, "@import '%p'", content); - memcpy(res + *osize, buf, strlen(buf)); + memcpy(rewritten + *osize, buf, strlen(buf)); *osize += strlen(buf); } else { /* copy import */ - memcpy(res + *osize, source + offset + match[0].rm_so, + memcpy(rewritten + *osize, + source + offset + match[0].rm_so, match[0].rm_eo - match[0].rm_so); *osize += match[0].rm_eo - match[0].rm_so; } - } - else { + nsurl_unref(url); + } else { /* copy import */ - memcpy(res + *osize, source + offset + match[0].rm_so, + memcpy(rewritten + *osize, + source + offset + match[0].rm_so, match[0].rm_eo - match[0].rm_so); *osize += match[0].rm_eo - match[0].rm_so; } @@ -526,308 +285,771 @@ char * rewrite_stylesheet_urls(const char *source, unsigned int size, /* copy rest of source */ if (offset < size) { - memcpy(res + *osize, source + offset, size - offset); + memcpy(rewritten + *osize, source + offset, size - offset); *osize += size - offset; } - return res; + return rewritten; } - -/** - * Rewrite URLs in a HTML document to be relative. - * - * \param doc root of the document tree - * \param base base url of document - * \return true on success, false on out of memory - */ - -bool rewrite_document_urls(xmlDoc *doc, const char *base, - struct save_complete_entry *list) +static bool save_complete_save_stylesheet(save_complete_ctx *ctx, + hlcache_handle *css) { - xmlNode *node; + const char *css_data; + unsigned long css_size; + char *source; + unsigned long source_len; + struct nscss_import *imports; + uint32_t import_count; + lwc_string *type; + char filename[32]; + bool result; - for (node = doc->children; node; node = node->next) - if (node->type == XML_ELEMENT_NODE) - if (!rewrite_urls(node, base, list)) - return false; + if (save_complete_ctx_has_content(ctx, css)) + return true; - return true; -} + if (save_complete_ctx_add_content(ctx, css) == false) { + warn_user("NoMemory", 0); + return false; + } + imports = nscss_get_imports(css, &import_count); + if (save_complete_save_imported_sheets(ctx, + imports, import_count) == false) + return false; -/** - * Traverse tree, rewriting URLs as we go. - * - * \param n xmlNode of type XML_ELEMENT_NODE to rewrite - * \param base base url of document - * \return true on success, false on out of memory - * - * URLs in the tree rooted at element n are rewritten. - */ + css_data = content_get_source_data(css, &css_size); + source = save_complete_rewrite_stylesheet_urls(ctx, css_data, css_size, + hlcache_handle_get_url(css), &source_len); + if (source == NULL) { + warn_user("NoMemory", 0); + return false; + } -bool rewrite_urls(xmlNode *n, const char *base, - struct save_complete_entry *list) -{ - xmlNode *child; + type = content_get_mime_type(css); + if (type == NULL) { + free(source); + return false; + } - assert(n->type == XML_ELEMENT_NODE); + snprintf(filename, sizeof filename, "%p", css); - /** - * We only need to consider the following cases: - * - * Attribute: Elements: - * - * 1) data - * 2) href - * 3) src