diff options
author | John Mark Bell <jmb@netsurf-browser.org> | 2009-12-17 23:55:02 +0000 |
---|---|---|
committer | John Mark Bell <jmb@netsurf-browser.org> | 2009-12-17 23:55:02 +0000 |
commit | 355799ce0bbb078237dfc1ae9874bbc5342acbc4 (patch) | |
tree | 7ca980c01c0d4d1d55a3b7b15418c95c5618afae /riscos/save_complete.c | |
parent | 4346b2b62b940182575e6612e46234355afa083c (diff) | |
download | netsurf-355799ce0bbb078237dfc1ae9874bbc5342acbc4.tar.gz netsurf-355799ce0bbb078237dfc1ae9874bbc5342acbc4.tar.bz2 |
Merge branches/MarkieB/gtkmain to trunk.
svn path=/trunk/netsurf/; revision=9729
Diffstat (limited to 'riscos/save_complete.c')
-rw-r--r-- | riscos/save_complete.c | 759 |
1 files changed, 0 insertions, 759 deletions
diff --git a/riscos/save_complete.c b/riscos/save_complete.c deleted file mode 100644 index 3ac559784..000000000 --- a/riscos/save_complete.c +++ /dev/null @@ -1,759 +0,0 @@ -/* - * Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk> - * Copyright 2004-2007 James Bursa <bursa@users.sourceforge.net> - * - * This file is part of NetSurf, http://www.netsurf-browser.org/ - * - * NetSurf is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; version 2 of the License. - * - * NetSurf is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/** \file - * Save HTML document with dependencies (implementation). - */ - -#include "utils/config.h" - -#define _GNU_SOURCE /* for strndup */ -#include <assert.h> -#include <ctype.h> -#include <errno.h> -#include <stdio.h> -#include <string.h> -#include <sys/types.h> -#include <regex.h> -#include <libxml/HTMLtree.h> -#include <libxml/parserInternals.h> -#include "oslib/osfile.h" -#include "utils/config.h" -#include "content/content.h" -#include "css/css.h" -#include "render/box.h" -#include "riscos/gui.h" -#include "riscos/save_complete.h" -#include "utils/log.h" -#include "utils/url.h" -#include "utils/utils.h" - -regex_t save_complete_import_re; - -/** An entry in save_complete_list. */ -struct save_complete_entry { - struct content *content; - struct save_complete_entry *next; /**< Next entry in list */ -}; - -/** List of urls seen and saved so far. */ -static struct save_complete_entry *save_complete_list = 0; - -static bool save_complete_html(struct content *c, const char *path, - bool index); -static bool save_imported_sheets(struct content *c, const char *path); -static char * rewrite_stylesheet_urls(const char *source, unsigned int size, - int *osize, const char *base); -static bool rewrite_document_urls(xmlDoc *doc, const char *base); -static bool rewrite_urls(xmlNode *n, const char *base); -static bool rewrite_url(xmlNode *n, const char *attr, const char *base); -static bool save_complete_list_add(struct content *content); -static struct content * save_complete_list_find(const char *url); -static bool save_complete_list_check(struct content *content); -/* static void save_complete_list_dump(void); */ -static bool save_complete_inventory(const char *path); - -/** - * Save an HTML page with all dependencies. - * - * \param c CONTENT_HTML to save - * \param path directory to save to (must exist) - * \return true on success, false on error and error reported - */ - -bool save_complete(struct content *c, const char *path) -{ - bool result; - - result = save_complete_html(c, path, true); - - if (result) - result = save_complete_inventory(path); - - /* free save_complete_list */ - while (save_complete_list) { - struct save_complete_entry *next = save_complete_list->next; - free(save_complete_list); - save_complete_list = next; - } - - return result; -} - - -/** - * Save an HTML page with all dependencies, recursing through imported pages. - * - * \param c CONTENT_HTML to save - * \param path directory to save to (must exist) - * \param index true to save as "index" - * \return true on success, false on error and error reported - */ - -bool save_complete_html(struct content *c, const char *path, bool index) -{ - char spath[256]; - unsigned int i; - xmlDocPtr doc; - os_error *error; - - if (c->type != CONTENT_HTML) - return false; - - if (save_complete_list_check(c)) - return true; - - /* save stylesheets, ignoring the base and adblocking sheets */ - for (i = STYLESHEET_START; i != c->data.html.stylesheet_count; i++) { - struct content *css = c->data.html.stylesheets[i].c; - char *source; - int source_len; - bool is_style; - - if (!css) - continue; - if (save_complete_list_check(css)) - continue; - - is_style = (strcmp(css->url, c->data.html.base_url) == 0); - - if (is_style == false) { - if (!save_complete_list_add(css)) { - warn_user("NoMemory", 0); - return false; - } - } - - if (!save_imported_sheets(css, path)) - return false; - - if (is_style) - continue; /* don't save <style> elements */ - - snprintf(spath, sizeof spath, "%s.%x", path, - (unsigned int) css); - source = rewrite_stylesheet_urls(css->source_data, - css->source_size, &source_len, css->url); - if (!source) { - warn_user("NoMemory", 0); - return false; - } - - error = xosfile_save_stamped(spath, 0xf79, - (byte *) source, (byte *) source + source_len); - free(source); - if (error) { - LOG(("xosfile_save_stamped: 0x%x: %s", - error->errnum, error->errmess)); - warn_user("SaveError", error->errmess); - return false; - } - } - - /* save objects */ - for (i = 0; i != c->data.html.object_count; i++) { - struct content *obj = c->data.html.object[i].content; - - /* skip difficult content types */ - if (!obj || obj->type >= CONTENT_OTHER || !obj->source_data) - continue; - if (save_complete_list_check(obj)) - continue; - - if (!save_complete_list_add(obj)) { - warn_user("NoMemory", 0); - return false; - } - - if (obj->type == CONTENT_HTML) { - if (!save_complete_html(obj, path, false)) - return false; - continue; - } - - snprintf(spath, sizeof spath, "%s.%x", path, - (unsigned int) obj); - error = xosfile_save_stamped(spath, - ro_content_filetype(obj), - (byte *) obj->source_data, - (byte *) obj->source_data + obj->source_size); - if (error) { - LOG(("xosfile_save_stamped: 0x%x: %s", - error->errnum, error->errmess)); - warn_user("SaveError", error->errmess); - return false; - } - } - - /*save_complete_list_dump();*/ - - /* copy document */ - doc = xmlCopyDoc(c->data.html.document, 1); - if (doc == NULL) { - warn_user("NoMemory", 0); - return false; - } - - /* rewrite all urls we know about */ - if (!rewrite_document_urls(doc, c->data.html.base_url)) { - xmlFreeDoc(doc); - warn_user("NoMemory", 0); - return false; - } - - /* save the html file out last of all */ - if (index) - snprintf(spath, sizeof spath, "%s.index", path); - else - snprintf(spath, sizeof spath, "%s.%x", path, (unsigned int)c); - - errno = 0; - if (htmlSaveFileFormat(spath, doc, 0, 0) == -1) { - if (errno) - warn_user("SaveError", strerror(errno)); - else - warn_user("SaveError", "htmlSaveFileFormat failed"); - - xmlFreeDoc(doc); - return false; - } - - xmlFreeDoc(doc); - - error = xosfile_set_type(spath, 0xfaf); - if (error) { - LOG(("xosfile_set_type: 0x%x: %s", - error->errnum, error->errmess)); - warn_user("SaveError", error->errmess); - return false; - } - - return true; -} - - -/** - * Save stylesheets imported by a CONTENT_CSS. - * - * \param c a CONTENT_CSS - * \param path path to save to - * \return true on success, false on error and error reported - */ - -bool save_imported_sheets(struct content *c, const char *path) -{ - char spath[256]; - unsigned int j; - char *source; - int source_len; - os_error *error; - - for (j = 0; j != c->data.css.import_count; j++) { - struct content *css = c->data.css.imports[j].c; - - if (!css) - continue; - if (save_complete_list_check(css)) - continue; - - if (!save_complete_list_add(css)) { - warn_user("NoMemory", 0); - return false; - } - - if (!save_imported_sheets(css, path)) - return false; - - snprintf(spath, sizeof spath, "%s.%x", path, - (unsigned int) css); - source = rewrite_stylesheet_urls(css->source_data, - css->source_size, &source_len, css->url); - if (!source) { - warn_user("NoMemory", 0); - return false; - } - - error = xosfile_save_stamped(spath, 0xf79, - (byte *) source, (byte *) source + source_len); - free(source); - if (error) { - LOG(("xosfile_save_stamped: 0x%x: %s", - error->errnum, error->errmess)); - warn_user("SaveError", error->errmess); - return false; - } - } - - return true; -} - - -/** - * Initialise the save_complete module. - */ - -void save_complete_init(void) -{ - /* Match an @import rule - see CSS 2.1 G.1. */ - regcomp_wrapper(&save_complete_import_re, - "@import" /* IMPORT_SYM */ - "[ \t\r\n\f]*" /* S* */ - /* 1 */ - "(" /* [ */ - /* 2 3 */ - "\"(([^\"]|[\\]\")*)\"" /* STRING (approximated) */ - "|" - /* 4 5 */ - "'(([^']|[\\]')*)'" - "|" /* | */ - "url\\([ \t\r\n\f]*" /* URI (approximated) */ - /* 6 7 */ - "\"(([^\"]|[\\]\")*)\"" - "[ \t\r\n\f]*\\)" - "|" - "url\\([ \t\r\n\f]*" - /* 8 9 */ - "'(([^']|[\\]')*)'" - "[ \t\r\n\f]*\\)" - "|" - "url\\([ \t\r\n\f]*" - /* 10 */ - "([^) \t\r\n\f]*)" - "[ \t\r\n\f]*\\)" - ")", /* ] */ - REG_EXTENDED | REG_ICASE); -} - - -/** - * Rewrite stylesheet \@import rules for save complete. - * - * @param source stylesheet source - * @param size size of source - * @param osize updated with the size of the result - * @param base url of stylesheet - * @return converted source, or 0 on out of memory - */ - -char * rewrite_stylesheet_urls(const char *source, unsigned int size, - int *osize, const char *base) -{ - char *res; - const char *url; - char *url2; - char buf[20]; - unsigned int offset = 0; - int url_len = 0; - struct content *content; - int m; - unsigned int i; - unsigned int imports = 0; - regmatch_t match[11]; - url_func_result result; - - /* count number occurences of @import to (over)estimate result size */ - /* can't use strstr because source is not 0-terminated string */ - for (i = 0; 7 < size && i != size - 7; i++) { - if (source[i] == '@' && - tolower(source[i + 1]) == 'i' && - tolower(source[i + 2]) == 'm' && - tolower(source[i + 3]) == 'p' && - tolower(source[i + 4]) == 'o' && - tolower(source[i + 5]) == 'r' && - tolower(source[i + 6]) == 't') - imports++; - } - - res = malloc(size + imports * 20); - if (!res) - return 0; - *osize = 0; - - while (offset < size) { - m = regexec(&save_complete_import_re, source + offset, - 11, match, 0); - if (m) - break; - - /*for (unsigned int i = 0; i != 11; i++) { - if (match[i].rm_so == -1) - continue; - fprintf(stderr, "%i: '%.*s'\n", i, - match[i].rm_eo - match[i].rm_so, - source + offset + match[i].rm_so); - }*/ - - url = 0; - if (match[2].rm_so != -1) { - url = source + offset + match[2].rm_so; - url_len = match[2].rm_eo - match[2].rm_so; - } else if (match[4].rm_so != -1) { - url = source + offset + match[4].rm_so; - url_len = match[4].rm_eo - match[4].rm_so; - } else if (match[6].rm_so != -1) { - url = source + offset + match[6].rm_so; - url_len = match[6].rm_eo - match[6].rm_so; - } else if (match[8].rm_so != -1) { - url = source + offset + match[8].rm_so; - url_len = match[8].rm_eo - match[8].rm_so; - } else if (match[10].rm_so != -1) { - url = source + offset + match[10].rm_so; - url_len = match[10].rm_eo - match[10].rm_so; - } - assert(url); - - url2 = strndup(url, url_len); - if (!url2) { - free(res); - return 0; - } - result = url_join(url2, base, (char**)&url); - free(url2); - if (result == URL_FUNC_NOMEM) { - free(res); - return 0; - } - - /* copy data before match */ - memcpy(res + *osize, source + offset, match[0].rm_so); - *osize += match[0].rm_so; - - if (result == URL_FUNC_OK) { - content = save_complete_list_find(url); - if (content) { - /* replace import */ - snprintf(buf, sizeof buf, "@import '%x'", - (unsigned int) content); - memcpy(res + *osize, buf, strlen(buf)); - *osize += strlen(buf); - } else { - /* copy import */ - memcpy(res + *osize, source + offset + match[0].rm_so, - match[0].rm_eo - match[0].rm_so); - *osize += match[0].rm_eo - match[0].rm_so; - } - } - else { - /* copy import */ - memcpy(res + *osize, source + offset + match[0].rm_so, - match[0].rm_eo - match[0].rm_so); - *osize += match[0].rm_eo - match[0].rm_so; - } - - assert(0 < match[0].rm_eo); - offset += match[0].rm_eo; - } - - /* copy rest of source */ - if (offset < size) { - memcpy(res + *osize, source + offset, size - offset); - *osize += size - offset; - } - - return res; -} - - -/** - * Rewrite URLs in a HTML document to be relative. - * - * \param doc root of the document tree - * \param base base url of document - * \return true on success, false on out of memory - */ - -bool rewrite_document_urls(xmlDoc *doc, const char *base) -{ - xmlNode *node; - - for (node = doc->children; node; node = node->next) - if (node->type == XML_ELEMENT_NODE) - if (!rewrite_urls(node, base)) - return false; - - return true; -} - - -/** - * Traverse tree, rewriting URLs as we go. - * - * \param n xmlNode of type XML_ELEMENT_NODE to rewrite - * \param base base url of document - * \return true on success, false on out of memory - * - * URLs in the tree rooted at element n are rewritten. - */ - -bool rewrite_urls(xmlNode *n, const char *base) -{ - xmlNode *child; - - assert(n->type == XML_ELEMENT_NODE); - - /** - * We only need to consider the following cases: - * - * Attribute: Elements: - * - * 1) data <object> - * 2) href <a> <area> <link> - * 3) src <script> <input> <frame> <iframe> <img> - * 4) n/a <style> - * 5) n/a any <base> tag - * 6) background any (except those above) - */ - if (!n->name) { - /* ignore */ - } - /* 1 */ - else if (strcmp((const char *) n->name, "object") == 0) { - if (!rewrite_url(n, "data", base)) - return false; - } - /* 2 */ - else if (strcmp((const char *) n->name, "a") == 0 || - strcmp((const char *) n->name, "area") == 0 || - strcmp((const char *) n->name, "link") == 0) { - if (!rewrite_url(n, "href", base)) - return false; - } - /* 3 */ - else if (strcmp((const char *) n->name, "frame") == 0 || - strcmp((const char *) n->name, "iframe") == 0 || - strcmp((const char *) n->name, "input") == 0 || - strcmp((const char *) n->name, "img") == 0 || - strcmp((const char *) n->name, "script") == 0) { - if (!rewrite_url(n, "src", base)) - return false; - } - /* 4 */ - else if (strcmp((const char *) n->name, "style") == 0) { - unsigned int len; - xmlChar *content; - - for (child = n->children; child != 0; child = child->next) { - /* Get current content */ - content = xmlNodeGetContent(child); - if (!content) - /* unfortunately we don't know if this is - * due to memory exhaustion, or because - * there is no content for this node */ - continue; - - /* Rewrite @import rules */ - char *rewritten = rewrite_stylesheet_urls( - (const char *) content, - strlen((const char *) content), - (int *) &len, base); - xmlFree(content); - if (!rewritten) - return false; - - /* set new content */ - xmlNodeSetContentLen(child, - (const xmlChar*)rewritten, - len); - } - - return true; - } - /* 5 */ - else if (strcmp((const char *) n->name, "base") == 0) { - /* simply remove any <base> tags from the document */ - xmlUnlinkNode(n); - xmlFreeNode(n); - /* base tags have no content, so there's no point recursing - * additionally, we've just destroyed this node, so trying - * to recurse would result in bad things happening */ - return true; - } - /* 6 */ - else { - if (!rewrite_url(n, "background", base)) - return false; - } - - /* now recurse */ - for (child = n->children; child;) { - /* we must extract the next child now, as if the current - * child is a <base> element, it will be removed from the - * tree (see 5, above), thus preventing extraction of the - * next child */ - xmlNode *next = child->next; - if (child->type == XML_ELEMENT_NODE) { - if (!rewrite_urls(child, base)) - return false; - } - child = next; - } - - return true; -} - - -/** - * Rewrite an URL in a HTML document. - * - * \param n The node to modify - * \param attr The html attribute to modify - * \param base base url of document - * \return true on success, false on out of memory - */ - -bool rewrite_url(xmlNode *n, const char *attr, const char *base) -{ - char *url, *data; - char rel[20]; - struct content *content; - url_func_result res; - - if (!xmlHasProp(n, (const xmlChar *) attr)) - return true; - - data = (char *) xmlGetProp(n, (const xmlChar *) attr); - if (!data) - return false; - - res = url_join(data, base, &url); - xmlFree(data); - if (res == URL_FUNC_NOMEM) - return false; - else if (res == URL_FUNC_OK) { - content = save_complete_list_find(url); - if (content) { - /* found a match */ - free(url); - snprintf(rel, sizeof rel, "%x", - (unsigned int) content); - if (!xmlSetProp(n, (const xmlChar *) attr, - (xmlChar *) rel)) - return false; - } else { - /* no match found */ - if (!xmlSetProp(n, (const xmlChar *) attr, - (xmlChar *) url)) { - free(url); - return false; - } - free(url); - } - } - - return true; -} - - -/** - * Add a content to the save_complete_list. - * - * \param content content to add - * \return true on success, false on out of memory - */ - -bool save_complete_list_add(struct content *content) -{ - struct save_complete_entry *entry; - entry = malloc(sizeof (*entry)); - if (!entry) - return false; - entry->content = content; - entry->next = save_complete_list; - save_complete_list = entry; - return true; -} - - -/** - * Look up a url in the save_complete_list. - * - * \param url url to find - * \return content if found, 0 otherwise - */ - -struct content * save_complete_list_find(const char *url) -{ - struct save_complete_entry *entry; - for (entry = save_complete_list; entry; entry = entry->next) - if (strcmp(url, entry->content->url) == 0) - return entry->content; - return 0; -} - - -/** - * Look up a content in the save_complete_list. - * - * \param content pointer to content - * \return true if the content is in the save_complete_list - */ - -bool save_complete_list_check(struct content *content) -{ - struct save_complete_entry *entry; - for (entry = save_complete_list; entry; entry = entry->next) - if (entry->content == content) - return true; - return false; -} - - -#if 0 -/** - * Dump save complete list to stderr - */ -void save_complete_list_dump(void) -{ - struct save_complete_entry *entry; - for (entry = save_complete_list; entry; entry = entry->next) - fprintf(stderr, "%p : %s\n", entry->content, - entry->content->url); -} -#endif - - -/** - * Create the inventory file listing original URLs. - */ - -bool save_complete_inventory(const char *path) -{ - char spath[256]; - FILE *fp; - - snprintf(spath, sizeof spath, "%s.Inventory", path); - - fp = fopen(spath, "w"); - if (!fp) { - LOG(("fopen(): errno = %i", errno)); - warn_user("SaveError", strerror(errno)); - return false; - } - - struct save_complete_entry *entry; - for (entry = save_complete_list; entry; entry = entry->next) - fprintf(fp, "%x %s\n", - (unsigned int) entry->content, - entry->content->url); - - fclose(fp); - - return true; -} - |