diff options
-rw-r--r-- | render/html.c | 8 | ||||
-rw-r--r-- | render/html.h | 1 | ||||
-rw-r--r-- | riscos/save_complete.c | 578 |
3 files changed, 564 insertions, 23 deletions
diff --git a/render/html.c b/render/html.c index e028e523d..7e3fb834c 100644 --- a/render/html.c +++ b/render/html.c @@ -69,6 +69,7 @@ void html_create(struct content *c, const char *params[]) } html->parser = htmlCreatePushParserCtxt(0, 0, "", 0, 0, html->encoding); + html->document = 0; html->base_url = xstrdup(c->url); html->layout = 0; html->background_colour = TRANSPARENT; @@ -139,7 +140,7 @@ int html_convert(struct content *c, unsigned int width, unsigned int height) /* finish parsing */ htmlParseChunk(c->data.html.parser, "", 0, 1); - document = c->data.html.parser->myDoc; + document = c->data.html.document = c->data.html.parser->myDoc; /*xmlDebugDumpDocument(stderr, c->data.html.parser->myDoc);*/ htmlFreeParserCtxt(c->data.html.parser); c->data.html.parser = 0; @@ -181,7 +182,7 @@ int html_convert(struct content *c, unsigned int width, unsigned int height) /*box_dump(c->data.html.layout->children, 0);*/ /* XML tree not required past this point */ - xmlFreeDoc(document); + //xmlFreeDoc(document); /* layout the box tree */ sprintf(c->status_message, messages_get("Formatting")); @@ -755,6 +756,9 @@ void html_destroy(struct content *c) if (c->data.html.parser) htmlFreeParserCtxt(c->data.html.parser); + if (c->data.html.document) + xmlFreeDoc(c->data.html.document); + free(c->data.html.base_url); if (c->data.html.layout) diff --git a/render/html.h b/render/html.h index d029a7554..673e57317 100644 --- a/render/html.h +++ b/render/html.h @@ -36,6 +36,7 @@ struct box_position { /** Data specific to CONTENT_HTML. */ struct content_html_data { htmlParserCtxt *parser; /**< HTML parser context. */ + xmlDoc *document; /**< the XML document tree */ xmlCharEncoding encoding; /**< Encoding of source. */ bool getenc; /**< Need to get the encoding from the document, as server is broken. */ diff --git a/riscos/save_complete.c b/riscos/save_complete.c index fca18e987..4740b2dac 100644 --- a/riscos/save_complete.c +++ b/riscos/save_complete.c @@ -5,8 +5,12 @@ * Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk> */ +#include <assert.h> #include <string.h> #include <unixlib/local.h> /* for __riscosify */ + +#include "libxml/HTMLtree.h" + #include "oslib/osfile.h" #include "netsurf/utils/config.h" #include "netsurf/content/content.h" @@ -16,16 +20,33 @@ #include "netsurf/riscos/gui.h" #include "netsurf/riscos/save_complete.h" #include "netsurf/utils/log.h" +#include "netsurf/utils/url.h" #include "netsurf/utils/utils.h" #ifdef WITH_SAVE_COMPLETE -/** \todo URL rewriting +/** \todo URL rewriting for @import rules * Objects used by embedded html pages * GUI */ -void save_imported_sheets(struct content *c, int parent, int level, char *p, char* fn); +struct url_entry { + char *url; /**< Fully qualified URL, as per url_join output */ + char *par; /**< Base URL of parent object */ + int ptr; /**< Pointer to object's location in memory */ + struct url_entry *next; /**< Next entry in list */ +}; + +static void save_imported_sheets(struct content *c, char *p, char* fn, + struct url_entry *imports); +/*static char *leafname(const char *url); +static int rewrite_stylesheet_urls(const char* sheet, int isize, char* buffer, + int osize, struct url_entry *head);*/ +static int rewrite_document_urls(xmlDoc *doc, struct url_entry *head, char *fname); +static int rewrite_urls(xmlNode *n, struct url_entry *head, char *fname); +static void rewrite_url_data(xmlNode *n, struct url_entry *head, char *fname); +static void rewrite_url_href(xmlNode *n, struct url_entry *head, char *fname); +static void rewrite_url_src(xmlNode *n, struct url_entry *head, char *fname); /* this is temporary. */ const char * const SAVE_PATH = "<NetSurf$Dir>.savetest."; @@ -36,6 +57,9 @@ void save_complete(struct content *c) { char *fname = 0, *spath; unsigned int i; + struct url_entry urls = {0, 0, 0, 0}; /* sentinel at head */ + struct url_entry *object; + xmlDoc *toSave; if (c->type != CONTENT_HTML) return; @@ -50,17 +74,41 @@ void save_complete(struct content *c) { /* save stylesheets, ignoring the base sheet and <style> elements */ for (i = 2; i != c->data.html.stylesheet_count; i++) { - struct content *css = c->data.html.object[i].content; + struct content *css = c->data.html.stylesheet_content[i]; + struct url_entry imports = {0, 0, 0, 0}; + //char *source; + //int source_len; if (!css) continue; - save_imported_sheets(css, (int)i, 0, spath, fname); + save_imported_sheets(css, spath, fname, &imports); + + /*source = xcalloc(css->source_size+100, sizeof(char)); + source_len = rewrite_stylesheet_urls(css->source_data, + css->source_size, + source, + css->source_size+100, + &imports);*/ - sprintf(spath, "%s%s%s.%d/css", SAVE_PATH, fname, OBJ_DIR, i); - xosfile_save_stamped(spath, 0xf79, - css->source_data, - css->source_data + css->source_size); + sprintf(spath, "%s%s%s.%p", SAVE_PATH, fname, OBJ_DIR, css); + /*if (source_len > 0) { + xosfile_save_stamped(spath, 0xf79, source, + source + source_len); + } + xfree(source);*/ + xosfile_save_stamped(spath, 0xf79, css->source_data, + css->source_data + css->source_size); + + /* Now add the url of this sheet to the list + * of objects imported by the parent page + */ + object = xcalloc(1, sizeof(struct url_entry)); + object->url = css->url; + object->par = url_normalize(c->data.html.base_url); + object->ptr = (int)css; + object->next = urls.next; + urls.next = object; } /* save objects */ @@ -72,42 +120,530 @@ void save_complete(struct content *c) { continue; } - sprintf(spath, "%s%s%s.%d", SAVE_PATH, fname, OBJ_DIR, i); + sprintf(spath, "%s%s%s.%p", SAVE_PATH, fname, OBJ_DIR, c->data.html.object[i].content); xosfile_save_stamped(spath, ro_content_filetype(obj), obj->source_data, obj->source_data + obj->source_size); + + /* Add to list, as for stylesheets */ + object = xcalloc(1, sizeof(struct url_entry)); + object->url = obj->url; + object->par = url_normalize(c->data.html.base_url); + object->ptr = (int)obj; + object->next = urls.next; + urls.next = object; } - /** \todo URL rewriting */ + /* make a copy of the document tree */ + toSave = xmlCopyDoc(c->data.html.document, 1); - /* save the html file out last of all (allows url rewriting first) */ + if (!toSave) { + xfree(spath); + return; + } + + /* rewrite all urls we know about */ + if (rewrite_document_urls(toSave, &urls, fname) == 0) { + xfree(spath); + xmlFreeDoc(toSave); + return; + } + + /* save the html file out last of all */ sprintf(spath, "%s%s", SAVE_PATH, fname); - xosfile_save_stamped(spath, 0xfaf, - c->source_data, - c->source_data + c->source_size); + htmlSaveFile(spath, toSave); + xosfile_set_type(spath, 0xfaf); + xmlFreeDoc(toSave); xfree(spath); - xfree(fname); + //xfree(fname); } -void save_imported_sheets(struct content *c, int parent, int level, char *p, char *fn) +void save_imported_sheets(struct content *c, char *p, char *fn, + struct url_entry *imports) { unsigned int j; + //struct url_entry *this; + //char *source; + //int source_len; for (j = 0; j != c->data.css.import_count; j++) { struct content *css = c->data.css.import_content[j]; + struct url_entry imp = {0, 0, 0, 0}; if (!css) continue; - save_imported_sheets(css, parent, level+1, p, fn); - sprintf(p, "%s%s%s.%d%c%d/css", SAVE_PATH, fn, OBJ_DIR, parent, 'a'+level, j); - xosfile_save_stamped(p, 0xf79, - css->source_data, - css->source_data + css->source_size); + save_imported_sheets(css, p, fn, &imp); + + /*source = xcalloc(css->source_size+100, sizeof(char)); + source_len = rewrite_stylesheet_urls(css->source_data, + css->source_size, + source, + css->source_size+100, + &imp);*/ + + sprintf(p, "%s%s%s.%p", SAVE_PATH, fn, OBJ_DIR, css); + /*if (source_len > 0) { + xosfile_save_stamped(p, 0xf79, source, source + source_len); + } + xfree(source);*/ + xosfile_save_stamped(p, 0xf79, css->source_data, + css->source_data + css->source_size); + + /* now add the url of this sheet to the list of + * sheets imported by the parent sheet. + */ + /*this = xcalloc(1, sizeof(struct url_entry)); + this->url = css->url; + this->par = url_normalize(c->url); + this->ptr = (int)css; + this->next = imports->next; + imports->next = this;*/ + } +} + +#if 0 + +char *leafname(const char *url) +{ + char *res, *temp; + + /* the input URL is that produced by url_join, + * therefore, we can assume that this will work. + */ + temp = strrchr(url, '/'); + + if ((temp - url) == (int)(strlen(url)-1)) { /* root dir */ + res = xstrdup("index/html"); + return res; + } + + temp += 1; + res = xcalloc(strlen(temp), sizeof(char)); + if (__riscosify_std(temp, 0, res, strlen(temp), 0)) { + return res; + } + + return NULL; +} + +/** + * Rewrite stylesheet @import rules to use relative urls. + * + * @param sheet The source of the stylesheet + * @param isize The size of the input buffer + * @param buffer The buffer into which to write the modified sheet + * @param osize The size of the output buffer + * @param head Pointer to the head of the list containing imported urls + * @return The length of the output buffer, or 0 on error. + */ +int rewrite_stylesheet_urls(const char *sheet, int isize, char *buffer, + int osize, struct url_entry *head) +{ + struct url_entry *item, *next; + char *rule, *input = sheet, *temp, *end; + int out_size = 0, out = 0; + + assert(head); + + while (input < sheet+isize) { + /* find next occurrence of @import in input buffer */ + rule = strstr(input, "@import"); + + if (!rule) { + if (out_size + ((sheet+isize)-input) > osize) { + /* not enough space in buffer -> exit */ + return 0; + } + memcpy(buffer, input, ((sheet+isize)-input)); + out_size += ((sheet+isize)-input); + break; + } + + /* find end of this rule */ + end = strchr(rule, ';'); + if (!end) { /* rule not closed - give up */ + if (out_size + ((sheet+isize)-input) > osize) { + /* not enough space in buffer -> exit */ + return 0; + } + memcpy(buffer, input, ((sheet+isize)-input)); + out_size += ((sheet+isize)-input); + break; + } + + /* skip until after first set of double quotes */ + temp = strchr(rule, '"'); + if (!temp) { /* no quotes - try parentheses */ + temp = strchr(rule, '('); + if (!temp) { /* no parentheses - give up */ + if (out_size + (rule-input+1) > osize) { + /* not enough space in buffer -> exit */ + return 0; + } + memcpy(buffer, input, (rule-input+1)); + buffer += rule-input+1; + out_size += rule-input+1; + input = rule + 1; + continue; + } + } + /* check we haven't gone past the end */ + if (temp > end && *temp == '(') { /* tested both */ + if (out_size + (end-input+1) > osize) { + /* not enough space in buffer -> exit */ + return 0; + } + memcpy(buffer, input, (end-input+1)); + buffer += end-input+1; + out_size += end-input+1; + input = end + 1; + continue; + } + else if (temp > end) { /* not done parentheses yet */ + temp = strchr(rule, '('); + if (!temp) { /* no parentheses - give up */ + if (out_size + (rule-input+1) > osize) { + /* not enough space in buffer -> exit */ + return 0; + } + memcpy(buffer, input, (rule-input+1)); + buffer += rule-input+1; + out_size += rule-input+1; + input = rule + 1; + continue; + } + } + if (temp > end) { + if (out_size + (end-input+1) > osize) { + /* not enough space in buffer -> exit */ + return 0; + } + memcpy(buffer, input, (end-input+1)); + buffer += end-input+1; + out_size += end-input+1; + input = end + 1; + continue; + } + rule = temp + 1; + + /* pointer to end of url */ + temp = strchr(rule, '"'); + if (!temp) { + temp = strchr(rule, ')'); + if (!temp) { + if (out_size + (rule-input+1) > osize) { + /* not enough space in buffer -> exit */ + return 0; + } + memcpy(buffer, input, (rule-input+1)); + buffer += rule-input+1; + out_size += rule-input+1; + input = rule + 1; + continue; + } + } + /* check we haven't gone past the end */ + if (temp > end && *temp == ')') { /* tested both */ + if (out_size + (end-input+1) > osize) { + /* not enough space in buffer -> exit */ + return 0; + } + memcpy(buffer, input, (end-input+1)); + buffer += end-input+1; + out_size += end-input+1; + input = end + 1; + continue; + } + else if (temp > end) { /* not done parentheses yet */ + temp = strchr(rule, ')'); + if (!temp) { /* no parentheses - give up */ + if (out_size + (rule-input+1) > osize) { + /* not enough space in buffer -> exit */ + return 0; + } + memcpy(buffer, input, (rule-input+1)); + buffer += rule-input+1; + out_size += rule-input+1; + input = rule + 1; + continue; + } + } + if (temp > end) { + if (out_size + (end-input+1) > osize) { + /* not enough space in buffer -> exit */ + return 0; + } + memcpy(buffer, input, (end-input+1)); + buffer += end-input+1; + out_size += end-input+1; + input = end + 1; + continue; + } + end = temp; + + /* copy input up to @import rule to output buffer */ + if (out_size + (rule-input) > osize) { + /* not enough space in buffer -> exit */ + return 0; + } + memcpy(buffer, input, (out = (rule-input))); + input = rule; + buffer += out; + out_size += out; + + /* copy url into temporary buffer */ + temp = xcalloc((end-rule), sizeof(char)); + strncpy(temp, rule, (end-rule)); + + /* iterate over list, looking for url */ + /** \todo make url detection more accurate */ + for (item=head; item->next; item=item->next) { + + if (strstr(item->next->url, temp) != 0) { + /* url found -> rewrite it */ + int len = 12; + char *url = xcalloc(len, sizeof(char)); + sprintf(url, "./0x%x", item->next->ptr); + if (out_size + len > osize) { + return 0; + } + memcpy(buffer, url, len); + xfree(url); + out = len; + break; + } + } + + if (item->next == 0) { + /* url not found -> write temp to buffer */ + if ((int)(out_size + strlen(temp)) > osize) { + return 0; + } + memcpy(buffer, temp, strlen(temp)); + out = strlen(temp); + } + + /* free url */ + xfree(temp); + + input = end; + buffer += out; + out_size += out; + } + + /* free list */ + for (item = head; item->next; item = item->next) { + + next = item->next; + item->next = item->next->next; + xfree(next->par); + xfree(next); + + if (item->next == 0) { + break; + } + } + + return out_size; +} + +#endif + +/** + * Rewrite URLs in a HTML document to be relative + * + * @param doc The root of the document tree + * @param head The head of the list of known URLs + * @param fname The name of the file to save as + * @return 0 on error. >0 otherwise + */ +int rewrite_document_urls(xmlDoc *doc, struct url_entry *head, char *fname) +{ + xmlNode *html; + struct url_entry *item, *next; + + /* find the html element */ + for (html = doc->children; + html!=0 && html->type != XML_ELEMENT_NODE; + html = html->next) + ; + if (html == 0 || strcmp((const char*)html->name, "html") != 0) { + return 0; + } + + rewrite_urls(html, head, fname); + + /* free list */ + for (item = head; item->next; item = item->next) { + + next = item->next; + item->next = item->next->next; + xfree(next->par); + xfree(next); + + if (item->next == 0) { + break; + } } + + return 1; } +/** + * Traverse tree, rewriting URLs as we go. + * + * @param n The root of the tree + * @param head The head of the list of known URLs + * @param fname The name of the file to save as + * @return 0 on error. >0 otherwise + */ +int rewrite_urls(xmlNode *n, struct url_entry *head, char *fname) +{ + xmlNode *this; + + /** + * We only need to consider the following cases: + * + * Attribute: Elements: + * + * 1) data <object> + * 2) href <a> <area> <link> <base> + * 3) src <script> <input> <frame> <iframe> <img> + */ + if (n->type == XML_ELEMENT_NODE) { + LOG(("%s", n->name)); + /* 1 */ + if (strcmp(n->name, "object") == 0) { + rewrite_url_data(n, head, fname); + } + /* 2 */ + else if (strcmp(n->name, "a") == 0 || + strcmp(n->name, "area") == 0 || + strcmp(n->name, "link") == 0 || + strcmp(n->name, "base") == 0) { + rewrite_url_href(n, head, fname); + } + /* 3 */ + else if (strcmp(n->name, "frame") == 0 || + strcmp(n->name, "iframe") == 0 || + strcmp(n->name, "input") == 0 || + strcmp(n->name, "img") == 0 || + strcmp(n->name, "script") == 0) { + rewrite_url_src(n, head, fname); + } + } + else { + return 0; + } + + /* now recurse */ + for (this = n->children; this != 0; this = this->next) { + rewrite_urls(this, head, fname); + } + + return 1; +} + +void rewrite_url_data(xmlNode *n, struct url_entry *head, char *fname) +{ + char *url, *data, *rel; + struct url_entry *item; + int len = strlen(fname) + strlen(OBJ_DIR) + 13; + + data = xmlGetProp(n, (const xmlChar*)"data"); + + if (!data) return; + + url = url_join(data, head->next->par); + if (!url) { + xmlFree(data); + return; + } + + for (item=head; item->next; item=item->next) { + + if (strcmp(url, item->next->url) == 0) { /* found a match */ + rel = xcalloc(len, sizeof(char)); + sprintf(rel, "./%s%s/0x%x", fname, OBJ_DIR, + item->next->ptr); + xmlSetProp(n, (const xmlChar*)"data", + (const xmlChar*) rel); + xfree(rel); + break; + } + } + + xfree(url); + xmlFree(data); +} + +void rewrite_url_href(xmlNode *n, struct url_entry *head, char *fname) +{ + char *url, *href, *rel; + struct url_entry *item; + int len = strlen(fname) + strlen(OBJ_DIR) + 13; + + href = xmlGetProp(n, (const xmlChar*)"href"); + + if (!href) return; + + url = url_join(href, head->next->par); + if (!url) { + xmlFree(href); + return; + } + + for (item=head; item->next; item=item->next) { + + if (strcmp(url, item->next->url) == 0) { /* found a match */ + rel = xcalloc(len, sizeof(char)); + sprintf(rel, "./%s%s/0x%x", fname, OBJ_DIR, + item->next->ptr); + xmlSetProp(n, (const xmlChar*)"href", + (const xmlChar*) rel); + xfree(rel); + break; + } + } + + xfree(url); + xmlFree(href); +} + +void rewrite_url_src(xmlNode *n, struct url_entry *head, char *fname) +{ + char *url, *src, *rel; + struct url_entry *item; + int len = strlen(fname) + strlen(OBJ_DIR) + 13; + + src = xmlGetProp(n, (const xmlChar*)"src"); + + if (!src) return; + + url = url_join(src, head->next->par); + if (!url) { + xmlFree(src); + return; + } + + for (item=head; item->next; item=item->next) { + + if (strcmp(url, item->next->url) == 0) { /* found a match */ + rel = xcalloc(len, sizeof(char)); + sprintf(rel, "./%s%s/0x%x", fname, OBJ_DIR, + item->next->ptr); + xmlSetProp(n, (const xmlChar*)"src", + (const xmlChar*) rel); + xfree(rel); + break; + } + } + + xfree(url); + xmlFree(src); +} #endif |