/* * This file is part of NetSurf, http://netsurf.sourceforge.net/ * Licensed under the GNU General Public License, * http://www.opensource.org/licenses/gpl-license * Copyright 2004 John M Bell */ #include #include #include "libxml/HTMLtree.h" #include "oslib/osfile.h" #include "netsurf/utils/config.h" #include "netsurf/content/content.h" #include "netsurf/riscos/save_text.h" #include "netsurf/utils/log.h" #include "netsurf/utils/utils.h" #ifdef WITH_TEXT_EXPORT static void extract_text(xmlDoc *doc); static void extract_text_from_tree(xmlNode *n); static char *buffer = 0; static int output_size = 0; void save_as_text(struct content *c, char *path) { htmlParserCtxtPtr toSave; if (c->type != CONTENT_HTML) { return; } /* allocate a buffer the same size as the source * the output is guaranteed to be less than this */ buffer = xcalloc(c->source_size, sizeof(char)); toSave = htmlCreateMemoryParserCtxt(c->source_data, c->source_size); htmlParseDocument(toSave); extract_text(toSave->myDoc); if (output_size > 0) { xosfile_save_stamped(path, 0xfff, (byte*)buffer, (byte*)buffer+output_size); } xmlFreeDoc(toSave->myDoc); htmlFreeParserCtxt(toSave); xfree(buffer); } void extract_text(xmlDoc *doc) { xmlNode *html; /* find the html element */ for (html = doc->children; html!=0 && html->type != XML_ELEMENT_NODE; html = html->next) ; if (html == 0 || strcmp((const char*)html->name, "html") != 0) { return; } extract_text_from_tree(html); } void extract_text_from_tree(xmlNode *n) { xmlNode *this; char *text; int len = 0; int need_nl = 0; if (n->type == XML_ELEMENT_NODE) { if (strcmp(n->name, "dl") == 0 || strcmp(n->name, "h1") == 0 || strcmp(n->name, "h2") == 0 || strcmp(n->name, "h3") == 0 || strcmp(n->name, "ol") == 0 || strcmp(n->name, "title") == 0 || strcmp(n->name, "ul") == 0) { need_nl = 2; } else if (strcmp(n->name, "applet") == 0 || strcmp(n->name, "br") == 0 || strcmp(n->name, "div") == 0 || strcmp(n->name, "dt") == 0 || strcmp(n->name, "h4") == 0 || strcmp(n->name, "h5") == 0 || strcmp(n->name, "h6") == 0 || strcmp(n->name, "li") == 0 || strcmp(n->name, "object") == 0 || strcmp(n->name, "p") == 0 || strcmp(n->name, "tr") == 0) { need_nl = 1; } /* do nothing, we just recurse through these nodes */ } else if (n->type == XML_TEXT_NODE) { text = squash_tolat1(n->content); len = strlen(text); strcat(buffer, text); output_size += len; xfree(text); return; } else { return; } /* now recurse */ for (this = n->children; this != 0; this = this->next) { extract_text_from_tree(this); } if (need_nl) { for (len = 0; len != need_nl; len++) { strcat(buffer, "\n"); output_size += 1; } } } #endif