summaryrefslogtreecommitdiff
path: root/desktop/save_text.c
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2004-03-25 00:31:45 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2004-03-25 00:31:45 +0000
commitc18775f4fb469dc1e60b345a9fd0f83788ffcf50 (patch)
treed93e1054f960a5a3c7af33086e95f2316d3106c1 /desktop/save_text.c
parent7d0b1443916345947ee72ff07ebb56493d09598e (diff)
downloadnetsurf-c18775f4fb469dc1e60b345a9fd0f83788ffcf50.tar.gz
netsurf-c18775f4fb469dc1e60b345a9fd0f83788ffcf50.tar.bz2
[project @ 2004-03-25 00:31:45 by jmb]
Make text export use stdio. Move save_text.[ch] to desktop. svn path=/import/netsurf/; revision=666
Diffstat (limited to 'desktop/save_text.c')
-rw-r--r--desktop/save_text.c117
1 files changed, 117 insertions, 0 deletions
diff --git a/desktop/save_text.c b/desktop/save_text.c
new file mode 100644
index 000000000..916567717
--- /dev/null
+++ b/desktop/save_text.c
@@ -0,0 +1,117 @@
+/*
+ * This file is part of NetSurf, http://netsurf.sourceforge.net/
+ * Licensed under the GNU General Public License,
+ * http://www.opensource.org/licenses/gpl-license
+ * Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
+ */
+
+#include <stdbool.h>
+#include <string.h>
+
+#include "libxml/HTMLtree.h"
+
+#include "netsurf/utils/config.h"
+#include "netsurf/content/content.h"
+#include "netsurf/desktop/save_text.h"
+#include "netsurf/utils/log.h"
+#include "netsurf/utils/utils.h"
+
+#ifdef WITH_TEXT_EXPORT
+
+static void extract_text(xmlDoc *doc);
+static void extract_text_from_tree(xmlNode *n);
+
+static FILE *out;
+
+void save_as_text(struct content *c, char *path) {
+
+ htmlParserCtxtPtr toSave;
+
+ if (c->type != CONTENT_HTML) {
+ return;
+ }
+
+ out = fopen(path, "w+");
+ if (!out) return;
+
+ toSave = htmlCreateMemoryParserCtxt(c->source_data, c->source_size);
+ htmlParseDocument(toSave);
+
+ extract_text(toSave->myDoc);
+
+ fclose(out);
+
+ xmlFreeDoc(toSave->myDoc);
+ htmlFreeParserCtxt(toSave);
+}
+
+void extract_text(xmlDoc *doc)
+{
+ xmlNode *html;
+
+ /* find the html element */
+ for (html = doc->children;
+ html!=0 && html->type != XML_ELEMENT_NODE;
+ html = html->next)
+ ;
+ if (html == 0 || strcmp((const char*)html->name, "html") != 0) {
+ return;
+ }
+
+ extract_text_from_tree(html);
+}
+
+void extract_text_from_tree(xmlNode *n)
+{
+ xmlNode *this;
+ char *text;
+ int len, need_nl = 0;
+
+ if (n->type == XML_ELEMENT_NODE) {
+ if (strcmp(n->name, "dl") == 0 ||
+ strcmp(n->name, "h1") == 0 ||
+ strcmp(n->name, "h2") == 0 ||
+ strcmp(n->name, "h3") == 0 ||
+ strcmp(n->name, "ol") == 0 ||
+ strcmp(n->name, "title") == 0 ||
+ strcmp(n->name, "ul") == 0) {
+ need_nl = 2;
+ }
+ else if (strcmp(n->name, "applet") == 0 ||
+ strcmp(n->name, "br") == 0 ||
+ strcmp(n->name, "div") == 0 ||
+ strcmp(n->name, "dt") == 0 ||
+ strcmp(n->name, "h4") == 0 ||
+ strcmp(n->name, "h5") == 0 ||
+ strcmp(n->name, "h6") == 0 ||
+ strcmp(n->name, "li") == 0 ||
+ strcmp(n->name, "object") == 0 ||
+ strcmp(n->name, "p") == 0 ||
+ strcmp(n->name, "tr") == 0) {
+ need_nl = 1;
+ }
+ /* do nothing, we just recurse through these nodes */
+ }
+ else if (n->type == XML_TEXT_NODE) {
+ text = squash_tolat1(n->content);
+ fprintf(out, "%s", text);
+ xfree(text);
+ return;
+ }
+ else {
+ return;
+ }
+
+ /* now recurse */
+ for (this = n->children; this != 0; this = this->next) {
+ extract_text_from_tree(this);
+ }
+
+ if (need_nl) {
+ for (len = 0; len != need_nl; len++) {
+ fprintf(out, "\n");
+ }
+ }
+}
+
+#endif