summaryrefslogtreecommitdiff
path: root/desktop/save_text.c
blob: 18e5654899ac7a4653837860177111479c8cfc84 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/*
 * This file is part of NetSurf, http://netsurf.sourceforge.net/
 * Licensed under the GNU General Public License,
 *                http://www.opensource.org/licenses/gpl-license
 * Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
 */

#include <stdbool.h>
#include <string.h>

#include "libxml/HTMLtree.h"

#include "netsurf/utils/config.h"
#include "netsurf/content/content.h"
#include "netsurf/desktop/save_text.h"
#include "netsurf/utils/log.h"
#include "netsurf/utils/utils.h"

#ifdef WITH_TEXT_EXPORT

static void extract_text(xmlDoc *doc);
static void extract_text_from_tree(xmlNode *n);

static FILE *out;

void save_as_text(struct content *c, char *path) {

        htmlParserCtxtPtr toSave;

	if (c->type != CONTENT_HTML) {
		return;
	}

        out = fopen(path, "w");
        if (!out) return;

	toSave = htmlCreateMemoryParserCtxt(c->source_data, c->source_size);
	htmlParseDocument(toSave);

	extract_text(toSave->myDoc);

        fclose(out);

	xmlFreeDoc(toSave->myDoc);
	htmlFreeParserCtxt(toSave);
}

void extract_text(xmlDoc *doc)
{
        xmlNode *html;

        /* find the html element */
        for (html = doc->children;
             html!=0 && html->type != XML_ELEMENT_NODE;
             html = html->next)
                ;
        if (html == 0 || strcmp((const char*)html->name, "html") != 0) {
                return;
        }

        extract_text_from_tree(html);
}

void extract_text_from_tree(xmlNode *n)
{
        xmlNode *this;
        char *text;
        int len, need_nl = 0;

        if (n->type == XML_ELEMENT_NODE) {
                if (strcmp(n->name, "dl") == 0 ||
                    strcmp(n->name, "h1") == 0 ||
                    strcmp(n->name, "h2") == 0 ||
                    strcmp(n->name, "h3") == 0 ||
                    strcmp(n->name, "ol") == 0 ||
                    strcmp(n->name, "title") == 0 ||
                    strcmp(n->name, "ul") == 0) {
                        need_nl = 2;
                }
                else if (strcmp(n->name, "applet") == 0 ||
                         strcmp(n->name, "br") == 0 ||
                         strcmp(n->name, "div") == 0 ||
                         strcmp(n->name, "dt") == 0 ||
                         strcmp(n->name, "h4") == 0 ||
                         strcmp(n->name, "h5") == 0 ||
                         strcmp(n->name, "h6") == 0 ||
                         strcmp(n->name, "li") == 0 ||
                         strcmp(n->name, "object") == 0 ||
                         strcmp(n->name, "p") == 0 ||
                         strcmp(n->name, "tr") == 0) {
                        need_nl = 1;
                }
                /* do nothing, we just recurse through these nodes */
        }
        else if (n->type == XML_TEXT_NODE) {
                text = squash_tolat1(n->content);
                fprintf(out, "%s", text);
                xfree(text);
                return;
        }
        else {
                return;
        }

        /* now recurse */
	for (this = n->children; this != 0; this = this->next) {
                extract_text_from_tree(this);
	}

	if (need_nl) {
	        for (len = 0; len != need_nl; len++) {
	                fprintf(out, "\n");
                }
	}
}

#endif