desktop/save_text.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

/*
 * Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
 *
 * This file is part of NetSurf, http://www.netsurf-browser.org/
 *
 * NetSurf is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; version 2 of the License.
 *
 * NetSurf is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <stdbool.h>
#include <string.h>

#include <libxml/HTMLtree.h>

#include "utils/config.h"
#include "content/content.h"
#include "desktop/save_text.h"
#include "utils/log.h"
#include "utils/utils.h"

#ifdef WITH_TEXT_EXPORT

static void extract_text(xmlDoc *doc);
static void extract_text_from_tree(xmlNode *n);

static FILE *out;

void save_as_text(struct content *c, char *path) {

	htmlParserCtxtPtr toSave;

	if (c->type != CONTENT_HTML) {
		return;
	}

	out = fopen(path, "w");
	if (!out) return;

	toSave = htmlCreateMemoryParserCtxt(c->source_data, c->source_size);
	htmlParseDocument(toSave);

	extract_text(toSave->myDoc);

	fclose(out);

	xmlFreeDoc(toSave->myDoc);
	htmlFreeParserCtxt(toSave);
}

void extract_text(xmlDoc *doc)
{
	xmlNode *html;

	/* find the html element */
	for (html = doc->children;
	     html!=0 && html->type != XML_ELEMENT_NODE;
	     html = html->next)
		;
	if (html == 0 || strcmp((const char*)html->name, "html") != 0) {
		return;
	}

	extract_text_from_tree(html);
}

void extract_text_from_tree(xmlNode *n)
{
	xmlNode *this_node;
	char *text;
	int need_nl = 0;

	if (n->type == XML_ELEMENT_NODE) {
		if (strcmp(n->name, "dl") == 0 ||
		    strcmp(n->name, "h1") == 0 ||
		    strcmp(n->name, "h2") == 0 ||
		    strcmp(n->name, "h3") == 0 ||
		    strcmp(n->name, "ol") == 0 ||
		    strcmp(n->name, "title") == 0 ||
		    strcmp(n->name, "ul") == 0) {
			need_nl = 2;
		}
		else if (strcmp(n->name, "applet") == 0 ||
			 strcmp(n->name, "br") == 0 ||
			 strcmp(n->name, "div") == 0 ||
			 strcmp(n->name, "dt") == 0 ||
			 strcmp(n->name, "h4") == 0 ||
			 strcmp(n->name, "h5") == 0 ||
			 strcmp(n->name, "h6") == 0 ||
			 strcmp(n->name, "li") == 0 ||
			 strcmp(n->name, "object") == 0 ||
			 strcmp(n->name, "p") == 0 ||
			 strcmp(n->name, "tr") == 0) {
			need_nl = 1;
		}
		/* do nothing, we just recurse through these nodes */
	}
	else if (n->type == XML_TEXT_NODE) {
		if ((text = squash_whitespace(n->content)) != NULL) {
			fputs(text, out);
			free(text);
		}
		return;
	}
	else {
		return;
	}

	/* now recurse */
	for (this_node = n->children; this_node != 0; this_node = this_node->next) {
		extract_text_from_tree(this_node);
	}

	while (need_nl--)
		fputc('\n', out);
}

#endif