summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Drake <tlsa@netsurf-browser.org>2008-04-13 18:21:22 +0000
committerMichael Drake <tlsa@netsurf-browser.org>2008-04-13 18:21:22 +0000
commit5c8406bf8b9eec4a216a92cc72de53c8ee23ba7c (patch)
treeb0f86cd2787c8af1df5bf5f41036d5cdc791a4b1
parent78c22d4171bc07c66252d4b6464bf000c6ee59a7 (diff)
downloadnetsurf-5c8406bf8b9eec4a216a92cc72de53c8ee23ba7c.tar.gz
netsurf-5c8406bf8b9eec4a216a92cc72de53c8ee23ba7c.tar.bz2
Rewrite text export to use box tree. Improves output formatting. Simplify selection saving.
svn path=/trunk/netsurf/; revision=4094
-rw-r--r--desktop/save_text.c310
-rw-r--r--desktop/save_text.h24
-rw-r--r--desktop/selection.c105
-rw-r--r--desktop/selection.h2
4 files changed, 277 insertions, 164 deletions
diff --git a/desktop/save_text.c b/desktop/save_text.c
index 6fea9664d..3258a75ee 100644
--- a/desktop/save_text.c
+++ b/desktop/save_text.c
@@ -1,5 +1,6 @@
/*
* Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
+ * Copyright 2008 Michael Drake <tlsa@netsurf-browser.org>
*
* This file is part of NetSurf, http://www.netsurf-browser.org/
*
@@ -16,111 +17,280 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+/** \file
+ * Text export of HTML (implementation).
+ */
+
+#include <assert.h>
#include <stdbool.h>
#include <string.h>
-#include <libxml/HTMLtree.h>
-
#include "utils/config.h"
#include "content/content.h"
#include "desktop/save_text.h"
+#include "render/box.h"
#include "utils/log.h"
+#include "utils/utf8.h"
#include "utils/utils.h"
#ifdef WITH_TEXT_EXPORT
-static void extract_text(xmlDoc *doc);
-static void extract_text_from_tree(xmlNode *n);
+static void extract_text(struct box *box, bool *first,
+ save_text_whitespace *before, struct save_text_state *save);
+static bool save_text_add_to_buffer(const char *text, size_t length,
+ struct box *box, const char *whitespace_text,
+ size_t whitespace_length, struct save_text_state *save);
-static FILE *out;
-void save_as_text(struct content *c, char *path) {
+/**
+ * Extract the text from an HTML content and save it was a text file. Text is
+ * converted to local encoding.
+ *
+ * \param c An HTML content.
+ * \param path Path to save text file too.
+ */
- htmlParserCtxtPtr toSave;
+void save_as_text(struct content *c, char *path)
+{
+ FILE *out;
+ struct save_text_state save = { NULL, 0, 0 };
+ save_text_whitespace before = WHITESPACE_NONE;
+ bool first = true;
+ utf8_convert_ret ret;
+ char *result;
- if (c->type != CONTENT_HTML) {
+ if (c && c->type != CONTENT_HTML) {
return;
}
- out = fopen(path, "w");
- if (!out) return;
-
- toSave = htmlCreateMemoryParserCtxt(c->source_data, c->source_size);
- htmlParseDocument(toSave);
+ extract_text(c->data.html.layout, &first, &before, &save);
+ if (!save.block)
+ return;
- extract_text(toSave->myDoc);
+ ret = utf8_to_local_encoding(save.block, save.length, &result);
+ free(save.block);
- fclose(out);
+ if (ret != UTF8_CONVERT_OK) {
+ LOG(("failed to convert to local encoding, return %d", ret));
+ return;
+ }
- xmlFreeDoc(toSave->myDoc);
- htmlFreeParserCtxt(toSave);
+ out = fopen(path, "w");
+ if (out) {
+ int res = fputs(result, out);
+ res = fputs("\n", out);
+ fclose(out);
+ }
+ free(result);
}
-void extract_text(xmlDoc *doc)
+
+/**
+ * Decide what whitespace to place before the next bit of content-related text
+ * that is saved. Any existing whitespace is overridden with if the whitespace
+ * for this box is more "significant".
+ *
+ * \param box Pointer to box.
+ * \param first Whether this is before the first bit of content-related
+ * text to be saved.
+ * \param before Type of whitespace currently intended to be placed
+ * before the next bit of content-related text to be saved.
+ * Updated if this box is worthy of more significant
+ * whitespace.
+ * \param whitespace_text Whitespace to place before next bit of
+ * content-related text to be saved.
+ * Updated if this box is worthy of more significant
+ * whitespace.
+ * \param whitespace_length Length of whitespace_text.
+ * Updated if this box is worthy of more significant
+ * whitespace.
+ */
+
+void save_text_solve_whitespace(struct box *box, bool *first,
+ save_text_whitespace *before, const char **whitespace_text,
+ size_t *whitespace_length)
{
- xmlNode *html;
-
- /* find the html element */
- for (html = doc->children;
- html!=0 && html->type != XML_ELEMENT_NODE;
- html = html->next)
- ;
- if (html == 0 || strcmp((const char*)html->name, "html") != 0) {
- return;
+ /* work out what whitespace should be placed before the next bit of
+ * text */
+ if (*before < WHITESPACE_TWO_NEW_LINES &&
+ /* significant box type */
+ (box->type == BOX_BLOCK ||
+ box->type == BOX_TABLE ||
+ box->type == BOX_FLOAT_LEFT ||
+ box->type == BOX_FLOAT_RIGHT) &&
+ /* and not a list element */
+ !box->list_marker &&
+ /* and not a marker... */
+ (!(box->parent && box->parent->list_marker == box) ||
+ /* ...unless marker follows WHITESPACE_TAB */
+ ((box->parent && box->parent->list_marker == box) &&
+ *before == WHITESPACE_TAB))) {
+ *before = WHITESPACE_TWO_NEW_LINES;
+ }
+ else if (*before <= WHITESPACE_ONE_NEW_LINE &&
+ (box->type == BOX_TABLE_ROW ||
+ box->type == BOX_BR ||
+ (box->type != BOX_INLINE &&
+ (box->parent && box->parent->list_marker == box)) ||
+ (box->parent->style &&
+ (box->parent->style->white_space ==
+ CSS_WHITE_SPACE_PRE ||
+ box->parent->style->white_space ==
+ CSS_WHITE_SPACE_PRE_WRAP) &&
+ box->type == BOX_INLINE_CONTAINER))) {
+ if (*before == WHITESPACE_ONE_NEW_LINE)
+ *before = WHITESPACE_TWO_NEW_LINES;
+ else
+ *before = WHITESPACE_ONE_NEW_LINE;
+ }
+ else if (*before < WHITESPACE_TAB &&
+ (box->type == BOX_TABLE_CELL ||
+ box->list_marker)) {
+ *before = WHITESPACE_TAB;
}
- extract_text_from_tree(html);
+ if (*first) {
+ /* before the first bit of text to be saved; there is
+ * no preceding whitespace */
+ *whitespace_text = "";
+ *whitespace_length = 0;
+ } else {
+ /* set the whitespace that has been decided on */
+ switch (*before) {
+ case WHITESPACE_TWO_NEW_LINES:
+ *whitespace_text = "\n\n";
+ *whitespace_length = 2;
+ break;
+ case WHITESPACE_ONE_NEW_LINE:
+ *whitespace_text = "\n";
+ *whitespace_length = 1;
+ break;
+ case WHITESPACE_TAB:
+ *whitespace_text = "\t";
+ *whitespace_length = 1;
+ break;
+ case WHITESPACE_NONE:
+ *whitespace_text = "";
+ *whitespace_length = 0;
+ break;
+ default:
+ *whitespace_text = "";
+ *whitespace_length = 0;
+ break;
+ }
+ }
}
-void extract_text_from_tree(xmlNode *n)
+
+/**
+ * Traverse though the box tree and add all text to a save buffer.
+ *
+ * \param box Pointer to box.
+ * \param first Whether this is before the first bit of content-related
+ * text to be saved.
+ * \param before Type of whitespace currently intended to be placed
+ * before the next bit of content-related text to be saved.
+ * Updated if this box is worthy of more significant
+ * whitespace.
+ * \param save our save_text_state workspace pointer
+ * \return true iff the file writing succeeded and traversal should continue.
+ */
+
+void extract_text(struct box *box, bool *first, save_text_whitespace *before,
+ struct save_text_state *save)
{
- xmlNode *this_node;
- char *text;
- int need_nl = 0;
-
- if (n->type == XML_ELEMENT_NODE) {
- if (strcmp(n->name, "dl") == 0 ||
- strcmp(n->name, "h1") == 0 ||
- strcmp(n->name, "h2") == 0 ||
- strcmp(n->name, "h3") == 0 ||
- strcmp(n->name, "ol") == 0 ||
- strcmp(n->name, "title") == 0 ||
- strcmp(n->name, "ul") == 0) {
- need_nl = 2;
- }
- else if (strcmp(n->name, "applet") == 0 ||
- strcmp(n->name, "br") == 0 ||
- strcmp(n->name, "div") == 0 ||
- strcmp(n->name, "dt") == 0 ||
- strcmp(n->name, "h4") == 0 ||
- strcmp(n->name, "h5") == 0 ||
- strcmp(n->name, "h6") == 0 ||
- strcmp(n->name, "li") == 0 ||
- strcmp(n->name, "object") == 0 ||
- strcmp(n->name, "p") == 0 ||
- strcmp(n->name, "tr") == 0) {
- need_nl = 1;
- }
- /* do nothing, we just recurse through these nodes */
+ struct box *child;
+ const char *whitespace_text = "";
+ size_t whitespace_length = 0;
+
+ assert(box);
+
+ /* If box has a list marker */
+ if (box->list_marker) {
+ /* do the marker box before continuing with the rest of the
+ * list element */
+ extract_text(box->list_marker, first, before, save);
}
- else if (n->type == XML_TEXT_NODE) {
- if ((text = squash_whitespace(n->content)) != NULL) {
- fputs(text, out);
- free(text);
- }
- return;
+
+ /* read before calling the handler in case it modifies the tree */
+ child = box->children;
+
+ save_text_solve_whitespace(box, first, before, &whitespace_text,
+ &whitespace_length);
+
+ if (box->type != BOX_BR && !((box->type == BOX_FLOAT_LEFT ||
+ box->type == BOX_FLOAT_RIGHT) && !box->text) &&
+ box->length > 0 && box->text) {
+ /* Box meets criteria for export; add text to buffer */
+ save_text_add_to_buffer(box->text, box->length, box,
+ whitespace_text, whitespace_length, save);
+ *first = false;
+ *before = WHITESPACE_NONE;
}
- else {
- return;
+
+ /* Work though the children of this box, extracting any text */
+ while (child) {
+ extract_text(child, first, before, save);
+ child = child->next;
}
- /* now recurse */
- for (this_node = n->children; this_node != 0; this_node = this_node->next) {
- extract_text_from_tree(this_node);
+ return;
+}
+
+
+/**
+ * Add text to save text buffer. Any preceding whitespace or following space is
+ * also added to the buffer.
+ *
+ * \param text Pointer to text being added.
+ * \param length Length of text to be appended (bytes).
+ * \param box Pointer to text box.
+ * \param whitespace_text Whitespace to place before text for formatting
+ * may be NULL.
+ * \param whitespace_length Length of whitespace_text.
+ * \param save Our save_text_state workspace pointer.
+ * \return true iff the file writing succeeded and traversal should continue.
+ */
+
+bool save_text_add_to_buffer(const char *text, size_t length, struct box *box,
+ const char *whitespace_text, size_t whitespace_length,
+ struct save_text_state *save)
+{
+ size_t new_length;
+ int space = 0;
+
+ assert(save);
+
+ if (box->space > 0)
+ space = 1;
+
+ if (whitespace_text)
+ length += whitespace_length;
+
+ new_length = save->length + whitespace_length + length + space;
+ if (new_length >= save->alloc) {
+ size_t new_alloc = save->alloc + (save->alloc / 4);
+ char *new_block;
+
+ if (new_alloc < new_length) new_alloc = new_length;
+
+ new_block = realloc(save->block, new_alloc);
+ if (!new_block) return false;
+
+ save->block = new_block;
+ save->alloc = new_alloc;
+ }
+ if (whitespace_text) {
+ memcpy(save->block + save->length, whitespace_text,
+ whitespace_length);
}
+ memcpy(save->block + save->length + whitespace_length, text, length);
+ save->length += length;
+
+ if (space == 1)
+ save->block[save->length++] = ' ';
- while (need_nl--)
- fputc('\n', out);
+ return true;
}
#endif
diff --git a/desktop/save_text.h b/desktop/save_text.h
index 9ae839b95..b452f7353 100644
--- a/desktop/save_text.h
+++ b/desktop/save_text.h
@@ -1,5 +1,6 @@
/*
* Copyright 2004 John M Bell <jmb202@ecs.soton.ac.uk>
+ * Copyright 2008 Michael Drake <tlsa@netsurf-browser.org>
*
* This file is part of NetSurf, http://www.netsurf-browser.org/
*
@@ -16,11 +17,34 @@
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+/** \file
+ * Text export of HTML (interface).
+ */
+
#ifndef _NETSURF_DESKTOP_SAVE_TEXT_H_
#define _NETSURF_DESKTOP_SAVE_TEXT_H_
+#include "render/box.h"
+
struct content;
+/* text currently being saved */
+struct save_text_state {
+ char *block;
+ size_t length;
+ size_t alloc;
+};
+
+typedef enum {
+ WHITESPACE_NONE,
+ WHITESPACE_TAB,
+ WHITESPACE_ONE_NEW_LINE,
+ WHITESPACE_TWO_NEW_LINES
+} save_text_whitespace;
+
void save_as_text(struct content *c, char *path);
+void save_text_solve_whitespace(struct box *box, bool *first,
+ save_text_whitespace *before, const char **whitespace_text,
+ size_t *whitespace_length);
#endif
diff --git a/desktop/selection.c b/desktop/selection.c
index 4d675cc0c..31c5def60 100644
--- a/desktop/selection.c
+++ b/desktop/selection.c
@@ -18,8 +18,7 @@
*/
/** \file
- * Text selection within browser windows,
- * (implementation, platform independent)
+ * Text selection within browser windows (implementation).
*/
#include <assert.h>
@@ -29,6 +28,7 @@
#include "desktop/gui.h"
#include "desktop/plotters.h"
+#include "desktop/save_text.h"
#include "desktop/selection.h"
#include "render/box.h"
#include "render/font.h"
@@ -64,21 +64,6 @@ struct rdw_info {
struct rect r;
};
-
-/* text selection currently being saved */
-struct save_state {
- char *block;
- size_t length;
- size_t alloc;
-};
-
-typedef enum {
- WHITESPACE_NONE,
- WHITESPACE_TAB,
- WHITESPACE_ONE_NEW_LINE,
- WHITESPACE_TWO_NEW_LINES
-} seln_whitespace;
-
static bool redraw_handler(const char *text, size_t length, struct box *box,
void *handle, const char *whitespace_text,
size_t whitespace_length);
@@ -92,7 +77,7 @@ static bool selected_part(struct box *box, unsigned start_idx, unsigned end_idx,
unsigned *start_offset, unsigned *end_offset);
static bool traverse_tree(struct box *box, unsigned start_idx, unsigned end_idx,
unsigned int num_space, seln_traverse_handler handler,
- void *handle, seln_whitespace *before, bool *first,
+ void *handle, save_text_whitespace *before, bool *first,
bool do_marker);
static struct box *get_box(struct box *b, unsigned offset, int *pidx);
@@ -451,7 +436,7 @@ bool selected_part(struct box *box, unsigned start_idx, unsigned end_idx,
bool traverse_tree(struct box *box, unsigned start_idx, unsigned end_idx,
unsigned int num_space, seln_traverse_handler handler,
- void *handle, seln_whitespace *before, bool *first,
+ void *handle, save_text_whitespace *before, bool *first,
bool do_marker)
{
struct box *child;
@@ -486,64 +471,8 @@ bool traverse_tree(struct box *box, unsigned start_idx, unsigned end_idx,
/* If nicely formatted output of the selected text is required, work
* out what whitespace should be placed before the next bit of text */
if (before) {
- if (*before < WHITESPACE_TWO_NEW_LINES &&
- /* significant box type */
- (box->type == BOX_BLOCK ||
- box->type == BOX_TABLE ||
- box->type == BOX_FLOAT_LEFT ||
- box->type == BOX_FLOAT_RIGHT) &&
- /* and not a list element */
- !box->list_marker &&
- /* and not a marker... */
- (!do_marker ||
- /* ...unless marker follows WHITESPACE_TAB */
- (do_marker && *before == WHITESPACE_TAB))) {
- *before = WHITESPACE_TWO_NEW_LINES;
- }
- else if (*before <= WHITESPACE_ONE_NEW_LINE &&
- (box->type == BOX_TABLE_ROW ||
- box->type == BOX_BR ||
- (box->type != BOX_INLINE && do_marker) ||
- (box->parent->style &&
- (box->parent->style->white_space ==
- CSS_WHITE_SPACE_PRE ||
- box->parent->style->white_space ==
- CSS_WHITE_SPACE_PRE_WRAP) &&
- box->type == BOX_INLINE_CONTAINER))) {
- if (*before == WHITESPACE_ONE_NEW_LINE)
- *before = WHITESPACE_TWO_NEW_LINES;
- else
- *before = WHITESPACE_ONE_NEW_LINE;
- }
- else if (*before < WHITESPACE_TAB &&
- (box->type == BOX_TABLE_CELL ||
- box->list_marker)) {
- *before = WHITESPACE_TAB;
- }
- if (*first) {
- whitespace_text = "";
- } else {
- switch (*before) {
- case WHITESPACE_TWO_NEW_LINES:
- whitespace_text = "\n\n";
- whitespace_length = 2;
- break;
- case WHITESPACE_ONE_NEW_LINE:
- whitespace_text = "\n";
- whitespace_length = 1;
- break;
- case WHITESPACE_TAB:
- whitespace_text = "\t";
- whitespace_length = 1;
- break;
- case WHITESPACE_NONE:
- whitespace_text = "";
- break;
- default:
- whitespace_text = "";
- break;
- }
- }
+ save_text_solve_whitespace(box, first, before, &whitespace_text,
+ &whitespace_length);
}
else {
whitespace_text = NULL;
@@ -612,7 +541,7 @@ bool selection_traverse(struct selection *s, seln_traverse_handler handler,
void *handle)
{
struct content *c;
- seln_whitespace before = WHITESPACE_NONE;
+ save_text_whitespace before = WHITESPACE_NONE;
bool first = true;
if (!selection_defined(s))
@@ -952,7 +881,7 @@ bool save_handler(const char *text, size_t length, struct box *box,
void *handle, const char *whitespace_text,
size_t whitespace_length)
{
- struct save_state *sv = handle;
+ struct save_text_state *sv = handle;
size_t new_length;
int space = 0;
@@ -1002,7 +931,7 @@ bool save_handler(const char *text, size_t length, struct box *box,
bool selection_save_text(struct selection *s, const char *path)
{
struct content *c = s->bw->current_content;
- struct save_state sv = { NULL, 0, 0 };
+ struct save_text_state sv = { NULL, 0, 0 };
utf8_convert_ret ret;
char *result;
FILE *out;
@@ -1014,19 +943,6 @@ bool selection_save_text(struct selection *s, const char *path)
return false;
}
- /* add newline at end */
- sv.length++;
- if (sv.length > sv.alloc) {
- char *new_block;
-
- new_block = realloc(sv.block, sv.length);
- if (!new_block) return false;
-
- sv.block = new_block;
- sv.alloc = sv.length;
- }
- sv.block[sv.length - 1] = '\n';
-
if (!sv.block)
return false;
@@ -1041,9 +957,12 @@ bool selection_save_text(struct selection *s, const char *path)
out = fopen(path, "w");
if (out) {
int res = fputs(result, out);
+ res = fputs("\n", out);
fclose(out);
+ free(result);
return (res != EOF);
}
+ free(result);
return false;
}
diff --git a/desktop/selection.h b/desktop/selection.h
index 4520ac9f6..c04e503f3 100644
--- a/desktop/selection.h
+++ b/desktop/selection.h
@@ -17,7 +17,7 @@
*/
/** \file
- * Text selection within browser windows, platform-independent part (interface)
+ * Text selection within browser windows (interface).
*/
#ifndef _NETSURF_DESKTOP_SELECTION_H_