From d3892dbb6c4ed4e325de60846cdaa27fc3fc34cf Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Mon, 18 Aug 2008 18:33:27 +0000 Subject: Test driver for pre-specified chunk size tests svn path=/trunk/hubbub/; revision=5151 --- test/INDEX | 1 + test/Makefile | 2 +- test/data/tree-chunks/INDEX | 5 + test/data/tree-chunks/basic.dat | 4 + test/tree-buf.c | 867 ++++++++++++++++++++++++++++++++++++++++ 5 files changed, 878 insertions(+), 1 deletion(-) create mode 100644 test/data/tree-chunks/INDEX create mode 100644 test/data/tree-chunks/basic.dat create mode 100644 test/tree-buf.c diff --git a/test/INDEX b/test/INDEX index 7614c5d..94ba83b 100644 --- a/test/INDEX +++ b/test/INDEX @@ -12,3 +12,4 @@ tokeniser2 HTML tokeniser (again) tokeniser2 tokeniser3 HTML tokeniser (byte-by-byte) tokeniser2 tree Treebuilding API html tree2 Treebuilding API tree-construction +tree-buf Treebuilder (specified chunks) tree-chunks diff --git a/test/Makefile b/test/Makefile index 8092460..a26df39 100644 --- a/test/Makefile +++ b/test/Makefile @@ -45,7 +45,7 @@ endif # Tests TESTS_$(d) := csdetect dict entities hubbub parser \ - tokeniser tokeniser2 tokeniser3 tree tree2 + tokeniser tokeniser2 tokeniser3 tree tree2 tree-buf # Items for top-level makefile to use ITEMS_CLEAN := $(ITEMS_CLEAN) \ diff --git a/test/data/tree-chunks/INDEX b/test/data/tree-chunks/INDEX new file mode 100644 index 0000000..cce0e47 --- /dev/null +++ b/test/data/tree-chunks/INDEX @@ -0,0 +1,5 @@ +# Index file for chunked treebuilder tests +# +# Test Description + +basic.dat Basic test of test driver diff --git a/test/data/tree-chunks/basic.dat b/test/data/tree-chunks/basic.dat new file mode 100644 index 0000000..aa5647d --- /dev/null +++ b/test/data/tree-chunks/basic.dat @@ -0,0 +1,4 @@ +#chunks 1 +76 +#data +foobar diff --git a/test/tree-buf.c b/test/tree-buf.c new file mode 100644 index 0000000..cc7e345 --- /dev/null +++ b/test/tree-buf.c @@ -0,0 +1,867 @@ +/* + * Tree construction tester. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include + +#include +#include +#include + +#include "utils/utils.h" + +#include "testutils.h" + +typedef struct attr_t attr_t; +typedef struct node_t node_t; +typedef struct buf_t buf_t; + +struct attr_t { + hubbub_ns ns; + char *name; + char *value; +}; + +struct node_t { + enum { DOCTYPE, COMMENT, ELEMENT, CHARACTER } type; + + union { + struct { + char *name; + char *public_id; + char *system_id; + } doctype; + + struct { + hubbub_ns ns; + char *name; + attr_t *attrs; + size_t n_attrs; + } element; + + char *content; /**< For comments, characters **/ + } data; + + node_t *next; + node_t *prev; + + node_t *child; + node_t *parent; + + uint32_t refcnt; +}; + +struct buf_t { + char *buf; + size_t len; + size_t pos; +}; + + +#define NUM_NAMESPACES 7 +const char const *ns_names[NUM_NAMESPACES] = + { NULL, NULL /*html*/, "math", "svg", "xlink", "xml", "xmlns" }; + + +node_t *Document; + + + +static void node_print(buf_t *buf, node_t *node, unsigned depth); + +static int create_comment(void *ctx, const hubbub_string *data, void **result); +static int create_doctype(void *ctx, const hubbub_doctype *doctype, + void **result); +static int create_element(void *ctx, const hubbub_tag *tag, void **result); +static int create_text(void *ctx, const hubbub_string *data, void **result); +static int ref_node(void *ctx, void *node); +static int unref_node(void *ctx, void *node); +static int append_child(void *ctx, void *parent, void *child, void **result); +static int insert_before(void *ctx, void *parent, void *child, void *ref_child, + void **result); +static int remove_child(void *ctx, void *parent, void *child, void **result); +static int clone_node(void *ctx, void *node, bool deep, void **result); +static int reparent_children(void *ctx, void *node, void *new_parent); +static int get_parent(void *ctx, void *node, bool element_only, void **result); +static int has_children(void *ctx, void *node, bool *result); +static int form_associate(void *ctx, void *form, void *node); +static int add_attributes(void *ctx, void *node, + const hubbub_attribute *attributes, uint32_t n_attributes); +static int set_quirks_mode(void *ctx, hubbub_quirks_mode mode); + +static void delete_node(node_t *node); +static void delete_attr(attr_t *attr); + +static hubbub_tree_handler tree_handler = { + create_comment, + create_doctype, + create_element, + create_text, + ref_node, + unref_node, + append_child, + insert_before, + remove_child, + clone_node, + reparent_children, + get_parent, + has_children, + form_associate, + add_attributes, + set_quirks_mode, + NULL, + NULL +}; + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + void *ret; + + UNUSED(pw); + + /* A half-arsed attempt at filling freshly allocated space with junk. */ + if (ptr == NULL) { + ret = malloc(len); + if (ret != NULL) + memset(ret, 0xdf, len); + } else { + ret = realloc(ptr, len); + } + + return ret; +} + + +/* + * Create, initialise, and return, a parser instance. + */ +static hubbub_parser *setup_parser(void) +{ + hubbub_parser *parser; + hubbub_parser_optparams params; + + parser = hubbub_parser_create("UTF-8", myrealloc, NULL); + assert(parser != NULL); + + params.tree_handler = &tree_handler; + assert(hubbub_parser_setopt(parser, HUBBUB_PARSER_TREE_HANDLER, + ¶ms) == HUBBUB_OK); + + params.document_node = (void *)1; + assert(hubbub_parser_setopt(parser, HUBBUB_PARSER_DOCUMENT_NODE, + ¶ms) == HUBBUB_OK); + +/* Don't enable scripting -- we want the same behaviour as NetSurf. + params.enable_scripting = true; + assert(hubbub_parser_setopt(parser, HUBBUB_PARSER_ENABLE_SCRIPTING, + ¶ms) == HUBBUB_OK); +*/ + + return parser; +} + + + +/*** Buffer handling bits ***/ +static void buf_add(buf_t *buf, const char *str) +{ + size_t len = strlen(str) + 1; + + if (!buf) { + printf("%s", str); + return; + } + + if (buf->buf == NULL) { + buf->len = ((len + 1024) / 1024) * 1024; + buf->buf = calloc(1, buf->len); + } + + while (buf->pos + len > buf->len) { + buf->len *= 2; + buf->buf = realloc(buf->buf, buf->len); + } + + strcat(buf->buf, str); + buf->pos += len; +} + + +int main(int argc, char **argv) +{ + FILE *fp; + char buf[4096]; + size_t *chunks; + size_t n_chunks; + hubbub_parser *parser; + + buf_t got = { NULL, 0, 0 }; + + if (argc != 3) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + /* Initialise library */ + assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK); + + fp = fopen(argv[2], "rb"); + if (fp == NULL) { + printf("Failed opening %s\n", argv[2]); + return 1; + } + + /* Format: + * #chunks + * lines + * #data + * + */ + + assert(fgets(buf, sizeof(buf), fp) != NULL); + assert(strncmp(buf, "#chunks ", sizeof("#chunks ") - 1) == 0); + n_chunks = atoi(buf + sizeof("#chunks ") - 1); + + chunks = malloc(n_chunks * sizeof(size_t)); + assert(chunks != NULL); + + for (uint32_t i = 0; i < n_chunks; i++) { + assert(fgets(buf, sizeof(buf), fp) != NULL); + chunks[i] = atoi(buf); + } + + assert(fgets(buf, sizeof(buf), fp) != NULL); + assert(strcmp(buf, "#data\n") == 0); + + parser = setup_parser(); + + for (uint32_t i = 0; i < n_chunks; i++) { + assert(chunks[i] <= sizeof(buf)); + + fread(buf, 1, chunks[i], fp); + + assert(hubbub_parser_parse_chunk(parser, (uint8_t *) buf, + chunks[i]) == HUBBUB_OK); + } + + assert(hubbub_parser_completed(parser) == HUBBUB_OK); + + node_print(&got, Document, 0); + printf("%s", got.buf); + + hubbub_parser_destroy(parser); + while (Document) { + node_t *victim = Document; + Document = victim->next; + delete_node(victim); + } + Document = NULL; + + printf("PASS\n"); + + fclose(fp); + + free(got.buf); + + assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK); + + return 0; +} + + +/*** Tree construction functions ***/ + +int create_comment(void *ctx, const hubbub_string *data, void **result) +{ + node_t *node = calloc(1, sizeof *node); + + node->type = COMMENT; + node->data.content = strndup((const char *) data->ptr, data->len); + node->refcnt = 1; + node->refcnt = 1; + + *result = node; + + return 0; +} + +int create_doctype(void *ctx, const hubbub_doctype *doctype, void **result) +{ + node_t *node = calloc(1, sizeof *node); + + node->type = DOCTYPE; + node->data.doctype.name = strndup( + (const char *) doctype->name.ptr, + doctype->name.len); + + if (!doctype->public_missing) { + node->data.doctype.public_id = strndup( + (const char *) doctype->public_id.ptr, + doctype->public_id.len); + } + + if (!doctype->system_missing) { + node->data.doctype.system_id = strndup( + (const char *) doctype->system_id.ptr, + doctype->system_id.len); + } + node->refcnt = 1; + + *result = node; + + return 0; +} + +int create_element(void *ctx, const hubbub_tag *tag, void **result) +{ + node_t *node = calloc(1, sizeof *node); + + assert(tag->ns < NUM_NAMESPACES); + + node->type = ELEMENT; + node->data.element.ns = tag->ns; + node->data.element.name = strndup( + (const char *) tag->name.ptr, + tag->name.len); + node->data.element.n_attrs = tag->n_attributes; + + node->data.element.attrs = calloc(node->data.element.n_attrs, + sizeof *node->data.element.attrs); + + for (size_t i = 0; i < tag->n_attributes; i++) { + attr_t *attr = &node->data.element.attrs[i]; + + assert(tag->attributes[i].ns < NUM_NAMESPACES); + + attr->ns = tag->attributes[i].ns; + + attr->name = strndup( + (const char *) tag->attributes[i].name.ptr, + tag->attributes[i].name.len); + + attr->value = strndup( + (const char *) tag->attributes[i].value.ptr, + tag->attributes[i].value.len); + } + node->refcnt = 1; + + *result = node; + + return 0; +} + +int create_text(void *ctx, const hubbub_string *data, void **result) +{ + node_t *node = calloc(1, sizeof *node); + + node->type = CHARACTER; + node->data.content = strndup((const char *) data->ptr, data->len); + node->refcnt = 1; + node->refcnt = 1; + + *result = node; + + return 0; +} + +int ref_node(void *ctx, void *node) +{ + node_t *n = node; + + if (node != (void *) 1) + n->refcnt++; + + return 0; +} + +int unref_node(void *ctx, void *node) +{ + node_t *n = node; + + if (n != (void *) 1) { + assert(n->refcnt > 0); + + n->refcnt--; + + printf("Unreferencing node %p (%d)\n", node, n->refcnt); + + if (n->refcnt == 0 && n->parent == NULL) { + delete_node(n); + } + } + + return 0; +} + +int append_child(void *ctx, void *parent, void *child, void **result) +{ + node_t *tparent = parent; + node_t *tchild = child; + + node_t *insert = NULL; + + tchild->next = tchild->prev = NULL; + +#ifndef NDEBUG + printf("appending (%p):\n", (void *) tchild); + node_print(NULL, tchild, 0); + printf("to:\n"); + if (parent != (void *)1) + node_print(NULL, tparent, 0); +#endif + + *result = child; + + if (parent == (void *)1) { + if (Document) { + insert = Document; + } else { + Document = tchild; + } + } else { + if (tparent->child == NULL) { + tparent->child = tchild; + } else { + insert = tparent->child; + } + } + + if (insert) { + while (insert->next != NULL) { + insert = insert->next; + } + + if (tchild->type == CHARACTER && insert->type == CHARACTER) { + insert->data.content = realloc(insert->data.content, + strlen(insert->data.content) + + strlen(tchild->data.content) + 1); + strcat(insert->data.content, tchild->data.content); + *result = insert; + } else { + insert->next = tchild; + tchild->prev = insert; + } + } + + if (*result == child) + tchild->parent = tparent; + + ref_node(ctx, *result); + + return 0; +} + +/* insert 'child' before 'ref_child', under 'parent' */ +int insert_before(void *ctx, void *parent, void *child, void *ref_child, + void **result) +{ + node_t *tparent = parent; + node_t *tchild = child; + node_t *tref = ref_child; + +#ifndef NDEBUG + printf("inserting (%p):\n", (void *) tchild); + node_print(NULL, tchild, 0); + printf("before:\n"); + node_print(NULL, tref, 0); + printf("under:\n"); + if (parent != (void *)1) + node_print(NULL, tparent, 0); +#endif + + if (tchild->type == CHARACTER && tref->prev && + tref->prev->type == CHARACTER) { + node_t *insert = tref->prev; + + insert->data.content = realloc(insert->data.content, + strlen(insert->data.content) + + strlen(tchild->data.content) + 1); + strcat(insert->data.content, tchild->data.content); + + *result = insert; + } else { + tchild->parent = parent; + + tchild->prev = tref->prev; + tchild->next = tref; + tref->prev = tchild; + + if (tchild->prev) + tchild->prev->next = tchild; + else + tparent->child = tchild; + + *result = child; + } + + ref_node(ctx, *result); + + return 0; +} + +int remove_child(void *ctx, void *parent, void *child, void **result) +{ + node_t *tparent = parent; + node_t *tchild = child; + + assert(tparent->child); + assert(tchild->parent == tparent); + + printf("Removing child %p\n", child); + + if (tchild->parent->child == tchild) { + tchild->parent->child = tchild->next; + } + + if (tchild->prev) + tchild->prev->next = tchild->next; + + if (tchild->next) + tchild->next->prev = tchild->prev; + + /* now reset all the child's pointers */ + tchild->next = tchild->prev = tchild->parent = NULL; + + *result = child; + + ref_node(ctx, *result); + + return 0; +} + +int clone_node(void *ctx, void *node, bool deep, void **result) +{ + node_t *old_node = node; + node_t *new_node = calloc(1, sizeof *new_node); + + new_node->type = old_node->type; + + switch (old_node->type) { + case DOCTYPE: + new_node->data.doctype.name = + strdup(old_node->data.doctype.name); + if (old_node->data.doctype.public_id) + new_node->data.doctype.public_id = + strdup(old_node->data.doctype.public_id); + if (old_node->data.doctype.system_id) + new_node->data.doctype.system_id = + strdup(old_node->data.doctype.system_id); + break; + case COMMENT: + case CHARACTER: + new_node->data.content = strdup(old_node->data.content); + break; + case ELEMENT: + new_node->data.element.ns = old_node->data.element.ns; + new_node->data.element.name = + strdup(old_node->data.element.name); + new_node->data.element.attrs = + calloc(old_node->data.element.n_attrs, + sizeof *new_node->data.element.attrs); + for (size_t i = 0; i < old_node->data.element.n_attrs; i++) { + attr_t *attr = &new_node->data.element.attrs[i]; + + attr->ns = old_node->data.element.attrs[i].ns; + attr->name = + strdup(old_node->data.element.attrs[i].name); + attr->value = + strdup(old_node->data.element.attrs[i].value); + } + new_node->data.element.n_attrs = old_node->data.element.n_attrs; + break; + } + + *result = new_node; + + new_node->child = new_node->parent = + new_node->next = new_node->prev = + NULL; + + new_node->refcnt = 1; + + if (deep == false) + return 0; + + node_t *last = NULL; + + for (node_t *child = old_node->child; child != NULL; + child = child->next) { + node_t *n; + + clone_node(ctx, child, true, (void **) &n); + + n->refcnt = 0; + + if (last == NULL) { + new_node->child = n; + } else { + last->next = n; + n->prev = last; + } + + n->parent = new_node; + last = n; + } + + return 0; +} + +/* Take all of the child nodes of "node" and append them to "new_parent" */ +int reparent_children(void *ctx, void *node, void *new_parent) +{ + node_t *parent = new_parent; + node_t *old_parent = node; + + node_t *insert; + node_t *kids; + + kids = old_parent->child; + if (!kids) return 0; + + old_parent->child = NULL; + + insert = parent->child; + if (!insert) { + parent->child = kids; + } else { + while (insert->next != NULL) { + insert = insert->next; + } + + insert->next = kids; + kids->prev = insert; + } + + while (kids) { + kids->parent = parent; + kids = kids->next; + } + + return 0; +} + +int get_parent(void *ctx, void *node, bool element_only, void **result) +{ + *result = ((node_t *)node)->parent; + + if (*result != NULL) + ref_node(ctx, *result); + + return 0; +} + +int has_children(void *ctx, void *node, bool *result) +{ + *result = ((node_t *)node)->child ? true : false; + + return 0; +} + +int form_associate(void *ctx, void *form, void *node) +{ + return 0; +} + +int add_attributes(void *ctx, void *vnode, + const hubbub_attribute *attributes, uint32_t n_attributes) +{ + node_t *node = vnode; + size_t old_elems = node->data.element.n_attrs; + + node->data.element.n_attrs += n_attributes; + + node->data.element.attrs = realloc(node->data.element.attrs, + node->data.element.n_attrs * + sizeof *node->data.element.attrs); + + for (size_t i = 0; i < n_attributes; i++) { + attr_t *attr = &node->data.element.attrs[old_elems + i]; + + assert(attributes[i].ns < NUM_NAMESPACES); + + attr->ns = attributes[i].ns; + + attr->name = strndup( + (const char *) attributes[i].name.ptr, + attributes[i].name.len); + + attr->value = strndup( + (const char *) attributes[i].value.ptr, + attributes[i].value.len); + } + + + return 0; +} + +int set_quirks_mode(void *ctx, hubbub_quirks_mode mode) +{ + return 0; +} + + + +/*** Serialising bits ***/ + +static int compare_attrs(const void *a, const void *b) { + const attr_t *first = a; + const attr_t *second = b; + + return strcmp(first->name, second->name); +} + + + + +static void indent(buf_t *buf, unsigned depth) +{ + buf_add(buf, "| "); + for (unsigned i = 0; i < depth; i++) { + buf_add(buf, " "); + } +} + +static void print_ns(buf_t *buf, hubbub_ns ns) +{ + if (ns_names[ns] != NULL) { + buf_add(buf, ns_names[ns]); + buf_add(buf, " "); + } +} + +static void node_print(buf_t *buf, node_t *node, unsigned depth) +{ + if (!node) return; + + indent(buf, depth); + + switch (node->type) + { + case DOCTYPE: + buf_add(buf, "data.doctype.name); + + if (node->data.doctype.public_id || + node->data.doctype.system_id) { + if (node->data.doctype.public_id) { + buf_add(buf, " \""); + buf_add(buf, node->data.doctype.public_id); + buf_add(buf, "\" "); + } else { + buf_add(buf, "\"\" "); + } + + if (node->data.doctype.system_id) { + buf_add(buf, " \""); + buf_add(buf, node->data.doctype.system_id); + buf_add(buf, "\""); + } else { + buf_add(buf, "\"\""); + } + } + + buf_add(buf, ">\n"); + break; + case ELEMENT: + buf_add(buf, "<"); + print_ns(buf, node->data.element.ns); + buf_add(buf, node->data.element.name); + buf_add(buf, ">\n"); + + qsort(node->data.element.attrs, node->data.element.n_attrs, + sizeof *node->data.element.attrs, + compare_attrs); + + for (size_t i = 0; i < node->data.element.n_attrs; i++) { + indent(buf, depth + 1); + print_ns(buf, node->data.element.attrs[i].ns); + buf_add(buf, node->data.element.attrs[i].name); + buf_add(buf, "="); + buf_add(buf, "\""); + buf_add(buf, node->data.element.attrs[i].value); + buf_add(buf, "\"\n"); + } + + break; + case CHARACTER: + buf_add(buf, "\""); + buf_add(buf, node->data.content); + buf_add(buf, "\"\n"); + break; + case COMMENT: + buf_add(buf, "\n"); + break; + default: + printf("Unexpected node type %d\n", node->type); + assert(0); + } + + if (node->child) { + node_print(buf, node->child, depth + 1); + } + + if (node->next) { + node_print(buf, node->next, depth); + } +} + +static void delete_node(node_t *node) +{ + if (node == NULL) + return; + + if (node->refcnt != 0) { + printf("Node %p has non-zero refcount %d\n", + (void *) node, node->refcnt); + assert(0); + } + + switch (node->type) { + case DOCTYPE: + free(node->data.doctype.name); + free(node->data.doctype.public_id); + free(node->data.doctype.system_id); + break; + case COMMENT: + case CHARACTER: + free(node->data.content); + break; + case ELEMENT: + free(node->data.element.name); + for (size_t i = 0; i < node->data.element.n_attrs; i++) + delete_attr(&node->data.element.attrs[i]); + free(node->data.element.attrs); + break; + } + + node_t *c, *d; + + for (c = node->child; c != NULL; c = d) { + d = c->next; + + delete_node(c); + } + + memset(node, 0xdf, sizeof(node_t)); + + free(node); +} + +static void delete_attr(attr_t *attr) +{ + if (attr == NULL) + return; + + free(attr->name); + free(attr->value); + + memset(attr, 0xdf, sizeof(attr_t)); +} + -- cgit v1.2.3