From 7b30a5520cfb56e651f0eb4da85a3e07747da7dc Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sat, 23 Jun 2007 22:40:25 +0000 Subject: Import hubbub -- an HTML parsing library. Plenty of work still to do (like tree generation ;) svn path=/trunk/hubbub/; revision=3359 --- test/tokeniser.c | 174 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) create mode 100644 test/tokeniser.c (limited to 'test/tokeniser.c') diff --git a/test/tokeniser.c b/test/tokeniser.c new file mode 100644 index 0000000..271b986 --- /dev/null +++ b/test/tokeniser.c @@ -0,0 +1,174 @@ +#include +#include + +#include + +#include "utils/utils.h" + +#include "input/inputstream.h" +#include "tokeniser/tokeniser.h" + +#include "testutils.h" + +static const uint8_t *pbuffer; + +static void buffer_handler(const uint8_t *buffer, size_t len, void *pw); +static void token_handler(const hubbub_token *token, void *pw); + +static void *myrealloc(void *ptr, size_t len, void *pw) +{ + UNUSED(pw); + + return realloc(ptr, len); +} + +int main(int argc, char **argv) +{ + hubbub_inputstream *stream; + hubbub_tokeniser *tok; + hubbub_tokeniser_optparams params; + FILE *fp; + size_t len, origlen; +#define CHUNK_SIZE (4096) + uint8_t buf[CHUNK_SIZE]; + + if (argc != 3) { + printf("Usage: %s \n", argv[0]); + return 1; + } + + /* Initialise library */ + assert(hubbub_initialise(argv[1], myrealloc, NULL) == HUBBUB_OK); + + stream = hubbub_inputstream_create("UTF-8", "UTF-8", myrealloc, NULL); + assert(stream != NULL); + + tok = hubbub_tokeniser_create(stream, myrealloc, NULL); + assert(tok != NULL); + + params.buffer_handler.handler = buffer_handler; + params.buffer_handler.pw = NULL; + assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_BUFFER_HANDLER, + ¶ms) == HUBBUB_OK); + + params.token_handler.handler = token_handler; + params.token_handler.pw = NULL; + assert(hubbub_tokeniser_setopt(tok, HUBBUB_TOKENISER_TOKEN_HANDLER, + ¶ms) == HUBBUB_OK); + + fp = fopen(argv[2], "rb"); + if (fp == NULL) { + printf("Failed opening %s\n", argv[2]); + return 1; + } + + fseek(fp, 0, SEEK_END); + origlen = len = ftell(fp); + fseek(fp, 0, SEEK_SET); + + while (len >= CHUNK_SIZE) { + fread(buf, 1, CHUNK_SIZE, fp); + + assert(hubbub_inputstream_append(stream, + buf, CHUNK_SIZE) == HUBBUB_OK); + + len -= CHUNK_SIZE; + + assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); + } + + if (len > 0) { + fread(buf, 1, len, fp); + + assert(hubbub_inputstream_append(stream, + buf, len) == HUBBUB_OK); + + len = 0; + + assert(hubbub_inputstream_append(stream, NULL, 0) == + HUBBUB_OK); + + assert(hubbub_tokeniser_run(tok) == HUBBUB_OK); + } + + fclose(fp); + + hubbub_tokeniser_destroy(tok); + + hubbub_inputstream_destroy(stream); + + assert(hubbub_finalise(myrealloc, NULL) == HUBBUB_OK); + + printf("PASS\n"); + + return 0; +} + +void buffer_handler(const uint8_t *buffer, size_t len, void *pw) +{ + UNUSED(len); + UNUSED(pw); + + pbuffer = buffer; +} + +void token_handler(const hubbub_token *token, void *pw) +{ + static const char *token_names[] = { + "DOCTYPE", "START TAG", "END TAG", + "COMMENT", "CHARACTERS", "EOF" + }; + size_t i; + + UNUSED(pw); + + printf("%s: ", token_names[token->type]); + + switch (token->type) { + case HUBBUB_TOKEN_DOCTYPE: + printf("'%.*s' (%svalid)\n", + (int) token->data.doctype.name.len, + pbuffer + token->data.doctype.name.data_off, + token->data.doctype.correct ? "" : "in"); + break; + case HUBBUB_TOKEN_START_TAG: + printf("'%.*s' %s\n", + (int) token->data.tag.name.len, + pbuffer + token->data.tag.name.data_off, + (token->data.tag.n_attributes > 0) ? + "attributes:" : ""); + for (i = 0; i < token->data.tag.n_attributes; i++) { + printf("\t'%.*s' = '%.*s'\n", + (int) token->data.tag.attributes[i].name.len, + pbuffer + token->data.tag.attributes[i].name.data_off, + (int) token->data.tag.attributes[i].value.len, + pbuffer + token->data.tag.attributes[i].value.data_off); + } + break; + case HUBBUB_TOKEN_END_TAG: + printf("'%.*s' %s\n", + (int) token->data.tag.name.len, + pbuffer + token->data.tag.name.data_off, + (token->data.tag.n_attributes > 0) ? + "attributes:" : ""); + for (i = 0; i < token->data.tag.n_attributes; i++) { + printf("\t'%.*s' = '%.*s'\n", + (int) token->data.tag.attributes[i].name.len, + pbuffer + token->data.tag.attributes[i].name.data_off, + (int) token->data.tag.attributes[i].value.len, + pbuffer + token->data.tag.attributes[i].value.data_off); + } + break; + case HUBBUB_TOKEN_COMMENT: + printf("'%.*s'\n", (int) token->data.comment.len, + pbuffer + token->data.comment.data_off); + break; + case HUBBUB_TOKEN_CHARACTER: + printf("'%.*s'\n", (int) token->data.character.len, + pbuffer + token->data.character.data_off); + break; + case HUBBUB_TOKEN_EOF: + printf("\n"); + break; + } +} -- cgit v1.2.3