From e8dbf1fa8049169e6918cce20e98e309a793cffe Mon Sep 17 00:00:00 2001 From: Vincent Sanders Date: Sat, 6 Jan 2018 23:44:17 +0000 Subject: split out cross reference table handling to separate module --- include/nspdf/page.h | 4 +- src/Makefile | 2 +- src/cos_object.c | 35 +++++++--- src/cos_object.h | 23 ++++++- src/document.c | 134 +------------------------------------ src/page.c | 31 ++++++++- src/pdf_doc.c | 64 +++++++----------- src/pdf_doc.h | 35 +++++----- src/xref.c | 186 +++++++++++++++++++++++++++++++++++++++++++++++++++ test/parsepdf.c | 7 +- 10 files changed, 318 insertions(+), 203 deletions(-) create mode 100644 src/xref.c diff --git a/include/nspdf/page.h b/include/nspdf/page.h index 119ef22..8c1d7fc 100644 --- a/include/nspdf/page.h +++ b/include/nspdf/page.h @@ -19,6 +19,8 @@ struct nspdf_doc; -nspdferror nspdf_count_pages(struct nspdf_doc *doc, unsigned int *pages_out); +nspdferror nspdf_page_count(struct nspdf_doc *doc, unsigned int *pages_out); + +nspdferror nspdf_page_render(struct nspdf_doc *doc, unsigned int page_num); #endif /* NSPDF_META_H_ */ diff --git a/src/Makefile b/src/Makefile index 09bde65..a2d1ae8 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,3 +1,3 @@ -DIR_SOURCES := document.c byte_class.c cos_parse.c cos_object.c pdf_doc.c meta.c page.c +DIR_SOURCES := document.c byte_class.c cos_parse.c cos_object.c pdf_doc.c meta.c page.c xref.c include $(NSBUILD)/Makefile.subdir diff --git a/src/cos_object.c b/src/cos_object.c index 494c7ff..80e4431 100644 --- a/src/cos_object.c +++ b/src/cos_object.c @@ -119,7 +119,7 @@ cos_get_dictionary_value(struct nspdf_doc *doc, nspdferror res; struct cos_dictionary_entry *entry; - res = xref_get_referenced(doc, &dict); + res = nspdf__xref_get_referenced(doc, &dict); if (res == NSPDFERROR_OK) { if (dict->type != COS_TYPE_DICTIONARY) { res = NSPDFERROR_TYPE; @@ -264,7 +264,7 @@ cos_get_int(struct nspdf_doc *doc, { nspdferror res; - res = xref_get_referenced(doc, &cobj); + res = nspdf__xref_get_referenced(doc, &cobj); if (res == NSPDFERROR_OK) { if (cobj->type != COS_TYPE_INT) { res = NSPDFERROR_TYPE; @@ -282,7 +282,7 @@ cos_get_name(struct nspdf_doc *doc, { nspdferror res; - res = xref_get_referenced(doc, &cobj); + res = nspdf__xref_get_referenced(doc, &cobj); if (res == NSPDFERROR_OK) { if (cobj->type != COS_TYPE_NAME) { res = NSPDFERROR_TYPE; @@ -301,7 +301,7 @@ cos_get_dictionary(struct nspdf_doc *doc, { nspdferror res; - res = xref_get_referenced(doc, &cobj); + res = nspdf__xref_get_referenced(doc, &cobj); if (res == NSPDFERROR_OK) { if (cobj->type != COS_TYPE_DICTIONARY) { res = NSPDFERROR_TYPE; @@ -320,7 +320,7 @@ cos_get_array(struct nspdf_doc *doc, { nspdferror res; - res = xref_get_referenced(doc, &cobj); + res = nspdf__xref_get_referenced(doc, &cobj); if (res == NSPDFERROR_OK) { if (cobj->type != COS_TYPE_ARRAY) { res = NSPDFERROR_TYPE; @@ -339,7 +339,7 @@ cos_get_string(struct nspdf_doc *doc, { nspdferror res; - res = xref_get_referenced(doc, &cobj); + res = nspdf__xref_get_referenced(doc, &cobj); if (res == NSPDFERROR_OK) { if (cobj->type != COS_TYPE_STRING) { res = NSPDFERROR_TYPE; @@ -351,6 +351,25 @@ cos_get_string(struct nspdf_doc *doc, } +nspdferror +cos_get_stream(struct nspdf_doc *doc, + struct cos_object *cobj, + struct cos_stream **stream_out) +{ + nspdferror res; + + res = nspdf__xref_get_referenced(doc, &cobj); + if (res == NSPDFERROR_OK) { + if (cobj->type != COS_TYPE_STREAM) { + res = NSPDFERROR_TYPE; + } else { + *stream_out = cobj->u.stream; + } + } + return res; +} + + /* * get a value for a key from a dictionary */ @@ -363,7 +382,7 @@ cos_get_array_value(struct nspdf_doc *doc, nspdferror res; struct cos_array_entry *entry; - res = xref_get_referenced(doc, &array); + res = nspdf__xref_get_referenced(doc, &array); if (res == NSPDFERROR_OK) { if (array->type != COS_TYPE_ARRAY) { res = NSPDFERROR_TYPE; @@ -411,7 +430,7 @@ cos_get_array_size(struct nspdf_doc *doc, unsigned int array_size = 0; struct cos_array_entry *array_entry; - res = xref_get_referenced(doc, &cobj); + res = nspdf__xref_get_referenced(doc, &cobj); if (res == NSPDFERROR_OK) { if (cobj->type != COS_TYPE_ARRAY) { res = NSPDFERROR_TYPE; diff --git a/src/cos_object.h b/src/cos_object.h index d0bd5ea..077be3b 100644 --- a/src/cos_object.h +++ b/src/cos_object.h @@ -50,6 +50,12 @@ struct cos_reference { uint64_t generation; }; +struct cos_stream { + uint8_t *data; + size_t length; +}; + + struct cos_object { int type; union { @@ -69,7 +75,7 @@ struct cos_object { struct cos_string *s; /** stream data */ - uint8_t *stream; + struct cos_stream *stream; /* dictionary */ struct cos_dictionary_entry *dictionary; @@ -221,3 +227,18 @@ nspdferror cos_get_dictionary(struct nspdf_doc *doc, struct cos_object *cobj, st * NSERROR_TYPE if the \p cobj is not a array */ nspdferror cos_get_array(struct nspdf_doc *doc, struct cos_object *cobj, struct cos_object **value_out); + +/** + * get the stream value of a cos object. + * + * Get the value from a cos object, if the object is an object reference it + * will be dereferenced first. The dereferencing will parse any previously + * unreferenced indirect objects as required. + * + * \param doc The document the cos object belongs to. + * \param cobj A cos object of stream type. + * \param stream_out The result value. + * \return NSERROR_OK and \p stream_out updated, + * NSERROR_TYPE if the \p cobj is not a array + */ +nspdferror cos_get_stream(struct nspdf_doc *doc, struct cos_object *cobj, struct cos_stream **stream_out); diff --git a/src/document.c b/src/document.c index 3dea95e..bbe948d 100644 --- a/src/document.c +++ b/src/document.c @@ -31,43 +31,6 @@ #define STARTXREF_SEARCH_SIZE 1024 -static nspdferror -doc_read_uint(struct nspdf_doc *doc, - uint64_t *offset_out, - uint64_t *result_out) -{ - uint8_t c; /* current byte from source data */ - unsigned int len; /* number of decimal places in number */ - uint8_t num[21]; /* temporary buffer for decimal values */ - uint64_t offset; /* current offset of source data */ - uint64_t result=0; /* parsed result */ - uint64_t tens; - - offset = *offset_out; - - for (len = 0; len < sizeof(num); len++) { - c = DOC_BYTE(doc, offset); - if ((bclass[c] & BC_DCML) != BC_DCML) { - if (len == 0) { - return -2; /* parse error no decimals in input */ - } - /* sum value from each place */ - for (tens = 1; len > 0; tens = tens * 10, len--) { - result += (num[len - 1] * tens); - } - - *offset_out = offset; - *result_out = result; - - return NSPDFERROR_OK; - } - num[len] = c - '0'; - offset++; - } - return -1; /* number too long */ -} - - /** * finds the startxref marker at the end of input */ @@ -224,95 +187,6 @@ decode_trailer(struct nspdf_doc *doc, } -static nspdferror -decode_xref(struct nspdf_doc *doc, uint64_t *offset_out) -{ - uint64_t offset; - nspdferror res; - uint64_t objnumber; /* current object number */ - uint64_t objcount; - - offset = *offset_out; - - /* xref object header */ - if ((DOC_BYTE(doc, offset ) != 'x') && - (DOC_BYTE(doc, offset + 1) != 'r') && - (DOC_BYTE(doc, offset + 2) != 'e') && - (DOC_BYTE(doc, offset + 3) != 'f')) { - return NSPDFERROR_SYNTAX; - } - offset += 4; - - res = doc_skip_ws(doc, &offset); - if (res != NSPDFERROR_OK) { - return res; - } - - /* subsections - * - */ - res = doc_read_uint(doc, &offset, &objnumber); - while (res == NSPDFERROR_OK) { - uint64_t lastobj; - res = doc_skip_ws(doc, &offset); - if (res != NSPDFERROR_OK) { - return res; - } - - res = doc_read_uint(doc, &offset, &objcount); - if (res != NSPDFERROR_OK) { - return res; - } - - res = doc_skip_ws(doc, &offset); - if (res != NSPDFERROR_OK) { - return res; - } - - //printf("decoding subsection %lld %lld\n", objnumber, objcount); - - lastobj = objnumber + objcount; - for (; objnumber < lastobj ; objnumber++) { - /* each entry is a fixed format */ - uint64_t objindex; - uint64_t objgeneration; - - /* object index */ - res = doc_read_uint(doc, &offset, &objindex); - if (res != NSPDFERROR_OK) { - return res; - } - offset++; /* skip space */ - - res = doc_read_uint(doc, &offset, &objgeneration); - if (res != NSPDFERROR_OK) { - return res; - } - offset++; /* skip space */ - - if ((DOC_BYTE(doc, offset++) == 'n')) { - if (objnumber < doc->xref_size) { - struct xref_table_entry *indobj; - indobj = doc->xref_table + objnumber; - - indobj->ref.id = objnumber; - indobj->ref.generation = objgeneration; - indobj->offset = objindex; - - //printf("xref %lld %lld -> %lld\n", objnumber, objgeneration, objindex); - } else { - printf("index out of bounds\n"); - } - } - - offset += 2; /* skip EOL */ - } - - res = doc_read_uint(doc, &offset, &objnumber); - } - - return NSPDFERROR_OK; -} /** @@ -367,12 +241,10 @@ decode_xref_trailer(struct nspdf_doc *doc, uint64_t xref_offset) goto decode_xref_trailer_failed; } - doc->xref_table = calloc(size, sizeof(struct xref_table_entry)); - if (doc->xref_table == NULL) { - res = NSPDFERROR_NOMEM; + res = nspdf__xref_allocate(doc, size); + if (res != NSPDFERROR_OK) { goto decode_xref_trailer_failed; } - doc->xref_size = size; res = cos_extract_dictionary_value(trailer, "Encrypt", &doc->encrypt); if ((res != NSPDFERROR_OK) && (res != NSPDFERROR_NOTFOUND)) { @@ -403,7 +275,7 @@ decode_xref_trailer(struct nspdf_doc *doc, uint64_t xref_offset) offset = xref_offset; /** @todo deal with XrefStm (number) in trailer */ - res = decode_xref(doc, &offset); + res = nspdf__xref_parse(doc, &offset); if (res != NSPDFERROR_OK) { printf("failed to decode xref table\n"); goto decode_xref_trailer_failed; diff --git a/src/page.c b/src/page.c index bca6dbc..5d2a117 100644 --- a/src/page.c +++ b/src/page.c @@ -17,13 +17,20 @@ #include "cos_object.h" #include "pdf_doc.h" +/** page entry */ +struct page_table_entry { + struct cos_object *resources; + struct cos_object *mediabox; + struct cos_object *contents; +}; + /** * recursively decodes a page tree */ nspdferror nspdf__decode_page_tree(struct nspdf_doc *doc, - struct cos_object *page_tree_node, - unsigned int *page_index) + struct cos_object *page_tree_node, + unsigned int *page_index) { nspdferror res; const char *type; @@ -131,8 +138,26 @@ nspdf__decode_page_tree(struct nspdf_doc *doc, /* exported interface documented in nspdf/page.h */ nspdferror -nspdf_count_pages(struct nspdf_doc *doc, unsigned int *pages_out) +nspdf_page_count(struct nspdf_doc *doc, unsigned int *pages_out) { *pages_out = doc->page_table_size; return NSPDFERROR_OK; } + +/* exported interface documented in nspdf/page.h */ +nspdferror +nspdf_page_render(struct nspdf_doc *doc, unsigned int page_number) +{ + struct page_table_entry *page_entry; + struct cos_stream *stream; + nspdferror res; + + page_entry = doc->page_table + page_number; + + res = cos_get_stream(doc, page_entry->contents, &stream); + if (res != NSPDFERROR_OK) { + return res; + } + + return res; +} diff --git a/src/pdf_doc.c b/src/pdf_doc.c index 281025c..997a3d7 100644 --- a/src/pdf_doc.c +++ b/src/pdf_doc.c @@ -57,51 +57,39 @@ nspdferror doc_skip_eol(struct nspdf_doc *doc, uint64_t *offset) return NSPDFERROR_OK; } -static struct cos_object cos_null_obj = { - .type = COS_TYPE_NULL, -}; nspdferror -xref_get_referenced(struct nspdf_doc *doc, struct cos_object **cobj_out) +doc_read_uint(struct nspdf_doc *doc, + uint64_t *offset_out, + uint64_t *result_out) { - nspdferror res; - struct cos_object *cobj; - struct cos_object *indirect; - uint64_t offset; - struct xref_table_entry *entry; + uint8_t c; /* current byte from source data */ + unsigned int len; /* number of decimal places in number */ + uint8_t num[21]; /* temporary buffer for decimal values */ + uint64_t offset; /* current offset of source data */ + uint64_t result=0; /* parsed result */ + uint64_t tens; - cobj = *cobj_out; + offset = *offset_out; - if (cobj->type != COS_TYPE_REFERENCE) { - /* not passed a reference object so just return what was passed */ - return NSPDFERROR_OK; - } - - entry = doc->xref_table + cobj->u.reference->id; + for (len = 0; len < sizeof(num); len++) { + c = DOC_BYTE(doc, offset); + if ((bclass[c] & BC_DCML) != BC_DCML) { + if (len == 0) { + return -2; /* parse error no decimals in input */ + } + /* sum value from each place */ + for (tens = 1; len > 0; tens = tens * 10, len--) { + result += (num[len - 1] * tens); + } - /* check if referenced object is in range and exists. return null object if - * not - */ - if ((cobj->u.reference->id >= doc->xref_size) || - (cobj->u.reference->id == 0) || - (entry->ref.id == 0)) { - *cobj_out = &cos_null_obj; - return NSPDFERROR_OK; - } + *offset_out = offset; + *result_out = result; - if (entry->object == NULL) { - /* indirect object has never been decoded */ - offset = entry->offset; - res = cos_parse_object(doc, &offset, &indirect); - if (res != NSPDFERROR_OK) { - printf("failed to decode indirect object\n"); - return res; + return NSPDFERROR_OK; } - - entry->object = indirect; + num[len] = c - '0'; + offset++; } - - *cobj_out = entry->object; - - return NSPDFERROR_OK; + return -1; /* number too long */ } diff --git a/src/pdf_doc.h b/src/pdf_doc.h index a75c90e..5c25878 100644 --- a/src/pdf_doc.h +++ b/src/pdf_doc.h @@ -1,21 +1,6 @@ -/** indirect object */ -struct xref_table_entry { - /* reference identifier */ - struct cos_reference ref; - /** offset of object */ - uint64_t offset; - - /* indirect object if already decoded */ - struct cos_object *object; -}; - -/** page entry */ -struct page_table_entry { - struct cos_object *resources; - struct cos_object *mediabox; - struct cos_object *contents; -}; +struct xref_table_entry; +struct page_table_entry; /** pdf document */ struct nspdf_doc { @@ -29,7 +14,7 @@ struct nspdf_doc { /** * Indirect object cross reference table */ - uint64_t xref_size; + uint64_t xref_table_size; struct xref_table_entry *xref_table; struct cos_object *root; @@ -45,9 +30,21 @@ struct nspdf_doc { /* byte data acessory, allows for more complex buffer handling in future */ #define DOC_BYTE(doc, offset) (doc->start[(offset)]) +/* helpers in pdf_doc.h */ nspdferror doc_skip_ws(struct nspdf_doc *doc, uint64_t *offset); nspdferror doc_skip_eol(struct nspdf_doc *doc, uint64_t *offset); +nspdferror doc_read_uint(struct nspdf_doc *doc, uint64_t *offset_out, uint64_t *result_out); + +/** + * parse xref from file + */ +nspdferror nspdf__xref_parse(struct nspdf_doc *doc, uint64_t *offset_out); + +/** + * get an object dereferencing through xref table if necessary + */ +nspdferror nspdf__xref_get_referenced(struct nspdf_doc *doc, struct cos_object **cobj_out); -nspdferror xref_get_referenced(struct nspdf_doc *doc, struct cos_object **cobj_out); +nspdferror nspdf__xref_allocate(struct nspdf_doc *doc, int64_t size); nspdferror nspdf__decode_page_tree(struct nspdf_doc *doc, struct cos_object *page_tree_node, unsigned int *page_index); diff --git a/src/xref.c b/src/xref.c new file mode 100644 index 0000000..298c750 --- /dev/null +++ b/src/xref.c @@ -0,0 +1,186 @@ +/* + * Copyright 2018 Vincent Sanders + * + * This file is part of libnspdf. + * + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + */ + +#include +#include +#include + +#include + +#include "cos_parse.h" +#include "cos_object.h" +#include "pdf_doc.h" + + +/** indirect object */ +struct xref_table_entry { + /* reference identifier */ + struct cos_reference ref; + + /** offset of object */ + uint64_t offset; + + /* indirect object if already decoded */ + struct cos_object *object; +}; + +static struct cos_object cos_null_obj = { + .type = COS_TYPE_NULL, +}; + +nspdferror nspdf__xref_allocate(struct nspdf_doc *doc, int64_t size) +{ + if (doc->xref_table != NULL) { + /** \todo handle freeing xref table */ + return NSPDFERROR_SYNTAX; + } + doc->xref_table_size = size; + + doc->xref_table = calloc(doc->xref_table_size, + sizeof(struct xref_table_entry)); + if (doc->xref_table == NULL) { + return NSPDFERROR_NOMEM; + } + return NSPDFERROR_OK; +} + +nspdferror nspdf__xref_parse(struct nspdf_doc *doc, uint64_t *offset_out) +{ + uint64_t offset; + nspdferror res; + uint64_t objnumber; /* current object number */ + uint64_t objcount; + + offset = *offset_out; + + /* xref object header */ + if ((DOC_BYTE(doc, offset ) != 'x') && + (DOC_BYTE(doc, offset + 1) != 'r') && + (DOC_BYTE(doc, offset + 2) != 'e') && + (DOC_BYTE(doc, offset + 3) != 'f')) { + return NSPDFERROR_SYNTAX; + } + offset += 4; + + res = doc_skip_ws(doc, &offset); + if (res != NSPDFERROR_OK) { + return res; + } + + /* subsections + * + */ + res = doc_read_uint(doc, &offset, &objnumber); + while (res == NSPDFERROR_OK) { + uint64_t lastobj; + res = doc_skip_ws(doc, &offset); + if (res != NSPDFERROR_OK) { + return res; + } + + res = doc_read_uint(doc, &offset, &objcount); + if (res != NSPDFERROR_OK) { + return res; + } + + res = doc_skip_ws(doc, &offset); + if (res != NSPDFERROR_OK) { + return res; + } + + //printf("decoding subsection %lld %lld\n", objnumber, objcount); + + lastobj = objnumber + objcount; + for (; objnumber < lastobj ; objnumber++) { + /* each entry is a fixed format */ + uint64_t objindex; + uint64_t objgeneration; + + /* object index */ + res = doc_read_uint(doc, &offset, &objindex); + if (res != NSPDFERROR_OK) { + return res; + } + offset++; /* skip space */ + + res = doc_read_uint(doc, &offset, &objgeneration); + if (res != NSPDFERROR_OK) { + return res; + } + offset++; /* skip space */ + + if ((DOC_BYTE(doc, offset++) == 'n')) { + if (objnumber < doc->xref_table_size) { + struct xref_table_entry *indobj; + indobj = doc->xref_table + objnumber; + + indobj->ref.id = objnumber; + indobj->ref.generation = objgeneration; + indobj->offset = objindex; + + //printf("xref %lld %lld -> %lld\n", objnumber, objgeneration, objindex); + } else { + //printf("index out of bounds\n"); + } + } + + offset += 2; /* skip EOL */ + } + + res = doc_read_uint(doc, &offset, &objnumber); + } + + return NSPDFERROR_OK; +} + + +nspdferror +nspdf__xref_get_referenced(struct nspdf_doc *doc, struct cos_object **cobj_out) +{ + nspdferror res; + struct cos_object *cobj; + struct cos_object *indirect; + uint64_t offset; + struct xref_table_entry *entry; + + cobj = *cobj_out; + + if (cobj->type != COS_TYPE_REFERENCE) { + /* not passed a reference object so just return what was passed */ + return NSPDFERROR_OK; + } + + entry = doc->xref_table + cobj->u.reference->id; + + /* check if referenced object is in range and exists. return null object if + * not + */ + if ((cobj->u.reference->id >= doc->xref_table_size) || + (cobj->u.reference->id == 0) || + (entry->ref.id == 0)) { + *cobj_out = &cos_null_obj; + return NSPDFERROR_OK; + } + + if (entry->object == NULL) { + /* indirect object has never been decoded */ + offset = entry->offset; + res = cos_parse_object(doc, &offset, &indirect); + if (res != NSPDFERROR_OK) { + //printf("failed to decode indirect object\n"); + return res; + } + + entry->object = indirect; + } + + *cobj_out = entry->object; + + return NSPDFERROR_OK; +} diff --git a/test/parsepdf.c b/test/parsepdf.c index ad6c6c4..7a64f4b 100644 --- a/test/parsepdf.c +++ b/test/parsepdf.c @@ -93,13 +93,18 @@ int main(int argc, char **argv) printf("Title:%s\n", lwc_string_data(title)); } - res = nspdf_count_pages(doc, &page_count); + res = nspdf_page_count(doc, &page_count); if (res != NSPDFERROR_OK) { printf("page count failed (%d)\n", res); return res; } printf("Pages:%d\n", page_count); + res = nspdf_page_render(doc, 0); + if (res != NSPDFERROR_OK) { + printf("page render failed (%d)\n", res); + return res; + } res = nspdf_document_destroy(doc); if (res != NSPDFERROR_OK) { -- cgit v1.2.3