From 5422dd50a49fe1a282271f22cd324f815e592e07 Mon Sep 17 00:00:00 2001 From: Vincent Sanders Date: Thu, 28 Dec 2017 17:18:10 +0000 Subject: decode page tree --- src/cos_object.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ src/cos_object.h | 15 ++++++ src/pdf_doc.h | 9 ++++ src/xref.c | 117 +++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 284 insertions(+), 5 deletions(-) diff --git a/src/cos_object.c b/src/cos_object.c index a5bd738..5bfd423 100644 --- a/src/cos_object.c +++ b/src/cos_object.c @@ -187,6 +187,59 @@ cos_get_dictionary_dictionary(struct pdf_doc *doc, return cos_get_dictionary(doc, dict_value, value_out); } +nspdferror +cos_heritable_dictionary_dictionary(struct pdf_doc *doc, + struct cos_object *dict, + const char *key, + struct cos_object **value_out) +{ + nspdferror res; + struct cos_object *dict_value; + res = cos_get_dictionary_value(doc, dict, key, &dict_value); + if (res == NSPDFERROR_NOTFOUND) { + /* \todo get parent entry and extract key from that dictionary instead */ + } + if (res != NSPDFERROR_OK) { + return res; + } + return cos_get_dictionary(doc, dict_value, value_out); +} + +nspdferror +cos_get_dictionary_array(struct pdf_doc *doc, + struct cos_object *dict, + const char *key, + struct cos_object **value_out) +{ + nspdferror res; + struct cos_object *dict_value; + + res = cos_get_dictionary_value(doc, dict, key, &dict_value); + if (res != NSPDFERROR_OK) { + return res; + } + return cos_get_array(doc, dict_value, value_out); +} + +nspdferror +cos_heritable_dictionary_array(struct pdf_doc *doc, + struct cos_object *dict, + const char *key, + struct cos_object **value_out) +{ + nspdferror res; + struct cos_object *dict_value; + + res = cos_get_dictionary_value(doc, dict, key, &dict_value); + if (res == NSPDFERROR_NOTFOUND) { + /* \todo get parent entry and extract key from that dictionary instead */ + } + if (res != NSPDFERROR_OK) { + return res; + } + return cos_get_array(doc, dict_value, value_out); +} + nspdferror cos_get_int(struct pdf_doc *doc, struct cos_object *cobj, @@ -242,3 +295,98 @@ cos_get_dictionary(struct pdf_doc *doc, } return res; } + +nspdferror +cos_get_array(struct pdf_doc *doc, + struct cos_object *cobj, + struct cos_object **value_out) +{ + nspdferror res; + + res = xref_get_referenced(doc, &cobj); + if (res == NSPDFERROR_OK) { + if (cobj->type != COS_TYPE_ARRAY) { + res = NSPDFERROR_TYPE; + } else { + *value_out = cobj; + } + } + return res; +} + +/* + * get a value for a key from a dictionary + */ +nspdferror +cos_get_array_value(struct pdf_doc *doc, + struct cos_object *array, + unsigned int index, + struct cos_object **value_out) +{ + nspdferror res; + struct cos_array_entry *entry; + + res = xref_get_referenced(doc, &array); + if (res == NSPDFERROR_OK) { + if (array->type != COS_TYPE_ARRAY) { + res = NSPDFERROR_TYPE; + } else { + unsigned int cur_index = 0; + res = NSPDFERROR_RANGE; + + entry = array->u.array; + while (entry != NULL) { + if (cur_index == index) { + *value_out = entry->value; + res = NSPDFERROR_OK; + break; + } + cur_index++; + entry = entry->next; + } + } + } + return res; +} + +nspdferror +cos_get_array_dictionary(struct pdf_doc *doc, + struct cos_object *array, + unsigned int index, + struct cos_object **value_out) +{ + nspdferror res; + struct cos_object *array_value; + + res = cos_get_array_value(doc, array, index, &array_value); + if (res != NSPDFERROR_OK) { + return res; + } + return cos_get_dictionary(doc, array_value, value_out); +} + +nspdferror +cos_get_array_size(struct pdf_doc *doc, + struct cos_object *cobj, + unsigned int *size_out) +{ + nspdferror res; + unsigned int array_size = 0; + struct cos_array_entry *array_entry; + + res = xref_get_referenced(doc, &cobj); + if (res == NSPDFERROR_OK) { + if (cobj->type != COS_TYPE_ARRAY) { + res = NSPDFERROR_TYPE; + } else { + /* walk array list to enumerate entries */ + array_entry = cobj->u.array; + while (array_entry != NULL) { + array_size++; + array_entry = array_entry->next; + } + *size_out = array_size; + } + } + return res; +} diff --git a/src/cos_object.h b/src/cos_object.h index 2ded7ec..48241c6 100644 --- a/src/cos_object.h +++ b/src/cos_object.h @@ -127,6 +127,13 @@ nspdferror cos_get_dictionary_name(struct pdf_doc *doc, struct cos_object *dict, nspdferror cos_get_dictionary_dictionary(struct pdf_doc *doc, struct cos_object *dict, const char *key, struct cos_object **value_out); +nspdferror cos_heritable_dictionary_dictionary(struct pdf_doc *doc, struct cos_object *dict, const char *key, struct cos_object **value_out); + +nspdferror cos_get_dictionary_array(struct pdf_doc *doc, struct cos_object *dict, const char *key, struct cos_object **value_out); + +nspdferror cos_heritable_dictionary_array(struct pdf_doc *doc, struct cos_object *dict, const char *key, struct cos_object **value_out); + + nspdferror cos_get_int(struct pdf_doc *doc, struct cos_object *cobj, int64_t *value_out); @@ -134,3 +141,11 @@ nspdferror cos_get_name(struct pdf_doc *doc, struct cos_object *cobj, const char nspdferror cos_get_dictionary(struct pdf_doc *doc, struct cos_object *cobj, struct cos_object **value_out); + +nspdferror cos_get_array(struct pdf_doc *doc, struct cos_object *cobj, struct cos_object **value_out); + +nspdferror cos_get_array_size(struct pdf_doc *doc, struct cos_object *cobj, unsigned int *size_out); + +nspdferror cos_get_array_value(struct pdf_doc *doc, struct cos_object *array, unsigned int index, struct cos_object **value_out); + +nspdferror cos_get_array_dictionary(struct pdf_doc *doc, struct cos_object *arrau, unsigned int index, struct cos_object **value_out); diff --git a/src/pdf_doc.h b/src/pdf_doc.h index b37e3b2..986556f 100644 --- a/src/pdf_doc.h +++ b/src/pdf_doc.h @@ -10,6 +10,12 @@ struct xref_table_entry { struct cos_object *object; }; +/** page entry */ +struct page_table_entry { + struct cos_object *resources; + struct cos_object *mediabox; + struct cos_object *contents; +}; /** pdf document */ struct pdf_doc { @@ -33,6 +39,9 @@ struct pdf_doc { struct cos_object *info; struct cos_object *id; + /* page refrerence table */ + uint64_t page_table_size; + struct page_table_entry *page_table; }; /* byte data acessory, allows for more complex buffer handling in future */ diff --git a/src/xref.c b/src/xref.c index 8239f45..452aa19 100644 --- a/src/xref.c +++ b/src/xref.c @@ -499,12 +499,123 @@ nspdferror decode_trailers(struct pdf_doc *doc) return decode_xref_trailer(doc, startxref); } +/** + * recursively decodes a page tree + */ +nspdferror +decode_page_tree(struct pdf_doc *doc, + struct cos_object *page_tree_node, + unsigned int *page_index) +{ + nspdferror res; + const char *type; + + // Type = Pages + res = cos_get_dictionary_name(doc, page_tree_node, "Type", &type); + if (res != NSPDFERROR_OK) { + return res; + } + + if (strcmp(type, "Pages") == 0) { + struct cos_object *kids; + unsigned int kids_size; + unsigned int kids_index; + + if (doc->page_table == NULL) { + /* allocate top level page table */ + int64_t count; + + res = cos_get_dictionary_int(doc, page_tree_node, "Count", &count); + if (res != NSPDFERROR_OK) { + return res; + } + + doc->page_table = calloc(count, sizeof(struct page_table_entry)); + if (doc->page_table == NULL) { + return NSPDFERROR_NOMEM; + } + doc->page_table_size = count; + } + + res = cos_get_dictionary_array(doc, page_tree_node, "Kids", &kids); + if (res != NSPDFERROR_OK) { + return res; + } + + res = cos_get_array_size(doc, kids, &kids_size); + if (res != NSPDFERROR_OK) { + return res; + } + + for (kids_index = 0; kids_index < kids_size; kids_index++) { + struct cos_object *kid; + + res = cos_get_array_dictionary(doc, kids, kids_index, &kid); + if (res != NSPDFERROR_OK) { + return res; + } + + res = decode_page_tree(doc, kid, page_index); + if (res != NSPDFERROR_OK) { + return res; + } + } + + } else if (strcmp(type, "Page") == 0) { + struct page_table_entry *page; + + page = doc->page_table + (*page_index); + + /* required heritable resources */ + res = cos_heritable_dictionary_dictionary(doc, + page_tree_node, + "Resources", + &(page->resources)); + if (res != NSPDFERROR_OK) { + return res; + } + + /* required heritable mediabox */ + res = cos_heritable_dictionary_array(doc, + page_tree_node, + "MediaBox", + &(page->mediabox)); + if (res != NSPDFERROR_OK) { + return res; + } + + /* optional page contents */ + res = cos_get_dictionary_value(doc, + page_tree_node, + "Contents", + &(page->contents)); + if ((res != NSPDFERROR_OK) && + (res != NSPDFERROR_NOTFOUND)) { + return res; + } + + printf("page index:%d page:%p resources:%p mediabox:%p contents:%p\n", + *page_index, + page, + page->resources, + page->mediabox, + page->contents); + + (*page_index)++; + res = NSPDFERROR_OK; + } else { + res = NSPDFERROR_FORMAT; + } + return res; +} + nspdferror decode_catalog(struct pdf_doc *doc) { nspdferror res; struct cos_object *catalog; const char *type; struct cos_object *pages; + unsigned int page_index = 0; res = cos_get_dictionary(doc, doc->root, &catalog); if (res != NSPDFERROR_OK) { @@ -526,14 +637,10 @@ nspdferror decode_catalog(struct pdf_doc *doc) return res; } - // Type = Pages - res = cos_get_dictionary_name(doc, pages, "Type", &type); + res = decode_page_tree(doc, pages, &page_index); if (res != NSPDFERROR_OK) { return res; } - if (strcmp(type, "Pages") != 0) { - return NSPDFERROR_FORMAT; - } return res; } -- cgit v1.2.3