diff options
author | Vincent Sanders <vince@kyllikki.org> | 2017-12-19 23:52:28 +0000 |
---|---|---|
committer | Vincent Sanders <vince@kyllikki.org> | 2017-12-19 23:52:28 +0000 |
commit | d4b12bd7bf85a3fa26f5a65d4dc3a5aaf02cb572 (patch) | |
tree | d3c034fbe5ea939435f036ca57dfcb5aa39230d2 /src | |
parent | d26bc9f19191e5dd7d233302f73f226c89cb797f (diff) | |
download | libnspdf-d4b12bd7bf85a3fa26f5a65d4dc3a5aaf02cb572.tar.gz libnspdf-d4b12bd7bf85a3fa26f5a65d4dc3a5aaf02cb572.tar.bz2 |
fix hex string decode and trailer parse
Diffstat (limited to 'src')
-rw-r--r-- | src/byte_class.c | 236 | ||||
-rw-r--r-- | src/xref.c | 301 |
2 files changed, 384 insertions, 153 deletions
diff --git a/src/byte_class.c b/src/byte_class.c index 59c0206..e881cf5 100644 --- a/src/byte_class.c +++ b/src/byte_class.c @@ -17,54 +17,54 @@ * end of line - characters that signify an end of line */ const uint8_t byte_classification[] = { - BC_WSPC, /* 00 - NULL */ - BC_RGLR, /* 01 */ - BC_RGLR, /* 02 */ - BC_RGLR, /* 03 */ - BC_RGLR, /* 04 */ - BC_RGLR, /* 05 */ - BC_RGLR, /* 06 */ - BC_RGLR, /* 07 */ - BC_RGLR, /* 08 */ - BC_WSPC, /* 09 - HT */ - BC_WSPC | BC_EOLM, /* 0A - LF */ - BC_RGLR, /* 0B */ - BC_WSPC, /* 0C - FF */ - BC_WSPC | BC_EOLM, /* 0D - CR */ - BC_RGLR, /* 0E */ - BC_RGLR, /* 0F */ - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, /* 10 - 13 */ - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, /* 14 - 17 */ - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, /* 18 - 1B */ - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, /* 1C - 1F */ - BC_WSPC, /* 20 - SP */ - BC_RGLR, - BC_RGLR, - BC_RGLR, /* 20 - 23 */ - BC_RGLR, /* 24 - '$' */ - BC_DELM | BC_CMNT, /* 25 - '%' */ - BC_RGLR, - BC_RGLR, /* 26 - 27 */ - BC_DELM, - BC_DELM, /* '(' ')' */ - BC_RGLR, - BC_RGLR, /* 2A - 2B */ - BC_RGLR, - BC_RGLR, /* 2C - 2D */ - BC_RGLR, - BC_DELM, /* '.' '/' */ + BC_WSPC, /* 00 - NULL */ + BC_RGLR, /* 01 */ + BC_RGLR, /* 02 */ + BC_RGLR, /* 03 */ + BC_RGLR, /* 04 */ + BC_RGLR, /* 05 */ + BC_RGLR, /* 06 */ + BC_RGLR, /* 07 */ + BC_RGLR, /* 08 */ + BC_WSPC, /* 09 - HT */ + BC_WSPC | BC_EOLM, /* 0A - LF */ + BC_RGLR, /* 0B */ + BC_WSPC, /* 0C - FF */ + BC_WSPC | BC_EOLM, /* 0D - CR */ + BC_RGLR, /* 0E */ + BC_RGLR, /* 0F */ + BC_RGLR, /* 10 */ + BC_RGLR, /* 11 */ + BC_RGLR, /* 12 */ + BC_RGLR, /* 13 */ + BC_RGLR, /* 14 */ + BC_RGLR, /* 15 */ + BC_RGLR, /* 16 */ + BC_RGLR, /* 17 */ + BC_RGLR, /* 18 */ + BC_RGLR, /* 19 */ + BC_RGLR, /* 1A */ + BC_RGLR, /* 1B */ + BC_RGLR, /* 1C */ + BC_RGLR, /* 1D */ + BC_RGLR, /* 1E */ + BC_RGLR, /* 1F */ + BC_WSPC, /* 20 - SP */ + BC_RGLR, /* 21 */ + BC_RGLR, /* 22 */ + BC_RGLR, /* 23 */ + BC_RGLR, /* 24 - '$' */ + BC_DELM | BC_CMNT, /* 25 - '%' */ + BC_RGLR, /* 26 */ + BC_RGLR, /* 27 */ + BC_DELM, /* 28 - '(' */ + BC_DELM, /* 29 - ')' */ + BC_RGLR, /* 2A */ + BC_RGLR, /* 2B */ + BC_RGLR, /* 2C */ + BC_RGLR, /* 2D */ + BC_RGLR, /* 2E - '.' */ + BC_DELM, /* 2F - '/' */ BC_OCTL | BC_DCML | BC_HEXL, /* 30 - '0' */ BC_OCTL | BC_DCML | BC_HEXL, /* 31 - '1' */ BC_OCTL | BC_DCML | BC_HEXL, /* 32 - '2' */ @@ -73,76 +73,76 @@ const uint8_t byte_classification[] = { BC_OCTL | BC_DCML | BC_HEXL, /* 35 - '5' */ BC_OCTL | BC_DCML | BC_HEXL, /* 36 - '6' */ BC_OCTL | BC_DCML | BC_HEXL, /* 37 - '7' */ - BC_DCML | BC_HEXL, - BC_DCML | BC_HEXL, /* '8' '9' */ - BC_RGLR, - BC_RGLR, /* ':' ';' */ - BC_DELM, - BC_RGLR, /* '<' '=' */ - BC_DELM, - BC_RGLR, /* '>' '?' */ - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, /* 40 - 43 */ - BC_RGLR, - BC_HEXL, - BC_HEXL, - BC_HEXL, /* 44 - 47 */ - BC_HEXL, - BC_HEXL, - BC_HEXL, - BC_RGLR, /* 48 - 4B */ - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, /* 4C - 4F */ - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, /* 50 - 53 */ - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, /* 54 - 57 */ - BC_RGLR, - BC_RGLR, /* 58 - 59 */ - BC_RGLR, - BC_DELM, /* 'Z' '[' */ - BC_RGLR, - BC_DELM, /* '\' ']' */ - BC_RGLR, - BC_RGLR, /* 5E - 5F */ - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_HEXL, - BC_HEXL, - BC_HEXL, /* 60 - 67 */ - BC_HEXL, - BC_HEXL, - BC_HEXL, - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, /* 68 - 6F */ - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, - BC_RGLR, /* 70 - 77 */ - BC_RGLR, - BC_RGLR, /* 78 - 79 */ - BC_RGLR, - BC_DELM, /* 'z' '{' */ - BC_RGLR, - BC_DELM, /* '|' '}' */ + BC_DCML | BC_HEXL, /* 38 - '8' */ + BC_DCML | BC_HEXL, /* 39 - '9' */ + BC_RGLR, /* 3A - ':' */ + BC_RGLR, /* 3B - ';' */ + BC_DELM, /* 3C - '<' */ + BC_RGLR, /* 3D - '=' */ + BC_DELM, /* 3E - '>' */ + BC_RGLR, /* 3F - '?' */ + BC_RGLR, /* 40 - */ + BC_HEXL, /* 41 - A */ + BC_HEXL, /* 42 - B */ + BC_HEXL, /* 43 - C */ + BC_HEXL, /* 44 - D */ + BC_HEXL, /* 45 - E */ + BC_HEXL, /* 46 - F */ + BC_RGLR, /* 47 - G */ + BC_RGLR, /* 48 - H */ + BC_RGLR, /* 49 - I */ + BC_RGLR, /* 4A - J */ + BC_RGLR, /* 4B - K */ + BC_RGLR, /* 4C - L */ + BC_RGLR, /* 4D - M */ + BC_RGLR, /* 4E - N */ + BC_RGLR, /* 4F - O */ + BC_RGLR, /* 50 - P */ + BC_RGLR, /* 51 - Q */ + BC_RGLR, /* 52 - R */ + BC_RGLR, /* 53 - S */ + BC_RGLR, /* 54 - T */ + BC_RGLR, /* 55 - U */ + BC_RGLR, /* 56 - V */ + BC_RGLR, /* 57 - W */ + BC_RGLR, /* 58 - X */ + BC_RGLR, /* 59 - Y */ + BC_RGLR, /* 5A - 'Z' */ + BC_DELM, /* 5B - '[' */ + BC_RGLR, /* 5C - '\' */ + BC_DELM, /* 5D - ']' */ + BC_RGLR, /* 5E */ + BC_RGLR, /* 5F */ + BC_RGLR, /* 60 */ + BC_HEXL, /* 61 - a */ + BC_HEXL, /* 62 - b */ + BC_HEXL, /* 63 - c */ + BC_HEXL, /* 64 - d */ + BC_HEXL, /* 65 - e */ + BC_HEXL, /* 66 - f */ + BC_RGLR, /* 67 - g */ + BC_RGLR, /* 68 - h */ + BC_RGLR, /* 69 - i */ + BC_RGLR, /* 6A - j */ + BC_RGLR, /* 6B - k */ + BC_RGLR, /* 6C - l */ + BC_RGLR, /* 6D - m */ + BC_RGLR, /* 6E - n */ + BC_RGLR, /* 6F - o */ + BC_RGLR, /* 70 - p */ + BC_RGLR, /* 71 - q */ + BC_RGLR, /* 72 - r */ + BC_RGLR, /* 73 - s */ + BC_RGLR, /* 74 - t */ + BC_RGLR, /* 75 - u */ + BC_RGLR, /* 76 - v */ + BC_RGLR, /* 77 - w */ + BC_RGLR, /* 78 - x */ + BC_RGLR, /* 79 - y */ + BC_RGLR, /* 7A - 'z' */ + BC_DELM, /* 7B - '{' */ + BC_RGLR, /* 7C - '|' */ + BC_DELM, /* 7D - '}' */ BC_RGLR, BC_RGLR, /* 7E - 7F */ BC_RGLR, @@ -206,7 +206,23 @@ static int doc_skip_ws(struct pdf_doc *doc, uint64_t *offset) return 0; } -static int doc_read_uint(struct pdf_doc *doc, uint64_t *offset_out, uint64_t *result_out) +/** + * move offset to next non eol byte + */ +static int doc_skip_eol(struct pdf_doc *doc, uint64_t *offset) +{ + uint8_t c; + /* TODO sort out keeping offset in range */ + c = DOC_BYTE(doc, *offset); + while ((bclass[c] & BC_EOLM) != 0) { + (*offset)++; + c = DOC_BYTE(doc, *offset); + } + return 0; +} + +static nspdferror +doc_read_uint(struct pdf_doc *doc, uint64_t *offset_out, uint64_t *result_out) { uint8_t c; /* current byte from source data */ unsigned int len; /* number of decimal places in number */ @@ -231,7 +247,7 @@ static int doc_read_uint(struct pdf_doc *doc, uint64_t *offset_out, uint64_t *re *offset_out = offset; *result_out = result; - return 0; + return NSPDFERROR_OK; } num[len] = c - '0'; offset++; @@ -242,7 +258,7 @@ static int doc_read_uint(struct pdf_doc *doc, uint64_t *offset_out, uint64_t *re /** * finds the startxref marker at the end of input */ -int find_startxref(struct pdf_doc *doc, uint64_t *start_xref_out) +nspdferror find_startxref(struct pdf_doc *doc, uint64_t *offset_out) { uint64_t offset; /* offset of characters being considered for startxref */ uint64_t earliest; /* earliest offset to serch for startxref */ @@ -265,14 +281,89 @@ int find_startxref(struct pdf_doc *doc, uint64_t *start_xref_out) (DOC_BYTE(doc, offset + 6) == 'r') && (DOC_BYTE(doc, offset + 7) == 'e') && (DOC_BYTE(doc, offset + 8) == 'f')) { - offset += 9; - doc_skip_ws(doc, &offset); - return doc_read_uint(doc, &offset, start_xref_out); + *offset_out = offset; + return NSPDFERROR_OK; } } - return -1; + return NSPDFERROR_SYNTAX; } +/** + * decodes a startxref field + */ +nspdferror decode_startxref(struct pdf_doc *doc, uint64_t *offset_out, uint64_t *start_xref_out) +{ + uint64_t offset; /* offset of characters being considered for startxref */ + uint64_t start_xref; + nspdferror res; + + offset = *offset_out; + + if ((DOC_BYTE(doc, offset ) != 's') || + (DOC_BYTE(doc, offset + 1) != 't') || + (DOC_BYTE(doc, offset + 2) != 'a') || + (DOC_BYTE(doc, offset + 3) != 'r') || + (DOC_BYTE(doc, offset + 4) != 't') || + (DOC_BYTE(doc, offset + 5) != 'x') || + (DOC_BYTE(doc, offset + 6) != 'r') || + (DOC_BYTE(doc, offset + 7) != 'e') || + (DOC_BYTE(doc, offset + 8) != 'f')) { + return NSPDFERROR_SYNTAX; + } + offset += 9; + + res = doc_skip_ws(doc, &offset); + if (res != NSPDFERROR_OK) { + return res; + } + + res = doc_read_uint(doc, &offset, &start_xref); + if (res != NSPDFERROR_OK) { + return res; + } + + res = doc_skip_eol(doc, &offset); + if (res != NSPDFERROR_OK) { + return res; + } + + if ((DOC_BYTE(doc, offset ) != '%') || + (DOC_BYTE(doc, offset + 1) != '%') || + (DOC_BYTE(doc, offset + 2) != 'E') || + (DOC_BYTE(doc, offset + 3) != 'O') || + (DOC_BYTE(doc, offset + 4) != 'F')) { + printf("missing EOF marker\n"); + return NSPDFERROR_SYNTAX; + } + + *offset_out = offset; + *start_xref_out = start_xref; + + return NSPDFERROR_OK; +} + + +/** + * finds the next trailer + */ +nspdferror find_trailer(struct pdf_doc *doc, uint64_t *offset_out) +{ + uint64_t offset; /* offset of characters being considered for trailer */ + + for (offset = *offset_out;offset < doc->length; offset++) { + if ((DOC_BYTE(doc, offset ) == 't') && + (DOC_BYTE(doc, offset + 1) == 'r') && + (DOC_BYTE(doc, offset + 2) == 'a') && + (DOC_BYTE(doc, offset + 3) == 'i') && + (DOC_BYTE(doc, offset + 4) == 'l') && + (DOC_BYTE(doc, offset + 5) == 'e') && + (DOC_BYTE(doc, offset + 6) == 'r')) { + *offset_out = offset; + return NSPDFERROR_OK; + } + } + return NSPDFERROR_SYNTAX; +} /** * find the PDF comment marker to identify the start of the document @@ -333,6 +424,7 @@ nspdferror cos_free_object(struct cos_object *cos_obj) break; case COS_TYPE_STRING: + free(cos_obj->u.s->data); free(cos_obj->u.s); break; @@ -569,15 +661,29 @@ cos_decode_string(struct pdf_doc *doc, return NSPDFERROR_OK; } +uint8_t xtoi(uint8_t x) +{ + if (x >= '0' && x <= '9') { + x = x - '0'; + } else if (x >= 'a' && x <='f') { + x = x - 'a' + 10; + } else if (x >= 'A' && x <='F') { + x = x - 'A' + 10; + } + return x; +} + nspdferror cos_decode_hex_string(struct pdf_doc *doc, uint64_t *offset_out, struct cos_object **cosobj_out) { uint64_t offset; - // struct cos_object *cosobj; + struct cos_object *cosobj; uint8_t c; - // uint8_t byte; + uint8_t value = 0; + struct cos_string *cstring; + bool first = true; offset = *offset_out; @@ -585,14 +691,46 @@ cos_decode_hex_string(struct pdf_doc *doc, if (c != '<') { return NSPDFERROR_SYNTAX; } - doc_skip_ws(doc, &offset); - while (c != '>') { - c = DOC_BYTE(doc, offset++); - doc_skip_ws(doc, &offset); + cstring = calloc(1, sizeof(*cstring)); + if (cstring == NULL) { + return NSPDFERROR_NOMEM; } - return -1; + cosobj = calloc(1, sizeof(*cosobj)); + if (cosobj == NULL) { + return NSPDFERROR_NOMEM; + } + cosobj->type = COS_TYPE_STRING; + cosobj->u.s = cstring; + + for (; offset < doc->length; offset++) { + c = DOC_BYTE(doc, offset); + if (c == '>') { + if (first == false) { + cos_string_append(cstring, value); + } + offset++; + doc_skip_ws(doc, &offset); + + *cosobj_out = cosobj; + *offset_out = offset; + + return NSPDFERROR_OK; + } else if ((bclass[c] & BC_HEXL) != 0) { + if (first) { + value = xtoi(c) << 4; + first = false; + } else { + value |= xtoi(c); + first = true; + cos_string_append(cstring, value); + } + } else if ((bclass[c] & BC_WSPC) == 0) { + break; /* unknown byte value in string */ + } + } + return NSPDFERROR_SYNTAX; } @@ -640,8 +778,6 @@ int cos_decode_dictionary(struct pdf_doc *doc, } printf("key: %s\n", key->u.n); - printf("%c\n", DOC_BYTE(doc, offset)); - res = cos_decode_object(doc, &offset, &value); if (res != 0) { printf("Unable to decode value object in dictionary\n"); @@ -729,6 +865,7 @@ cos_decode_list(struct pdf_doc *doc, cosobj->u.array = entry; } offset++; /* skip closing ] */ + doc_skip_ws(doc, &offset); *cosobj_out = cosobj; @@ -762,12 +899,13 @@ int cos_decode_name(struct pdf_doc *doc, } printf("found a name\n"); - c = DOC_BYTE(doc, offset++); + c = DOC_BYTE(doc, offset); while ((idx <= NAME_MAX_LENGTH) && ((bclass[c] & (BC_WSPC | BC_DELM)) == 0)) { + offset++; //printf("%c", c); name[idx++] = c; - c = DOC_BYTE(doc, offset++); + c = DOC_BYTE(doc, offset); } //printf("\nidx: %d\n", idx); if (idx > NAME_MAX_LENGTH) { @@ -1114,10 +1252,16 @@ int cos_decode_object(struct pdf_doc *doc, -int decode_trailer(struct pdf_doc *doc, uint64_t offset) +nspdferror +decode_trailer(struct pdf_doc *doc, + uint64_t *offset_out, + struct cos_object **trailer_out) { struct cos_object *trailer; int res; + uint64_t offset; + + offset = *offset_out; /* trailer object header */ if ((DOC_BYTE(doc, offset ) != 't') && @@ -1142,7 +1286,10 @@ int decode_trailer(struct pdf_doc *doc, uint64_t offset) return -1; } - return 0; + *trailer_out = trailer; + *offset_out = offset; + + return NSPDFERROR_OK; } int decode_xref(struct pdf_doc *doc, uint64_t *offset_out) @@ -1213,39 +1360,123 @@ int decode_xref(struct pdf_doc *doc, uint64_t *offset_out) return 0; } -int main(int argc, char **argv) +/** + * recursively parse trailers and xref tables + */ +nspdferror decode_xref_trailer(struct pdf_doc *doc, uint64_t xref_offset) { - struct pdf_doc doc; - int res; - uint64_t startxref; + nspdferror res; + uint64_t offset; /* the current data offset */ + uint64_t startxref; /* the value of the startxref field */ + struct cos_object *trailer; /* the current trailer */ - res = read_whole_pdf(&doc, argv[1]); - if (res != 0) { - printf("failed to read file\n"); + offset = xref_offset; + + res = find_trailer(doc, &offset); + if (res != NSPDFERROR_OK) { + printf("failed to decode startxref\n"); return res; } - res = check_header(&doc); - if (res != 0) { - printf("header check failed\n"); + res = decode_trailer(doc, &offset, &trailer); + if (res != NSPDFERROR_OK) { + printf("failed to decode trailer\n"); return res; } - res = find_startxref(&doc, &startxref); - if (res != 0) { - printf("failed to find startxref\n"); + res = decode_startxref(doc, &offset, &startxref); + if (res != NSPDFERROR_OK) { + printf("failed to decode startxref\n"); return res; } + if (startxref != xref_offset) { + printf("startxref and Prev value disagree\n"); + } + + /* extract Size from trailer and create xref table large enough */ + + /* check for prev ID key in trailer and recurse call if present */ + + /* + + res = decode_xref(&doc, &startxref); if (res != 0) { printf("failed to decode xref table\n"); return res; } - res = decode_trailer(&doc, startxref); + */ + + return res; +} + +/** + * decode non-linear pdf trailer data + * + * PDF have a structure nominally defined as header, body, cross reference table + * and trailer. The body, cross reference table and trailer sections may be + * repeated in a scheme known as "incremental updates" + * + * The strategy used here is to locate the end of the last trailer block which + * contains a startxref token followed by a byte offset into the file of the + * beginning of the cross reference table followed by a literal '%%EOF' + * + * the initial offset is used to walk back down a chain of xref/trailers until + * the trailer does not contain a Prev entry and decode xref tables forwards to + * overwrite earlier object entries with later ones. + * + * It is necessary to search forwards from the xref table to find the trailer + * block because instead of the Prev entry pointing to the previous trailer + * (from which we could have extracted the startxref to find the associated + * xref table) it points to the previous xref block which we have to skip to + * find the subsequent trailer. + * + */ +nspdferror decode_trailers(struct pdf_doc *doc) +{ + nspdferror res; + uint64_t offset; /* the current data offset */ + uint64_t startxref; /* the value of the first startxref field */ + + res = find_startxref(doc, &offset); + if (res != NSPDFERROR_OK) { + printf("failed to find startxref\n"); + return res; + } + + res = decode_startxref(doc, &offset, &startxref); + if (res != NSPDFERROR_OK) { + printf("failed to decode startxref\n"); + return res; + } + + /* recurse down the xref and trailers */ + return decode_xref_trailer(doc, startxref); +} + + +int main(int argc, char **argv) +{ + struct pdf_doc doc; + int res; + + res = read_whole_pdf(&doc, argv[1]); if (res != 0) { - printf("failed to decode trailer\n"); + printf("failed to read file\n"); + return res; + } + + res = check_header(&doc); + if (res != 0) { + printf("header check failed\n"); + return res; + } + + res = decode_trailers(&doc); + if (res != NSPDFERROR_OK) { + printf("failed to decode trailers (%d)\n", res); return res; } |