fix hex string decode and trailer parse

author: Vincent Sanders <vince@kyllikki.org> 2017-12-19 23:52:28 +0000
committer: Vincent Sanders <vince@kyllikki.org> 2017-12-19 23:52:28 +0000
commit: d4b12bd7bf85a3fa26f5a65d4dc3a5aaf02cb572 (patch)
tree: d3c034fbe5ea939435f036ca57dfcb5aa39230d2 /src
parent: d26bc9f19191e5dd7d233302f73f226c89cb797f (diff)
download: libnspdf-d4b12bd7bf85a3fa26f5a65d4dc3a5aaf02cb572.tar.gz
libnspdf-d4b12bd7bf85a3fa26f5a65d4dc3a5aaf02cb572.tar.bz2
2 files changed, 384 insertions, 153 deletions
diff --git a/src/byte_class.c b/src/byte_class.c
index 59c0206..e881cf5 100644
--- a/src/byte_class.c
+++ b/src/byte_class.c
@@ -17,54 +17,54 @@
  *     end of line - characters that signify an end of line
  */
 const uint8_t byte_classification[] = {
-    BC_WSPC,           /* 00 - NULL */
-    BC_RGLR,           /* 01 */
-    BC_RGLR,           /* 02 */
-    BC_RGLR,           /* 03 */
-    BC_RGLR,           /* 04 */
-    BC_RGLR,           /* 05 */
-    BC_RGLR,           /* 06 */
-    BC_RGLR,           /* 07 */
-    BC_RGLR,           /* 08 */
-    BC_WSPC,           /* 09 - HT */
-    BC_WSPC | BC_EOLM, /* 0A - LF */
-    BC_RGLR,           /* 0B */
-    BC_WSPC,           /* 0C - FF */
-    BC_WSPC | BC_EOLM, /* 0D - CR */
-    BC_RGLR,           /* 0E */
-    BC_RGLR,           /* 0F */
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR, /* 10 - 13 */
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR, /* 14 - 17 */
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR, /* 18 - 1B */
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR, /* 1C - 1F */
-    BC_WSPC, /* 20 - SP */
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR, /* 20 - 23 */
-    BC_RGLR,           /* 24 - '$' */
-    BC_DELM | BC_CMNT, /* 25 - '%' */
-    BC_RGLR,
-    BC_RGLR,                     /* 26 - 27 */
-    BC_DELM,
-    BC_DELM,                     /* '(' ')' */
-    BC_RGLR,
-    BC_RGLR,                     /* 2A - 2B */
-    BC_RGLR,
-    BC_RGLR,                     /* 2C - 2D */
-    BC_RGLR,
-    BC_DELM,                     /* '.' '/' */
+    BC_WSPC,                     /* 00 - NULL */
+    BC_RGLR,                     /* 01 */
+    BC_RGLR,                     /* 02 */
+    BC_RGLR,                     /* 03 */
+    BC_RGLR,                     /* 04 */
+    BC_RGLR,                     /* 05 */
+    BC_RGLR,                     /* 06 */
+    BC_RGLR,                     /* 07 */
+    BC_RGLR,                     /* 08 */
+    BC_WSPC,                     /* 09 - HT */
+    BC_WSPC | BC_EOLM,           /* 0A - LF */
+    BC_RGLR,                     /* 0B */
+    BC_WSPC,                     /* 0C - FF */
+    BC_WSPC | BC_EOLM,           /* 0D - CR */
+    BC_RGLR,                     /* 0E */
+    BC_RGLR,                     /* 0F */
+    BC_RGLR,                     /* 10 */
+    BC_RGLR,                     /* 11 */
+    BC_RGLR,                     /* 12 */
+    BC_RGLR,                     /* 13 */
+    BC_RGLR,                     /* 14 */
+    BC_RGLR,                     /* 15 */
+    BC_RGLR,                     /* 16 */
+    BC_RGLR,                     /* 17 */
+    BC_RGLR,                     /* 18 */
+    BC_RGLR,                     /* 19 */
+    BC_RGLR,                     /* 1A */
+    BC_RGLR,                     /* 1B */
+    BC_RGLR,                     /* 1C */
+    BC_RGLR,                     /* 1D */
+    BC_RGLR,                     /* 1E */
+    BC_RGLR,                     /* 1F */
+    BC_WSPC,                     /* 20 - SP */
+    BC_RGLR,                     /* 21 */
+    BC_RGLR,                     /* 22 */
+    BC_RGLR,                     /* 23 */
+    BC_RGLR,                     /* 24 - '$' */
+    BC_DELM | BC_CMNT,           /* 25 - '%' */
+    BC_RGLR,                     /* 26 */
+    BC_RGLR,                     /* 27 */
+    BC_DELM,                     /* 28 - '(' */
+    BC_DELM,                     /* 29 - ')' */
+    BC_RGLR,                     /* 2A */
+    BC_RGLR,                     /* 2B */
+    BC_RGLR,                     /* 2C */
+    BC_RGLR,                     /* 2D */
+    BC_RGLR,                     /* 2E - '.' */
+    BC_DELM,                     /* 2F - '/' */
     BC_OCTL | BC_DCML | BC_HEXL, /* 30 - '0' */
     BC_OCTL | BC_DCML | BC_HEXL, /* 31 - '1' */
     BC_OCTL | BC_DCML | BC_HEXL, /* 32 - '2' */
@@ -73,76 +73,76 @@ const uint8_t byte_classification[] = {
     BC_OCTL | BC_DCML | BC_HEXL, /* 35 - '5' */
     BC_OCTL | BC_DCML | BC_HEXL, /* 36 - '6' */
     BC_OCTL | BC_DCML | BC_HEXL, /* 37 - '7' */
-    BC_DCML | BC_HEXL,
-    BC_DCML | BC_HEXL, /* '8' '9' */
-    BC_RGLR,
-    BC_RGLR,                     /* ':' ';' */
-    BC_DELM,
-    BC_RGLR,                     /* '<' '=' */
-    BC_DELM,
-    BC_RGLR,                     /* '>' '?' */
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR, /* 40 - 43 */
-    BC_RGLR,
-    BC_HEXL,
-    BC_HEXL,
-    BC_HEXL, /* 44 - 47 */
-    BC_HEXL,
-    BC_HEXL,
-    BC_HEXL,
-    BC_RGLR, /* 48 - 4B */
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR, /* 4C - 4F */
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR, /* 50 - 53 */
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR, /* 54 - 57 */
-    BC_RGLR,
-    BC_RGLR,                   /* 58 - 59 */
-    BC_RGLR,
-    BC_DELM,                   /* 'Z' '[' */
-    BC_RGLR,
-    BC_DELM,                   /* '\' ']' */
-    BC_RGLR,
-    BC_RGLR,                   /* 5E - 5F */
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_HEXL,
-    BC_HEXL,
-    BC_HEXL, /* 60 - 67 */
-    BC_HEXL,
-    BC_HEXL,
-    BC_HEXL,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR, /* 68 - 6F */
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR,
-    BC_RGLR, /* 70 - 77 */
-    BC_RGLR,
-    BC_RGLR,                   /* 78 - 79 */
-    BC_RGLR,
-    BC_DELM,                   /* 'z' '{' */
-    BC_RGLR,
-    BC_DELM,                   /* '|' '}' */
+    BC_DCML | BC_HEXL,           /* 38 - '8' */
+    BC_DCML | BC_HEXL,           /* 39 - '9' */
+    BC_RGLR,                     /* 3A - ':' */
+    BC_RGLR,                     /* 3B - ';' */
+    BC_DELM,                     /* 3C - '<' */
+    BC_RGLR,                     /* 3D - '=' */
+    BC_DELM,                     /* 3E - '>' */
+    BC_RGLR,                     /* 3F - '?' */
+    BC_RGLR,                     /* 40 - */
+    BC_HEXL,                     /* 41 - A */
+    BC_HEXL,                     /* 42 - B */
+    BC_HEXL,                     /* 43 - C */
+    BC_HEXL,                     /* 44 - D */
+    BC_HEXL,                     /* 45 - E */
+    BC_HEXL,                     /* 46 - F */
+    BC_RGLR,                     /* 47 - G */
+    BC_RGLR,                     /* 48 - H */
+    BC_RGLR,                     /* 49 - I */
+    BC_RGLR,                     /* 4A - J */
+    BC_RGLR,                     /* 4B - K */
+    BC_RGLR,                     /* 4C - L */
+    BC_RGLR,                     /* 4D - M */
+    BC_RGLR,                     /* 4E - N */
+    BC_RGLR,                     /* 4F - O */
+    BC_RGLR,                     /* 50 - P */
+    BC_RGLR,                     /* 51 - Q */
+    BC_RGLR,                     /* 52 - R */
+    BC_RGLR,                     /* 53 - S */
+    BC_RGLR,                     /* 54 - T */
+    BC_RGLR,                     /* 55 - U */
+    BC_RGLR,                     /* 56 - V */
+    BC_RGLR,                     /* 57 - W */
+    BC_RGLR,                     /* 58 - X */
+    BC_RGLR,                     /* 59 - Y */
+    BC_RGLR,                     /* 5A - 'Z' */
+    BC_DELM,                     /* 5B - '[' */
+    BC_RGLR,                     /* 5C - '\' */
+    BC_DELM,                     /* 5D - ']' */
+    BC_RGLR,                     /* 5E */
+    BC_RGLR,                     /* 5F */
+    BC_RGLR,                     /* 60 */
+    BC_HEXL,                     /* 61 - a */
+    BC_HEXL,                     /* 62 - b */
+    BC_HEXL,                     /* 63 - c */
+    BC_HEXL,                     /* 64 - d */
+    BC_HEXL,                     /* 65 - e */
+    BC_HEXL,                     /* 66 - f */
+    BC_RGLR,                     /* 67 - g */
+    BC_RGLR,                     /* 68 - h */
+    BC_RGLR,                     /* 69 - i */
+    BC_RGLR,                     /* 6A - j */
+    BC_RGLR,                     /* 6B - k */
+    BC_RGLR,                     /* 6C - l */
+    BC_RGLR,                     /* 6D - m */
+    BC_RGLR,                     /* 6E - n */
+    BC_RGLR,                     /* 6F - o */
+    BC_RGLR,                     /* 70 - p */
+    BC_RGLR,                     /* 71 - q */
+    BC_RGLR,                     /* 72 - r */
+    BC_RGLR,                     /* 73 - s */
+    BC_RGLR,                     /* 74 - t */
+    BC_RGLR,                     /* 75 - u */
+    BC_RGLR,                     /* 76 - v */
+    BC_RGLR,                     /* 77 - w */
+    BC_RGLR,                     /* 78 - x */
+    BC_RGLR,                     /* 79 - y */
+    BC_RGLR,                     /* 7A - 'z' */
+    BC_DELM,                     /* 7B - '{' */
+    BC_RGLR,                     /* 7C - '|' */
+    BC_DELM,                     /* 7D - '}' */
     BC_RGLR,
     BC_RGLR,                   /* 7E - 7F */
     BC_RGLR,
diff --git a/src/xref.c b/src/xref.c
index 173eb9e..94b0ee0 100644
--- a/src/xref.c
+++ b/src/xref.c
@@ -206,7 +206,23 @@ static int doc_skip_ws(struct pdf_doc *doc, uint64_t *offset)
     return 0;
 }
 
-static int doc_read_uint(struct pdf_doc *doc, uint64_t *offset_out, uint64_t *result_out)
+/**
+ * move offset to next non eol byte
+ */
+static int doc_skip_eol(struct pdf_doc *doc, uint64_t *offset)
+{
+    uint8_t c;
+    /* TODO sort out keeping offset in range */
+    c = DOC_BYTE(doc, *offset);
+    while ((bclass[c] & BC_EOLM) != 0) {
+        (*offset)++;
+        c = DOC_BYTE(doc, *offset);
+    }
+    return 0;
+}
+
+static nspdferror
+doc_read_uint(struct pdf_doc *doc, uint64_t *offset_out, uint64_t *result_out)
 {
     uint8_t c; /* current byte from source data */
     unsigned int len; /* number of decimal places in number */
@@ -231,7 +247,7 @@ static int doc_read_uint(struct pdf_doc *doc, uint64_t *offset_out, uint64_t *re
             *offset_out = offset;
             *result_out = result;
 
-            return 0;
+            return NSPDFERROR_OK;
         }
         num[len] = c - '0';
         offset++;
@@ -242,7 +258,7 @@ static int doc_read_uint(struct pdf_doc *doc, uint64_t *offset_out, uint64_t *re
 /**
  * finds the startxref marker at the end of input
  */
-int find_startxref(struct pdf_doc *doc, uint64_t *start_xref_out)
+nspdferror find_startxref(struct pdf_doc *doc, uint64_t *offset_out)
 {
     uint64_t offset; /* offset of characters being considered for startxref */
     uint64_t earliest; /* earliest offset to serch for startxref */
@@ -265,14 +281,89 @@ int find_startxref(struct pdf_doc *doc, uint64_t *start_xref_out)
             (DOC_BYTE(doc, offset + 6) == 'r') &&
             (DOC_BYTE(doc, offset + 7) == 'e') &&
             (DOC_BYTE(doc, offset + 8) == 'f')) {
-            offset += 9;
-            doc_skip_ws(doc, &offset);
-            return doc_read_uint(doc, &offset, start_xref_out);
+            *offset_out = offset;
+            return NSPDFERROR_OK;
         }
     }
-    return -1;
+    return NSPDFERROR_SYNTAX;
 }
 
+/**
+ * decodes a startxref field
+ */
+nspdferror decode_startxref(struct pdf_doc *doc, uint64_t *offset_out, uint64_t *start_xref_out)
+{
+    uint64_t offset; /* offset of characters being considered for startxref */
+    uint64_t start_xref;
+    nspdferror res;
+
+    offset = *offset_out;
+
+    if ((DOC_BYTE(doc, offset    ) != 's') ||
+        (DOC_BYTE(doc, offset + 1) != 't') ||
+        (DOC_BYTE(doc, offset + 2) != 'a') ||
+        (DOC_BYTE(doc, offset + 3) != 'r') ||
+        (DOC_BYTE(doc, offset + 4) != 't') ||
+        (DOC_BYTE(doc, offset + 5) != 'x') ||
+        (DOC_BYTE(doc, offset + 6) != 'r') ||
+        (DOC_BYTE(doc, offset + 7) != 'e') ||
+        (DOC_BYTE(doc, offset + 8) != 'f')) {
+        return NSPDFERROR_SYNTAX;
+    }
+    offset += 9;
+
+    res = doc_skip_ws(doc, &offset);
+    if (res != NSPDFERROR_OK) {
+        return res;
+    }
+
+    res = doc_read_uint(doc, &offset, &start_xref);
+    if (res != NSPDFERROR_OK) {
+        return res;
+    }
+
+    res = doc_skip_eol(doc, &offset);
+    if (res != NSPDFERROR_OK) {
+        return res;
+    }
+
+    if ((DOC_BYTE(doc, offset    ) != '%') ||
+        (DOC_BYTE(doc, offset + 1) != '%') ||
+        (DOC_BYTE(doc, offset + 2) != 'E') ||
+        (DOC_BYTE(doc, offset + 3) != 'O') ||
+        (DOC_BYTE(doc, offset + 4) != 'F')) {
+        printf("missing EOF marker\n");
+        return NSPDFERROR_SYNTAX;
+    }
+
+    *offset_out = offset;
+    *start_xref_out = start_xref;
+
+    return NSPDFERROR_OK;
+}
+
+
+/**
+ * finds the next trailer
+ */
+nspdferror find_trailer(struct pdf_doc *doc, uint64_t *offset_out)
+{
+    uint64_t offset; /* offset of characters being considered for trailer */
+
+    for (offset = *offset_out;offset < doc->length; offset++) {
+        if ((DOC_BYTE(doc, offset    ) == 't') &&
+            (DOC_BYTE(doc, offset + 1) == 'r') &&
+            (DOC_BYTE(doc, offset + 2) == 'a') &&
+            (DOC_BYTE(doc, offset + 3) == 'i') &&
+            (DOC_BYTE(doc, offset + 4) == 'l') &&
+            (DOC_BYTE(doc, offset + 5) == 'e') &&
+            (DOC_BYTE(doc, offset + 6) == 'r')) {
+            *offset_out = offset;
+            return NSPDFERROR_OK;
+        }
+    }
+    return NSPDFERROR_SYNTAX;
+}
 
 /**
  * find the PDF comment marker to identify the start of the document
@@ -333,6 +424,7 @@ nspdferror cos_free_object(struct cos_object *cos_obj)
         break;
 
     case COS_TYPE_STRING:
+        free(cos_obj->u.s->data);
         free(cos_obj->u.s);
         break;
 
@@ -569,15 +661,29 @@ cos_decode_string(struct pdf_doc *doc,
     return NSPDFERROR_OK;
 }
 
+uint8_t xtoi(uint8_t x)
+{
+    if (x >= '0' && x <= '9') {
+        x = x - '0';
+    } else if (x >= 'a' && x <='f') {
+        x = x - 'a' + 10;
+    } else if (x >= 'A' && x <='F') {
+        x = x - 'A' + 10;
+    }
+    return x;
+}
+
 nspdferror
 cos_decode_hex_string(struct pdf_doc *doc,
                       uint64_t *offset_out,
                       struct cos_object **cosobj_out)
 {
     uint64_t offset;
-    //    struct cos_object *cosobj;
+    struct cos_object *cosobj;
     uint8_t c;
-    //    uint8_t byte;
+    uint8_t value = 0;
+    struct cos_string *cstring;
+    bool first = true;
 
     offset = *offset_out;
 
@@ -585,14 +691,46 @@ cos_decode_hex_string(struct pdf_doc *doc,
     if (c != '<') {
         return NSPDFERROR_SYNTAX;
     }
-    doc_skip_ws(doc, &offset);
 
-    while (c != '>') {
-        c = DOC_BYTE(doc, offset++);
-        doc_skip_ws(doc, &offset);
+    cstring = calloc(1, sizeof(*cstring));
+    if (cstring == NULL) {
+        return NSPDFERROR_NOMEM;
     }
 
-    return -1;
+    cosobj = calloc(1, sizeof(*cosobj));
+    if (cosobj == NULL) {
+        return NSPDFERROR_NOMEM;
+    }
+    cosobj->type = COS_TYPE_STRING;
+    cosobj->u.s = cstring;
+
+    for (; offset < doc->length; offset++) {
+        c = DOC_BYTE(doc, offset);
+        if (c == '>') {
+            if (first == false) {
+                cos_string_append(cstring, value);
+            }
+            offset++;
+            doc_skip_ws(doc, &offset);
+
+            *cosobj_out = cosobj;
+            *offset_out = offset;
+
+            return NSPDFERROR_OK;
+        } else if ((bclass[c] & BC_HEXL) != 0) {
+            if (first) {
+                value = xtoi(c) << 4;
+                first = false;
+            } else {
+                value |= xtoi(c);
+                first = true;
+                cos_string_append(cstring, value);
+            }
+        } else if ((bclass[c] & BC_WSPC) == 0) {
+            break; /* unknown byte value in string */
+        }
+    }
+    return NSPDFERROR_SYNTAX;
 }
 
 
@@ -640,8 +778,6 @@ int cos_decode_dictionary(struct pdf_doc *doc,
         }
         printf("key: %s\n", key->u.n);
 
-        printf("%c\n", DOC_BYTE(doc, offset));
-
         res = cos_decode_object(doc, &offset, &value);
         if (res != 0) {
             printf("Unable to decode value object in dictionary\n");
@@ -729,6 +865,7 @@ cos_decode_list(struct pdf_doc *doc,
         cosobj->u.array = entry;
     }
     offset++; /* skip closing ] */
+
     doc_skip_ws(doc, &offset);
 
     *cosobj_out = cosobj;
@@ -762,12 +899,13 @@ int cos_decode_name(struct pdf_doc *doc,
     }
     printf("found a name\n");
 
-    c = DOC_BYTE(doc, offset++);
+    c = DOC_BYTE(doc, offset);
     while ((idx <= NAME_MAX_LENGTH) &&
            ((bclass[c] & (BC_WSPC | BC_DELM)) == 0)) {
+        offset++;
         //printf("%c", c);
         name[idx++] = c;
-        c = DOC_BYTE(doc, offset++);
+        c = DOC_BYTE(doc, offset);
     }
     //printf("\nidx: %d\n", idx);
     if (idx > NAME_MAX_LENGTH) {
@@ -1114,10 +1252,16 @@ int cos_decode_object(struct pdf_doc *doc,
 
 
 
-int decode_trailer(struct pdf_doc *doc, uint64_t offset)
+nspdferror
+decode_trailer(struct pdf_doc *doc,
+               uint64_t *offset_out,
+               struct cos_object **trailer_out)
 {
     struct cos_object *trailer;
     int res;
+    uint64_t offset;
+
+    offset = *offset_out;
 
     /* trailer object header */
     if ((DOC_BYTE(doc, offset    ) != 't') &&
@@ -1142,7 +1286,10 @@ int decode_trailer(struct pdf_doc *doc, uint64_t offset)
         return -1;
     }
 
-    return 0;
+    *trailer_out = trailer;
+    *offset_out = offset;
+
+    return NSPDFERROR_OK;
 }
 
 int decode_xref(struct pdf_doc *doc, uint64_t *offset_out)
@@ -1213,39 +1360,123 @@ int decode_xref(struct pdf_doc *doc, uint64_t *offset_out)
     return 0;
 }
 
-int main(int argc, char **argv)
+/**
+ * recursively parse trailers and xref tables
+ */
+nspdferror decode_xref_trailer(struct pdf_doc *doc, uint64_t xref_offset)
 {
-    struct pdf_doc doc;
-    int res;
-    uint64_t startxref;
+    nspdferror res;
+    uint64_t offset; /* the current data offset */
+    uint64_t startxref; /* the value of the startxref field */
+    struct cos_object *trailer; /* the current trailer */
 
-    res = read_whole_pdf(&doc, argv[1]);
-    if (res != 0) {
-        printf("failed to read file\n");
+    offset = xref_offset;
+
+    res = find_trailer(doc, &offset);
+    if (res != NSPDFERROR_OK) {
+        printf("failed to decode startxref\n");
         return res;
     }
 
-    res = check_header(&doc);
-    if (res != 0) {
-        printf("header check failed\n");
+    res = decode_trailer(doc, &offset, &trailer);
+    if (res != NSPDFERROR_OK) {
+        printf("failed to decode trailer\n");
         return res;
     }
 
-    res = find_startxref(&doc, &startxref);
-    if (res != 0) {
-        printf("failed to find startxref\n");
+    res = decode_startxref(doc, &offset, &startxref);
+    if (res != NSPDFERROR_OK) {
+        printf("failed to decode startxref\n");
         return res;
     }
 
+    if (startxref != xref_offset) {
+        printf("startxref and Prev value disagree\n");
+    }
+
+    /* extract Size from trailer and create xref table large enough */
+
+    /* check for prev ID key in trailer and recurse call if present */
+
+    /*
+
+
     res = decode_xref(&doc, &startxref);
     if (res != 0) {
         printf("failed to decode xref table\n");
         return res;
     }
 
-    res = decode_trailer(&doc, startxref);
+    */
+
+    return res;
+}
+
+/**
+ * decode non-linear pdf trailer data
+ *
+ * PDF have a structure nominally defined as header, body, cross reference table
+ * and trailer. The body, cross reference table and trailer sections may be
+ * repeated in a scheme known as "incremental updates"
+ *
+ * The strategy used here is to locate the end of the last trailer block which
+ * contains a startxref token followed by a byte offset into the file of the
+ * beginning of the cross reference table followed by a literal '%%EOF'
+ *
+ * the initial offset is used to walk back down a chain of xref/trailers until
+ * the trailer does not contain a Prev entry and decode xref tables forwards to
+ * overwrite earlier object entries with later ones.
+ *
+ * It is necessary to search forwards from the xref table to find the trailer
+ * block because instead of the Prev entry pointing to the previous trailer
+ * (from which we could have extracted the startxref to find the associated
+ * xref table) it points to the previous xref block which we have to skip to
+ * find the subsequent trailer.
+ *
+ */
+nspdferror decode_trailers(struct pdf_doc *doc)
+{
+    nspdferror res;
+    uint64_t offset; /* the current data offset */
+    uint64_t startxref; /* the value of the first startxref field */
+
+    res = find_startxref(doc, &offset);
+    if (res != NSPDFERROR_OK) {
+        printf("failed to find startxref\n");
+        return res;
+    }
+
+    res = decode_startxref(doc, &offset, &startxref);
+    if (res != NSPDFERROR_OK) {
+        printf("failed to decode startxref\n");
+        return res;
+    }
+
+    /* recurse down the xref and trailers */
+    return decode_xref_trailer(doc, startxref);
+}
+
+
+int main(int argc, char **argv)
+{
+    struct pdf_doc doc;
+    int res;
+
+    res = read_whole_pdf(&doc, argv[1]);
     if (res != 0) {
-        printf("failed to decode trailer\n");
+        printf("failed to read file\n");
+        return res;
+    }
+
+    res = check_header(&doc);
+    if (res != 0) {
+        printf("header check failed\n");
+        return res;
+    }
+
+    res = decode_trailers(&doc);
+    if (res != NSPDFERROR_OK) {
+        printf("failed to decode trailers (%d)\n", res);
         return res;
     }
author	Vincent Sanders <vince@kyllikki.org>	2017-12-19 23:52:28 +0000
committer	Vincent Sanders <vince@kyllikki.org>	2017-12-19 23:52:28 +0000
commit	d4b12bd7bf85a3fa26f5a65d4dc3a5aaf02cb572 (patch)
tree	d3c034fbe5ea939435f036ca57dfcb5aa39230d2 /src
parent	d26bc9f19191e5dd7d233302f73f226c89cb797f (diff)
download	libnspdf-d4b12bd7bf85a3fa26f5a65d4dc3a5aaf02cb572.tar.gz libnspdf-d4b12bd7bf85a3fa26f5a65d4dc3a5aaf02cb572.tar.bz2