write parse of strings

author: Vincent Sanders <vince@kyllikki.org> 2017-12-19 00:16:17 +0000
committer: Vincent Sanders <vince@kyllikki.org> 2017-12-19 00:16:17 +0000
commit: d26bc9f19191e5dd7d233302f73f226c89cb797f (patch)
tree: cefc93be2ea2dcc5b07073e70ebedd82a96aced0
parent: 5de69a618c7858f997e9944c06837d951fc129aa (diff)
download: libnspdf-d26bc9f19191e5dd7d233302f73f226c89cb797f.tar.gz
libnspdf-d26bc9f19191e5dd7d233302f73f226c89cb797f.tar.bz2
4 files changed, 238 insertions, 44 deletions
diff --git a/src/Makefile b/src/Makefile
index ac8c347..f9ca22c 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,6 +1,6 @@
 #
 
-CFLAGS+=-g -Wall
+CFLAGS+=-g -Wall -Wextra
 
 .PHONY:all clean
 
diff --git a/src/byte_class.c b/src/byte_class.c
index a8ab735..59c0206 100644
--- a/src/byte_class.c
+++ b/src/byte_class.c
@@ -11,6 +11,7 @@
  *     decimal - characters that appear in decimal values 0123456789
  *     hexidecimal - characters that appear in hex values 0123456789ABCDEF
  *   delimiter - The characters used to separate tokens ()[]{}<>/%
+ *     comment - the % character used to introduce a comment
  *   whitespace - separate syntactic constructs like names and numbers treated
  *                as a single character except in comments, strings and streams
  *     end of line - characters that signify an end of line
@@ -52,8 +53,8 @@ const uint8_t byte_classification[] = {
     BC_RGLR,
     BC_RGLR,
     BC_RGLR, /* 20 - 23 */
-    BC_RGLR,
-    BC_DELM,                     /* '$' '%' */
+    BC_RGLR,           /* 24 - '$' */
+    BC_DELM | BC_CMNT, /* 25 - '%' */
     BC_RGLR,
     BC_RGLR,                     /* 26 - 27 */
     BC_DELM,
@@ -64,14 +65,14 @@ const uint8_t byte_classification[] = {
     BC_RGLR,                     /* 2C - 2D */
     BC_RGLR,
     BC_DELM,                     /* '.' '/' */
-    BC_DCML | BC_HEXL,
-    BC_DCML | BC_HEXL, /* '0' '1' */
-    BC_DCML | BC_HEXL,
-    BC_DCML | BC_HEXL, /* '2' '3' */
-    BC_DCML | BC_HEXL,
-    BC_DCML | BC_HEXL, /* '4' '5' */
-    BC_DCML | BC_HEXL,
-    BC_DCML | BC_HEXL, /* '6' '7' */
+    BC_OCTL | BC_DCML | BC_HEXL, /* 30 - '0' */
+    BC_OCTL | BC_DCML | BC_HEXL, /* 31 - '1' */
+    BC_OCTL | BC_DCML | BC_HEXL, /* 32 - '2' */
+    BC_OCTL | BC_DCML | BC_HEXL, /* 33 - '3' */
+    BC_OCTL | BC_DCML | BC_HEXL, /* 34 - '4' */
+    BC_OCTL | BC_DCML | BC_HEXL, /* 35 - '5' */
+    BC_OCTL | BC_DCML | BC_HEXL, /* 36 - '6' */
+    BC_OCTL | BC_DCML | BC_HEXL, /* 37 - '7' */
     BC_DCML | BC_HEXL,
     BC_DCML | BC_HEXL, /* '8' '9' */
     BC_RGLR,
@@ -181,4 +182,4 @@ const uint8_t byte_classification[] = {
     BC_RGLR, BC_RGLR, BC_RGLR, BC_RGLR, /* F8 - FF */
 };
 
-const uint8_t *blcass = &byte_classification[0];
+const uint8_t *bclass = &byte_classification[0];
diff --git a/src/byte_class.h b/src/byte_class.h
index 011acda..0ccfbdf 100644
--- a/src/byte_class.h
+++ b/src/byte_class.h
@@ -1,8 +1,10 @@
 #define BC_RGLR 0 /* regular character */
 #define BC_WSPC 1 /* character is whitespace */
 #define BC_EOLM (1<<1) /* character signifies end of line */
-#define BC_DCML (1<<2) /* character is a decimal */
-#define BC_HEXL (1<<3) /* character is a hexadecimal */
-#define BC_DELM (1<<4) /* character is a delimiter */
+#define BC_OCTL (1<<2) /* character is octal */
+#define BC_DCML (1<<3) /* character is decimal */
+#define BC_HEXL (1<<4) /* character is hexadecimal */
+#define BC_DELM (1<<5) /* character is a delimiter */
+#define BC_CMNT (1<<6) /* character is a comment */
 
 const uint8_t *bclass;
diff --git a/src/xref.c b/src/xref.c
index afb223f..173eb9e 100644
--- a/src/xref.c
+++ b/src/xref.c
@@ -9,6 +9,14 @@
 
 #define SLEN(x) (sizeof((x)) - 1)
 
+typedef enum {
+    NSPDFERROR_OK,
+    NSPDFERROR_NOMEM,
+    NSPDFERROR_SYNTAX, /* syntax error in parse */
+    NSPDFERROR_SIZE, /* not enough input data */
+    NSPDFERROR_RANGE, /* value outside type range */
+} nspdferror;
+
 enum cos_type {
     COS_TYPE_NULL,
     COS_TYPE_BOOL,
@@ -45,6 +53,11 @@ struct cos_array_entry {
     struct cos_object *value;
 };
 
+struct cos_string {
+    uint8_t *data;
+    size_t length;
+    size_t alloc;
+};
 
 struct cos_reference {
     /** id of indirect object */
@@ -70,7 +83,7 @@ struct cos_object {
         char *n;
 
         /** string */
-        char *s;
+        struct cos_string *s;
 
         /** stream data */
         uint8_t *stream;
@@ -170,14 +183,24 @@ read_whole_pdf(struct pdf_doc *doc, const char *fname)
 /* byte data acessory, allows for more complex buffer handling in future */
 #define DOC_BYTE(doc, offset) (doc->start[(offset)])
 
-/* find next non whitespace byte */
+/**
+ * move offset to next non whitespace byte
+ */
 static int doc_skip_ws(struct pdf_doc *doc, uint64_t *offset)
 {
     uint8_t c;
-
+    /* TODO sort out keeping offset in range */
     c = DOC_BYTE(doc, *offset);
-    while ((bclass[c] & BC_WSPC) != 0) {
+    while ((bclass[c] & (BC_WSPC | BC_CMNT) ) != 0) {
         (*offset)++;
+        /* skip comments */
+        if ((bclass[c] & BC_CMNT) != 0) {
+            c = DOC_BYTE(doc, *offset);
+            while ((bclass[c] & BC_EOLM ) == 0) {
+                (*offset)++;
+                c = DOC_BYTE(doc, *offset);
+            }
+        }
         c = DOC_BYTE(doc, *offset);
     }
     return 0;
@@ -186,7 +209,7 @@ static int doc_skip_ws(struct pdf_doc *doc, uint64_t *offset)
 static int doc_read_uint(struct pdf_doc *doc, uint64_t *offset_out, uint64_t *result_out)
 {
     uint8_t c; /* current byte from source data */
-    int len; /* number of decimal places in number */
+    unsigned int len; /* number of decimal places in number */
     uint8_t num[21]; /* temporary buffer for decimal values */
     uint64_t offset; /* current offset of source data */
     uint64_t result=0; /* parsed result */
@@ -299,9 +322,10 @@ int cos_indirect_object_add(struct pdf_doc *doc,
     return 0;
 }
 
-int cos_free_object(struct cos_object *cos_obj)
+nspdferror cos_free_object(struct cos_object *cos_obj)
 {
     struct cos_dictionary_entry *dentry;
+    struct cos_array_entry *aentry;
 
     switch (cos_obj->type) {
     case COS_TYPE_NAME:
@@ -326,6 +350,18 @@ int cos_free_object(struct cos_object *cos_obj)
         }
         break;
 
+    case COS_TYPE_ARRAY:
+        aentry = cos_obj->u.array;
+        while (aentry != NULL) {
+            struct cos_array_entry *oaentry;
+
+            cos_free_object(aentry->value);
+
+            oaentry = aentry;
+            aentry = aentry->next;
+            free(oaentry);
+        }
+
     case COS_TYPE_STREAM:
         free(cos_obj->u.stream);
         break;
@@ -333,7 +369,7 @@ int cos_free_object(struct cos_object *cos_obj)
     }
     free(cos_obj);
 
-    return 0;
+    return NSPDFERROR_OK;
 }
 
 int cos_decode_number(struct pdf_doc *doc,
@@ -342,7 +378,7 @@ int cos_decode_number(struct pdf_doc *doc,
 {
     struct cos_object *cosobj;
     uint8_t c; /* current byte from source data */
-    int len; /* number of decimal places in number */
+    unsigned int len; /* number of decimal places in number */
     uint8_t num[21]; /* temporary buffer for decimal values */
     uint64_t offset; /* current offset of source data */
 
@@ -384,27 +420,170 @@ int cos_decode_number(struct pdf_doc *doc,
     return -1; /* number too long */
 }
 
-int cos_decode_string(struct pdf_doc *doc,
-                      uint64_t *offset_out,
-                      struct cos_object **cosobj_out)
+#define COS_STRING_ALLOC 32
+
+nspdferror
+cos_string_append(struct cos_string *s, uint8_t c)
 {
-    return -1;
+    //printf("appending 0x%x to %p len %d alloc %d\n", c, s->data, s->length, s->alloc);
+    if (s->length == s->alloc) {
+        uint8_t *ns;
+        ns = realloc(s->data, s->alloc + COS_STRING_ALLOC);
+        if (ns == NULL) {
+            return NSPDFERROR_NOMEM;
+        }
+        s->data = ns;
+        s->alloc += COS_STRING_ALLOC;
+    }
+    s->data[s->length++] = c;
+    return NSPDFERROR_OK;
+}
+
+/**
+ * literal string processing
+ *
+ */
+nspdferror
+cos_decode_string(struct pdf_doc *doc,
+                  uint64_t *offset_out,
+                  struct cos_object **cosobj_out)
+{
+    uint64_t offset;
+    struct cos_object *cosobj;
+    uint8_t c;
+    unsigned int pdepth = 1; /* depth of open parens */
+    struct cos_string *cstring;
+
+    offset = *offset_out;
+
+    c = DOC_BYTE(doc, offset++);
+    if (c != '(') {
+        return NSPDFERROR_SYNTAX;
+    }
+
+    cstring = calloc(1, sizeof(*cstring));
+    if (cstring == NULL) {
+        return NSPDFERROR_NOMEM;
+    }
+
+    cosobj = calloc(1, sizeof(*cosobj));
+    if (cosobj == NULL) {
+        return NSPDFERROR_NOMEM;
+    }
+    cosobj->type = COS_TYPE_STRING;
+    cosobj->u.s = cstring;
+
+    while (pdepth > 0) {
+        c = DOC_BYTE(doc, offset++);
+
+        if (c == ')') {
+            pdepth--;
+            if (pdepth == 0) {
+                break;
+            }
+        } else if (c == '(') {
+            pdepth++;
+        } else if ((bclass[c] & BC_EOLM ) != 0) {
+            /* unescaped end of line characters are translated to a single
+             * newline
+             */
+            c = DOC_BYTE(doc, offset);
+            while ((bclass[c] & BC_EOLM) != 0) {
+                offset++;
+                c = DOC_BYTE(doc, offset);
+            }
+            c = '\n';
+        } else if (c == '\\') {
+            /* escaped chars */
+            c = DOC_BYTE(doc, offset++);
+            switch (c) {
+            case 'n':
+                c = '\n';
+                break;
+
+            case 'r':
+                c = '\r';
+                break;
+
+            case 't':
+                c = '\t';
+                break;
+
+            case 'b':
+                c = '\b';
+                break;
+
+            case 'f':
+                c = '\f';
+                break;
+
+            case '(':
+                c = '(';
+                break;
+
+            case ')':
+                c = ')';
+                break;
+
+            case '\\':
+                c = '\\';
+                break;
+
+            default:
+
+                if ((bclass[c] & BC_EOLM) != 0) {
+                    /* escaped end of line, swallow it */
+                    c = DOC_BYTE(doc, offset++);
+                    while ((bclass[c] & BC_EOLM) != 0) {
+                        c = DOC_BYTE(doc, offset++);
+                    }
+                } else if ((bclass[c] & BC_OCTL) != 0) {
+                    /* octal value */
+                    uint8_t val;
+                    val = (c - '0');
+                    c = DOC_BYTE(doc, offset);
+                    if ((bclass[c] & BC_OCTL) != 0) {
+                        offset++;
+                        val = (val << 3) | (c - '0');
+                        c = DOC_BYTE(doc, offset);
+                        if ((bclass[c] & BC_OCTL) != 0) {
+                            offset++;
+                            val = (val << 3) | (c - '0');
+                            c = val;
+                        }
+                    }
+                } /* else invalid (skip backslash) */
+                break;
+            }
+        }
+
+        /* c contains the character to add to the string */
+        cos_string_append(cstring, c);
+    }
+
+    doc_skip_ws(doc, &offset);
+
+    *cosobj_out = cosobj;
+    *offset_out = offset;
+
+    return NSPDFERROR_OK;
 }
 
-int cos_decode_hex_string(struct pdf_doc *doc,
+nspdferror
+cos_decode_hex_string(struct pdf_doc *doc,
                       uint64_t *offset_out,
                       struct cos_object **cosobj_out)
 {
     uint64_t offset;
-    struct cos_object *cosobj;
+    //    struct cos_object *cosobj;
     uint8_t c;
-    uint8_t byte;
+    //    uint8_t byte;
 
     offset = *offset_out;
 
     c = DOC_BYTE(doc, offset++);
     if (c != '<') {
-        return -1; /* syntax error */
+        return NSPDFERROR_SYNTAX;
     }
     doc_skip_ws(doc, &offset);
 
@@ -461,8 +640,11 @@ int cos_decode_dictionary(struct pdf_doc *doc,
         }
         printf("key: %s\n", key->u.n);
 
+        printf("%c\n", DOC_BYTE(doc, offset));
+
         res = cos_decode_object(doc, &offset, &value);
         if (res != 0) {
+            printf("Unable to decode value object in dictionary\n");
             /* todo free up any dictionary entries already created */
             return res;
         }
@@ -490,52 +672,61 @@ int cos_decode_dictionary(struct pdf_doc *doc,
     return 0;
 }
 
-int cos_decode_list(struct pdf_doc *doc,
-                      uint64_t *offset_out,
-                      struct cos_object **cosobj_out)
+
+nspdferror
+cos_decode_list(struct pdf_doc *doc,
+                uint64_t *offset_out,
+                struct cos_object **cosobj_out)
 {
     uint64_t offset;
     struct cos_object *cosobj;
     struct cos_array_entry *entry;
     struct cos_object *value;
-    int res;
+    nspdferror res;
 
     offset = *offset_out;
 
+    /* sanity check first token is list open */
     if (DOC_BYTE(doc, offset) != '[') {
-        return -1; /* syntax error */
+        printf("not a [\n");
+        return NSPDFERROR_SYNTAX; /* syntax error */
     }
     offset++;
-    doc_skip_ws(doc, &offset);
+
+    /* advance offset to next token */
+    res = doc_skip_ws(doc, &offset);
+    if (res != NSPDFERROR_OK) {
+        return res;
+    }
 
     printf("found a list\n");
 
     cosobj = calloc(1, sizeof(struct cos_object));
     if (cosobj == NULL) {
-        return -1; /* memory error */
+        return NSPDFERROR_NOMEM;
     }
     cosobj->type = COS_TYPE_ARRAY;
 
     while (DOC_BYTE(doc, offset) != ']') {
 
         res = cos_decode_object(doc, &offset, &value);
-        if (res != 0) {
-            /* todo free up any array entries already created */
+        if (res != NSPDFERROR_OK) {
+            cos_free_object(cosobj);
+            printf("Unable to decode value object in list\n");
             return res;
         }
 
-        /* add array entry */
+        /* add entry to array */
         entry = calloc(1, sizeof(struct cos_array_entry));
         if (entry == NULL) {
-            /* todo free up any array entries already created */
-            return -1; /* memory error */
+            cos_free_object(cosobj);
+            return NSPDFERROR_NOMEM;
         }
 
         entry->value = value;
         entry->next = cosobj->u.array;
 
         cosobj->u.array = entry;
-
     }
     offset++; /* skip closing ] */
     doc_skip_ws(doc, &offset);
author	Vincent Sanders <vince@kyllikki.org>	2017-12-19 00:16:17 +0000
committer	Vincent Sanders <vince@kyllikki.org>	2017-12-19 00:16:17 +0000
commit	d26bc9f19191e5dd7d233302f73f226c89cb797f (patch)
tree	cefc93be2ea2dcc5b07073e70ebedd82a96aced0
parent	5de69a618c7858f997e9944c06837d951fc129aa (diff)
download	libnspdf-d26bc9f19191e5dd7d233302f73f226c89cb797f.tar.gz libnspdf-d26bc9f19191e5dd7d233302f73f226c89cb797f.tar.bz2