diff options
author | Vincent Sanders <vince@kyllikki.org> | 2017-12-19 00:16:17 +0000 |
---|---|---|
committer | Vincent Sanders <vince@kyllikki.org> | 2017-12-19 00:16:17 +0000 |
commit | d26bc9f19191e5dd7d233302f73f226c89cb797f (patch) | |
tree | cefc93be2ea2dcc5b07073e70ebedd82a96aced0 | |
parent | 5de69a618c7858f997e9944c06837d951fc129aa (diff) | |
download | libnspdf-d26bc9f19191e5dd7d233302f73f226c89cb797f.tar.gz libnspdf-d26bc9f19191e5dd7d233302f73f226c89cb797f.tar.bz2 |
write parse of strings
-rw-r--r-- | src/Makefile | 2 | ||||
-rw-r--r-- | src/byte_class.c | 23 | ||||
-rw-r--r-- | src/byte_class.h | 8 | ||||
-rw-r--r-- | src/xref.c | 249 |
4 files changed, 238 insertions, 44 deletions
diff --git a/src/Makefile b/src/Makefile index ac8c347..f9ca22c 100644 --- a/src/Makefile +++ b/src/Makefile @@ -1,6 +1,6 @@ # -CFLAGS+=-g -Wall +CFLAGS+=-g -Wall -Wextra .PHONY:all clean diff --git a/src/byte_class.c b/src/byte_class.c index a8ab735..59c0206 100644 --- a/src/byte_class.c +++ b/src/byte_class.c @@ -11,6 +11,7 @@ * decimal - characters that appear in decimal values 0123456789 * hexidecimal - characters that appear in hex values 0123456789ABCDEF * delimiter - The characters used to separate tokens ()[]{}<>/% + * comment - the % character used to introduce a comment * whitespace - separate syntactic constructs like names and numbers treated * as a single character except in comments, strings and streams * end of line - characters that signify an end of line @@ -52,8 +53,8 @@ const uint8_t byte_classification[] = { BC_RGLR, BC_RGLR, BC_RGLR, /* 20 - 23 */ - BC_RGLR, - BC_DELM, /* '$' '%' */ + BC_RGLR, /* 24 - '$' */ + BC_DELM | BC_CMNT, /* 25 - '%' */ BC_RGLR, BC_RGLR, /* 26 - 27 */ BC_DELM, @@ -64,14 +65,14 @@ const uint8_t byte_classification[] = { BC_RGLR, /* 2C - 2D */ BC_RGLR, BC_DELM, /* '.' '/' */ - BC_DCML | BC_HEXL, - BC_DCML | BC_HEXL, /* '0' '1' */ - BC_DCML | BC_HEXL, - BC_DCML | BC_HEXL, /* '2' '3' */ - BC_DCML | BC_HEXL, - BC_DCML | BC_HEXL, /* '4' '5' */ - BC_DCML | BC_HEXL, - BC_DCML | BC_HEXL, /* '6' '7' */ + BC_OCTL | BC_DCML | BC_HEXL, /* 30 - '0' */ + BC_OCTL | BC_DCML | BC_HEXL, /* 31 - '1' */ + BC_OCTL | BC_DCML | BC_HEXL, /* 32 - '2' */ + BC_OCTL | BC_DCML | BC_HEXL, /* 33 - '3' */ + BC_OCTL | BC_DCML | BC_HEXL, /* 34 - '4' */ + BC_OCTL | BC_DCML | BC_HEXL, /* 35 - '5' */ + BC_OCTL | BC_DCML | BC_HEXL, /* 36 - '6' */ + BC_OCTL | BC_DCML | BC_HEXL, /* 37 - '7' */ BC_DCML | BC_HEXL, BC_DCML | BC_HEXL, /* '8' '9' */ BC_RGLR, @@ -181,4 +182,4 @@ const uint8_t byte_classification[] = { BC_RGLR, BC_RGLR, BC_RGLR, BC_RGLR, /* F8 - FF */ }; -const uint8_t *blcass = &byte_classification[0]; +const uint8_t *bclass = &byte_classification[0]; diff --git a/src/byte_class.h b/src/byte_class.h index 011acda..0ccfbdf 100644 --- a/src/byte_class.h +++ b/src/byte_class.h @@ -1,8 +1,10 @@ #define BC_RGLR 0 /* regular character */ #define BC_WSPC 1 /* character is whitespace */ #define BC_EOLM (1<<1) /* character signifies end of line */ -#define BC_DCML (1<<2) /* character is a decimal */ -#define BC_HEXL (1<<3) /* character is a hexadecimal */ -#define BC_DELM (1<<4) /* character is a delimiter */ +#define BC_OCTL (1<<2) /* character is octal */ +#define BC_DCML (1<<3) /* character is decimal */ +#define BC_HEXL (1<<4) /* character is hexadecimal */ +#define BC_DELM (1<<5) /* character is a delimiter */ +#define BC_CMNT (1<<6) /* character is a comment */ const uint8_t *bclass; @@ -9,6 +9,14 @@ #define SLEN(x) (sizeof((x)) - 1) +typedef enum { + NSPDFERROR_OK, + NSPDFERROR_NOMEM, + NSPDFERROR_SYNTAX, /* syntax error in parse */ + NSPDFERROR_SIZE, /* not enough input data */ + NSPDFERROR_RANGE, /* value outside type range */ +} nspdferror; + enum cos_type { COS_TYPE_NULL, COS_TYPE_BOOL, @@ -45,6 +53,11 @@ struct cos_array_entry { struct cos_object *value; }; +struct cos_string { + uint8_t *data; + size_t length; + size_t alloc; +}; struct cos_reference { /** id of indirect object */ @@ -70,7 +83,7 @@ struct cos_object { char *n; /** string */ - char *s; + struct cos_string *s; /** stream data */ uint8_t *stream; @@ -170,14 +183,24 @@ read_whole_pdf(struct pdf_doc *doc, const char *fname) /* byte data acessory, allows for more complex buffer handling in future */ #define DOC_BYTE(doc, offset) (doc->start[(offset)]) -/* find next non whitespace byte */ +/** + * move offset to next non whitespace byte + */ static int doc_skip_ws(struct pdf_doc *doc, uint64_t *offset) { uint8_t c; - + /* TODO sort out keeping offset in range */ c = DOC_BYTE(doc, *offset); - while ((bclass[c] & BC_WSPC) != 0) { + while ((bclass[c] & (BC_WSPC | BC_CMNT) ) != 0) { (*offset)++; + /* skip comments */ + if ((bclass[c] & BC_CMNT) != 0) { + c = DOC_BYTE(doc, *offset); + while ((bclass[c] & BC_EOLM ) == 0) { + (*offset)++; + c = DOC_BYTE(doc, *offset); + } + } c = DOC_BYTE(doc, *offset); } return 0; @@ -186,7 +209,7 @@ static int doc_skip_ws(struct pdf_doc *doc, uint64_t *offset) static int doc_read_uint(struct pdf_doc *doc, uint64_t *offset_out, uint64_t *result_out) { uint8_t c; /* current byte from source data */ - int len; /* number of decimal places in number */ + unsigned int len; /* number of decimal places in number */ uint8_t num[21]; /* temporary buffer for decimal values */ uint64_t offset; /* current offset of source data */ uint64_t result=0; /* parsed result */ @@ -299,9 +322,10 @@ int cos_indirect_object_add(struct pdf_doc *doc, return 0; } -int cos_free_object(struct cos_object *cos_obj) +nspdferror cos_free_object(struct cos_object *cos_obj) { struct cos_dictionary_entry *dentry; + struct cos_array_entry *aentry; switch (cos_obj->type) { case COS_TYPE_NAME: @@ -326,6 +350,18 @@ int cos_free_object(struct cos_object *cos_obj) } break; + case COS_TYPE_ARRAY: + aentry = cos_obj->u.array; + while (aentry != NULL) { + struct cos_array_entry *oaentry; + + cos_free_object(aentry->value); + + oaentry = aentry; + aentry = aentry->next; + free(oaentry); + } + case COS_TYPE_STREAM: free(cos_obj->u.stream); break; @@ -333,7 +369,7 @@ int cos_free_object(struct cos_object *cos_obj) } free(cos_obj); - return 0; + return NSPDFERROR_OK; } int cos_decode_number(struct pdf_doc *doc, @@ -342,7 +378,7 @@ int cos_decode_number(struct pdf_doc *doc, { struct cos_object *cosobj; uint8_t c; /* current byte from source data */ - int len; /* number of decimal places in number */ + unsigned int len; /* number of decimal places in number */ uint8_t num[21]; /* temporary buffer for decimal values */ uint64_t offset; /* current offset of source data */ @@ -384,27 +420,170 @@ int cos_decode_number(struct pdf_doc *doc, return -1; /* number too long */ } -int cos_decode_string(struct pdf_doc *doc, - uint64_t *offset_out, - struct cos_object **cosobj_out) +#define COS_STRING_ALLOC 32 + +nspdferror +cos_string_append(struct cos_string *s, uint8_t c) { - return -1; + //printf("appending 0x%x to %p len %d alloc %d\n", c, s->data, s->length, s->alloc); + if (s->length == s->alloc) { + uint8_t *ns; + ns = realloc(s->data, s->alloc + COS_STRING_ALLOC); + if (ns == NULL) { + return NSPDFERROR_NOMEM; + } + s->data = ns; + s->alloc += COS_STRING_ALLOC; + } + s->data[s->length++] = c; + return NSPDFERROR_OK; +} + +/** + * literal string processing + * + */ +nspdferror +cos_decode_string(struct pdf_doc *doc, + uint64_t *offset_out, + struct cos_object **cosobj_out) +{ + uint64_t offset; + struct cos_object *cosobj; + uint8_t c; + unsigned int pdepth = 1; /* depth of open parens */ + struct cos_string *cstring; + + offset = *offset_out; + + c = DOC_BYTE(doc, offset++); + if (c != '(') { + return NSPDFERROR_SYNTAX; + } + + cstring = calloc(1, sizeof(*cstring)); + if (cstring == NULL) { + return NSPDFERROR_NOMEM; + } + + cosobj = calloc(1, sizeof(*cosobj)); + if (cosobj == NULL) { + return NSPDFERROR_NOMEM; + } + cosobj->type = COS_TYPE_STRING; + cosobj->u.s = cstring; + + while (pdepth > 0) { + c = DOC_BYTE(doc, offset++); + + if (c == ')') { + pdepth--; + if (pdepth == 0) { + break; + } + } else if (c == '(') { + pdepth++; + } else if ((bclass[c] & BC_EOLM ) != 0) { + /* unescaped end of line characters are translated to a single + * newline + */ + c = DOC_BYTE(doc, offset); + while ((bclass[c] & BC_EOLM) != 0) { + offset++; + c = DOC_BYTE(doc, offset); + } + c = '\n'; + } else if (c == '\\') { + /* escaped chars */ + c = DOC_BYTE(doc, offset++); + switch (c) { + case 'n': + c = '\n'; + break; + + case 'r': + c = '\r'; + break; + + case 't': + c = '\t'; + break; + + case 'b': + c = '\b'; + break; + + case 'f': + c = '\f'; + break; + + case '(': + c = '('; + break; + + case ')': + c = ')'; + break; + + case '\\': + c = '\\'; + break; + + default: + + if ((bclass[c] & BC_EOLM) != 0) { + /* escaped end of line, swallow it */ + c = DOC_BYTE(doc, offset++); + while ((bclass[c] & BC_EOLM) != 0) { + c = DOC_BYTE(doc, offset++); + } + } else if ((bclass[c] & BC_OCTL) != 0) { + /* octal value */ + uint8_t val; + val = (c - '0'); + c = DOC_BYTE(doc, offset); + if ((bclass[c] & BC_OCTL) != 0) { + offset++; + val = (val << 3) | (c - '0'); + c = DOC_BYTE(doc, offset); + if ((bclass[c] & BC_OCTL) != 0) { + offset++; + val = (val << 3) | (c - '0'); + c = val; + } + } + } /* else invalid (skip backslash) */ + break; + } + } + + /* c contains the character to add to the string */ + cos_string_append(cstring, c); + } + + doc_skip_ws(doc, &offset); + + *cosobj_out = cosobj; + *offset_out = offset; + + return NSPDFERROR_OK; } -int cos_decode_hex_string(struct pdf_doc *doc, +nspdferror +cos_decode_hex_string(struct pdf_doc *doc, uint64_t *offset_out, struct cos_object **cosobj_out) { uint64_t offset; - struct cos_object *cosobj; + // struct cos_object *cosobj; uint8_t c; - uint8_t byte; + // uint8_t byte; offset = *offset_out; c = DOC_BYTE(doc, offset++); if (c != '<') { - return -1; /* syntax error */ + return NSPDFERROR_SYNTAX; } doc_skip_ws(doc, &offset); @@ -461,8 +640,11 @@ int cos_decode_dictionary(struct pdf_doc *doc, } printf("key: %s\n", key->u.n); + printf("%c\n", DOC_BYTE(doc, offset)); + res = cos_decode_object(doc, &offset, &value); if (res != 0) { + printf("Unable to decode value object in dictionary\n"); /* todo free up any dictionary entries already created */ return res; } @@ -490,52 +672,61 @@ int cos_decode_dictionary(struct pdf_doc *doc, return 0; } -int cos_decode_list(struct pdf_doc *doc, - uint64_t *offset_out, - struct cos_object **cosobj_out) + +nspdferror +cos_decode_list(struct pdf_doc *doc, + uint64_t *offset_out, + struct cos_object **cosobj_out) { uint64_t offset; struct cos_object *cosobj; struct cos_array_entry *entry; struct cos_object *value; - int res; + nspdferror res; offset = *offset_out; + /* sanity check first token is list open */ if (DOC_BYTE(doc, offset) != '[') { - return -1; /* syntax error */ + printf("not a [\n"); + return NSPDFERROR_SYNTAX; /* syntax error */ } offset++; - doc_skip_ws(doc, &offset); + + /* advance offset to next token */ + res = doc_skip_ws(doc, &offset); + if (res != NSPDFERROR_OK) { + return res; + } printf("found a list\n"); cosobj = calloc(1, sizeof(struct cos_object)); if (cosobj == NULL) { - return -1; /* memory error */ + return NSPDFERROR_NOMEM; } cosobj->type = COS_TYPE_ARRAY; while (DOC_BYTE(doc, offset) != ']') { res = cos_decode_object(doc, &offset, &value); - if (res != 0) { - /* todo free up any array entries already created */ + if (res != NSPDFERROR_OK) { + cos_free_object(cosobj); + printf("Unable to decode value object in list\n"); return res; } - /* add array entry */ + /* add entry to array */ entry = calloc(1, sizeof(struct cos_array_entry)); if (entry == NULL) { - /* todo free up any array entries already created */ - return -1; /* memory error */ + cos_free_object(cosobj); + return NSPDFERROR_NOMEM; } entry->value = value; entry->next = cosobj->u.array; cosobj->u.array = entry; - } offset++; /* skip closing ] */ doc_skip_ws(doc, &offset); |