summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2008-11-16 15:18:41 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2008-11-16 15:18:41 +0000
commit647121e093ff1f17007bd428e817a4d707c2a1bc (patch)
tree2e489d881018b72ffaa90f77396d069ca77fa86b
parentd322d42380a2008f1373525c306d99467aff87e7 (diff)
downloadiconv-647121e093ff1f17007bd428e817a4d707c2a1bc.tar.gz
iconv-647121e093ff1f17007bd428e817a4d707c2a1bc.tar.bz2
Ensure that we return the correct errors and, when we do, point to the correct place in the input sequence (namely the start of the erroneous sequence).
Unfortunately, UnicodeLib reads past the erroneous sequence so we previously returned a pointer to the middle/end of the sequence rather than the start. The only way I could think of doing this was to perform the conversion twice -- counting the number of successfully processed characters first, then to convert that number of characters again. We then play spot-the-difference with the output parameters to determine the correct return value. As encodings provided by UnicodeLib may be stateful, we need to be able to save the current state of the codecs and then restore them when we attempt the second conversion (otherwise, state-based encodings will do entirely the wrong thing). There's no API in UnicodeLib to do this and, given it's such an unlikely use case, I've not added any. Instead, we get to poke around in the UnicodeLib internals and do the save/restore ourselves. svn path=/trunk/iconv/; revision=5699
-rw-r--r--src/eightbit.c19
-rw-r--r--src/iconv.c195
-rw-r--r--src/internal.h5
3 files changed, 180 insertions, 39 deletions
diff --git a/src/eightbit.c b/src/eightbit.c
index c8ebcba..ae81735 100644
--- a/src/eightbit.c
+++ b/src/eightbit.c
@@ -133,23 +133,34 @@ unsigned iconv_eightbit_read(struct encoding_context *e,
if (c < 0x80) {
/* ASCII */
- if (callback(handle, c))
+ if (callback(handle, c)) {
+ /* Used character, so update pos */
+ pos++;
break;
+ }
}
else if (c < 0x100 && e->intab) {
LOG(("maps to: %x", e->intab[c - 0x80]));
/* Look up in mapping table */
if (e->intab[c - 0x80] != 0xffff) {
- if (callback(handle, e->intab[c - 0x80]))
+ if (callback(handle, e->intab[c - 0x80])) {
+ pos++;
break;
+ }
}
else {
/* character not defined in this encoding */
- return pos;
+ if (callback(handle, 0xfffd)) {
+ pos++;
+ break;
+ }
}
} else {
/* character not defined in this encoding */
- break;
+ if (callback(handle, 0xfffd)) {
+ pos++;
+ break;
+ }
}
}
diff --git a/src/iconv.c b/src/iconv.c
index 7ea9f4c..6cdfbb8 100644
--- a/src/iconv.c
+++ b/src/iconv.c
@@ -1,6 +1,7 @@
/* iconv implementation - see iconv.h for docs */
#include <errno.h>
+#include <limits.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdio.h>
@@ -9,6 +10,10 @@
#include <unicode/charsets.h>
#include <unicode/encoding.h>
+/* Hacktastic */
+#define DEBUG 0
+#include <unicode/encpriv.h>
+#undef DEBUG
#include <iconv/iconv.h>
@@ -239,6 +244,34 @@ iconv_t iconv_open(const char *tocode, const char *fromcode)
return (iconv_t)(-1);
}
+ if (e->in) {
+ e->in_save = calloc(1, sizeof(EncodingPriv) +
+ ((EncodingPriv *) e->in)->ws_size);
+ if (!e->in_save) {
+ if (e->out)
+ encoding_delete(e->out);
+ encoding_delete(e->in);
+ iconv_eightbit_delete(e);
+ free(e);
+ errno = ENOMEM;
+ return (iconv_t)(-1);
+ }
+ }
+
+ if (e->out) {
+ e->out_save = calloc(1, sizeof(EncodingPriv) +
+ ((EncodingPriv *) e->out)->ws_size);
+ if (!e->out_save) {
+ encoding_delete(e->out);
+ if (e->in)
+ encoding_delete(e->in);
+ iconv_eightbit_delete(e);
+ free(e);
+ errno = ENOMEM;
+ return (iconv_t)(-1);
+ }
+ }
+
/* add to list */
e->prev = 0;
e->next = context_list;
@@ -253,7 +286,10 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
size_t *outbytesleft)
{
struct encoding_context *e;
- unsigned read;
+ unsigned int read, read2;
+ char *orig_outbuf;
+ size_t orig_outbytesleft;
+ int write_state;
/* search for cd in list */
for (e = context_list; e; e = e->next)
@@ -311,9 +347,41 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
return (size_t)(-1);
}
+ /* This is plain ugly. To be able to detect when each type of
+ * conversion error has occurred and maintain the correct pointer
+ * into the input on error, we have to attempt to perform the
+ * conversion then try it again and play spot the difference in
+ * return values. As some encodings are stateful, we also need to
+ * be able to preserve the current state of encoding contexts. This
+ * requires knowledge of UnicodeLib's internal data structures. To
+ * save pain later, I'm assuming that UnicodeLib's encpriv.h is
+ * available at compile time. The cleaner approach of adding API to
+ * UnicodeLib seems pointless, as I can envisage no other use case
+ * than API munging for wanting to save/restore the state of codec
+ * instances.
+ */
+
+ orig_outbuf = *outbuf;
+ orig_outbytesleft = *outbytesleft;
+
e->outbuf = outbuf;
e->outbytesleft = outbytesleft;
+ /* Try to convert all the input */
+ e->req_chars = INT_MAX;
+ e->chars_processed = 0;
+ e->write_state = WRITE_SUCCESS;
+
+ /* Save codec states */
+ if (e->in) {
+ memcpy(e->in_save, e->in, sizeof(EncodingPriv) +
+ ((EncodingPriv *) e->in)->ws_size);
+ }
+ if (e->out) {
+ memcpy(e->out_save, e->out, sizeof(EncodingPriv) +
+ ((EncodingPriv *) e->out)->ws_size);
+ }
+
LOG(("reading"));
if (e->in)
@@ -323,40 +391,74 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
read = iconv_eightbit_read(e, character_callback, *inbuf,
*inbytesleft, e);
+ /* Record write state of first attempt (determines most errors) */
+ write_state = e->write_state;
+
+ /* Reset the output buffer pointer/length */
+ *outbuf = orig_outbuf;
+ *outbytesleft = orig_outbytesleft;
+
+ /* Shortcut failure to process first character of input */
+ if (e->chars_processed == 0) {
+ errno = write_state == WRITE_SUCCESS
+ ? EINVAL
+ : write_state == WRITE_FAILED ? EILSEQ : E2BIG;
+ return (size_t) -1;
+ }
+
+ /* Now require the number of chars processed */
+ e->req_chars = e->chars_processed;
+ e->chars_processed = 0;
+ e->write_state = WRITE_SUCCESS;
+
+ /* Restore codec states */
+ if (e->in) {
+ memcpy(e->in, e->in_save, sizeof(EncodingPriv) +
+ ((EncodingPriv *) e->in)->ws_size);
+ }
+ if (e->out) {
+ memcpy(e->out, e->out_save, sizeof(EncodingPriv) +
+ ((EncodingPriv *) e->out)->ws_size);
+ }
+
+ /* And try again */
+ if (e->in)
+ read2 = encoding_read(e->in, character_callback, *inbuf,
+ *inbytesleft, e);
+ else
+ read2 = iconv_eightbit_read(e, character_callback, *inbuf,
+ *inbytesleft, e);
+
LOG(("done"));
LOG(("read: %d, ibl: %zd, obl: %zd",
- read, *inbytesleft, *outbytesleft));
+ read2, *inbytesleft, *outbytesleft));
- /* 2 */
- if (read == *inbytesleft) {
- *inbuf += read;
- *inbytesleft = 0;
- return 0;
+ /* 2 or 3 */
+ if (write_state == WRITE_SUCCESS) {
+ *inbuf += read2;
+ *inbytesleft -= read2;
+
+ if (*inbytesleft > 0) {
+ errno = EINVAL;
+ } else {
+ return 0;
+ }
}
/* 4 */
- else if ((int)*outbytesleft < 0) {
+ else if (write_state == WRITE_NOMEM) {
LOG(("e2big"));
- *outbytesleft = 0;
- *inbuf += read - 1;
- *inbytesleft -= read - 1;
+ *inbuf += read2;
+ *inbytesleft -= read2;
errno = E2BIG;
}
- /** \todo find a mechanism for distinguishing between 1 & 3 */
/* 1 */
- else if (read != *inbytesleft) {
- *inbuf += read;
- *inbytesleft -= read;
+ else if (write_state == WRITE_FAILED) {
+ *inbuf += read2;
+ *inbytesleft -= read2;
LOG(("eilseq"));
errno = EILSEQ;
}
- /* 3 */
- else if ((int)*outbytesleft >= 0) {
- *inbuf += read;
- *inbytesleft -= read;
- LOG(("einval"));
- errno = EINVAL;
- }
LOG(("errno: %d", errno));
@@ -376,10 +478,14 @@ int iconv_close(iconv_t cd)
if (!e)
return 0;
- if (e->in)
+ if (e->in) {
encoding_delete(e->in);
- if (e->out)
+ free(e->in_save);
+ }
+ if (e->out) {
encoding_delete(e->out);
+ free(e->out_save);
+ }
iconv_eightbit_delete(e);
/* remove from list */
@@ -409,8 +515,10 @@ int character_callback(void *handle, UCS4 c)
/* Stop on invalid characters if we're not transliterating */
/** \todo is this sane? -- we can't distinguish between illegal input
* or valid input which just happens to correspond with U+fffd. */
- if (c == 0xFFFD && !e->transliterate)
+ if (c == 0xFFFD && !e->transliterate) {
+ e->write_state = WRITE_FAILED;
return 1;
+ }
LOG(("outbuf: %p, free: %zd", *e->outbuf, *e->outbytesleft));
LOG(("writing: %d", c));
@@ -440,6 +548,9 @@ int character_callback(void *handle, UCS4 c)
(int*)e->outbytesleft);
}
+ e->write_state = ret == -1 ? WRITE_FAILED
+ : ret == 0 ? WRITE_NOMEM : WRITE_SUCCESS;
+
if (ret == -1) {
/* Transliterate, if we've been asked to.
* Assumes that output is 8bit/8bit multibyte with ASCII G0.
@@ -448,9 +559,10 @@ int character_callback(void *handle, UCS4 c)
* Also, afaiaa, all supported multibyte encodings are ASCII
* compatible. */
/** \todo Actually perform some kind of transliteration */
- if (e->transliterate && (int)*e->outbytesleft > 0) {
- if (e->out) {
- /* Reset encoding write state */
+ if (e->transliterate) {
+ if ((int)*e->outbytesleft > 0) {
+ if (e->out) {
+ /* Flush through any pending shift sequences */
/** \todo this is a bit dodgy, as we only
* really need to ensure that the ASCII set
* is mapped into G0 in ISO2022 encodings.
@@ -459,23 +571,36 @@ int character_callback(void *handle, UCS4 c)
* perform some dirty hackery which relies
* upon knowledge of UnicodeLib's internals
*/
- encoding_write(e->out, NULL_UCS4, e->outbuf,
+ encoding_write(e->out, NULL_UCS4,
+ e->outbuf,
(int*)e->outbytesleft);
- }
+ }
- if ((int)*e->outbytesleft > 0) {
- *(*e->outbuf)++ = '?';
- --*e->outbytesleft;
+ if ((int)*e->outbytesleft > 0) {
+ *(*e->outbuf)++ = '?';
+ --*e->outbytesleft;
+
+ e->write_state = WRITE_SUCCESS;
- ret = 1;
+ ret = 1;
+ } else {
+ e->write_state = WRITE_NOMEM;
+ ret = 0;
+ }
} else {
+ e->write_state = WRITE_NOMEM;
ret = 0;
}
} else {
- ret = 1;
+ e->write_state = WRITE_FAILED;
+ ret = 0;
}
}
+ if (e->write_state == WRITE_SUCCESS &&
+ ++e->chars_processed == e->req_chars)
+ ret = 0;
+
return (!ret);
}
diff --git a/src/internal.h b/src/internal.h
index 929d906..ce415ca 100644
--- a/src/internal.h
+++ b/src/internal.h
@@ -16,13 +16,18 @@
struct encoding_context {
Encoding *in;
+ void *in_save;
unsigned int inflags;
Encoding *out;
+ void *out_save;
unsigned int outflags;
unsigned short *intab, *outtab;
char **outbuf;
size_t *outbytesleft;
char transliterate;
+ enum { WRITE_SUCCESS, WRITE_FAILED, WRITE_NOMEM } write_state;
+ int chars_processed;
+ int req_chars;
struct encoding_context *prev, *next;
};