diff options
-rw-r--r-- | module/module.c | 14 | ||||
-rw-r--r-- | src/iconv.c | 193 | ||||
-rw-r--r-- | src/internal.h | 11 | ||||
-rw-r--r-- | test/GNU/ISO-2022-JP-2-snippet | 2 | ||||
-rw-r--r-- | test/GNU/ISO-2022-JP-2-snippet.UTF-8 | 2 |
5 files changed, 63 insertions, 159 deletions
diff --git a/module/module.c b/module/module.c index 2ef2326..0631551 100644 --- a/module/module.c +++ b/module/module.c @@ -201,7 +201,7 @@ _kernel_oserror *do_iconv(int argc, const char *args) { char from[64] = "", to[64] = ""; char *f, *t; - bool list = false; + bool list = false, verbose = false; char out[4096] = ""; char *o; const char *p = args; @@ -273,9 +273,13 @@ _kernel_oserror *do_iconv(int argc, const char *args) p++; argc--; break; + case 'v': + verbose = true; + p += 2; + argc--; + break; case 'c': case 's': - case 'v': default: snprintf(ErrorGeneric.errmess, sizeof(ErrorGeneric.errmess), @@ -358,7 +362,11 @@ _kernel_oserror *do_iconv(int argc, const char *args) fclose(inf); /* Convert text */ - iconv(cd, &in, &inlen, &out, &outlen); + size_t read = iconv(cd, &in, &inlen, &out, &outlen); + if (verbose && read == (size_t) -1) { + fprintf(stderr, "Conversion failed: %s\n", + strerror(errno)); + } fwrite(output, 1, input_length * 4 - outlen, ofp); diff --git a/src/iconv.c b/src/iconv.c index 6cdfbb8..817822c 100644 --- a/src/iconv.c +++ b/src/iconv.c @@ -10,10 +10,6 @@ #include <unicode/charsets.h> #include <unicode/encoding.h> -/* Hacktastic */ -#define DEBUG 0 -#include <unicode/encpriv.h> -#undef DEBUG #include <iconv/iconv.h> @@ -244,34 +240,6 @@ iconv_t iconv_open(const char *tocode, const char *fromcode) return (iconv_t)(-1); } - if (e->in) { - e->in_save = calloc(1, sizeof(EncodingPriv) + - ((EncodingPriv *) e->in)->ws_size); - if (!e->in_save) { - if (e->out) - encoding_delete(e->out); - encoding_delete(e->in); - iconv_eightbit_delete(e); - free(e); - errno = ENOMEM; - return (iconv_t)(-1); - } - } - - if (e->out) { - e->out_save = calloc(1, sizeof(EncodingPriv) + - ((EncodingPriv *) e->out)->ws_size); - if (!e->out_save) { - encoding_delete(e->out); - if (e->in) - encoding_delete(e->in); - iconv_eightbit_delete(e); - free(e); - errno = ENOMEM; - return (iconv_t)(-1); - } - } - /* add to list */ e->prev = 0; e->next = context_list; @@ -286,10 +254,7 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, size_t *outbytesleft) { struct encoding_context *e; - unsigned int read, read2; - char *orig_outbuf; - size_t orig_outbytesleft; - int write_state; + unsigned int read; /* search for cd in list */ for (e = context_list; e; e = e->next) @@ -347,117 +312,59 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, return (size_t)(-1); } - /* This is plain ugly. To be able to detect when each type of - * conversion error has occurred and maintain the correct pointer - * into the input on error, we have to attempt to perform the - * conversion then try it again and play spot the difference in - * return values. As some encodings are stateful, we also need to - * be able to preserve the current state of encoding contexts. This - * requires knowledge of UnicodeLib's internal data structures. To - * save pain later, I'm assuming that UnicodeLib's encpriv.h is - * available at compile time. The cleaner approach of adding API to - * UnicodeLib seems pointless, as I can envisage no other use case - * than API munging for wanting to save/restore the state of codec - * instances. - */ - - orig_outbuf = *outbuf; - orig_outbytesleft = *outbytesleft; - e->outbuf = outbuf; e->outbytesleft = outbytesleft; - /* Try to convert all the input */ - e->req_chars = INT_MAX; - e->chars_processed = 0; - e->write_state = WRITE_SUCCESS; - - /* Save codec states */ - if (e->in) { - memcpy(e->in_save, e->in, sizeof(EncodingPriv) + - ((EncodingPriv *) e->in)->ws_size); - } - if (e->out) { - memcpy(e->out_save, e->out, sizeof(EncodingPriv) + - ((EncodingPriv *) e->out)->ws_size); - } - LOG(("reading")); - if (e->in) - read = encoding_read(e->in, character_callback, *inbuf, - *inbytesleft, e); - else - read = iconv_eightbit_read(e, character_callback, *inbuf, - *inbytesleft, e); - - /* Record write state of first attempt (determines most errors) */ - write_state = e->write_state; - - /* Reset the output buffer pointer/length */ - *outbuf = orig_outbuf; - *outbytesleft = orig_outbytesleft; - - /* Shortcut failure to process first character of input */ - if (e->chars_processed == 0) { - errno = write_state == WRITE_SUCCESS - ? EINVAL - : write_state == WRITE_FAILED ? EILSEQ : E2BIG; - return (size_t) -1; - } + /* Perform the conversion. + * + * To ensure that we detect the correct error conditions + * and point to the _start_ of erroneous input on error, we + * have to convert each character independently. Then we + * inspect for errors and only continue if there were none. + */ + while (*inbytesleft > 0) { + /* Clear current write state */ + e->write_state = WRITE_NONE; - /* Now require the number of chars processed */ - e->req_chars = e->chars_processed; - e->chars_processed = 0; - e->write_state = WRITE_SUCCESS; + if (e->in) + read = encoding_read(e->in, character_callback, *inbuf, + *inbytesleft, e); + else + read = iconv_eightbit_read(e, character_callback, + *inbuf, *inbytesleft, e); + + /* Stop on error */ + if (e->write_state != WRITE_SUCCESS) + break; - /* Restore codec states */ - if (e->in) { - memcpy(e->in, e->in_save, sizeof(EncodingPriv) + - ((EncodingPriv *) e->in)->ws_size); - } - if (e->out) { - memcpy(e->out, e->out_save, sizeof(EncodingPriv) + - ((EncodingPriv *) e->out)->ws_size); + /* Advance input */ + *inbuf += read; + *inbytesleft -= read; } - /* And try again */ - if (e->in) - read2 = encoding_read(e->in, character_callback, *inbuf, - *inbytesleft, e); - else - read2 = iconv_eightbit_read(e, character_callback, *inbuf, - *inbytesleft, e); - LOG(("done")); LOG(("read: %d, ibl: %zd, obl: %zd", - read2, *inbytesleft, *outbytesleft)); - - /* 2 or 3 */ - if (write_state == WRITE_SUCCESS) { - *inbuf += read2; - *inbytesleft -= read2; - - if (*inbytesleft > 0) { - errno = EINVAL; - } else { - return 0; - } - } - /* 4 */ - else if (write_state == WRITE_NOMEM) { - LOG(("e2big")); - *inbuf += read2; - *inbytesleft -= read2; + read, *inbytesleft, *outbytesleft)); + + /* Determine correct return value/error code */ + switch (e->write_state) { + case WRITE_SUCCESS: /* 2 */ + /** \todo We really should calculate the correct number of + * irreversible conversions that have been performed. For now, + * assume everything's reversible. */ + return 0; + case WRITE_NONE: /* 3 */ + errno = EINVAL; + break; + case WRITE_NOMEM: /* 4 */ errno = E2BIG; - } - /* 1 */ - else if (write_state == WRITE_FAILED) { - *inbuf += read2; - *inbytesleft -= read2; - LOG(("eilseq")); + break; + case WRITE_FAILED: /* 1 */ errno = EILSEQ; + break; } LOG(("errno: %d", errno)); @@ -478,14 +385,10 @@ int iconv_close(iconv_t cd) if (!e) return 0; - if (e->in) { + if (e->in) encoding_delete(e->in); - free(e->in_save); - } - if (e->out) { + if (e->out) encoding_delete(e->out); - free(e->out_save); - } iconv_eightbit_delete(e); /* remove from list */ @@ -581,27 +484,19 @@ int character_callback(void *handle, UCS4 c) --*e->outbytesleft; e->write_state = WRITE_SUCCESS; - - ret = 1; } else { e->write_state = WRITE_NOMEM; - ret = 0; } } else { e->write_state = WRITE_NOMEM; - ret = 0; } } else { e->write_state = WRITE_FAILED; - ret = 0; } } - if (e->write_state == WRITE_SUCCESS && - ++e->chars_processed == e->req_chars) - ret = 0; - - return (!ret); + /* Always stop after processing each character */ + return 1; } void parse_parameters(struct encoding_context *e, const char *params, diff --git a/src/internal.h b/src/internal.h index ce415ca..9150efc 100644 --- a/src/internal.h +++ b/src/internal.h @@ -16,18 +16,19 @@ struct encoding_context { Encoding *in; - void *in_save; unsigned int inflags; Encoding *out; - void *out_save; unsigned int outflags; unsigned short *intab, *outtab; char **outbuf; size_t *outbytesleft; char transliterate; - enum { WRITE_SUCCESS, WRITE_FAILED, WRITE_NOMEM } write_state; - int chars_processed; - int req_chars; + enum { + WRITE_SUCCESS, + WRITE_FAILED, + WRITE_NOMEM, + WRITE_NONE + } write_state; struct encoding_context *prev, *next; }; diff --git a/test/GNU/ISO-2022-JP-2-snippet b/test/GNU/ISO-2022-JP-2-snippet index 3e297b8..40fae83 100644 --- a/test/GNU/ISO-2022-JP-2-snippet +++ b/test/GNU/ISO-2022-JP-2-snippet @@ -1,4 +1,4 @@ -Japanese ($BF|K\8l(B) $B$3$s$K$A$O(B, (I:]FAJ(B +Japanese ($BF|K\8l(B) $B$3$s$K$A$O(B JIS -- $B855$(B $B3+H/(B Just for a test of JISX0212: $BqV$(DiQ(B (the second character is of JISX0212) Chinese ($BCfJ8(B,$BIaDL$A;0(B,$A::So(B) $(D0_$B9%(B diff --git a/test/GNU/ISO-2022-JP-2-snippet.UTF-8 b/test/GNU/ISO-2022-JP-2-snippet.UTF-8 index 6c63925..99d453b 100644 --- a/test/GNU/ISO-2022-JP-2-snippet.UTF-8 +++ b/test/GNU/ISO-2022-JP-2-snippet.UTF-8 @@ -1,4 +1,4 @@ -Japanese (日本語) こんにちは, コンニチハ +Japanese (日本語) こんにちは JIS -- 元気 開発 Just for a test of JISX0212: 騏驎 (the second character is of JISX0212) Chinese (中文,普通话,汉语) 你好 |