From 9c8a4ff7e117ba052b2957c7e3f2e8751e8f8970 Mon Sep 17 00:00:00 2001 From: John-Mark Bell Date: Sun, 13 Jan 2013 02:05:46 +0000 Subject: Transliteration fixes: * Clear any substitution if codec reset has been requested. * Don't report memory exhaustion when failing to allocate space for the test conversion in translit_try_sequence: there's nothing the caller can do, so treat it as if the substitution cannot be converted to the target character set. * Correctly report success if we run out of input immediately following a flush of a substitution. Additional tests for transliteration. --- build/tools/gentranstab.pl | 16 +++++++++++--- src/iconv.c | 27 +++++++++++++++++------ test/translit.c | 53 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+), 9 deletions(-) diff --git a/build/tools/gentranstab.pl b/build/tools/gentranstab.pl index 0e9205a..1b1ccad 100644 --- a/build/tools/gentranstab.pl +++ b/build/tools/gentranstab.pl @@ -48,12 +48,18 @@ static int translit_try_sequence(struct encoding_context *e, size_t orig_tmplen, tmplen, index; int ret = 1; - /* First, determine if sequence can be written to target encoding */ + /* Determine if sequence can be written to target encoding */ /* Worst case: conversion to UTF-8 (needing 6 bytes per character) */ orig_tmplen = tmplen = (seqlen + 1) * 6; ptmpbuf = tmpbuf = malloc(tmplen); - if (tmpbuf == NULL) - return 0; + if (tmpbuf == NULL) { + /* Consider lack of memory an inability to write the output. + * We cannot report memory exhaustion from here, as it will + * result in the caller thinking that the output buffer is + * too small, which isn't actually the case. As + * transliteration is best-effort anyway, this should be ok. */ + return -1; + } /* Reset the transout codec */ if (e->transout != NULL) { @@ -102,6 +108,8 @@ int translit_flush_replacement(struct encoding_context *e) size_t substlen = e->substlen; int ret = 1; + LOG(("Flushing %zd characters", substlen)); + while (substlen > 0) { UCS4 c = substitution[0]; @@ -118,6 +126,8 @@ int translit_flush_replacement(struct encoding_context *e) e->substitution = substitution; e->substlen = substlen; + LOG(("%zd characters remaining", substlen)); + return ret; } diff --git a/src/iconv.c b/src/iconv.c index c81a0b2..21bf665 100644 --- a/src/iconv.c +++ b/src/iconv.c @@ -292,6 +292,10 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, /* Clear skip */ e->skip = 0; + /* Reset transliteration state */ + e->substitution = NULL; + e->substlen = 0; + /* Reset read codec */ if (e->in) { encoding_reset(e->in); @@ -342,13 +346,20 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, e->outbytesleft = outbytesleft; /* Flush through any remaining transliteration */ - ret = translit_flush_replacement(e); - if (ret <= 0) { - errno = E2BIG; - return (size_t)-1; - } + if (e->substlen > 0) { + ret = translit_flush_replacement(e); + if (ret <= 0) { + errno = E2BIG; + return (size_t)-1; + } - LOG(("reading")); + /* Force write state to success, so if there's no more input + * (i.e. we were transliterating the last character of input) + * we'll report success, rather than whatever caused us to + * stop writing the transliterated sequence last time round. + */ + e->write_state = WRITE_SUCCESS; + } /* If, on the previous attempt to convert data, we reached the end * of the input buffer mid-sequence, then we retain the number of @@ -359,12 +370,16 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, * start. */ if (e->skip != 0) { + LOG(("Skipping %d bytes of input", e->skip)); + *inbuf += e->skip; *inbytesleft -= e->skip; e->skip = 0; } + LOG(("Reading %zd bytes of input", *inbytesleft)); + /* Perform the conversion. * * To ensure that we detect the correct error conditions diff --git a/test/translit.c b/test/translit.c index 8f17889..240f5e2 100644 --- a/test/translit.c +++ b/test/translit.c @@ -20,7 +20,15 @@ typedef struct translit_testcase { } translit_testcase; static const translit_testcase tests[] = { + /* Trivial */ { "iso-8859-1//TRANSLIT", "\xe2\x80\x93", "-" }, + /* Multi-character replacements */ + { "iso-8859-2//TRANSLIT", "\xc2\xa9", "(c)" }, + { "iso-8859-3//TRANSLIT", "\xc2\xab", "<<" }, + /* Multiple choices */ + { "iso-8859-4//TRANSLIT", "\xef\xac\x85", "st" }, + /* Default fallback */ + { "iso-8859-1//TRANSLIT", "\xef\xac\x87", "?" }, { NULL, NULL, NULL } }; @@ -53,6 +61,50 @@ static void run_tests(void) } } +static void test_translit_buffer_boundary(void) +{ + iconv_t cd; + char out[128]; + char *inp = (char *) "\xc2\xa9", *outp = out; + size_t inlen = strlen(inp), outlen; + size_t read; + + cd = iconv_open("iso-8859-2//TRANSLIT", "utf-8"); + assert(cd != (iconv_t) -1); + + outlen = 1; + read = iconv(cd, &inp, &inlen, &outp, &outlen); + assert(read == (size_t) -1); + assert(errno == E2BIG); + + /* Expect ( to appear in output */ + assert(outlen == 0); + assert(out[0] == '('); + + /* Try to write next output character */ + outlen = 1; + read = iconv(cd, &inp, &inlen, &outp, &outlen); + assert(read == (size_t) -1); + assert(errno == E2BIG); + + /* Expect "(c" in output */ + assert(outlen == 0); + assert(out[0] == '('); + assert(out[1] == 'c'); + + /* Flush through last character */ + outlen = 1; + read = iconv(cd, &inp, &inlen, &outp, &outlen); + assert(read == 0); + + /* Expect "(c)" in output, and all input read */ + assert(outlen == 0); + assert(memcmp(out, "(c)", 3) == 0); + assert(inlen == 0); + + iconv_close(cd); +} + int main(int argc, char **argv) { const char *ucpath; @@ -84,6 +136,7 @@ int main(int argc, char **argv) assert(iconv_initialise(aliases) == 1); run_tests(); + test_translit_buffer_boundary(); iconv_finalise(); -- cgit v1.2.3