From 9c8a4ff7e117ba052b2957c7e3f2e8751e8f8970 Mon Sep 17 00:00:00 2001
From: John-Mark Bell <jmb@netsurf-browser.org>
Date: Sun, 13 Jan 2013 02:05:46 +0000
Subject: Transliteration fixes:

  * Clear any substitution if codec reset has been requested.
  * Don't report memory exhaustion when failing to allocate space
    for the test conversion in translit_try_sequence: there's
    nothing the caller can do, so treat it as if the substitution
    cannot be converted to the target character set.
  * Correctly report success if we run out of input immediately
    following a flush of a substitution.

Additional tests for transliteration.
---
 build/tools/gentranstab.pl | 16 +++++++++++---
 src/iconv.c                | 27 +++++++++++++++++------
 test/translit.c            | 53 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 87 insertions(+), 9 deletions(-)

diff --git a/build/tools/gentranstab.pl b/build/tools/gentranstab.pl
index 0e9205a..1b1ccad 100644
--- a/build/tools/gentranstab.pl
+++ b/build/tools/gentranstab.pl
@@ -48,12 +48,18 @@ static int translit_try_sequence(struct encoding_context *e,
 	size_t orig_tmplen, tmplen, index;
 	int ret = 1;
 
-	/* First, determine if sequence can be written to target encoding */
+	/* Determine if sequence can be written to target encoding */
 	/* Worst case: conversion to UTF-8 (needing 6 bytes per character) */
 	orig_tmplen = tmplen = (seqlen + 1) * 6;
 	ptmpbuf = tmpbuf = malloc(tmplen);
-	if (tmpbuf == NULL)
-		return 0;
+	if (tmpbuf == NULL) {
+		/* Consider lack of memory an inability to write the output.
+		 * We cannot report memory exhaustion from here, as it will
+		 * result in the caller thinking that the output buffer is
+		 * too small, which isn't actually the case. As 
+		 * transliteration is best-effort anyway, this should be ok. */
+		return -1;
+	}
 
 	/* Reset the transout codec */
 	if (e->transout != NULL) {
@@ -102,6 +108,8 @@ int translit_flush_replacement(struct encoding_context *e)
 	size_t substlen = e->substlen;
 	int ret = 1;
 
+	LOG(("Flushing %zd characters", substlen));
+
 	while (substlen > 0) {
 		UCS4 c = substitution[0];
 		
@@ -118,6 +126,8 @@ int translit_flush_replacement(struct encoding_context *e)
 	e->substitution = substitution;
 	e->substlen = substlen;
 
+	LOG(("%zd characters remaining", substlen));
+
 	return ret;
 }
 
diff --git a/src/iconv.c b/src/iconv.c
index c81a0b2..21bf665 100644
--- a/src/iconv.c
+++ b/src/iconv.c
@@ -292,6 +292,10 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
 		/* Clear skip */
 		e->skip = 0;
 
+		/* Reset transliteration state */
+		e->substitution = NULL;
+		e->substlen = 0;
+
 		/* Reset read codec */
 		if (e->in) {
 			encoding_reset(e->in);
@@ -342,13 +346,20 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
 	e->outbytesleft = outbytesleft;
 
 	/* Flush through any remaining transliteration */
-	ret = translit_flush_replacement(e);
-	if (ret <= 0) {
-		errno = E2BIG;
-		return (size_t)-1;
-	}
+	if (e->substlen > 0) {
+		ret = translit_flush_replacement(e);
+		if (ret <= 0) {
+			errno = E2BIG;
+			return (size_t)-1;
+		}
 
-	LOG(("reading"));
+		/* Force write state to success, so if there's no more input
+		 * (i.e. we were transliterating the last character of input)
+		 * we'll report success, rather than whatever caused us to 
+		 * stop writing the transliterated sequence last time round.
+		 */
+		e->write_state = WRITE_SUCCESS;
+	}
 
 	/* If, on the previous attempt to convert data, we reached the end
 	 * of the input buffer mid-sequence, then we retain the number of
@@ -359,12 +370,16 @@ size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
 	 * start.
 	 */
 	if (e->skip != 0) {
+		LOG(("Skipping %d bytes of input", e->skip));
+
 		*inbuf += e->skip;
 		*inbytesleft -= e->skip;
 
 		e->skip = 0;
 	}
 
+	LOG(("Reading %zd bytes of input", *inbytesleft));
+
 	/* Perform the conversion.
 	 *
 	 * To ensure that we detect the correct error conditions
diff --git a/test/translit.c b/test/translit.c
index 8f17889..240f5e2 100644
--- a/test/translit.c
+++ b/test/translit.c
@@ -20,7 +20,15 @@ typedef struct translit_testcase {
 } translit_testcase;
 
 static const translit_testcase tests[] = {
+	/* Trivial */
 	{ "iso-8859-1//TRANSLIT", "\xe2\x80\x93", "-" },
+	/* Multi-character replacements */
+	{ "iso-8859-2//TRANSLIT", "\xc2\xa9", "(c)" },
+	{ "iso-8859-3//TRANSLIT", "\xc2\xab", "<<" },
+	/* Multiple choices */
+	{ "iso-8859-4//TRANSLIT", "\xef\xac\x85", "st" },
+	/* Default fallback */
+	{ "iso-8859-1//TRANSLIT", "\xef\xac\x87", "?" },
 	{ NULL, NULL, NULL }
 };
 
@@ -53,6 +61,50 @@ static void run_tests(void)
 	}
 }
 
+static void test_translit_buffer_boundary(void)
+{
+	iconv_t cd;
+	char out[128];
+	char *inp = (char *) "\xc2\xa9", *outp = out;
+	size_t inlen = strlen(inp), outlen;
+	size_t read;
+
+	cd = iconv_open("iso-8859-2//TRANSLIT", "utf-8");
+	assert(cd != (iconv_t) -1);
+
+	outlen = 1;
+	read = iconv(cd, &inp, &inlen, &outp, &outlen);
+	assert(read == (size_t) -1);
+	assert(errno == E2BIG);
+
+	/* Expect ( to appear in output */
+	assert(outlen == 0);
+	assert(out[0] == '(');
+
+	/* Try to write next output character */
+	outlen = 1;
+	read = iconv(cd, &inp, &inlen, &outp, &outlen);
+	assert(read == (size_t) -1);
+	assert(errno == E2BIG);
+
+	/* Expect "(c" in output */
+	assert(outlen == 0);
+	assert(out[0] == '(');
+	assert(out[1] == 'c');
+
+	/* Flush through last character */
+	outlen = 1;
+	read = iconv(cd, &inp, &inlen, &outp, &outlen);
+	assert(read == 0);
+
+	/* Expect "(c)" in output, and all input read */
+	assert(outlen == 0);
+	assert(memcmp(out, "(c)", 3) == 0);
+	assert(inlen == 0);
+
+	iconv_close(cd);
+}
+
 int main(int argc, char **argv)
 {
 	const char *ucpath;
@@ -84,6 +136,7 @@ int main(int argc, char **argv)
 	assert(iconv_initialise(aliases) == 1);
 
 	run_tests();
+	test_translit_buffer_boundary();
 
 	iconv_finalise();
 
-- 
cgit v1.2.3