1 files changed, 123 insertions, 96 deletions
diff --git a/frontends/riscos/ucstables.c b/frontends/riscos/ucstables.c
index 3e31c992e..a94e4348c 100644
--- a/frontends/riscos/ucstables.c
+++ b/frontends/riscos/ucstables.c
@@ -398,49 +398,47 @@ static const char *localencodings[] = {
 	"ISO-8859-10//TRANSLIT",
 	"ISO-8859-13//TRANSLIT",
 	"ISO-8859-14//TRANSLIT",
-	"ISO-8859-16//TRANSLIT",
-#define CONT_ENC_END 116	/* RISC OS alphabet numbers lie in a
-				 * contiguous range [100,CONT_ENC_END]
-				 * _except_ for Cyrillic2, which doesn't.
-				 */
-	"CP866//TRANSLIT"	/* Cyrillic2 - 120 */
+	"ISO-8859-16//TRANSLIT", /* Latin10 - 116 */
+	NULL,			 /* UTF-16, if you believe HdrSrc (Unused) */
+	NULL,			 /* Unused */
+	NULL,			 /* Unused */
+	"CP866//TRANSLIT"	 /* Cyrillic2 - 120 */
 };
 
-static const struct special {
-	char local;		/**< Local 8bit representation */
-	char len;		/**< Length (in bytes) of UTF-8 character */
-	const char *utf;	/**< UTF-8 representation */
-} special_chars[] = {
-	{ 0x80, 3, "\xE2\x82\xAC" },	/* EURO SIGN */
-	{ 0x81, 2, "\xC5\xB4" },	/* LATIN CAPITAL LETTER W WITH CIRCUMFLEX */
-	{ 0x82, 2, "\xC5\xB5" },	/* LATIN SMALL LETTER W WITH CIRCUMFLEX */
-	{ 0x84, 3, "\xE2\x9C\x98" },	/* HEAVY BALLOT X */
-	{ 0x85, 2, "\xC5\xB6" },	/* LATIN CAPITAL LETTER Y WITH CIRCUMFLEX */
-	{ 0x86, 2, "\xC5\xB7" },	/* LATIN SMALL LETTER Y WITH CIRCUMFLEX */
-	{ 0x88, 3, "\xE2\x87\x90" },	/* LEFTWARDS DOUBLE ARROW */
-	{ 0x89, 3, "\xE2\x87\x92" },	/* RIGHTWARDS DOUBLE ARROW */
-	{ 0x8a, 3, "\xE2\x87\x93" },	/* DOWNWARDS DOUBLE ARROW */
-	{ 0x8b, 3, "\xE2\x87\x91" },	/* UPWARDS DOUBLE ARROW */
-	{ 0x8c, 3, "\xE2\x80\xA6" },	/* HORIZONTAL ELLIPSIS */
-	{ 0x8d, 3, "\xE2\x84\xA2" },	/* TRADE MARK SIGN */
-	{ 0x8e, 3, "\xE2\x80\xB0" },	/* PER MILLE SIGN */
-	{ 0x8f, 3, "\xE2\x80\xA2" },	/* BULLET */
-	{ 0x90, 3, "\xE2\x80\x98" },	/* LEFT SINGLE QUOTATION MARK */
-	{ 0x91, 3, "\xE2\x80\x99" },	/* RIGHT SINGLE QUOTATION MARK */
-	{ 0x92, 3, "\xE2\x80\xB9" },	/* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
-	{ 0x93, 3, "\xE2\x80\xBA" },	/* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
-	{ 0x94, 3, "\xE2\x80\x9C" },	/* LEFT DOUBLE QUOTATION MARK */
-	{ 0x95, 3, "\xE2\x80\x9D" },	/* RIGHT DOUBLE QUOTATION MARK */
-	{ 0x96, 3, "\xE2\x80\x9E" },	/* DOUBLE LOW-9 QUOTATION MARK */
-	{ 0x97, 3, "\xE2\x80\x93" },	/* EN DASH */
-	{ 0x98, 3, "\xE2\x80\x94" },	/* EM DASH */
-	{ 0x99, 3, "\xE2\x88\x92" },	/* MINUS SIGN */
-	{ 0x9a, 2, "\xC5\x92" },	/* LATIN CAPITAL LIGATURE OE */
-	{ 0x9b, 2, "\xC5\x93" },	/* LATIN SMALL LIGATURE OE */
-	{ 0x9c, 3, "\xE2\x80\xA0" },	/* DAGGER */
-	{ 0x9d, 3, "\xE2\x80\xA1" },	/* DOUBLE DAGGER */
-	{ 0x9e, 3, "\xEF\xAC\x81" },	/* LATIN SMALL LIGATURE FI */
-	{ 0x9f, 3, "\xEF\xAC\x82" } 	/* LATIN SMALL LIGATURE FL */
+/* These are the Acorn Latin1 C1 block between [0x80,0x9f] */
+static const char *special_chars[] = {
+	"\xE2\x82\xAC",	/* EURO SIGN */
+	"\xC5\xB4",	/* LATIN CAPITAL LETTER W WITH CIRCUMFLEX */
+	"\xC5\xB5",	/* LATIN SMALL LETTER W WITH CIRCUMFLEX */
+	NULL,		/* unused */
+	"\xE2\x9C\x98",	/* HEAVY BALLOT X */
+	"\xC5\xB6",	/* LATIN CAPITAL LETTER Y WITH CIRCUMFLEX */
+	"\xC5\xB7",	/* LATIN SMALL LETTER Y WITH CIRCUMFLEX */
+	NULL,		/* unused */
+	"\xE2\x87\x90",	/* LEFTWARDS DOUBLE ARROW */
+	"\xE2\x87\x92",	/* RIGHTWARDS DOUBLE ARROW */
+	"\xE2\x87\x93",	/* DOWNWARDS DOUBLE ARROW */
+	"\xE2\x87\x91",	/* UPWARDS DOUBLE ARROW */
+	"\xE2\x80\xA6",	/* HORIZONTAL ELLIPSIS */
+	"\xE2\x84\xA2",	/* TRADE MARK SIGN */
+	"\xE2\x80\xB0",	/* PER MILLE SIGN */
+	"\xE2\x80\xA2",	/* BULLET */
+	"\xE2\x80\x98",	/* LEFT SINGLE QUOTATION MARK */
+	"\xE2\x80\x99",	/* RIGHT SINGLE QUOTATION MARK */
+	"\xE2\x80\xB9",	/* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */
+	"\xE2\x80\xBA",	/* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */
+	"\xE2\x80\x9C",	/* LEFT DOUBLE QUOTATION MARK */
+	"\xE2\x80\x9D",	/* RIGHT DOUBLE QUOTATION MARK */
+	"\xE2\x80\x9E",	/* DOUBLE LOW-9 QUOTATION MARK */
+	"\xE2\x80\x93",	/* EN DASH */
+	"\xE2\x80\x94",	/* EM DASH */
+	"\xE2\x88\x92",	/* MINUS SIGN */
+	"\xC5\x92",	/* LATIN CAPITAL LIGATURE OE */
+	"\xC5\x93",	/* LATIN SMALL LIGATURE OE */
+	"\xE2\x80\xA0",	/* DAGGER */
+	"\xE2\x80\xA1",	/* DOUBLE DAGGER */
+	"\xEF\xAC\x81",	/* LATIN SMALL LIGATURE FI */
+	"\xEF\xAC\x82" 	/* LATIN SMALL LIGATURE FL */
 };
 
 
@@ -470,20 +468,22 @@ nserror utf8_to_local_encoding(const char *string, size_t len, char **result)
 
 	/* read system alphabet */
 	error = xosbyte1(osbyte_ALPHABET_NUMBER, 127, 0, &alphabet);
-	if (error)
+	/* Assume Latin1 for anything we know nothing about */
+	if (error || alphabet < territory_ALPHABET_BFONT ||
+			alphabet > territory_ALPHABET_CYRILLIC2)
 		alphabet = territory_ALPHABET_LATIN1;
 
 	/* UTF-8 -> simply copy string */
-	if (alphabet == 111 /* UTF-8 */) {
+	if (alphabet == territory_ALPHABET_UTF8) {
 		*result = strndup(string, len);
 		return NSERROR_OK;
 	}
 
 	/* get encoding name */
-	enc = (alphabet <= CONT_ENC_END ? localencodings[alphabet - 100]
-			      : (alphabet == 120 ?
-					localencodings[CONT_ENC_END - 100 + 1]
-						 : localencodings[0]));
+	enc = localencodings[alphabet - territory_ALPHABET_BFONT];
+	/* Assume Latin1 for any that are unused */
+	if (enc == NULL)
+		enc = localencodings[0];
 
 	/* create output buffer */
 	*(result) = malloc(len + 1);
@@ -498,13 +498,32 @@ nserror utf8_to_local_encoding(const char *string, size_t len, char **result)
 	 * characters and inserting appropriate output for characters
 	 * that iconv can't handle. */
 	for (off = 0; off < len; off = utf8_next(string, len, off)) {
-		if (string[off] != 0xE2 &&
-				string[off] != 0xC5 && string[off] != 0xEF)
+		/* Specials only start with C5/E2/EF */
+		if (string[off] != 0xC5 &&
+				string[off] != 0xE2 && string[off] != 0xEF)
 			continue;
 
+		/* Ignore truncated input */
+		if (off + 2 + (string[off] == 0xC5 ? 0 : 1) >= len)
+			continue;
+
+		/* Search to see if this character is special */
 		for (i = 0; i != NOF_ELEMENTS(special_chars); i++) {
-			if (strncmp(string + off, special_chars[i].utf,
-					special_chars[i].len) != 0)
+			/* Skip unused special char */
+			if (special_chars[i] == NULL)
+				continue;
+
+			/* Skip 2-byte non-match */
+			if (string[off] == 0xC5 &&
+				    (string[off] != special_chars[i][0] ||
+				    string[off+1] != special_chars[i][1]))
+				continue;
+
+			/* Skip 3-byte non-match */
+			if (string[off] != 0xC5 &&
+				    (string[off] != special_chars[i][0] ||
+				    string[off+1] != special_chars[i][1] ||
+				    string[off+2] != special_chars[i][2]))
 				continue;
 
 			/* 0 length has a special meaning to utf8_to_enc */
@@ -524,9 +543,13 @@ nserror utf8_to_local_encoding(const char *string, size_t len, char **result)
 				free(temp);
 			}
 
-			*cur_pos = special_chars[i].local;
+			/* Emit conversion for this special character */
+			*cur_pos = 0x80 + i;
 			*(++cur_pos) = '\0';
-			prev_off = off + special_chars[i].len;
+			prev_off = off + 2 + (string[off] == 0xC5 ? 0 : 1);
+
+			/* Return to outer loop to process remaining input */
+			break;
 		}
 	}
 
@@ -561,7 +584,7 @@ nserror utf8_to_local_encoding(const char *string, size_t len, char **result)
 nserror utf8_from_local_encoding(const char *string, size_t len, char **result)
 {
 	os_error *error;
-	int alphabet, i, num_specials = 0, result_alloc;
+	int alphabet, num_specials = 0, result_alloc;
 #define SPECIAL_CHUNK_SIZE 255
 	size_t off, prev_off, cur_off;
 	char *temp;
@@ -576,11 +599,13 @@ nserror utf8_from_local_encoding(const char *string, size_t len, char **result)
 
 	/* read system alphabet */
 	error = xosbyte1(osbyte_ALPHABET_NUMBER, 127, 0, &alphabet);
-	if (error)
+	/* Assume Latin1 for anything we know nothing about */
+	if (error || alphabet < territory_ALPHABET_BFONT ||
+			alphabet > territory_ALPHABET_CYRILLIC2)
 		alphabet = territory_ALPHABET_LATIN1;
 
 	/* UTF-8 -> simply copy string */
-	if (alphabet == 111 /* UTF-8 */) {
+	if (alphabet == territory_ALPHABET_UTF8) {
 		temp = strndup(string, len);
 		if (!temp)
 			return NSERROR_NOMEM;
@@ -590,10 +615,10 @@ nserror utf8_from_local_encoding(const char *string, size_t len, char **result)
 	}
 
 	/* get encoding name */
-	enc = (alphabet <= CONT_ENC_END ? localencodings[alphabet - 100]
-			      : (alphabet == 120 ?
-					localencodings[CONT_ENC_END - 100 + 1]
-						 : localencodings[0]));
+	enc = localencodings[alphabet - territory_ALPHABET_BFONT];
+	/* Assume Latin1 for any that are unused */
+	if (enc == NULL)
+		enc = localencodings[0];
 
 	/* create output buffer (oversized) */
 	result_alloc = (len * 4) + (3 * SPECIAL_CHUNK_SIZE) + 1;
@@ -610,52 +635,54 @@ nserror utf8_from_local_encoding(const char *string, size_t len, char **result)
 	 * characters and inserting appropriate output for characters
 	 * that iconv can't handle. */
 	for (off = 0; off < len; off++) {
+		/* Skip non-special characters */
 		if (string[off] < 0x80 || string[off] > 0x9f)
 			continue;
 
-		for (i = 0; i != NOF_ELEMENTS(special_chars); i++) {
-			if (string[off] != special_chars[i].local)
-				continue;
-
-			/* 0 length has a special meaning to utf8_from_enc */
-			if (off - prev_off > 0) {
-				err = utf8_from_enc(string + prev_off, enc,
-						    off - prev_off, &temp, NULL);
-				if (err != NSERROR_OK) {
-					assert(err != NSERROR_BAD_ENCODING);
-					NSLOG(netsurf, INFO,
-					      "utf8_from_enc failed");
-					free(*result);
-					return NSERROR_NOMEM;
-				}
-
-				strcat((*result) + cur_off, temp);
-
-				cur_off += strlen(temp);
-
-				free(temp);
+		/* 0 length has a special meaning to utf8_from_enc */
+		if (off - prev_off > 0) {
+			err = utf8_from_enc(string + prev_off, enc,
+					off - prev_off, &temp, NULL);
+			if (err != NSERROR_OK) {
+				assert(err != NSERROR_BAD_ENCODING);
+				NSLOG(netsurf, INFO, "utf8_from_enc failed");
+				free(*result);
+				return NSERROR_NOMEM;
 			}
 
-			strcat((*result) + cur_off, special_chars[i].utf);
+			strcat((*result) + cur_off, temp);
 
-			cur_off += special_chars[i].len;
+			cur_off += strlen(temp);
 
-			prev_off = off + 1;
+			free(temp);
+		}
 
-			num_specials++;
-			if (num_specials % SPECIAL_CHUNK_SIZE ==
-					SPECIAL_CHUNK_SIZE - 1) {
-				char *temp = realloc((*result),
-						result_alloc +
-						(3 * SPECIAL_CHUNK_SIZE));
-				if (!temp) {
-					free(*result);
-					return NSERROR_NOMEM;
-				}
+		/* Append UTF-8 encoded special character or U+FFFD if none */
+		if (special_chars[string[off]-0x80] != NULL) {
+			const char *special = special_chars[string[off]-0x80];
+			strcat((*result) + cur_off, special);
+			cur_off += 2 + (special[0] == 0xC5 ? 0 : 1);
+		} else {
+			strcat((*result) + cur_off, "\xef\xbf\xbd");
+			cur_off += 3;
+		}
 
-				*result = temp;
-				result_alloc += (3 * SPECIAL_CHUNK_SIZE);
+		prev_off = off + 1;
+
+		/* Resize output buffer if necessary */
+		num_specials++;
+		if (num_specials % SPECIAL_CHUNK_SIZE ==
+				SPECIAL_CHUNK_SIZE - 1) {
+			char *temp = realloc((*result),
+					result_alloc +
+					(3 * SPECIAL_CHUNK_SIZE));
+			if (!temp) {
+				free(*result);
+				return NSERROR_NOMEM;
 			}
+
+			*result = temp;
+			result_alloc += (3 * SPECIAL_CHUNK_SIZE);
 		}
 	}