diff options
Diffstat (limited to 'frontends/riscos/ucstables.c')
-rw-r--r-- | frontends/riscos/ucstables.c | 219 |
1 files changed, 123 insertions, 96 deletions
diff --git a/frontends/riscos/ucstables.c b/frontends/riscos/ucstables.c index 3e31c992e..a94e4348c 100644 --- a/frontends/riscos/ucstables.c +++ b/frontends/riscos/ucstables.c @@ -398,49 +398,47 @@ static const char *localencodings[] = { "ISO-8859-10//TRANSLIT", "ISO-8859-13//TRANSLIT", "ISO-8859-14//TRANSLIT", - "ISO-8859-16//TRANSLIT", -#define CONT_ENC_END 116 /* RISC OS alphabet numbers lie in a - * contiguous range [100,CONT_ENC_END] - * _except_ for Cyrillic2, which doesn't. - */ - "CP866//TRANSLIT" /* Cyrillic2 - 120 */ + "ISO-8859-16//TRANSLIT", /* Latin10 - 116 */ + NULL, /* UTF-16, if you believe HdrSrc (Unused) */ + NULL, /* Unused */ + NULL, /* Unused */ + "CP866//TRANSLIT" /* Cyrillic2 - 120 */ }; -static const struct special { - char local; /**< Local 8bit representation */ - char len; /**< Length (in bytes) of UTF-8 character */ - const char *utf; /**< UTF-8 representation */ -} special_chars[] = { - { 0x80, 3, "\xE2\x82\xAC" }, /* EURO SIGN */ - { 0x81, 2, "\xC5\xB4" }, /* LATIN CAPITAL LETTER W WITH CIRCUMFLEX */ - { 0x82, 2, "\xC5\xB5" }, /* LATIN SMALL LETTER W WITH CIRCUMFLEX */ - { 0x84, 3, "\xE2\x9C\x98" }, /* HEAVY BALLOT X */ - { 0x85, 2, "\xC5\xB6" }, /* LATIN CAPITAL LETTER Y WITH CIRCUMFLEX */ - { 0x86, 2, "\xC5\xB7" }, /* LATIN SMALL LETTER Y WITH CIRCUMFLEX */ - { 0x88, 3, "\xE2\x87\x90" }, /* LEFTWARDS DOUBLE ARROW */ - { 0x89, 3, "\xE2\x87\x92" }, /* RIGHTWARDS DOUBLE ARROW */ - { 0x8a, 3, "\xE2\x87\x93" }, /* DOWNWARDS DOUBLE ARROW */ - { 0x8b, 3, "\xE2\x87\x91" }, /* UPWARDS DOUBLE ARROW */ - { 0x8c, 3, "\xE2\x80\xA6" }, /* HORIZONTAL ELLIPSIS */ - { 0x8d, 3, "\xE2\x84\xA2" }, /* TRADE MARK SIGN */ - { 0x8e, 3, "\xE2\x80\xB0" }, /* PER MILLE SIGN */ - { 0x8f, 3, "\xE2\x80\xA2" }, /* BULLET */ - { 0x90, 3, "\xE2\x80\x98" }, /* LEFT SINGLE QUOTATION MARK */ - { 0x91, 3, "\xE2\x80\x99" }, /* RIGHT SINGLE QUOTATION MARK */ - { 0x92, 3, "\xE2\x80\xB9" }, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ - { 0x93, 3, "\xE2\x80\xBA" }, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ - { 0x94, 3, "\xE2\x80\x9C" }, /* LEFT DOUBLE QUOTATION MARK */ - { 0x95, 3, "\xE2\x80\x9D" }, /* RIGHT DOUBLE QUOTATION MARK */ - { 0x96, 3, "\xE2\x80\x9E" }, /* DOUBLE LOW-9 QUOTATION MARK */ - { 0x97, 3, "\xE2\x80\x93" }, /* EN DASH */ - { 0x98, 3, "\xE2\x80\x94" }, /* EM DASH */ - { 0x99, 3, "\xE2\x88\x92" }, /* MINUS SIGN */ - { 0x9a, 2, "\xC5\x92" }, /* LATIN CAPITAL LIGATURE OE */ - { 0x9b, 2, "\xC5\x93" }, /* LATIN SMALL LIGATURE OE */ - { 0x9c, 3, "\xE2\x80\xA0" }, /* DAGGER */ - { 0x9d, 3, "\xE2\x80\xA1" }, /* DOUBLE DAGGER */ - { 0x9e, 3, "\xEF\xAC\x81" }, /* LATIN SMALL LIGATURE FI */ - { 0x9f, 3, "\xEF\xAC\x82" } /* LATIN SMALL LIGATURE FL */ +/* These are the Acorn Latin1 C1 block between [0x80,0x9f] */ +static const char *special_chars[] = { + "\xE2\x82\xAC", /* EURO SIGN */ + "\xC5\xB4", /* LATIN CAPITAL LETTER W WITH CIRCUMFLEX */ + "\xC5\xB5", /* LATIN SMALL LETTER W WITH CIRCUMFLEX */ + NULL, /* unused */ + "\xE2\x9C\x98", /* HEAVY BALLOT X */ + "\xC5\xB6", /* LATIN CAPITAL LETTER Y WITH CIRCUMFLEX */ + "\xC5\xB7", /* LATIN SMALL LETTER Y WITH CIRCUMFLEX */ + NULL, /* unused */ + "\xE2\x87\x90", /* LEFTWARDS DOUBLE ARROW */ + "\xE2\x87\x92", /* RIGHTWARDS DOUBLE ARROW */ + "\xE2\x87\x93", /* DOWNWARDS DOUBLE ARROW */ + "\xE2\x87\x91", /* UPWARDS DOUBLE ARROW */ + "\xE2\x80\xA6", /* HORIZONTAL ELLIPSIS */ + "\xE2\x84\xA2", /* TRADE MARK SIGN */ + "\xE2\x80\xB0", /* PER MILLE SIGN */ + "\xE2\x80\xA2", /* BULLET */ + "\xE2\x80\x98", /* LEFT SINGLE QUOTATION MARK */ + "\xE2\x80\x99", /* RIGHT SINGLE QUOTATION MARK */ + "\xE2\x80\xB9", /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ + "\xE2\x80\xBA", /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ + "\xE2\x80\x9C", /* LEFT DOUBLE QUOTATION MARK */ + "\xE2\x80\x9D", /* RIGHT DOUBLE QUOTATION MARK */ + "\xE2\x80\x9E", /* DOUBLE LOW-9 QUOTATION MARK */ + "\xE2\x80\x93", /* EN DASH */ + "\xE2\x80\x94", /* EM DASH */ + "\xE2\x88\x92", /* MINUS SIGN */ + "\xC5\x92", /* LATIN CAPITAL LIGATURE OE */ + "\xC5\x93", /* LATIN SMALL LIGATURE OE */ + "\xE2\x80\xA0", /* DAGGER */ + "\xE2\x80\xA1", /* DOUBLE DAGGER */ + "\xEF\xAC\x81", /* LATIN SMALL LIGATURE FI */ + "\xEF\xAC\x82" /* LATIN SMALL LIGATURE FL */ }; @@ -470,20 +468,22 @@ nserror utf8_to_local_encoding(const char *string, size_t len, char **result) /* read system alphabet */ error = xosbyte1(osbyte_ALPHABET_NUMBER, 127, 0, &alphabet); - if (error) + /* Assume Latin1 for anything we know nothing about */ + if (error || alphabet < territory_ALPHABET_BFONT || + alphabet > territory_ALPHABET_CYRILLIC2) alphabet = territory_ALPHABET_LATIN1; /* UTF-8 -> simply copy string */ - if (alphabet == 111 /* UTF-8 */) { + if (alphabet == territory_ALPHABET_UTF8) { *result = strndup(string, len); return NSERROR_OK; } /* get encoding name */ - enc = (alphabet <= CONT_ENC_END ? localencodings[alphabet - 100] - : (alphabet == 120 ? - localencodings[CONT_ENC_END - 100 + 1] - : localencodings[0])); + enc = localencodings[alphabet - territory_ALPHABET_BFONT]; + /* Assume Latin1 for any that are unused */ + if (enc == NULL) + enc = localencodings[0]; /* create output buffer */ *(result) = malloc(len + 1); @@ -498,13 +498,32 @@ nserror utf8_to_local_encoding(const char *string, size_t len, char **result) * characters and inserting appropriate output for characters * that iconv can't handle. */ for (off = 0; off < len; off = utf8_next(string, len, off)) { - if (string[off] != 0xE2 && - string[off] != 0xC5 && string[off] != 0xEF) + /* Specials only start with C5/E2/EF */ + if (string[off] != 0xC5 && + string[off] != 0xE2 && string[off] != 0xEF) continue; + /* Ignore truncated input */ + if (off + 2 + (string[off] == 0xC5 ? 0 : 1) >= len) + continue; + + /* Search to see if this character is special */ for (i = 0; i != NOF_ELEMENTS(special_chars); i++) { - if (strncmp(string + off, special_chars[i].utf, - special_chars[i].len) != 0) + /* Skip unused special char */ + if (special_chars[i] == NULL) + continue; + + /* Skip 2-byte non-match */ + if (string[off] == 0xC5 && + (string[off] != special_chars[i][0] || + string[off+1] != special_chars[i][1])) + continue; + + /* Skip 3-byte non-match */ + if (string[off] != 0xC5 && + (string[off] != special_chars[i][0] || + string[off+1] != special_chars[i][1] || + string[off+2] != special_chars[i][2])) continue; /* 0 length has a special meaning to utf8_to_enc */ @@ -524,9 +543,13 @@ nserror utf8_to_local_encoding(const char *string, size_t len, char **result) free(temp); } - *cur_pos = special_chars[i].local; + /* Emit conversion for this special character */ + *cur_pos = 0x80 + i; *(++cur_pos) = '\0'; - prev_off = off + special_chars[i].len; + prev_off = off + 2 + (string[off] == 0xC5 ? 0 : 1); + + /* Return to outer loop to process remaining input */ + break; } } @@ -561,7 +584,7 @@ nserror utf8_to_local_encoding(const char *string, size_t len, char **result) nserror utf8_from_local_encoding(const char *string, size_t len, char **result) { os_error *error; - int alphabet, i, num_specials = 0, result_alloc; + int alphabet, num_specials = 0, result_alloc; #define SPECIAL_CHUNK_SIZE 255 size_t off, prev_off, cur_off; char *temp; @@ -576,11 +599,13 @@ nserror utf8_from_local_encoding(const char *string, size_t len, char **result) /* read system alphabet */ error = xosbyte1(osbyte_ALPHABET_NUMBER, 127, 0, &alphabet); - if (error) + /* Assume Latin1 for anything we know nothing about */ + if (error || alphabet < territory_ALPHABET_BFONT || + alphabet > territory_ALPHABET_CYRILLIC2) alphabet = territory_ALPHABET_LATIN1; /* UTF-8 -> simply copy string */ - if (alphabet == 111 /* UTF-8 */) { + if (alphabet == territory_ALPHABET_UTF8) { temp = strndup(string, len); if (!temp) return NSERROR_NOMEM; @@ -590,10 +615,10 @@ nserror utf8_from_local_encoding(const char *string, size_t len, char **result) } /* get encoding name */ - enc = (alphabet <= CONT_ENC_END ? localencodings[alphabet - 100] - : (alphabet == 120 ? - localencodings[CONT_ENC_END - 100 + 1] - : localencodings[0])); + enc = localencodings[alphabet - territory_ALPHABET_BFONT]; + /* Assume Latin1 for any that are unused */ + if (enc == NULL) + enc = localencodings[0]; /* create output buffer (oversized) */ result_alloc = (len * 4) + (3 * SPECIAL_CHUNK_SIZE) + 1; @@ -610,52 +635,54 @@ nserror utf8_from_local_encoding(const char *string, size_t len, char **result) * characters and inserting appropriate output for characters * that iconv can't handle. */ for (off = 0; off < len; off++) { + /* Skip non-special characters */ if (string[off] < 0x80 || string[off] > 0x9f) continue; - for (i = 0; i != NOF_ELEMENTS(special_chars); i++) { - if (string[off] != special_chars[i].local) - continue; - - /* 0 length has a special meaning to utf8_from_enc */ - if (off - prev_off > 0) { - err = utf8_from_enc(string + prev_off, enc, - off - prev_off, &temp, NULL); - if (err != NSERROR_OK) { - assert(err != NSERROR_BAD_ENCODING); - NSLOG(netsurf, INFO, - "utf8_from_enc failed"); - free(*result); - return NSERROR_NOMEM; - } - - strcat((*result) + cur_off, temp); - - cur_off += strlen(temp); - - free(temp); + /* 0 length has a special meaning to utf8_from_enc */ + if (off - prev_off > 0) { + err = utf8_from_enc(string + prev_off, enc, + off - prev_off, &temp, NULL); + if (err != NSERROR_OK) { + assert(err != NSERROR_BAD_ENCODING); + NSLOG(netsurf, INFO, "utf8_from_enc failed"); + free(*result); + return NSERROR_NOMEM; } - strcat((*result) + cur_off, special_chars[i].utf); + strcat((*result) + cur_off, temp); - cur_off += special_chars[i].len; + cur_off += strlen(temp); - prev_off = off + 1; + free(temp); + } - num_specials++; - if (num_specials % SPECIAL_CHUNK_SIZE == - SPECIAL_CHUNK_SIZE - 1) { - char *temp = realloc((*result), - result_alloc + - (3 * SPECIAL_CHUNK_SIZE)); - if (!temp) { - free(*result); - return NSERROR_NOMEM; - } + /* Append UTF-8 encoded special character or U+FFFD if none */ + if (special_chars[string[off]-0x80] != NULL) { + const char *special = special_chars[string[off]-0x80]; + strcat((*result) + cur_off, special); + cur_off += 2 + (special[0] == 0xC5 ? 0 : 1); + } else { + strcat((*result) + cur_off, "\xef\xbf\xbd"); + cur_off += 3; + } - *result = temp; - result_alloc += (3 * SPECIAL_CHUNK_SIZE); + prev_off = off + 1; + + /* Resize output buffer if necessary */ + num_specials++; + if (num_specials % SPECIAL_CHUNK_SIZE == + SPECIAL_CHUNK_SIZE - 1) { + char *temp = realloc((*result), + result_alloc + + (3 * SPECIAL_CHUNK_SIZE)); + if (!temp) { + free(*result); + return NSERROR_NOMEM; } + + *result = temp; + result_alloc += (3 * SPECIAL_CHUNK_SIZE); } } |