diff options
Diffstat (limited to 'riscos/ucstables.c')
-rw-r--r-- | riscos/ucstables.c | 281 |
1 files changed, 269 insertions, 12 deletions
diff --git a/riscos/ucstables.c b/riscos/ucstables.c index b744e9c6a..ef103f367 100644 --- a/riscos/ucstables.c +++ b/riscos/ucstables.c @@ -6,12 +6,17 @@ */ /** \file - * UCS conversion tables + * UCS conversion tables and RISC OS-specific UTF-8 text handling */ +#include <assert.h> +#include <limits.h> +#include <string.h> #include "oslib/osbyte.h" #include "oslib/territory.h" + #include "netsurf/riscos/ucstables.h" +#include "netsurf/utils/utf8.h" #include "netsurf/utils/utils.h" /* Common values (ASCII) */ @@ -334,15 +339,16 @@ int *ucstable_from_alphabet(int alphabet) return ucstable; } + static const char *localencodings[] = { "ISO-8859-1", /* BFont - 100 - just use Latin1, instead */ - "ISO-8859-1", /* do we want to use Acorn Latin1, instead? */ + "ISO-8859-1", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4", "ISO-8859-5", "ISO-8859-6", - "ISO-8869-7", + "ISO-8859-7", "ISO-8859-8", "ISO-8859-9", "ISO-IR-182", @@ -354,21 +360,272 @@ static const char *localencodings[] = { "CP866" /* Cyrillic2 - 120 */ }; +static const struct special { + char local; /**< Local 8bit representation */ + char len; /**< Length (in bytes) of UTF-8 character */ + const char *utf; /**< UTF-8 representation */ +} special_chars[] = { + { 0x80, 3, "\xE2\x82\xAC" }, /* EURO SIGN */ + { 0x81, 2, "\xC5\xB4" }, /* LATIN CAPITAL LETTER W WITH CIRCUMFLEX */ + { 0x82, 2, "\xC5\xB5" }, /* LATIN SMALL LETTER W WITH CIRCUMFLEX */ + { 0x84, 3, "\xE2\x9C\x98" }, /* HEAVY BALLOT X */ + { 0x85, 2, "\xC5\xB6" }, /* LATIN CAPITAL LETTER Y WITH CIRCUMFLEX */ + { 0x86, 2, "\xC5\xB7" }, /* LATIN SMALL LETTER Y WITH CIRCUMFLEX */ + { 0x88, 3, "\xE2\x87\x90" }, /* LEFTWARDS DOUBLE ARROW */ + { 0x89, 3, "\xE2\x87\x92" }, /* RIGHTWARDS DOUBLE ARROW */ + { 0x8a, 3, "\xE2\x87\x93" }, /* DOWNWARDS DOUBLE ARROW */ + { 0x8b, 3, "\xE2\x87\x91" }, /* UPWARDS DOUBLE ARROW */ + { 0x8c, 3, "\xE2\x80\xA6" }, /* HORIZONTAL ELLIPSIS */ + { 0x8d, 3, "\xE2\x84\xA2" }, /* TRADE MARK SIGN */ + { 0x8e, 3, "\xE2\x80\xB0" }, /* PER MILLE SIGN */ + { 0x8f, 3, "\xE2\x80\xA2" }, /* BULLET */ + { 0x90, 3, "\xE2\x80\x98" }, /* LEFT SINGLE QUOTATION MARK */ + { 0x91, 3, "\xE2\x80\x99" }, /* RIGHT SINGLE QUOTATION MARK */ + { 0x92, 3, "\xE2\x80\xB9" }, /* SINGLE LEFT-POINTING ANGLE QUOTATION MARK */ + { 0x93, 3, "\xE2\x80\xBA" }, /* SINGLE RIGHT-POINTING ANGLE QUOTATION MARK */ + { 0x94, 3, "\xE2\x80\x9C" }, /* LEFT DOUBLE QUOTATION MARK */ + { 0x95, 3, "\xE2\x80\x9D" }, /* RIGHT DOUBLE QUOTATION MARK */ + { 0x96, 3, "\xE2\x80\x9E" }, /* DOUBLE LOW-9 QUOTATION MARK */ + { 0x97, 3, "\xE2\x80\x93" }, /* EN DASH */ + { 0x98, 3, "\xE2\x80\x94" }, /* EM DASH */ + { 0x99, 3, "\xE2\x88\x92" }, /* MINUS SIGN */ + { 0x9a, 2, "\xC5\x92" }, /* LATIN CAPITAL LIGATURE OE */ + { 0x9b, 2, "\xC5\x93" }, /* LATIN SMALL LIGATURE OE */ + { 0x9c, 3, "\xE2\x80\xA0" }, /* DAGGER */ + { 0x9d, 3, "\xE2\x80\xA1" }, /* DOUBLE DAGGER */ + { 0x9e, 3, "\xEF\xAC\x81" }, /* LATIN SMALL LIGATURE FI */ + { 0x9f, 3, "\xEF\xAC\x82" } /* LATIN SMALL LIGATURE FL */ +}; + + /** - * Retrieve local encoding name, suitable for passing to iconv + * Convert a UTF-8 encoded string into the system local encoding + * + * \param string The string to convert + * \param len The length (in bytes) of the string, or 0 + * \param result Pointer to location in which to store result + * \return The appropriate utf8_convert_ret value */ -const char *local_encoding_name(void) +utf8_convert_ret utf8_to_local_encoding(const char *string, size_t len, + char **result) { os_error *error; - int alphabet; + int alphabet, i, offset_count = 0; + struct { + const struct special *local; /* local character */ + size_t offset; /* byte offset into string */ + } offsets[CHAR_MAX]; + size_t off; + char *temp; + const char *enc; + utf8_convert_ret err; + + assert(string && result); + + /* get length, if necessary */ + if (len == 0) + len = strlen(string); + /* read system alphabet */ error = xosbyte1(osbyte_ALPHABET_NUMBER, 127, 0, &alphabet); - if (!error) { - if (alphabet < 116) - return localencodings[alphabet - 100]; - else if (alphabet == 120) - return localencodings[16]; + if (error) + alphabet = territory_ALPHABET_LATIN1; + + /* UTF-8 -> simply copy string */ + if (alphabet == 111 /* UTF-8 */) { + *result = strndup(string, len); + return UTF8_CONVERT_OK; + } + + /* get encoding name */ + enc = (alphabet < 116 ? localencodings[alphabet - 100] + : (alphabet == 120 ? localencodings[16] + : localencodings[0])); + + /* populate offsets array with details of characters that + * will be stripped by iconv */ + for (off = 0; off < len; off = utf8_next(string, len, off)) { + if (string[off] != 0xE2 && + string[off] != 0xC5 && string[off] != 0xEF) + continue; + + for (i = 0; i != NOF_ELEMENTS(special_chars); i++) { + if (strncmp(string + off, special_chars[i].utf, + special_chars[i].len) == 0) { + /* ensure we don't overflow our buffer */ + assert(offset_count < CHAR_MAX - 1); + offsets[offset_count].local = + &special_chars[i]; + offsets[offset_count].offset = off; + offset_count++; + break; + } + } + } + + if (offset_count == 0) { + /* No substitutions are required, so exit here */ + return utf8_to_enc(string, enc, len, result); + } + + /* create output buffer */ + *(result) = malloc(len + 1); + if (!(*result)) + return UTF8_CONVERT_NOMEM; + *(*result) = '\0'; + + /* convert the chunks between offsets, then copy stripped + * character into output string */ + for (i = 0; i != offset_count; i++) { + off = (i > 0 ? offsets[i-1].offset + offsets[i-1].local->len + : 0); + + err = utf8_to_enc(string + off, enc, + offsets[i].offset - off, &temp); + if (err != UTF8_CONVERT_OK) { + assert(err != UTF8_CONVERT_BADENC); + free(*result); + return UTF8_CONVERT_NOMEM; + } + + strcat((*result), temp); + off = strlen(*result); + (*result)[off] = offsets[i].local->local; + (*result)[off+1] = '\0'; + + free(temp); + } + + /* handle last chunk */ + if (offsets[offset_count - 1].offset < len) { + off = offsets[offset_count - 1].offset + + offsets[offset_count - 1].local->len; + + err = utf8_to_enc(string + off, enc, len - off, &temp); + if (err != UTF8_CONVERT_OK) { + assert(err != UTF8_CONVERT_BADENC); + free(*result); + return UTF8_CONVERT_NOMEM; + } + + strcat((*result), temp); + + free(temp); + } + + return UTF8_CONVERT_OK; +} + +/** + * Convert a string encoded in the system local encoding to UTF-8 + * + * \param string The string to convert + * \param len The length (in bytes) of the string, or 0 + * \param result Pointer to location in which to store result + * \return The appropriate utf8_convert_ret value + */ +utf8_convert_ret utf8_from_local_encoding(const char *string, size_t len, + char **result) +{ + os_error *error; + int alphabet, i, offset_count = 0; + struct { + const struct special *local; /* utf character */ + size_t offset; /* byte offset into string */ + } offsets[CHAR_MAX]; + size_t off; + char *temp; + const char *enc; + utf8_convert_ret err; + + assert(string && result); + + /* get length, if necessary */ + if (len == 0) + len = strlen(string); + + /* read system alphabet */ + error = xosbyte1(osbyte_ALPHABET_NUMBER, 127, 0, &alphabet); + if (error) + alphabet = territory_ALPHABET_LATIN1; + + /* UTF-8 -> simply copy string */ + if (alphabet == 111 /* UTF-8 */) { + *result = strndup(string, len); + return UTF8_CONVERT_OK; + } + + /* get encoding name */ + enc = (alphabet < 116 ? localencodings[alphabet - 100] + : (alphabet == 120 ? localencodings[16] + : localencodings[0])); + + /* populate offsets array with details of characters that + * will be stripped by iconv */ + for (off = 0; off < len; off++) { + if (string[off] < 0x80 || string[off] > 0x9f) + continue; + + for (i = 0; i != NOF_ELEMENTS(special_chars); i++) { + if (string[off] == special_chars[i].local) { + /* ensure we don't overflow our buffer */ + assert(offset_count < CHAR_MAX - 1); + offsets[offset_count].local = + &special_chars[i]; + offsets[offset_count].offset = off; + offset_count++; + break; + } + } + } + + if (offset_count == 0) { + /* No substitutions are required, so exit here */ + return utf8_from_enc(string, enc, len, result); + } + + /* create output buffer (oversized, but not by much) */ + *(result) = malloc(len + (3 * offset_count) + 1); + if (!(*result)) + return UTF8_CONVERT_NOMEM; + *(*result) = '\0'; + + /* convert the chunks between offsets, then copy stripped + * UTF-8 character into output string */ + for (i = 0; i != offset_count; i++) { + off = (i > 0 ? offsets[i-1].offset + offsets[i-1].local->len + : 0); + + err = utf8_from_enc(string + off, enc, + offsets[i].offset - off, &temp); + if (err != UTF8_CONVERT_OK) { + assert(err != UTF8_CONVERT_BADENC); + free(*result); + return UTF8_CONVERT_NOMEM; + } + + strcat((*result), temp); + strcat((*result), offsets[i].local->utf); + + free(temp); + } + + /* handle last chunk */ + if (offsets[offset_count - 1].offset < len) { + off = offsets[offset_count - 1].offset + + offsets[offset_count - 1].local->len; + + err = utf8_from_enc(string + off, enc, len - off, &temp); + if (err != UTF8_CONVERT_OK) { + assert(err != UTF8_CONVERT_BADENC); + free(*result); + return UTF8_CONVERT_NOMEM; + } + + strcat((*result), temp); + + free(temp); } - return localencodings[0]; + return UTF8_CONVERT_OK; } |