diff options
author | John Mark Bell <jmb@netsurf-browser.org> | 2008-11-10 18:43:09 +0000 |
---|---|---|
committer | John Mark Bell <jmb@netsurf-browser.org> | 2008-11-10 18:43:09 +0000 |
commit | f8d8287cdbd7da9cd9392bcddf04860a10fa598e (patch) | |
tree | 668b4cc601fdfd050a51095d4f9bbebef9eaffec /src | |
download | iconv-f8d8287cdbd7da9cd9392bcddf04860a10fa598e.tar.gz iconv-f8d8287cdbd7da9cd9392bcddf04860a10fa598e.tar.bz2 |
Import Iconv sources
svn path=/trunk/iconv/; revision=5677
Diffstat (limited to 'src')
-rw-r--r-- | src/Makefile | 49 | ||||
-rw-r--r-- | src/alias.c | 89 | ||||
-rw-r--r-- | src/aliases.c | 364 | ||||
-rw-r--r-- | src/eightbit.c | 280 | ||||
-rw-r--r-- | src/iconv.c | 457 | ||||
-rw-r--r-- | src/internal.h | 58 | ||||
-rw-r--r-- | src/utils.c | 53 |
7 files changed, 1350 insertions, 0 deletions
diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..f9d136b --- /dev/null +++ b/src/Makefile @@ -0,0 +1,49 @@ +# Child makefile fragment +# +# Toolchain is provided by top-level makefile +# +# Variables provided by top-level makefile +# +# COMPONENT The name of the component +# EXPORT The location of the export directory +# TOP The location of the source tree root +# RELEASEDIR The place to put release objects +# DEBUGDIR The place to put debug objects +# +# do_include Canned command sequence to include a child makefile +# +# Variables provided by parent makefile: +# +# DIR The name of the directory we're in, relative to $(TOP) +# +# Variables we can manipulate: +# +# ITEMS_CLEAN The list of items to remove for "make clean" +# ITEMS_DISTCLEAN The list of items to remove for "make distclean" +# TARGET_TESTS The list of target names to run for "make test" +# +# SOURCES The list of sources to build for $(COMPONENT) +# +# Plus anything from the toolchain + +# Push parent directory onto the directory stack +sp := $(sp).x +dirstack_$(sp) := $(d) +d := $(DIR) + +# Manipulate include paths +CFLAGS := $(CFLAGS) -I$(d) + +# Sources +SRCS_$(d) := alias.c aliases.c eightbit.c iconv.c utils.c + +# Append to sources for component +SOURCES += $(addprefix $(d), $(SRCS_$(d))) + +# Now include any children we may have +MAKE_INCLUDES := $(wildcard $(d)*/Makefile) +$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC)))) + +# Finally, pop off the directory stack +d := $(dirstack_$(sp)) +sp := $(basename $(sp)) diff --git a/src/alias.c b/src/alias.c new file mode 100644 index 0000000..ebc1b78 --- /dev/null +++ b/src/alias.c @@ -0,0 +1,89 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "unicode/charsets.h" +#include "unicode/encoding.h" + +#include "internal.h" + +struct table_entry { + const char *alias; + const char *encname; +}; + +/* This table contains special cases to allow us to use UnicodeLib sensibly. */ +static const struct table_entry mapping_table[] = { + {"/UTF-7/UNICODE-1-1-UTF-7/UNICODE-2-0-UTF-7/", "UTF-7" }, + {"/ISO-10646-UCS-4/UCS-4/UTF-32/", "ISO-10646-UCS-4" }, + {"/UTF-16/UCS-2/ISO-10646-UCS-2/UNICODE-1-1/UNICODE-2-0/", "UTF-16" }, + {"/ISO-2022/", "ISO-2022" }, +}; + +#define TABLE_SIZE (sizeof(mapping_table) / sizeof(mapping_table[0])) + +/** + * Look up an encoding number, based on its name + * + * \param name The encoding name + * \return The encoding number, or 0 if not found. + */ +int iconv_encoding_number_from_name(const char *name) +{ + unsigned int i; + char buf[256]; + struct canon *c; + + if (!name) + return 0; + + snprintf(buf, sizeof buf, "/%s/", name); + + /* convert to upper case */ + for (i = 0; i != strlen(buf); i++) { + if (buf[i] >= 'a' && buf[i] <= 'z') + buf[i] = buf[i] - 32; + } + + for (i = 0; i != TABLE_SIZE; i++) + if (strstr(mapping_table[i].alias, buf) != NULL) + return encoding_number_from_name(mapping_table[i].encname); + + c = alias_canonicalise(name); + if (!c) + return 0; + + return encoding_number_from_name(c->name); +} + +/** + * Look up an encoding name, based on its MIB number + * + * \param number The encoding MIB number + * \return Pointer to encoding name, or NULL if not found + */ +const char *iconv_encoding_name_from_number(int number) +{ + const char *ret = NULL; + /* This is a PITA - UnicodeLib doesn't have a call to do this, + * so implement it ourselves. */ + switch (number) { + case csUnicode11UTF7: + ret = mapping_table[0].alias; + break; + case csUCS4: + ret = mapping_table[1].alias; + break; + case csUnicode11: + ret = mapping_table[2].alias; + break; + case csVenturaMath: + ret = mapping_table[3].alias; + break; + default: + ret = mibenum_to_name(number); + break; + } + + return ret; +} diff --git a/src/aliases.c b/src/aliases.c new file mode 100644 index 0000000..1292685 --- /dev/null +++ b/src/aliases.c @@ -0,0 +1,364 @@ +#include <ctype.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "internal.h" + +struct alias { + struct alias *next; + struct canon *canon; + unsigned short name_len; + char name[1]; +}; + +#define HASH_SIZE (43) +static struct canon *canon_tab[HASH_SIZE]; +static struct alias *alias_tab[HASH_SIZE]; + +static bool create_alias(const char *alias, struct canon *c); +static struct canon *create_canon(const char *canon, short mibenum); +static int hash_val(const char *alias); + +#ifdef TEST +static void dump_alias_data(void); + +int main (void) +{ + struct canon *c; + + create_alias_data("Unicode:Files.Aliases"); + + dump_alias_data(); + + c = alias_canonicalise("moose"); + if (c) + printf("!!!\n"); + + c = alias_canonicalise("csinvariant"); + if (c) + printf("%s %d\n", c->name, c->mib_enum); + + c = alias_canonicalise("nats-sefi-add"); + if (c) + printf("%s %d\n", c->name, c->mib_enum); + + printf("%d\n", mibenum_from_name(c->name)); + + printf("%s\n", mibenum_to_name(c->mib_enum)); + + free_alias_data(); + + return 0; +} +#endif + +/** + * Create an alias + * + * \param alias The alias name + * \param c The canonical form + * \return true on success, false otherwise + */ +bool create_alias(const char *alias, struct canon *c) +{ + struct alias *a; + int hash; + + if (!alias || !c) + return false; + + a = malloc(sizeof(struct alias) + strlen(alias) + 1); + if (!a) + return false; + + a->canon = c; + a->name_len = strlen(alias); + strcpy(a->name, alias); + a->name[a->name_len] = '\0'; + + hash = hash_val(alias); + + a->next = alias_tab[hash]; + alias_tab[hash] = a; + + return true; +} + +/** + * Create a canonical form + * + * \param canon The canonical name + * \param mibenum The MIB enum value + * \return Pointer to struct canon or NULL on error + */ +struct canon *create_canon(const char *canon, short mibenum) +{ + struct canon *c; + int hash, len; + + if (!canon) + return NULL; + + len = strlen(canon); + + c = malloc(sizeof(struct canon) + len + 1); + if (!c) + return NULL; + + c->mib_enum = mibenum; + c->name_len = len; + strcpy(c->name, canon); + c->name[len] = '\0'; + + hash = hash_val(canon); + + c->next = canon_tab[hash]; + canon_tab[hash] = c; + + return c; +} + +/** + * Hash function + * + * \param alias String to hash + * \return The hashed value + */ +int hash_val(const char *alias) +{ + const char *s = alias; + unsigned int h = 5381; + + if (!alias) + return 0; + + while (*s) + h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */ + + return h % HASH_SIZE; +} + +/** + * Free all alias data + */ +void free_alias_data(void) +{ + struct canon *c, *d; + struct alias *a, *b; + int i; + + for (i = 0; i != HASH_SIZE; i++) { + for (c = canon_tab[i]; c; c = d) { + d = c->next; + free(c); + } + canon_tab[i] = NULL; + + for (a = alias_tab[i]; a; a = b) { + b = a->next; + free(a); + } + alias_tab[i] = NULL; + } +} + +#ifdef TEST +/** + * Dump all alias data to stdout + */ +void dump_alias_data(void) +{ + struct canon *c; + struct alias *a; + int i; + size_t size = 0; + + for (i = 0; i != HASH_SIZE; i++) { + for (c = canon_tab[i]; c; c = c->next) { + printf("%d %s\n", i, c->name); + size += offsetof(struct canon, name) + c->name_len; + } + + for (a = alias_tab[i]; a; a = a->next) { + printf("%d %s\n", i, a->name); + size += offsetof(struct alias, name) + a->name_len; + } + } + + size += (sizeof(canon_tab) / sizeof(canon_tab[0])); + size += (sizeof(alias_tab) / sizeof(alias_tab[0])); + + printf("%d\n", size); +} +#endif + +/** + * Create alias data from Aliases file + * + * \param filename The path to the Aliases file + * \return 1 on success, 0 on failure. + */ +int create_alias_data(const char *filename) +{ + char buf[300]; + FILE *fp; + + if (!filename) + return 0; + + fp = fopen(filename, "r"); + if (!fp) + return 0; + + while (fgets(buf, sizeof buf, fp)) { + char *p, *aliases = 0, *mib, *end; + struct canon *cf; + + if (buf[0] == 0 || buf[0] == '#') + /* skip blank lines or comments */ + continue; + + buf[strlen(buf) - 1] = 0; /* lose terminating newline */ + end = buf + strlen(buf); + + /* find end of canonical form */ + for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + *p++ = '\0'; /* terminate canonical form */ + + /* skip whitespace */ + for (; *p && isspace(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + mib = p; + + /* find end of mibenum */ + for (; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p < end) + *p++ = '\0'; /* terminate mibenum */ + + cf = create_canon(buf, atoi(mib)); + if (!cf) + continue; + + /* skip whitespace */ + for (; p < end && *p && isspace(*p); p++) + ; /* do nothing */ + if (p >= end) + continue; + aliases = p; + + while (p < end) { + /* find end of alias */ + for (; *p && !isspace(*p) && !iscntrl(*p); p++) + ; /* do nothing */ + if (p > end) + /* stop if we've gone past the end */ + break; + /* terminate current alias */ + *p++ = '\0'; + + if (!create_alias(aliases, cf)) + break; + + /* in terminating, we may have advanced + * past the end - check this here */ + if (p >= end) + break; + + /* skip whitespace */ + for (; *p && isspace(*p); p++) + ; /* do nothing */ + + if (p >= end) + /* gone past end => stop */ + break; + + /* update pointer to current alias */ + aliases = p; + } + } + + fclose(fp); + + return 1; +} + +/** + * Retrieve the canonical form of an alias name + * + * \param alias The alias name + * \return Pointer to struct canon or NULL if not found + */ +struct canon *alias_canonicalise(const char *alias) +{ + int hash, len; + struct canon *c; + struct alias *a; + + if (!alias) + return NULL; + + hash = hash_val(alias); + len = strlen(alias); + + for (c = canon_tab[hash]; c; c = c->next) + if (c->name_len == len && strcasecmp(c->name, alias) == 0) + break; + if (c) + return c; + + for (a = alias_tab[hash]; a; a = a->next) + if (a->name_len == len && strcasecmp(a->name, alias) == 0) + break; + if (a) + return a->canon; + + return NULL; +} + +/** + * Retrieve the MIB enum value assigned to an encoding name + * + * \param alias The alias to lookup + * \return The MIB enum value, or 0 if not found + */ +short mibenum_from_name(const char *alias) +{ + struct canon *c; + + if (!alias) + return 0; + + c = alias_canonicalise(alias); + if (!c) + return 0; + + return c->mib_enum; +} + +/** + * Retrieve the canonical name of an encoding from the MIB enum + * + * \param mibenum The MIB enum value + * \return Pointer to canonical name, or NULL if not found + */ +const char *mibenum_to_name(short mibenum) +{ + int i; + struct canon *c; + + for (i = 0; i != HASH_SIZE; i++) + for (c = canon_tab[i]; c; c = c->next) + if (c->mib_enum == mibenum) + return c->name; + + return NULL; +} diff --git a/src/eightbit.c b/src/eightbit.c new file mode 100644 index 0000000..3ff3470 --- /dev/null +++ b/src/eightbit.c @@ -0,0 +1,280 @@ +/* stateless 8bit encoding support => no support for CP1255, 1258 or TCVN + * functions in this file have an identical API to the encoding functions + * in UnicodeLib. see unicode/encoding.h for documentation. */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "internal.h" + +struct table_entry { + const char *canon; + const char *filename; +}; + +/* Table should be ordered by enc_num */ +static const struct table_entry mapping_table[] = { + { "US-ASCII", 0 }, + { "HP-ROMAN8", "HPR8" }, + { "MACINTOSH", "Apple.Roman"}, + { "IBM437", "Microsoft.CP437" }, + { "IBM775", "Microsoft.CP775" }, + { "IBM850", "Microsoft.CP850" }, + { "IBM852", "Microsoft.CP852" }, + { "IBM855", "Microsoft.CP855" }, + { "IBM857", "Microsoft.CP857" }, + { "IBM860", "Microsoft.CP860" }, + { "IBM861", "Microsoft.CP861" }, + { "IBM862", "Microsoft.CP862" }, + { "IBM863", "Microsoft.CP863" }, + { "IBM864", "Microsoft.CP864" }, + { "IBM865", "Microsoft.CP865" }, + { "IBM866", "Microsoft.CP866" }, + { "IBM869", "Microsoft.CP869" }, + { "KOI8-R", "KOI8-R" }, + { "KOI8-U", "KOI8-U" }, + { "IBM00858", "Microsoft.CP858" }, + { "WINDOWS-1250", "Microsoft.CP1250" }, + { "WINDOWS-1251", "Microsoft.CP1251" }, + { "WINDOWS-1252", "Microsoft.CP1252" }, + { "WINDOWS-1253", "Microsoft.CP1253" }, + { "WINDOWS-1254", "Microsoft.CP1254" }, + { "WINDOWS-1256", "Microsoft.CP1256" }, + { "WINDOWS-1257", "Microsoft.CP1257" }, + { "CP737", "Microsoft.CP737" }, + { "CP853", "Microsoft.CP853" }, + { "CP856", "Microsoft.CP856" }, + { "CP874", "Microsoft.CP874" }, + { "CP922", "Microsoft.CP922" }, + { "CP1046", "Microsoft.CP1046" }, + { "CP1124", "Microsoft.CP1124" }, + { "CP1125", "Microsoft.CP1125" }, + { "CP1129", "Microsoft.CP1129" }, + { "CP1133", "Microsoft.CP1133" }, + { "CP1161", "Microsoft.CP1161" }, + { "CP1162", "Microsoft.CP1162" }, + { "CP1163", "Microsoft.CP1163" }, + { "GEORGIAN-ACADEMY", "GeorgA" }, + { "GEORGIAN-PS", "GeorgPS" }, + { "KOI8-RU", "KOI8-RU" }, + { "KOI8-T", "KOI8-T" }, + { "MACARABIC", "Apple.Arabic" }, + { "MACCROATIAN", "Apple.Croatian" }, + { "MACGREEK", "Apple.Greek" }, + { "MACHEBREW", "Apple.Hebrew" }, + { "MACICELAND", "Apple.Iceland" }, + { "MACROMANIA", "Apple.Romania" }, + { "MACTHAI", "Apple.Thai" }, + { "MACTURKISH", "Apple.Turkish" }, + { "MULELAO-1", "Mulelao" }, + { "MACCYRILLIC", "Apple.Cyrillic" }, + { "MACUKRAINE", "Apple.Ukrainian" }, + { "MACCENTRALEUROPE", "Apple.CentEuro" }, +}; + +#define TABLE_SIZE (sizeof(mapping_table) / sizeof(mapping_table[0])) + +/** + * Look up an encoding number, based on its name + * + * \param name The encoding name + * \return The encoding number, or 0 if not found + */ +int iconv_eightbit_number_from_name(const char *name) +{ + struct canon *c; + int i; + + if (!name) + return 0; + + c = alias_canonicalise(name); + if (!c) + return 0; + + LOG(("searching for: %s", name)); + + for (i = 0; i != TABLE_SIZE; i++) { + if (strcasecmp(mapping_table[i].canon, c->name) == 0) { + LOG(("found: %d", c->mib_enum | (1<<30))); + return c->mib_enum | (1<<30); + } + } + + return 0; +} + +/** + * Read an 8bit encoded string + * + * \param e The encoding context + * \param callback Callback function to handle generated UCS characters + * \param s The input string + * \param n The length (in bytes) of the input + * \param handle Callback private data pointer + * \return The number of characters processed + */ +unsigned iconv_eightbit_read(struct encoding_context *e, + int (*callback)(void *handle, UCS4 c), const char *s, + unsigned int n, void *handle) +{ + UCS4 c; + unsigned int pos; + + if (!e || !callback || !s) + return 0; + + for (pos = 0; pos != n; pos++) { + + c = s[pos]; + + LOG(("read: %d (%d)", c, pos)); + + if (c < 0x80) { + /* ASCII */ + if (callback(handle, c)) + break; + } + else if (c < 0x100 && e->intab) { + LOG(("maps to: %x", e->intab[c - 0x80])); + /* Look up in mapping table */ + if (e->intab[c - 0x80] != 0xffff) { + if (callback(handle, e->intab[c - 0x80])) + break; + } + else { + /* character not defined in this encoding */ + return pos; + } + } + } + + return pos; +} + +/** + * Write a UCS character in an 8bit encoding + * + * \param e The encoding context + * \param c The UCS4 character + * \param buf Indirect pointer to output buffer + * \param bufsize Pointer to size of output buffer + * \return 1 on success, 0 if bufsize is too small, -1 if unrepresentable. + */ +int iconv_eightbit_write(struct encoding_context *e, UCS4 c, + char **buf, int *bufsize) +{ + int i; + + /* sanity check input */ + if (!e || !bufsize || !buf || !*buf) + return 0; + + /* buffer full */ + if (--*bufsize < 0) + return 0; + + if (c < 0x0080) + /* ASCII */ + *(*buf)++ = (char)c; + else { + /* Perform reverse table lookup */ + for (i = 0; i != 0x80; i++) { + if (e->outtab && e->outtab[i] == c) { + *(*buf)++ = (char)(i+0x80); + break; + } + } + if (i == 0x80) { + /* Nothing was written => fixup bufsize */ + ++*bufsize; + return -1; + } + } + + LOG(("written: %d", *(*buf-1))); + + return 1; +} + +/** + * Load an 8bit encoding + * + * \param enc_num The encoding number to load + * \return Pointer to lookup table for encoding, or NULL on error + */ +unsigned short *iconv_eightbit_new(int enc_num) +{ + char filename[64]; + const char *name; + FILE *fp; + unsigned int len; + int i; + unsigned short *ret; + + name = mibenum_to_name(enc_num); + if (!name) + return NULL; + + /* Lookup filename in table */ + for (i = 0; i != TABLE_SIZE; i++) + if (strcasecmp(mapping_table[i].canon, name) == 0) { + if (mapping_table[i].filename == 0) + return NULL; + + snprintf(filename, sizeof filename, + "Unicode:Encodings.%s", + mapping_table[i].filename); + + break; + } + + LOG(("opening: %s", filename)); + + /* Open */ + fp = fopen(filename, "rb"); + if (!fp) { + return NULL; + } + + /* Get extent */ + fseek(fp, 0, SEEK_END); + len = (unsigned int)ftell(fp); + fseek(fp, 0, SEEK_SET); + + /* Unexpected length => give up */ + if (len != 256) { + fclose(fp); + return NULL; + } + + /* Create buffer */ + ret = calloc(128, sizeof(short)); + if (!ret) { + fclose(fp); + return NULL; + } + + fread(ret, 128, sizeof(short), fp); + + fclose(fp); + + return ret; +} + +/** + * Delete any 8bit encodings used by a context + * + * \param e The encoding context + */ +void iconv_eightbit_delete(struct encoding_context *e) +{ + if (!e) + return; + + if (e->intab) + free(e->intab); + if (e->outtab) + free(e->outtab); +} diff --git a/src/iconv.c b/src/iconv.c new file mode 100644 index 0000000..aa18fa5 --- /dev/null +++ b/src/iconv.c @@ -0,0 +1,457 @@ +/* iconv implementation - see iconv.h for docs */ + +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <sys/errno.h> + +#include <unicode/charsets.h> +#include <unicode/encoding.h> + +#include <iconv/iconv.h> + +#include "internal.h" + +static struct encoding_context *context_list; + +static int character_callback(void *handle, UCS4 c); +static void parse_parameters(struct encoding_context *e, const char *params, + bool destination); +static void parse_parameter(struct encoding_context *e, const char *param, + int length, bool destination); + +int iconv_initialise(const char *aliases_file) +{ + if (aliases_file == NULL) + return false; + + if (create_alias_data(aliases_file) == false) + return false; + + encoding_initialise(); + + return true; +} + +void iconv_finalise(void) +{ + struct encoding_context *a, *b; + + /* clients may quit / die without cleaning up. */ + for (a = context_list; a; a = b) { + b = a->next; + if (a->in) + encoding_delete(a->in); + if (a->out) + encoding_delete(a->out); + iconv_eightbit_delete(a); + free(a); + } + + free_alias_data(); + + /* finalise the unicode library */ + encoding_tidyup(); +} + +iconv_t iconv_open(const char *tocode, const char *fromcode) +{ + int to = 0, from = 0; + struct encoding_context *e; + struct canon *c; + bool to_force_le = false, from_force_le = false; + char totemp[128], fromtemp[128]; + const char *slash; + unsigned int len; + + /* can't do anything without these */ + if (!tocode || !fromcode) { + errno = EINVAL; + return (iconv_t)(-1); + } + + e = calloc(1, sizeof(*e)); + if (!e) { + LOG(("malloc failed")); + errno = ENOMEM; + return (iconv_t)(-1); + } + + /* strip any parameters off the end of the tocode string */ + slash = strchr(tocode, '/'); + len = slash ? (unsigned) (slash - tocode) : strlen(tocode); + snprintf(totemp, sizeof totemp, "%.*s", len, tocode); + + /* parse parameters */ + if (slash && *(slash + 1) == '/' && *(slash + 2) != '\0') + parse_parameters(e, slash + 2, true); + + /* strip any parameters off the end of the fromcode string */ + slash = strchr(fromcode, '/'); + len = slash ? (unsigned) (slash - fromcode) : strlen(fromcode); + snprintf(fromtemp, sizeof fromtemp, "%.*s", len, fromcode); + + /* parse parameters */ + if (slash && *(slash + 1) == '/' && *(slash + 2) != '\0') + parse_parameters(e, slash + 2, false); + + /* try our own 8bit charset code first */ + to = iconv_eightbit_number_from_name(totemp); + from = iconv_eightbit_number_from_name(fromtemp); + + /* if that failed, try the UnicodeLib functionality */ + if (!to) + to = iconv_encoding_number_from_name(totemp); + + if (!from) + from = iconv_encoding_number_from_name(fromtemp); + + /* if that failed, perhaps it was an endian-specific variant of + * something UnicodeLib can handle? */ + if (!to) { + c = alias_canonicalise(totemp); + if (c) { + switch(c->mib_enum) { + case 1013: /* UTF-16BE */ + to = csUnicode11; + break; + case 1014: /* UTF-16LE */ + to = csUnicode11; + to_force_le = true; + break; + case 1018: /* UTF-32BE */ + to = csUCS4; + break; + case 1019: /* UTF-32LE */ + to = csUCS4; + to_force_le = true; + break; + } + } + } + + if (!from) { + c = alias_canonicalise(fromtemp); + if (c) { + switch(c->mib_enum) { + case 1013: /* UTF-16BE */ + from = csUnicode11; + break; + case 1014: /* UTF-16LE */ + from = csUnicode11; + from_force_le = true; + break; + case 1018: /* UTF-32BE */ + from = csUCS4; + break; + case 1019: /* UTF-32LE */ + from = csUCS4; + from_force_le = true; + break; + } + } + } + + LOG(("to: %d(%s) from: %d(%s)", to, totemp, from, fromtemp)); + + /* ensure both encodings are recognised */ + if (to == 0 || from == 0) { + free(e); + errno = EINVAL; + return (iconv_t)(-1); + } + + /* bit 30 set indicates that this is an 8bit encoding */ + if (from & (1<<30)) + e->intab = iconv_eightbit_new(from & ~(1<<30)); + else { + e->in = encoding_new(from, encoding_READ); + if (e->in) { + /* Set encoding flags */ + unsigned int flags = 0; + if (from_force_le) + flags |= encoding_FLAG_LITTLE_ENDIAN; + + c = alias_canonicalise(fromtemp); + if (c && (c->mib_enum == csUCS4 || + c->mib_enum == csUnicode)) + flags |= encoding_FLAG_NO_HEADER; + + encoding_set_flags(e->in, flags, flags); + } + } + + /* neither created => memory error or somesuch. assume ENOMEM */ + /* no table is ever generated for ASCII */ + if (!e->in && !e->intab && (from & ~(1<<30)) != csASCII) { + free(e); + errno = ENOMEM; + return (iconv_t)(-1); + } + + if (to & (1<<30)) + e->outtab = iconv_eightbit_new(to & ~(1<<30)); + else { + e->out = encoding_new(to, encoding_WRITE_STRICT); + if (e->out) { + /* Set encoding flags */ + unsigned int flags = 0; + if (to_force_le) + flags |= encoding_FLAG_LITTLE_ENDIAN; + + c = alias_canonicalise(totemp); + if (c && (c->mib_enum == csUCS4 || + c->mib_enum == csUnicode)) + flags |= encoding_FLAG_NO_HEADER; + + encoding_set_flags(e->out, flags, flags); + } + } + + /* neither created => ENOMEM */ + if (!e->out && !e->outtab && (to & ~(1<<30)) != csASCII) { + if (e->in) + encoding_delete(e->in); + iconv_eightbit_delete(e); + free(e); + errno = ENOMEM; + return (iconv_t)(-1); + } + + /* add to list */ + e->prev = 0; + e->next = context_list; + if (context_list) + context_list->prev = e; + context_list = e; + + return (iconv_t)e; +} + +size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf, + size_t *outbytesleft) +{ + struct encoding_context *e; + unsigned read; + + /* search for cd in list */ + for (e = context_list; e; e = e->next) + if (e == (struct encoding_context *)cd) + break; + + /* not found => invalid */ + if (!e) { + errno = EINVAL; + return (size_t)(-1); + } + + if (inbuf == NULL || *inbuf == NULL) { + if (e->in) + encoding_reset(e->in); + return 0; + } + + /* Is there any point doing anything? */ + if (!outbuf || !(*outbuf) || !outbytesleft) { + errno = EINVAL; + return (size_t)(-1); + } + + e->outbuf = outbuf; + e->outbytesleft = outbytesleft; + + LOG(("reading")); + + if (e->in) + read = encoding_read(e->in, character_callback, *inbuf, + *inbytesleft, e); + else + read = iconv_eightbit_read(e, character_callback, *inbuf, + *inbytesleft, e); + + LOG(("done")); + + LOG(("read: %d, ibl: %d, obl: %d", read, *inbytesleft, *outbytesleft)); + + /* 2 */ + if (read == *inbytesleft) { + *inbuf += read; + *inbytesleft = 0; + return 0; + } + /* 4 */ + else if ((int)*outbytesleft < 0) { + LOG(("e2big")); + *outbytesleft = 0; + *inbuf += read - 1; + *inbytesleft -= read - 1; + errno = E2BIG; + } + /** \todo find a mechanism for distinguishing between 1 & 3 */ + /* 1 */ + else if (read != *inbytesleft) { + *inbuf += read; + *inbytesleft -= read; + LOG(("eilseq")); + errno = EILSEQ; + } + /* 3 */ + else if ((int)*outbytesleft >= 0) { + *inbuf += read; + *inbytesleft -= read; + LOG(("einval")); + errno = EINVAL; + } + + LOG(("errno: %d", errno)); + + return (size_t)(-1); +} + +int iconv_close(iconv_t cd) +{ + struct encoding_context *e; + + /* search for cd in list */ + for (e = context_list; e; e = e->next) + if (e == (struct encoding_context *)cd) + break; + + /* not found => invalid */ + if (!e) + return 0; + + if (e->in) + encoding_delete(e->in); + if (e->out) + encoding_delete(e->out); + iconv_eightbit_delete(e); + + /* remove from list */ + if (e->next) + e->next->prev = e->prev; + if (e->prev) + e->prev->next = e->next; + else + context_list = e->next; + + free(e); + + /* reduce our memory usage somewhat */ + encoding_table_remove_unused(8 /* recommended value */); + + return 0; +} + +/* this is called for each converted character */ +int character_callback(void *handle, UCS4 c) +{ + struct encoding_context *e; + int ret; + + e = (struct encoding_context*)handle; + + LOG(("outbuf: %p, free: %d", *e->outbuf, *e->outbytesleft)); + LOG(("writing: %d", c)); + + if (e->out) { + char *prev_outbuf = *e->outbuf; + size_t prev_outbytesleft = *e->outbytesleft; + + ret = encoding_write(e->out, c, e->outbuf, + (int*)e->outbytesleft); + + LOG(("ret: %d", ret)); + + /* Why the need for this nonsense? UnicodeLib appears to + * decrease the count of free space in the buffer even + * if it doesn't write into it. This is a bug, as the + * documentation says that the buffer pointer AND free + * space count are left unmodified if nothing is written. + * Therefore, we have this hack until UnicodeLib gets fixed. + */ + if (ret == -1) { + *e->outbytesleft = prev_outbytesleft - + (*e->outbuf - prev_outbuf); + } + } else { + ret = iconv_eightbit_write(e, c, e->outbuf, + (int*)e->outbytesleft); + } + + if (ret == -1) { + /* Transliterate, if we've been asked to. + * Assumes that output is 8bit/8bit multibyte with ASCII G0. + * This should be fine as the only <>8bit encodings are + * UCS{2,4}, UTF-{16,32}, neither of which return -1. + * Also, afaiaa, all supported multibyte encodings are ASCII + * compatible. */ + /** \todo Actually perform some kind of transliteration */ + if (e->transliterate && (int)*e->outbytesleft > 0) { + if (e->out) { + /* Reset encoding write state */ + /** \todo this is a bit dodgy, as we only + * really need to ensure that the ASCII set + * is mapped into G0 in ISO2022 encodings. + * This will reset G1->G3, too, which may + * break things. If so, we may have to + * perform some dirty hackery which relies + * upon knowledge of UnicodeLib's internals + */ + encoding_write(e->out, NULL_UCS4, e->outbuf, + (int*)e->outbytesleft); + } + + if ((int)*e->outbytesleft > 0) { + *(*e->outbuf)++ = '?'; + --*e->outbytesleft; + + ret = 1; + } else { + ret = 0; + } + } else { + ret = 1; + } + } + + return (!ret); +} + +void parse_parameters(struct encoding_context *e, const char *params, + bool destination) +{ + char *slash = NULL, *prev = NULL; + int len; + + len = strlen(params); + + while (slash - params < len && + (slash = strchr(params, '/')) != NULL) { + parse_parameter(e, prev == NULL ? params : prev, + slash - (prev == NULL ? params : prev), + destination); + + prev = slash + 2; + slash += 2; + } + + if (slash == NULL) + parse_parameter(e, prev == NULL ? params : prev, + (params + len) - + (prev == NULL ? params : prev), + destination); +} + +void parse_parameter(struct encoding_context *e, const char *param, + int length, bool destination) +{ + if (length == 8 && strncasecmp(param, "TRANSLIT", 8) == 0) { + if (destination) + e->transliterate = 1; + } +} + diff --git a/src/internal.h b/src/internal.h new file mode 100644 index 0000000..d19bd09 --- /dev/null +++ b/src/internal.h @@ -0,0 +1,58 @@ +#ifndef _ICONV_INTERNAL_H_ +#define _ICONV_INTERNAL_H_ + +#ifndef unicode_encoding_h +#include <unicode/encoding.h> +#endif + +#ifndef DEBUG +#define LOG(x) +#else +#define LOG(x) (printf(__FILE__ " %s %i: ", __func__, __LINE__), printf x, fputc('\n', stdout)) +#endif + +#define UNUSED(x) ((x) = (x)) + +struct encoding_context { + Encoding *in; + Encoding *out; + unsigned short *intab, *outtab; + char **outbuf; + size_t *outbytesleft; + char transliterate; + struct encoding_context *prev, *next; +}; + +/* in eightbit.c */ +int iconv_eightbit_number_from_name(const char *name); +unsigned iconv_eightbit_read(struct encoding_context *e, + int (*callback)(void *handle, UCS4 c), const char *s, + unsigned int n, void *handle); +int iconv_eightbit_write(struct encoding_context *e, UCS4 c, + char **buf, int *bufsize); +unsigned short *iconv_eightbit_new(int enc_num); +void iconv_eightbit_delete(struct encoding_context *e); + +/* in alias.c */ +int iconv_encoding_number_from_name(const char *name); +const char *iconv_encoding_name_from_number(int number); + +struct canon { + struct canon *next; + short mib_enum; + unsigned short name_len; + char name[1]; +}; + +/* in aliases.c */ +int create_alias_data(const char *filename); +void free_alias_data(void); +struct canon *alias_canonicalise(const char *alias); +short mibenum_from_name(const char *alias); +const char *mibenum_to_name(short mibenum); + +/* in utils.c */ +int strcasecmp(const char *s1, const char *s2); +int strncasecmp(const char *s1, const char *s2, size_t len); + +#endif diff --git a/src/utils.c b/src/utils.c new file mode 100644 index 0000000..5403816 --- /dev/null +++ b/src/utils.c @@ -0,0 +1,53 @@ +#include <ctype.h> + +#include "internal.h" + +/** + * Case insensitive string comparison + * + * \param s1 Pointer to string + * \param s2 Pointer to string + * \return 0 if strings match, <> 0 if no match + */ +int strcasecmp(const char *s1, const char *s2) +{ + int i; + + if (!s1 || !s2) + return 1; /* this is arbitrary */ + + if (s1 == s2) + return 0; + + while ((i = tolower(*s1)) && i == tolower(*s2)) + s1++, s2++; + + return ((unsigned char) tolower(*s1) - (unsigned char) tolower(*s2)); +} + +/** + * Length-limited case insensitive string comparison + * + * \param s1 Pointer to string + * \param s2 Pointer to string + * \param len Length to compare + * \return 0 if strings match, <> 0 if no match + */ +int strncasecmp(const char *s1, const char *s2, size_t len) +{ + int i; + + if (!s1 || !s2) + return 1; /* this is arbitrary */ + + if (len == 0) + return 0; + + if (s1 == s2) + return 0; + + while (len-- && (i = tolower(*s1)) && i == tolower(*s2)) + s1++, s2++; + + return ((unsigned char) tolower(*s1) - (unsigned char) tolower(*s2)); +} |