summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn Mark Bell <jmb@netsurf-browser.org>2008-11-10 18:43:09 +0000
committerJohn Mark Bell <jmb@netsurf-browser.org>2008-11-10 18:43:09 +0000
commitf8d8287cdbd7da9cd9392bcddf04860a10fa598e (patch)
tree668b4cc601fdfd050a51095d4f9bbebef9eaffec /src
downloadiconv-f8d8287cdbd7da9cd9392bcddf04860a10fa598e.tar.gz
iconv-f8d8287cdbd7da9cd9392bcddf04860a10fa598e.tar.bz2
Import Iconv sources
svn path=/trunk/iconv/; revision=5677
Diffstat (limited to 'src')
-rw-r--r--src/Makefile49
-rw-r--r--src/alias.c89
-rw-r--r--src/aliases.c364
-rw-r--r--src/eightbit.c280
-rw-r--r--src/iconv.c457
-rw-r--r--src/internal.h58
-rw-r--r--src/utils.c53
7 files changed, 1350 insertions, 0 deletions
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..f9d136b
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,49 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT The name of the component
+# EXPORT The location of the export directory
+# TOP The location of the source tree root
+# RELEASEDIR The place to put release objects
+# DEBUGDIR The place to put debug objects
+#
+# do_include Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN The list of items to remove for "make distclean"
+# TARGET_TESTS The list of target names to run for "make test"
+#
+# SOURCES The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp := $(sp).x
+dirstack_$(sp) := $(d)
+d := $(DIR)
+
+# Manipulate include paths
+CFLAGS := $(CFLAGS) -I$(d)
+
+# Sources
+SRCS_$(d) := alias.c aliases.c eightbit.c iconv.c utils.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/alias.c b/src/alias.c
new file mode 100644
index 0000000..ebc1b78
--- /dev/null
+++ b/src/alias.c
@@ -0,0 +1,89 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "unicode/charsets.h"
+#include "unicode/encoding.h"
+
+#include "internal.h"
+
+struct table_entry {
+ const char *alias;
+ const char *encname;
+};
+
+/* This table contains special cases to allow us to use UnicodeLib sensibly. */
+static const struct table_entry mapping_table[] = {
+ {"/UTF-7/UNICODE-1-1-UTF-7/UNICODE-2-0-UTF-7/", "UTF-7" },
+ {"/ISO-10646-UCS-4/UCS-4/UTF-32/", "ISO-10646-UCS-4" },
+ {"/UTF-16/UCS-2/ISO-10646-UCS-2/UNICODE-1-1/UNICODE-2-0/", "UTF-16" },
+ {"/ISO-2022/", "ISO-2022" },
+};
+
+#define TABLE_SIZE (sizeof(mapping_table) / sizeof(mapping_table[0]))
+
+/**
+ * Look up an encoding number, based on its name
+ *
+ * \param name The encoding name
+ * \return The encoding number, or 0 if not found.
+ */
+int iconv_encoding_number_from_name(const char *name)
+{
+ unsigned int i;
+ char buf[256];
+ struct canon *c;
+
+ if (!name)
+ return 0;
+
+ snprintf(buf, sizeof buf, "/%s/", name);
+
+ /* convert to upper case */
+ for (i = 0; i != strlen(buf); i++) {
+ if (buf[i] >= 'a' && buf[i] <= 'z')
+ buf[i] = buf[i] - 32;
+ }
+
+ for (i = 0; i != TABLE_SIZE; i++)
+ if (strstr(mapping_table[i].alias, buf) != NULL)
+ return encoding_number_from_name(mapping_table[i].encname);
+
+ c = alias_canonicalise(name);
+ if (!c)
+ return 0;
+
+ return encoding_number_from_name(c->name);
+}
+
+/**
+ * Look up an encoding name, based on its MIB number
+ *
+ * \param number The encoding MIB number
+ * \return Pointer to encoding name, or NULL if not found
+ */
+const char *iconv_encoding_name_from_number(int number)
+{
+ const char *ret = NULL;
+ /* This is a PITA - UnicodeLib doesn't have a call to do this,
+ * so implement it ourselves. */
+ switch (number) {
+ case csUnicode11UTF7:
+ ret = mapping_table[0].alias;
+ break;
+ case csUCS4:
+ ret = mapping_table[1].alias;
+ break;
+ case csUnicode11:
+ ret = mapping_table[2].alias;
+ break;
+ case csVenturaMath:
+ ret = mapping_table[3].alias;
+ break;
+ default:
+ ret = mibenum_to_name(number);
+ break;
+ }
+
+ return ret;
+}
diff --git a/src/aliases.c b/src/aliases.c
new file mode 100644
index 0000000..1292685
--- /dev/null
+++ b/src/aliases.c
@@ -0,0 +1,364 @@
+#include <ctype.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "internal.h"
+
+struct alias {
+ struct alias *next;
+ struct canon *canon;
+ unsigned short name_len;
+ char name[1];
+};
+
+#define HASH_SIZE (43)
+static struct canon *canon_tab[HASH_SIZE];
+static struct alias *alias_tab[HASH_SIZE];
+
+static bool create_alias(const char *alias, struct canon *c);
+static struct canon *create_canon(const char *canon, short mibenum);
+static int hash_val(const char *alias);
+
+#ifdef TEST
+static void dump_alias_data(void);
+
+int main (void)
+{
+ struct canon *c;
+
+ create_alias_data("Unicode:Files.Aliases");
+
+ dump_alias_data();
+
+ c = alias_canonicalise("moose");
+ if (c)
+ printf("!!!\n");
+
+ c = alias_canonicalise("csinvariant");
+ if (c)
+ printf("%s %d\n", c->name, c->mib_enum);
+
+ c = alias_canonicalise("nats-sefi-add");
+ if (c)
+ printf("%s %d\n", c->name, c->mib_enum);
+
+ printf("%d\n", mibenum_from_name(c->name));
+
+ printf("%s\n", mibenum_to_name(c->mib_enum));
+
+ free_alias_data();
+
+ return 0;
+}
+#endif
+
+/**
+ * Create an alias
+ *
+ * \param alias The alias name
+ * \param c The canonical form
+ * \return true on success, false otherwise
+ */
+bool create_alias(const char *alias, struct canon *c)
+{
+ struct alias *a;
+ int hash;
+
+ if (!alias || !c)
+ return false;
+
+ a = malloc(sizeof(struct alias) + strlen(alias) + 1);
+ if (!a)
+ return false;
+
+ a->canon = c;
+ a->name_len = strlen(alias);
+ strcpy(a->name, alias);
+ a->name[a->name_len] = '\0';
+
+ hash = hash_val(alias);
+
+ a->next = alias_tab[hash];
+ alias_tab[hash] = a;
+
+ return true;
+}
+
+/**
+ * Create a canonical form
+ *
+ * \param canon The canonical name
+ * \param mibenum The MIB enum value
+ * \return Pointer to struct canon or NULL on error
+ */
+struct canon *create_canon(const char *canon, short mibenum)
+{
+ struct canon *c;
+ int hash, len;
+
+ if (!canon)
+ return NULL;
+
+ len = strlen(canon);
+
+ c = malloc(sizeof(struct canon) + len + 1);
+ if (!c)
+ return NULL;
+
+ c->mib_enum = mibenum;
+ c->name_len = len;
+ strcpy(c->name, canon);
+ c->name[len] = '\0';
+
+ hash = hash_val(canon);
+
+ c->next = canon_tab[hash];
+ canon_tab[hash] = c;
+
+ return c;
+}
+
+/**
+ * Hash function
+ *
+ * \param alias String to hash
+ * \return The hashed value
+ */
+int hash_val(const char *alias)
+{
+ const char *s = alias;
+ unsigned int h = 5381;
+
+ if (!alias)
+ return 0;
+
+ while (*s)
+ h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */
+
+ return h % HASH_SIZE;
+}
+
+/**
+ * Free all alias data
+ */
+void free_alias_data(void)
+{
+ struct canon *c, *d;
+ struct alias *a, *b;
+ int i;
+
+ for (i = 0; i != HASH_SIZE; i++) {
+ for (c = canon_tab[i]; c; c = d) {
+ d = c->next;
+ free(c);
+ }
+ canon_tab[i] = NULL;
+
+ for (a = alias_tab[i]; a; a = b) {
+ b = a->next;
+ free(a);
+ }
+ alias_tab[i] = NULL;
+ }
+}
+
+#ifdef TEST
+/**
+ * Dump all alias data to stdout
+ */
+void dump_alias_data(void)
+{
+ struct canon *c;
+ struct alias *a;
+ int i;
+ size_t size = 0;
+
+ for (i = 0; i != HASH_SIZE; i++) {
+ for (c = canon_tab[i]; c; c = c->next) {
+ printf("%d %s\n", i, c->name);
+ size += offsetof(struct canon, name) + c->name_len;
+ }
+
+ for (a = alias_tab[i]; a; a = a->next) {
+ printf("%d %s\n", i, a->name);
+ size += offsetof(struct alias, name) + a->name_len;
+ }
+ }
+
+ size += (sizeof(canon_tab) / sizeof(canon_tab[0]));
+ size += (sizeof(alias_tab) / sizeof(alias_tab[0]));
+
+ printf("%d\n", size);
+}
+#endif
+
+/**
+ * Create alias data from Aliases file
+ *
+ * \param filename The path to the Aliases file
+ * \return 1 on success, 0 on failure.
+ */
+int create_alias_data(const char *filename)
+{
+ char buf[300];
+ FILE *fp;
+
+ if (!filename)
+ return 0;
+
+ fp = fopen(filename, "r");
+ if (!fp)
+ return 0;
+
+ while (fgets(buf, sizeof buf, fp)) {
+ char *p, *aliases = 0, *mib, *end;
+ struct canon *cf;
+
+ if (buf[0] == 0 || buf[0] == '#')
+ /* skip blank lines or comments */
+ continue;
+
+ buf[strlen(buf) - 1] = 0; /* lose terminating newline */
+ end = buf + strlen(buf);
+
+ /* find end of canonical form */
+ for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ *p++ = '\0'; /* terminate canonical form */
+
+ /* skip whitespace */
+ for (; *p && isspace(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ mib = p;
+
+ /* find end of mibenum */
+ for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p < end)
+ *p++ = '\0'; /* terminate mibenum */
+
+ cf = create_canon(buf, atoi(mib));
+ if (!cf)
+ continue;
+
+ /* skip whitespace */
+ for (; p < end && *p && isspace(*p); p++)
+ ; /* do nothing */
+ if (p >= end)
+ continue;
+ aliases = p;
+
+ while (p < end) {
+ /* find end of alias */
+ for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+ ; /* do nothing */
+ if (p > end)
+ /* stop if we've gone past the end */
+ break;
+ /* terminate current alias */
+ *p++ = '\0';
+
+ if (!create_alias(aliases, cf))
+ break;
+
+ /* in terminating, we may have advanced
+ * past the end - check this here */
+ if (p >= end)
+ break;
+
+ /* skip whitespace */
+ for (; *p && isspace(*p); p++)
+ ; /* do nothing */
+
+ if (p >= end)
+ /* gone past end => stop */
+ break;
+
+ /* update pointer to current alias */
+ aliases = p;
+ }
+ }
+
+ fclose(fp);
+
+ return 1;
+}
+
+/**
+ * Retrieve the canonical form of an alias name
+ *
+ * \param alias The alias name
+ * \return Pointer to struct canon or NULL if not found
+ */
+struct canon *alias_canonicalise(const char *alias)
+{
+ int hash, len;
+ struct canon *c;
+ struct alias *a;
+
+ if (!alias)
+ return NULL;
+
+ hash = hash_val(alias);
+ len = strlen(alias);
+
+ for (c = canon_tab[hash]; c; c = c->next)
+ if (c->name_len == len && strcasecmp(c->name, alias) == 0)
+ break;
+ if (c)
+ return c;
+
+ for (a = alias_tab[hash]; a; a = a->next)
+ if (a->name_len == len && strcasecmp(a->name, alias) == 0)
+ break;
+ if (a)
+ return a->canon;
+
+ return NULL;
+}
+
+/**
+ * Retrieve the MIB enum value assigned to an encoding name
+ *
+ * \param alias The alias to lookup
+ * \return The MIB enum value, or 0 if not found
+ */
+short mibenum_from_name(const char *alias)
+{
+ struct canon *c;
+
+ if (!alias)
+ return 0;
+
+ c = alias_canonicalise(alias);
+ if (!c)
+ return 0;
+
+ return c->mib_enum;
+}
+
+/**
+ * Retrieve the canonical name of an encoding from the MIB enum
+ *
+ * \param mibenum The MIB enum value
+ * \return Pointer to canonical name, or NULL if not found
+ */
+const char *mibenum_to_name(short mibenum)
+{
+ int i;
+ struct canon *c;
+
+ for (i = 0; i != HASH_SIZE; i++)
+ for (c = canon_tab[i]; c; c = c->next)
+ if (c->mib_enum == mibenum)
+ return c->name;
+
+ return NULL;
+}
diff --git a/src/eightbit.c b/src/eightbit.c
new file mode 100644
index 0000000..3ff3470
--- /dev/null
+++ b/src/eightbit.c
@@ -0,0 +1,280 @@
+/* stateless 8bit encoding support => no support for CP1255, 1258 or TCVN
+ * functions in this file have an identical API to the encoding functions
+ * in UnicodeLib. see unicode/encoding.h for documentation. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "internal.h"
+
+struct table_entry {
+ const char *canon;
+ const char *filename;
+};
+
+/* Table should be ordered by enc_num */
+static const struct table_entry mapping_table[] = {
+ { "US-ASCII", 0 },
+ { "HP-ROMAN8", "HPR8" },
+ { "MACINTOSH", "Apple.Roman"},
+ { "IBM437", "Microsoft.CP437" },
+ { "IBM775", "Microsoft.CP775" },
+ { "IBM850", "Microsoft.CP850" },
+ { "IBM852", "Microsoft.CP852" },
+ { "IBM855", "Microsoft.CP855" },
+ { "IBM857", "Microsoft.CP857" },
+ { "IBM860", "Microsoft.CP860" },
+ { "IBM861", "Microsoft.CP861" },
+ { "IBM862", "Microsoft.CP862" },
+ { "IBM863", "Microsoft.CP863" },
+ { "IBM864", "Microsoft.CP864" },
+ { "IBM865", "Microsoft.CP865" },
+ { "IBM866", "Microsoft.CP866" },
+ { "IBM869", "Microsoft.CP869" },
+ { "KOI8-R", "KOI8-R" },
+ { "KOI8-U", "KOI8-U" },
+ { "IBM00858", "Microsoft.CP858" },
+ { "WINDOWS-1250", "Microsoft.CP1250" },
+ { "WINDOWS-1251", "Microsoft.CP1251" },
+ { "WINDOWS-1252", "Microsoft.CP1252" },
+ { "WINDOWS-1253", "Microsoft.CP1253" },
+ { "WINDOWS-1254", "Microsoft.CP1254" },
+ { "WINDOWS-1256", "Microsoft.CP1256" },
+ { "WINDOWS-1257", "Microsoft.CP1257" },
+ { "CP737", "Microsoft.CP737" },
+ { "CP853", "Microsoft.CP853" },
+ { "CP856", "Microsoft.CP856" },
+ { "CP874", "Microsoft.CP874" },
+ { "CP922", "Microsoft.CP922" },
+ { "CP1046", "Microsoft.CP1046" },
+ { "CP1124", "Microsoft.CP1124" },
+ { "CP1125", "Microsoft.CP1125" },
+ { "CP1129", "Microsoft.CP1129" },
+ { "CP1133", "Microsoft.CP1133" },
+ { "CP1161", "Microsoft.CP1161" },
+ { "CP1162", "Microsoft.CP1162" },
+ { "CP1163", "Microsoft.CP1163" },
+ { "GEORGIAN-ACADEMY", "GeorgA" },
+ { "GEORGIAN-PS", "GeorgPS" },
+ { "KOI8-RU", "KOI8-RU" },
+ { "KOI8-T", "KOI8-T" },
+ { "MACARABIC", "Apple.Arabic" },
+ { "MACCROATIAN", "Apple.Croatian" },
+ { "MACGREEK", "Apple.Greek" },
+ { "MACHEBREW", "Apple.Hebrew" },
+ { "MACICELAND", "Apple.Iceland" },
+ { "MACROMANIA", "Apple.Romania" },
+ { "MACTHAI", "Apple.Thai" },
+ { "MACTURKISH", "Apple.Turkish" },
+ { "MULELAO-1", "Mulelao" },
+ { "MACCYRILLIC", "Apple.Cyrillic" },
+ { "MACUKRAINE", "Apple.Ukrainian" },
+ { "MACCENTRALEUROPE", "Apple.CentEuro" },
+};
+
+#define TABLE_SIZE (sizeof(mapping_table) / sizeof(mapping_table[0]))
+
+/**
+ * Look up an encoding number, based on its name
+ *
+ * \param name The encoding name
+ * \return The encoding number, or 0 if not found
+ */
+int iconv_eightbit_number_from_name(const char *name)
+{
+ struct canon *c;
+ int i;
+
+ if (!name)
+ return 0;
+
+ c = alias_canonicalise(name);
+ if (!c)
+ return 0;
+
+ LOG(("searching for: %s", name));
+
+ for (i = 0; i != TABLE_SIZE; i++) {
+ if (strcasecmp(mapping_table[i].canon, c->name) == 0) {
+ LOG(("found: %d", c->mib_enum | (1<<30)));
+ return c->mib_enum | (1<<30);
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Read an 8bit encoded string
+ *
+ * \param e The encoding context
+ * \param callback Callback function to handle generated UCS characters
+ * \param s The input string
+ * \param n The length (in bytes) of the input
+ * \param handle Callback private data pointer
+ * \return The number of characters processed
+ */
+unsigned iconv_eightbit_read(struct encoding_context *e,
+ int (*callback)(void *handle, UCS4 c), const char *s,
+ unsigned int n, void *handle)
+{
+ UCS4 c;
+ unsigned int pos;
+
+ if (!e || !callback || !s)
+ return 0;
+
+ for (pos = 0; pos != n; pos++) {
+
+ c = s[pos];
+
+ LOG(("read: %d (%d)", c, pos));
+
+ if (c < 0x80) {
+ /* ASCII */
+ if (callback(handle, c))
+ break;
+ }
+ else if (c < 0x100 && e->intab) {
+ LOG(("maps to: %x", e->intab[c - 0x80]));
+ /* Look up in mapping table */
+ if (e->intab[c - 0x80] != 0xffff) {
+ if (callback(handle, e->intab[c - 0x80]))
+ break;
+ }
+ else {
+ /* character not defined in this encoding */
+ return pos;
+ }
+ }
+ }
+
+ return pos;
+}
+
+/**
+ * Write a UCS character in an 8bit encoding
+ *
+ * \param e The encoding context
+ * \param c The UCS4 character
+ * \param buf Indirect pointer to output buffer
+ * \param bufsize Pointer to size of output buffer
+ * \return 1 on success, 0 if bufsize is too small, -1 if unrepresentable.
+ */
+int iconv_eightbit_write(struct encoding_context *e, UCS4 c,
+ char **buf, int *bufsize)
+{
+ int i;
+
+ /* sanity check input */
+ if (!e || !bufsize || !buf || !*buf)
+ return 0;
+
+ /* buffer full */
+ if (--*bufsize < 0)
+ return 0;
+
+ if (c < 0x0080)
+ /* ASCII */
+ *(*buf)++ = (char)c;
+ else {
+ /* Perform reverse table lookup */
+ for (i = 0; i != 0x80; i++) {
+ if (e->outtab && e->outtab[i] == c) {
+ *(*buf)++ = (char)(i+0x80);
+ break;
+ }
+ }
+ if (i == 0x80) {
+ /* Nothing was written => fixup bufsize */
+ ++*bufsize;
+ return -1;
+ }
+ }
+
+ LOG(("written: %d", *(*buf-1)));
+
+ return 1;
+}
+
+/**
+ * Load an 8bit encoding
+ *
+ * \param enc_num The encoding number to load
+ * \return Pointer to lookup table for encoding, or NULL on error
+ */
+unsigned short *iconv_eightbit_new(int enc_num)
+{
+ char filename[64];
+ const char *name;
+ FILE *fp;
+ unsigned int len;
+ int i;
+ unsigned short *ret;
+
+ name = mibenum_to_name(enc_num);
+ if (!name)
+ return NULL;
+
+ /* Lookup filename in table */
+ for (i = 0; i != TABLE_SIZE; i++)
+ if (strcasecmp(mapping_table[i].canon, name) == 0) {
+ if (mapping_table[i].filename == 0)
+ return NULL;
+
+ snprintf(filename, sizeof filename,
+ "Unicode:Encodings.%s",
+ mapping_table[i].filename);
+
+ break;
+ }
+
+ LOG(("opening: %s", filename));
+
+ /* Open */
+ fp = fopen(filename, "rb");
+ if (!fp) {
+ return NULL;
+ }
+
+ /* Get extent */
+ fseek(fp, 0, SEEK_END);
+ len = (unsigned int)ftell(fp);
+ fseek(fp, 0, SEEK_SET);
+
+ /* Unexpected length => give up */
+ if (len != 256) {
+ fclose(fp);
+ return NULL;
+ }
+
+ /* Create buffer */
+ ret = calloc(128, sizeof(short));
+ if (!ret) {
+ fclose(fp);
+ return NULL;
+ }
+
+ fread(ret, 128, sizeof(short), fp);
+
+ fclose(fp);
+
+ return ret;
+}
+
+/**
+ * Delete any 8bit encodings used by a context
+ *
+ * \param e The encoding context
+ */
+void iconv_eightbit_delete(struct encoding_context *e)
+{
+ if (!e)
+ return;
+
+ if (e->intab)
+ free(e->intab);
+ if (e->outtab)
+ free(e->outtab);
+}
diff --git a/src/iconv.c b/src/iconv.c
new file mode 100644
index 0000000..aa18fa5
--- /dev/null
+++ b/src/iconv.c
@@ -0,0 +1,457 @@
+/* iconv implementation - see iconv.h for docs */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/errno.h>
+
+#include <unicode/charsets.h>
+#include <unicode/encoding.h>
+
+#include <iconv/iconv.h>
+
+#include "internal.h"
+
+static struct encoding_context *context_list;
+
+static int character_callback(void *handle, UCS4 c);
+static void parse_parameters(struct encoding_context *e, const char *params,
+ bool destination);
+static void parse_parameter(struct encoding_context *e, const char *param,
+ int length, bool destination);
+
+int iconv_initialise(const char *aliases_file)
+{
+ if (aliases_file == NULL)
+ return false;
+
+ if (create_alias_data(aliases_file) == false)
+ return false;
+
+ encoding_initialise();
+
+ return true;
+}
+
+void iconv_finalise(void)
+{
+ struct encoding_context *a, *b;
+
+ /* clients may quit / die without cleaning up. */
+ for (a = context_list; a; a = b) {
+ b = a->next;
+ if (a->in)
+ encoding_delete(a->in);
+ if (a->out)
+ encoding_delete(a->out);
+ iconv_eightbit_delete(a);
+ free(a);
+ }
+
+ free_alias_data();
+
+ /* finalise the unicode library */
+ encoding_tidyup();
+}
+
+iconv_t iconv_open(const char *tocode, const char *fromcode)
+{
+ int to = 0, from = 0;
+ struct encoding_context *e;
+ struct canon *c;
+ bool to_force_le = false, from_force_le = false;
+ char totemp[128], fromtemp[128];
+ const char *slash;
+ unsigned int len;
+
+ /* can't do anything without these */
+ if (!tocode || !fromcode) {
+ errno = EINVAL;
+ return (iconv_t)(-1);
+ }
+
+ e = calloc(1, sizeof(*e));
+ if (!e) {
+ LOG(("malloc failed"));
+ errno = ENOMEM;
+ return (iconv_t)(-1);
+ }
+
+ /* strip any parameters off the end of the tocode string */
+ slash = strchr(tocode, '/');
+ len = slash ? (unsigned) (slash - tocode) : strlen(tocode);
+ snprintf(totemp, sizeof totemp, "%.*s", len, tocode);
+
+ /* parse parameters */
+ if (slash && *(slash + 1) == '/' && *(slash + 2) != '\0')
+ parse_parameters(e, slash + 2, true);
+
+ /* strip any parameters off the end of the fromcode string */
+ slash = strchr(fromcode, '/');
+ len = slash ? (unsigned) (slash - fromcode) : strlen(fromcode);
+ snprintf(fromtemp, sizeof fromtemp, "%.*s", len, fromcode);
+
+ /* parse parameters */
+ if (slash && *(slash + 1) == '/' && *(slash + 2) != '\0')
+ parse_parameters(e, slash + 2, false);
+
+ /* try our own 8bit charset code first */
+ to = iconv_eightbit_number_from_name(totemp);
+ from = iconv_eightbit_number_from_name(fromtemp);
+
+ /* if that failed, try the UnicodeLib functionality */
+ if (!to)
+ to = iconv_encoding_number_from_name(totemp);
+
+ if (!from)
+ from = iconv_encoding_number_from_name(fromtemp);
+
+ /* if that failed, perhaps it was an endian-specific variant of
+ * something UnicodeLib can handle? */
+ if (!to) {
+ c = alias_canonicalise(totemp);
+ if (c) {
+ switch(c->mib_enum) {
+ case 1013: /* UTF-16BE */
+ to = csUnicode11;
+ break;
+ case 1014: /* UTF-16LE */
+ to = csUnicode11;
+ to_force_le = true;
+ break;
+ case 1018: /* UTF-32BE */
+ to = csUCS4;
+ break;
+ case 1019: /* UTF-32LE */
+ to = csUCS4;
+ to_force_le = true;
+ break;
+ }
+ }
+ }
+
+ if (!from) {
+ c = alias_canonicalise(fromtemp);
+ if (c) {
+ switch(c->mib_enum) {
+ case 1013: /* UTF-16BE */
+ from = csUnicode11;
+ break;
+ case 1014: /* UTF-16LE */
+ from = csUnicode11;
+ from_force_le = true;
+ break;
+ case 1018: /* UTF-32BE */
+ from = csUCS4;
+ break;
+ case 1019: /* UTF-32LE */
+ from = csUCS4;
+ from_force_le = true;
+ break;
+ }
+ }
+ }
+
+ LOG(("to: %d(%s) from: %d(%s)", to, totemp, from, fromtemp));
+
+ /* ensure both encodings are recognised */
+ if (to == 0 || from == 0) {
+ free(e);
+ errno = EINVAL;
+ return (iconv_t)(-1);
+ }
+
+ /* bit 30 set indicates that this is an 8bit encoding */
+ if (from & (1<<30))
+ e->intab = iconv_eightbit_new(from & ~(1<<30));
+ else {
+ e->in = encoding_new(from, encoding_READ);
+ if (e->in) {
+ /* Set encoding flags */
+ unsigned int flags = 0;
+ if (from_force_le)
+ flags |= encoding_FLAG_LITTLE_ENDIAN;
+
+ c = alias_canonicalise(fromtemp);
+ if (c && (c->mib_enum == csUCS4 ||
+ c->mib_enum == csUnicode))
+ flags |= encoding_FLAG_NO_HEADER;
+
+ encoding_set_flags(e->in, flags, flags);
+ }
+ }
+
+ /* neither created => memory error or somesuch. assume ENOMEM */
+ /* no table is ever generated for ASCII */
+ if (!e->in && !e->intab && (from & ~(1<<30)) != csASCII) {
+ free(e);
+ errno = ENOMEM;
+ return (iconv_t)(-1);
+ }
+
+ if (to & (1<<30))
+ e->outtab = iconv_eightbit_new(to & ~(1<<30));
+ else {
+ e->out = encoding_new(to, encoding_WRITE_STRICT);
+ if (e->out) {
+ /* Set encoding flags */
+ unsigned int flags = 0;
+ if (to_force_le)
+ flags |= encoding_FLAG_LITTLE_ENDIAN;
+
+ c = alias_canonicalise(totemp);
+ if (c && (c->mib_enum == csUCS4 ||
+ c->mib_enum == csUnicode))
+ flags |= encoding_FLAG_NO_HEADER;
+
+ encoding_set_flags(e->out, flags, flags);
+ }
+ }
+
+ /* neither created => ENOMEM */
+ if (!e->out && !e->outtab && (to & ~(1<<30)) != csASCII) {
+ if (e->in)
+ encoding_delete(e->in);
+ iconv_eightbit_delete(e);
+ free(e);
+ errno = ENOMEM;
+ return (iconv_t)(-1);
+ }
+
+ /* add to list */
+ e->prev = 0;
+ e->next = context_list;
+ if (context_list)
+ context_list->prev = e;
+ context_list = e;
+
+ return (iconv_t)e;
+}
+
+size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
+ size_t *outbytesleft)
+{
+ struct encoding_context *e;
+ unsigned read;
+
+ /* search for cd in list */
+ for (e = context_list; e; e = e->next)
+ if (e == (struct encoding_context *)cd)
+ break;
+
+ /* not found => invalid */
+ if (!e) {
+ errno = EINVAL;
+ return (size_t)(-1);
+ }
+
+ if (inbuf == NULL || *inbuf == NULL) {
+ if (e->in)
+ encoding_reset(e->in);
+ return 0;
+ }
+
+ /* Is there any point doing anything? */
+ if (!outbuf || !(*outbuf) || !outbytesleft) {
+ errno = EINVAL;
+ return (size_t)(-1);
+ }
+
+ e->outbuf = outbuf;
+ e->outbytesleft = outbytesleft;
+
+ LOG(("reading"));
+
+ if (e->in)
+ read = encoding_read(e->in, character_callback, *inbuf,
+ *inbytesleft, e);
+ else
+ read = iconv_eightbit_read(e, character_callback, *inbuf,
+ *inbytesleft, e);
+
+ LOG(("done"));
+
+ LOG(("read: %d, ibl: %d, obl: %d", read, *inbytesleft, *outbytesleft));
+
+ /* 2 */
+ if (read == *inbytesleft) {
+ *inbuf += read;
+ *inbytesleft = 0;
+ return 0;
+ }
+ /* 4 */
+ else if ((int)*outbytesleft < 0) {
+ LOG(("e2big"));
+ *outbytesleft = 0;
+ *inbuf += read - 1;
+ *inbytesleft -= read - 1;
+ errno = E2BIG;
+ }
+ /** \todo find a mechanism for distinguishing between 1 & 3 */
+ /* 1 */
+ else if (read != *inbytesleft) {
+ *inbuf += read;
+ *inbytesleft -= read;
+ LOG(("eilseq"));
+ errno = EILSEQ;
+ }
+ /* 3 */
+ else if ((int)*outbytesleft >= 0) {
+ *inbuf += read;
+ *inbytesleft -= read;
+ LOG(("einval"));
+ errno = EINVAL;
+ }
+
+ LOG(("errno: %d", errno));
+
+ return (size_t)(-1);
+}
+
+int iconv_close(iconv_t cd)
+{
+ struct encoding_context *e;
+
+ /* search for cd in list */
+ for (e = context_list; e; e = e->next)
+ if (e == (struct encoding_context *)cd)
+ break;
+
+ /* not found => invalid */
+ if (!e)
+ return 0;
+
+ if (e->in)
+ encoding_delete(e->in);
+ if (e->out)
+ encoding_delete(e->out);
+ iconv_eightbit_delete(e);
+
+ /* remove from list */
+ if (e->next)
+ e->next->prev = e->prev;
+ if (e->prev)
+ e->prev->next = e->next;
+ else
+ context_list = e->next;
+
+ free(e);
+
+ /* reduce our memory usage somewhat */
+ encoding_table_remove_unused(8 /* recommended value */);
+
+ return 0;
+}
+
+/* this is called for each converted character */
+int character_callback(void *handle, UCS4 c)
+{
+ struct encoding_context *e;
+ int ret;
+
+ e = (struct encoding_context*)handle;
+
+ LOG(("outbuf: %p, free: %d", *e->outbuf, *e->outbytesleft));
+ LOG(("writing: %d", c));
+
+ if (e->out) {
+ char *prev_outbuf = *e->outbuf;
+ size_t prev_outbytesleft = *e->outbytesleft;
+
+ ret = encoding_write(e->out, c, e->outbuf,
+ (int*)e->outbytesleft);
+
+ LOG(("ret: %d", ret));
+
+ /* Why the need for this nonsense? UnicodeLib appears to
+ * decrease the count of free space in the buffer even
+ * if it doesn't write into it. This is a bug, as the
+ * documentation says that the buffer pointer AND free
+ * space count are left unmodified if nothing is written.
+ * Therefore, we have this hack until UnicodeLib gets fixed.
+ */
+ if (ret == -1) {
+ *e->outbytesleft = prev_outbytesleft -
+ (*e->outbuf - prev_outbuf);
+ }
+ } else {
+ ret = iconv_eightbit_write(e, c, e->outbuf,
+ (int*)e->outbytesleft);
+ }
+
+ if (ret == -1) {
+ /* Transliterate, if we've been asked to.
+ * Assumes that output is 8bit/8bit multibyte with ASCII G0.
+ * This should be fine as the only <>8bit encodings are
+ * UCS{2,4}, UTF-{16,32}, neither of which return -1.
+ * Also, afaiaa, all supported multibyte encodings are ASCII
+ * compatible. */
+ /** \todo Actually perform some kind of transliteration */
+ if (e->transliterate && (int)*e->outbytesleft > 0) {
+ if (e->out) {
+ /* Reset encoding write state */
+ /** \todo this is a bit dodgy, as we only
+ * really need to ensure that the ASCII set
+ * is mapped into G0 in ISO2022 encodings.
+ * This will reset G1->G3, too, which may
+ * break things. If so, we may have to
+ * perform some dirty hackery which relies
+ * upon knowledge of UnicodeLib's internals
+ */
+ encoding_write(e->out, NULL_UCS4, e->outbuf,
+ (int*)e->outbytesleft);
+ }
+
+ if ((int)*e->outbytesleft > 0) {
+ *(*e->outbuf)++ = '?';
+ --*e->outbytesleft;
+
+ ret = 1;
+ } else {
+ ret = 0;
+ }
+ } else {
+ ret = 1;
+ }
+ }
+
+ return (!ret);
+}
+
+void parse_parameters(struct encoding_context *e, const char *params,
+ bool destination)
+{
+ char *slash = NULL, *prev = NULL;
+ int len;
+
+ len = strlen(params);
+
+ while (slash - params < len &&
+ (slash = strchr(params, '/')) != NULL) {
+ parse_parameter(e, prev == NULL ? params : prev,
+ slash - (prev == NULL ? params : prev),
+ destination);
+
+ prev = slash + 2;
+ slash += 2;
+ }
+
+ if (slash == NULL)
+ parse_parameter(e, prev == NULL ? params : prev,
+ (params + len) -
+ (prev == NULL ? params : prev),
+ destination);
+}
+
+void parse_parameter(struct encoding_context *e, const char *param,
+ int length, bool destination)
+{
+ if (length == 8 && strncasecmp(param, "TRANSLIT", 8) == 0) {
+ if (destination)
+ e->transliterate = 1;
+ }
+}
+
diff --git a/src/internal.h b/src/internal.h
new file mode 100644
index 0000000..d19bd09
--- /dev/null
+++ b/src/internal.h
@@ -0,0 +1,58 @@
+#ifndef _ICONV_INTERNAL_H_
+#define _ICONV_INTERNAL_H_
+
+#ifndef unicode_encoding_h
+#include <unicode/encoding.h>
+#endif
+
+#ifndef DEBUG
+#define LOG(x)
+#else
+#define LOG(x) (printf(__FILE__ " %s %i: ", __func__, __LINE__), printf x, fputc('\n', stdout))
+#endif
+
+#define UNUSED(x) ((x) = (x))
+
+struct encoding_context {
+ Encoding *in;
+ Encoding *out;
+ unsigned short *intab, *outtab;
+ char **outbuf;
+ size_t *outbytesleft;
+ char transliterate;
+ struct encoding_context *prev, *next;
+};
+
+/* in eightbit.c */
+int iconv_eightbit_number_from_name(const char *name);
+unsigned iconv_eightbit_read(struct encoding_context *e,
+ int (*callback)(void *handle, UCS4 c), const char *s,
+ unsigned int n, void *handle);
+int iconv_eightbit_write(struct encoding_context *e, UCS4 c,
+ char **buf, int *bufsize);
+unsigned short *iconv_eightbit_new(int enc_num);
+void iconv_eightbit_delete(struct encoding_context *e);
+
+/* in alias.c */
+int iconv_encoding_number_from_name(const char *name);
+const char *iconv_encoding_name_from_number(int number);
+
+struct canon {
+ struct canon *next;
+ short mib_enum;
+ unsigned short name_len;
+ char name[1];
+};
+
+/* in aliases.c */
+int create_alias_data(const char *filename);
+void free_alias_data(void);
+struct canon *alias_canonicalise(const char *alias);
+short mibenum_from_name(const char *alias);
+const char *mibenum_to_name(short mibenum);
+
+/* in utils.c */
+int strcasecmp(const char *s1, const char *s2);
+int strncasecmp(const char *s1, const char *s2, size_t len);
+
+#endif
diff --git a/src/utils.c b/src/utils.c
new file mode 100644
index 0000000..5403816
--- /dev/null
+++ b/src/utils.c
@@ -0,0 +1,53 @@
+#include <ctype.h>
+
+#include "internal.h"
+
+/**
+ * Case insensitive string comparison
+ *
+ * \param s1 Pointer to string
+ * \param s2 Pointer to string
+ * \return 0 if strings match, <> 0 if no match
+ */
+int strcasecmp(const char *s1, const char *s2)
+{
+ int i;
+
+ if (!s1 || !s2)
+ return 1; /* this is arbitrary */
+
+ if (s1 == s2)
+ return 0;
+
+ while ((i = tolower(*s1)) && i == tolower(*s2))
+ s1++, s2++;
+
+ return ((unsigned char) tolower(*s1) - (unsigned char) tolower(*s2));
+}
+
+/**
+ * Length-limited case insensitive string comparison
+ *
+ * \param s1 Pointer to string
+ * \param s2 Pointer to string
+ * \param len Length to compare
+ * \return 0 if strings match, <> 0 if no match
+ */
+int strncasecmp(const char *s1, const char *s2, size_t len)
+{
+ int i;
+
+ if (!s1 || !s2)
+ return 1; /* this is arbitrary */
+
+ if (len == 0)
+ return 0;
+
+ if (s1 == s2)
+ return 0;
+
+ while (len-- && (i = tolower(*s1)) && i == tolower(*s2))
+ s1++, s2++;
+
+ return ((unsigned char) tolower(*s1) - (unsigned char) tolower(*s2));
+}