Import Iconv sources

svn path=/trunk/iconv/; revision=5677
author: John Mark Bell <jmb@netsurf-browser.org> 2008-11-10 18:43:09 +0000
committer: John Mark Bell <jmb@netsurf-browser.org> 2008-11-10 18:43:09 +0000
commit: f8d8287cdbd7da9cd9392bcddf04860a10fa598e (patch)
tree: 668b4cc601fdfd050a51095d4f9bbebef9eaffec /src
download: iconv-f8d8287cdbd7da9cd9392bcddf04860a10fa598e.tar.gz
iconv-f8d8287cdbd7da9cd9392bcddf04860a10fa598e.tar.bz2
7 files changed, 1350 insertions, 0 deletions
diff --git a/src/Makefile b/src/Makefile
new file mode 100644
index 0000000..f9d136b
--- /dev/null
+++ b/src/Makefile
@@ -0,0 +1,49 @@
+# Child makefile fragment
+#
+# Toolchain is provided by top-level makefile
+#
+# Variables provided by top-level makefile
+#
+# COMPONENT		The name of the component
+# EXPORT		The location of the export directory
+# TOP			The location of the source tree root
+# RELEASEDIR		The place to put release objects
+# DEBUGDIR		The place to put debug objects
+#
+# do_include		Canned command sequence to include a child makefile
+#
+# Variables provided by parent makefile:
+#
+# DIR			The name of the directory we're in, relative to $(TOP)
+#
+# Variables we can manipulate:
+#
+# ITEMS_CLEAN		The list of items to remove for "make clean"
+# ITEMS_DISTCLEAN	The list of items to remove for "make distclean"
+# TARGET_TESTS		The list of target names to run for "make test"
+#
+# SOURCES		The list of sources to build for $(COMPONENT)
+#
+# Plus anything from the toolchain
+
+# Push parent directory onto the directory stack
+sp             := $(sp).x
+dirstack_$(sp) := $(d)
+d              := $(DIR)
+
+# Manipulate include paths
+CFLAGS := $(CFLAGS) -I$(d)
+
+# Sources
+SRCS_$(d) := alias.c aliases.c eightbit.c iconv.c utils.c
+
+# Append to sources for component
+SOURCES += $(addprefix $(d), $(SRCS_$(d)))
+
+# Now include any children we may have
+MAKE_INCLUDES := $(wildcard $(d)*/Makefile)
+$(eval $(foreach INC, $(MAKE_INCLUDES), $(call do_include,$(INC))))
+
+# Finally, pop off the directory stack
+d  := $(dirstack_$(sp))
+sp := $(basename $(sp))
diff --git a/src/alias.c b/src/alias.c
new file mode 100644
index 0000000..ebc1b78
--- /dev/null
+++ b/src/alias.c
@@ -0,0 +1,89 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "unicode/charsets.h"
+#include "unicode/encoding.h"
+
+#include "internal.h"
+
+struct table_entry {
+	const char *alias;
+	const char *encname;
+};
+
+/* This table contains special cases to allow us to use UnicodeLib sensibly. */
+static const struct table_entry mapping_table[] = {
+	{"/UTF-7/UNICODE-1-1-UTF-7/UNICODE-2-0-UTF-7/", "UTF-7" },
+	{"/ISO-10646-UCS-4/UCS-4/UTF-32/", "ISO-10646-UCS-4" },
+	{"/UTF-16/UCS-2/ISO-10646-UCS-2/UNICODE-1-1/UNICODE-2-0/", "UTF-16" },
+	{"/ISO-2022/", "ISO-2022" },
+};
+
+#define TABLE_SIZE (sizeof(mapping_table) / sizeof(mapping_table[0]))
+
+/**
+ * Look up an encoding number, based on its name
+ *
+ * \param name The encoding name
+ * \return The encoding number, or 0 if not found.
+ */
+int iconv_encoding_number_from_name(const char *name)
+{
+	unsigned int i;
+	char buf[256];
+	struct canon *c;
+
+	if (!name)
+		return 0;
+
+	snprintf(buf, sizeof buf, "/%s/", name);
+
+	/* convert to upper case */
+	for (i = 0; i != strlen(buf); i++) {
+		if (buf[i] >= 'a' && buf[i] <= 'z')
+			buf[i] = buf[i] - 32;
+	}
+
+	for (i = 0; i != TABLE_SIZE; i++)
+		if (strstr(mapping_table[i].alias, buf) != NULL)
+			return encoding_number_from_name(mapping_table[i].encname);
+
+	c = alias_canonicalise(name);
+	if (!c)
+		return 0;
+
+	return encoding_number_from_name(c->name);
+}
+
+/**
+ * Look up an encoding name, based on its MIB number
+ *
+ * \param number  The encoding MIB number
+ * \return Pointer to encoding name, or NULL if not found
+ */
+const char *iconv_encoding_name_from_number(int number)
+{
+	const char *ret = NULL;
+	/* This is a PITA - UnicodeLib doesn't have a call to do this,
+	 * so implement it ourselves. */
+	switch (number) {
+		case csUnicode11UTF7:
+			ret = mapping_table[0].alias;
+			break;
+		case csUCS4:
+			ret = mapping_table[1].alias;
+			break;
+		case csUnicode11:
+			ret = mapping_table[2].alias;
+			break;
+		case csVenturaMath:
+			ret = mapping_table[3].alias;
+			break;
+		default:
+			ret = mibenum_to_name(number);
+			break;
+	}
+
+	return ret;
+}
diff --git a/src/aliases.c b/src/aliases.c
new file mode 100644
index 0000000..1292685
--- /dev/null
+++ b/src/aliases.c
@@ -0,0 +1,364 @@
+#include <ctype.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "internal.h"
+
+struct alias {
+	struct alias *next;
+	struct canon *canon;
+	unsigned short name_len;
+	char name[1];
+};
+
+#define HASH_SIZE (43)
+static struct canon *canon_tab[HASH_SIZE];
+static struct alias *alias_tab[HASH_SIZE];
+
+static bool create_alias(const char *alias, struct canon *c);
+static struct canon *create_canon(const char *canon, short mibenum);
+static int hash_val(const char *alias);
+
+#ifdef TEST
+static void dump_alias_data(void);
+
+int main (void)
+{
+	struct canon *c;
+
+	create_alias_data("Unicode:Files.Aliases");
+
+	dump_alias_data();
+
+	c = alias_canonicalise("moose");
+	if (c)
+		printf("!!!\n");
+
+	c = alias_canonicalise("csinvariant");
+	if (c)
+		printf("%s %d\n", c->name, c->mib_enum);
+
+	c = alias_canonicalise("nats-sefi-add");
+	if (c)
+		printf("%s %d\n", c->name, c->mib_enum);
+
+	printf("%d\n", mibenum_from_name(c->name));
+
+	printf("%s\n", mibenum_to_name(c->mib_enum));
+
+	free_alias_data();
+
+	return 0;
+}
+#endif
+
+/**
+ * Create an alias
+ *
+ * \param alias The alias name
+ * \param c The canonical form
+ * \return true on success, false otherwise
+ */
+bool create_alias(const char *alias, struct canon *c)
+{
+	struct alias *a;
+	int hash;
+
+	if (!alias || !c)
+		return false;
+
+	a = malloc(sizeof(struct alias) + strlen(alias) + 1);
+	if (!a)
+		return false;
+
+	a->canon = c;
+	a->name_len = strlen(alias);
+	strcpy(a->name, alias);
+	a->name[a->name_len] = '\0';
+
+	hash = hash_val(alias);
+
+	a->next = alias_tab[hash];
+	alias_tab[hash] = a;
+
+	return true;
+}
+
+/**
+ * Create a canonical form
+ *
+ * \param canon The canonical name
+ * \param mibenum The MIB enum value
+ * \return Pointer to struct canon or NULL on error
+ */
+struct canon *create_canon(const char *canon, short mibenum)
+{
+	struct canon *c;
+	int hash, len;
+
+	if (!canon)
+		return NULL;
+
+	len = strlen(canon);
+
+	c = malloc(sizeof(struct canon) + len + 1);
+	if (!c)
+		return NULL;
+
+	c->mib_enum = mibenum;
+	c->name_len = len;
+	strcpy(c->name, canon);
+	c->name[len] = '\0';
+
+	hash = hash_val(canon);
+
+	c->next = canon_tab[hash];
+	canon_tab[hash] = c;
+
+	return c;
+}
+
+/**
+ * Hash function
+ *
+ * \param alias String to hash
+ * \return The hashed value
+ */
+int hash_val(const char *alias)
+{
+	const char *s = alias;
+	unsigned int h = 5381;
+
+	if (!alias)
+		return 0;
+
+	while (*s)
+		h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */
+
+	return h % HASH_SIZE;
+}
+
+/**
+ * Free all alias data
+ */
+void free_alias_data(void)
+{
+	struct canon *c, *d;
+	struct alias *a, *b;
+	int i;
+
+	for (i = 0; i != HASH_SIZE; i++) {
+		for (c = canon_tab[i]; c; c = d) {
+			d = c->next;
+			free(c);
+		}
+		canon_tab[i] = NULL;
+
+		for (a = alias_tab[i]; a; a = b) {
+			b = a->next;
+			free(a);
+		}
+		alias_tab[i] = NULL;
+	}
+}
+
+#ifdef TEST
+/**
+ * Dump all alias data to stdout
+ */
+void dump_alias_data(void)
+{
+	struct canon *c;
+	struct alias *a;
+	int i;
+	size_t size = 0;
+
+	for (i = 0; i != HASH_SIZE; i++) {
+		for (c = canon_tab[i]; c; c = c->next) {
+			printf("%d %s\n", i, c->name);
+			size += offsetof(struct canon, name) + c->name_len;
+		}
+
+		for (a = alias_tab[i]; a; a = a->next) {
+			printf("%d %s\n", i, a->name);
+			size += offsetof(struct alias, name) + a->name_len;
+		}
+	}
+
+	size += (sizeof(canon_tab) / sizeof(canon_tab[0]));
+	size += (sizeof(alias_tab) / sizeof(alias_tab[0]));
+
+	printf("%d\n", size);
+}
+#endif
+
+/**
+ * Create alias data from Aliases file
+ *
+ * \param filename  The path to the Aliases file
+ * \return 1 on success, 0 on failure.
+ */
+int create_alias_data(const char *filename)
+{
+	char buf[300];
+	FILE *fp;
+
+	if (!filename)
+		return 0;
+
+	fp = fopen(filename, "r");
+	if (!fp)
+		return 0;
+
+	while (fgets(buf, sizeof buf, fp)) {
+		char *p, *aliases = 0, *mib, *end;
+		struct canon *cf;
+
+		if (buf[0] == 0 || buf[0] == '#')
+			/* skip blank lines or comments */
+			continue;
+
+		buf[strlen(buf) - 1] = 0; /* lose terminating newline */
+		end = buf + strlen(buf);
+
+		/* find end of canonical form */
+		for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++)
+			; /* do nothing */
+		if (p >= end)
+			continue;
+		*p++ = '\0'; /* terminate canonical form */
+
+		/* skip whitespace */
+		for (; *p && isspace(*p); p++)
+			; /* do nothing */
+		if (p >= end)
+			continue;
+		mib = p;
+
+		/* find end of mibenum */
+		for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+			; /* do nothing */
+		if (p < end)
+			*p++ = '\0'; /* terminate mibenum */
+
+		cf = create_canon(buf, atoi(mib));
+		if (!cf)
+			continue;
+
+		/* skip whitespace */
+		for (; p < end && *p && isspace(*p); p++)
+			; /* do nothing */
+		if (p >= end)
+			continue;
+		aliases = p;
+
+		while (p < end) {
+			/* find end of alias */
+			for (; *p && !isspace(*p) && !iscntrl(*p); p++)
+				; /* do nothing */
+			if (p > end)
+				/* stop if we've gone past the end */
+				break;
+			/* terminate current alias */
+			*p++ = '\0';
+
+			if (!create_alias(aliases, cf))
+				break;
+
+			/* in terminating, we may have advanced
+			 * past the end - check this here */
+			if (p >= end)
+				break;
+
+			/* skip whitespace */
+			for (; *p && isspace(*p); p++)
+				; /* do nothing */
+
+			if (p >= end)
+				/* gone past end => stop */
+				break;
+
+			/* update pointer to current alias */
+			aliases = p;
+		}
+	}
+
+	fclose(fp);
+
+	return 1;
+}
+
+/**
+ * Retrieve the canonical form of an alias name
+ *
+ * \param alias The alias name
+ * \return Pointer to struct canon or NULL if not found
+ */
+struct canon *alias_canonicalise(const char *alias)
+{
+	int hash, len;
+	struct canon *c;
+	struct alias *a;
+
+	if (!alias)
+		return NULL;
+
+	hash = hash_val(alias);
+	len = strlen(alias);
+
+	for (c = canon_tab[hash]; c; c = c->next)
+		if (c->name_len == len && strcasecmp(c->name, alias) == 0)
+			break;
+	if (c)
+		return c;
+
+	for (a = alias_tab[hash]; a; a = a->next)
+		if (a->name_len == len && strcasecmp(a->name, alias) == 0)
+			break;
+	if (a)
+		return a->canon;
+
+	return NULL;
+}
+
+/**
+ * Retrieve the MIB enum value assigned to an encoding name
+ *
+ * \param alias The alias to lookup
+ * \return The MIB enum value, or 0 if not found
+ */
+short mibenum_from_name(const char *alias)
+{
+	struct canon *c;
+
+	if (!alias)
+		return 0;
+
+	c = alias_canonicalise(alias);
+	if (!c)
+		return 0;
+
+	return c->mib_enum;
+}
+
+/**
+ * Retrieve the canonical name of an encoding from the MIB enum
+ *
+ * \param mibenum The MIB enum value
+ * \return Pointer to canonical name, or NULL if not found
+ */
+const char *mibenum_to_name(short mibenum)
+{
+	int i;
+	struct canon *c;
+
+	for (i = 0; i != HASH_SIZE; i++)
+		for (c = canon_tab[i]; c; c = c->next)
+			if (c->mib_enum == mibenum)
+				return c->name;
+
+	return NULL;
+}
diff --git a/src/eightbit.c b/src/eightbit.c
new file mode 100644
index 0000000..3ff3470
--- /dev/null
+++ b/src/eightbit.c
@@ -0,0 +1,280 @@
+/* stateless 8bit encoding support => no support for CP1255, 1258 or TCVN
+ * functions in this file have an identical API to the encoding functions
+ * in UnicodeLib. see unicode/encoding.h for documentation. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "internal.h"
+
+struct table_entry {
+	const char *canon;
+	const char *filename;
+};
+
+/* Table should be ordered by enc_num */
+static const struct table_entry mapping_table[] = {
+	{ "US-ASCII", 0 },
+	{ "HP-ROMAN8", "HPR8" },
+	{ "MACINTOSH", "Apple.Roman"},
+	{ "IBM437", "Microsoft.CP437" },
+	{ "IBM775", "Microsoft.CP775" },
+	{ "IBM850", "Microsoft.CP850" },
+	{ "IBM852", "Microsoft.CP852" },
+	{ "IBM855", "Microsoft.CP855" },
+	{ "IBM857", "Microsoft.CP857" },
+	{ "IBM860", "Microsoft.CP860" },
+	{ "IBM861", "Microsoft.CP861" },
+	{ "IBM862", "Microsoft.CP862" },
+	{ "IBM863", "Microsoft.CP863" },
+	{ "IBM864", "Microsoft.CP864" },
+	{ "IBM865", "Microsoft.CP865" },
+	{ "IBM866", "Microsoft.CP866" },
+	{ "IBM869", "Microsoft.CP869" },
+	{ "KOI8-R", "KOI8-R" },
+	{ "KOI8-U", "KOI8-U" },
+	{ "IBM00858", "Microsoft.CP858" },
+	{ "WINDOWS-1250", "Microsoft.CP1250" },
+	{ "WINDOWS-1251", "Microsoft.CP1251" },
+	{ "WINDOWS-1252", "Microsoft.CP1252" },
+	{ "WINDOWS-1253", "Microsoft.CP1253" },
+	{ "WINDOWS-1254", "Microsoft.CP1254" },
+	{ "WINDOWS-1256", "Microsoft.CP1256" },
+	{ "WINDOWS-1257", "Microsoft.CP1257" },
+	{ "CP737", "Microsoft.CP737" },
+	{ "CP853", "Microsoft.CP853" },
+	{ "CP856", "Microsoft.CP856" },
+	{ "CP874", "Microsoft.CP874" },
+	{ "CP922", "Microsoft.CP922" },
+	{ "CP1046", "Microsoft.CP1046" },
+	{ "CP1124", "Microsoft.CP1124" },
+	{ "CP1125", "Microsoft.CP1125" },
+	{ "CP1129", "Microsoft.CP1129" },
+	{ "CP1133", "Microsoft.CP1133" },
+	{ "CP1161", "Microsoft.CP1161" },
+	{ "CP1162", "Microsoft.CP1162" },
+	{ "CP1163", "Microsoft.CP1163" },
+	{ "GEORGIAN-ACADEMY", "GeorgA" },
+	{ "GEORGIAN-PS", "GeorgPS" },
+	{ "KOI8-RU", "KOI8-RU" },
+	{ "KOI8-T", "KOI8-T" },
+	{ "MACARABIC", "Apple.Arabic" },
+	{ "MACCROATIAN", "Apple.Croatian" },
+	{ "MACGREEK", "Apple.Greek" },
+	{ "MACHEBREW", "Apple.Hebrew" },
+	{ "MACICELAND", "Apple.Iceland" },
+	{ "MACROMANIA", "Apple.Romania" },
+	{ "MACTHAI", "Apple.Thai" },
+	{ "MACTURKISH", "Apple.Turkish" },
+	{ "MULELAO-1", "Mulelao" },
+	{ "MACCYRILLIC", "Apple.Cyrillic" },
+	{ "MACUKRAINE", "Apple.Ukrainian" },
+	{ "MACCENTRALEUROPE", "Apple.CentEuro" },
+};
+
+#define TABLE_SIZE (sizeof(mapping_table) / sizeof(mapping_table[0]))
+
+/**
+ * Look up an encoding number, based on its name
+ *
+ * \param name  The encoding name
+ * \return The encoding number, or 0 if not found
+ */
+int iconv_eightbit_number_from_name(const char *name)
+{
+	struct canon *c;
+	int i;
+
+	if (!name)
+		return 0;
+
+	c = alias_canonicalise(name);
+	if (!c)
+		return 0;
+
+	LOG(("searching for: %s", name));
+
+	for (i = 0; i != TABLE_SIZE; i++) {
+		if (strcasecmp(mapping_table[i].canon, c->name) == 0) {
+			LOG(("found: %d", c->mib_enum | (1<<30)));
+			return c->mib_enum | (1<<30);
+		}
+	}
+
+	return 0;
+}
+
+/**
+ * Read an 8bit encoded string
+ *
+ * \param e  The encoding context
+ * \param callback  Callback function to handle generated UCS characters
+ * \param s  The input string
+ * \param n  The length (in bytes) of the input
+ * \param handle  Callback private data pointer
+ * \return The number of characters processed
+ */
+unsigned iconv_eightbit_read(struct encoding_context *e,
+		int (*callback)(void *handle, UCS4 c), const char *s,
+		unsigned int n, void *handle)
+{
+	UCS4 c;
+	unsigned int pos;
+
+	if (!e || !callback || !s)
+		return 0;
+
+	for (pos = 0; pos != n; pos++) {
+
+		c = s[pos];
+
+		LOG(("read: %d (%d)", c, pos));
+
+		if (c < 0x80) {
+			/* ASCII */
+			if (callback(handle, c))
+				break;
+		}
+		else if (c < 0x100 && e->intab) {
+			LOG(("maps to: %x", e->intab[c - 0x80]));
+			/* Look up in mapping table */
+			if (e->intab[c - 0x80] != 0xffff) {
+				if (callback(handle, e->intab[c - 0x80]))
+					break;
+			}
+			else {
+				/* character not defined in this encoding */
+				return pos;
+			}
+		}
+	}
+
+	return pos;
+}
+
+/**
+ * Write a UCS character in an 8bit encoding
+ *
+ * \param e  The encoding context
+ * \param c  The UCS4 character
+ * \param buf  Indirect pointer to output buffer
+ * \param bufsize  Pointer to size of output buffer
+ * \return 1 on success, 0 if bufsize is too small, -1 if unrepresentable.
+ */
+int iconv_eightbit_write(struct encoding_context *e, UCS4 c,
+		char **buf, int *bufsize)
+{
+	int i;
+
+	/* sanity check input */
+	if (!e || !bufsize || !buf || !*buf)
+		return 0;
+
+	/* buffer full */
+	if (--*bufsize < 0)
+		return 0;
+
+	if (c < 0x0080)
+		/* ASCII */
+		*(*buf)++ = (char)c;
+	else {
+		/* Perform reverse table lookup */
+		for (i = 0; i != 0x80; i++) {
+			if (e->outtab && e->outtab[i] == c) {
+				*(*buf)++ = (char)(i+0x80);
+				break;
+			}
+		}
+		if (i == 0x80) {
+			/* Nothing was written => fixup bufsize */
+			++*bufsize;
+			return -1;
+		}
+	}
+
+	LOG(("written: %d", *(*buf-1)));
+
+	return 1;
+}
+
+/**
+ * Load an 8bit encoding
+ *
+ * \param enc_num  The encoding number to load
+ * \return Pointer to lookup table for encoding, or NULL on error
+ */
+unsigned short *iconv_eightbit_new(int enc_num)
+{
+	char filename[64];
+	const char *name;
+	FILE *fp;
+	unsigned int len;
+	int i;
+	unsigned short *ret;
+
+	name = mibenum_to_name(enc_num);
+	if (!name)
+		return NULL;
+
+	/* Lookup filename in table */
+	for (i = 0; i != TABLE_SIZE; i++)
+		if (strcasecmp(mapping_table[i].canon, name) == 0) {
+			if (mapping_table[i].filename == 0)
+				return NULL;
+
+			snprintf(filename, sizeof filename,
+				"Unicode:Encodings.%s",
+				mapping_table[i].filename);
+
+			break;
+		}
+
+	LOG(("opening: %s", filename));
+
+	/* Open */
+	fp = fopen(filename, "rb");
+	if (!fp) {
+		return NULL;
+	}
+
+	/* Get extent */
+	fseek(fp, 0, SEEK_END);
+	len = (unsigned int)ftell(fp);
+	fseek(fp, 0, SEEK_SET);
+
+	/* Unexpected length => give up */
+	if (len != 256) {
+		fclose(fp);
+		return NULL;
+	}
+
+	/* Create buffer */
+	ret = calloc(128, sizeof(short));
+	if (!ret) {
+		fclose(fp);
+		return NULL;
+	}
+
+	fread(ret, 128, sizeof(short), fp);
+
+	fclose(fp);
+
+	return ret;
+}
+
+/**
+ * Delete any 8bit encodings used by a context
+ *
+ * \param e  The encoding context
+ */
+void iconv_eightbit_delete(struct encoding_context *e)
+{
+	if (!e)
+		return;
+
+	if (e->intab)
+		free(e->intab);
+	if (e->outtab)
+		free(e->outtab);
+}
diff --git a/src/iconv.c b/src/iconv.c
new file mode 100644
index 0000000..aa18fa5
--- /dev/null
+++ b/src/iconv.c
@@ -0,0 +1,457 @@
+/* iconv implementation - see iconv.h for docs */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <sys/errno.h>
+
+#include <unicode/charsets.h>
+#include <unicode/encoding.h>
+
+#include <iconv/iconv.h>
+
+#include "internal.h"
+
+static struct encoding_context *context_list;
+
+static int character_callback(void *handle, UCS4 c);
+static void parse_parameters(struct encoding_context *e, const char *params,
+		bool destination);
+static void parse_parameter(struct encoding_context *e, const char *param,
+		int length, bool destination);
+
+int iconv_initialise(const char *aliases_file)
+{
+	if (aliases_file == NULL)
+		return false;
+
+	if (create_alias_data(aliases_file) == false)
+		return false;
+
+	encoding_initialise();
+
+	return true;
+}
+
+void iconv_finalise(void)
+{
+	struct encoding_context *a, *b;
+
+	/* clients may quit / die without cleaning up. */
+	for (a = context_list; a; a = b) {
+		b = a->next;
+		if (a->in)
+			encoding_delete(a->in);
+		if (a->out)
+			encoding_delete(a->out);
+		iconv_eightbit_delete(a);
+		free(a);
+	}
+
+	free_alias_data();
+
+	/* finalise the unicode library */
+	encoding_tidyup();
+}
+
+iconv_t iconv_open(const char *tocode, const char *fromcode)
+{
+	int to = 0, from = 0;
+	struct encoding_context *e;
+	struct canon *c;
+	bool to_force_le = false, from_force_le = false;
+	char totemp[128], fromtemp[128];
+	const char *slash;
+	unsigned int len;
+
+	/* can't do anything without these */
+	if (!tocode || !fromcode) {
+		errno = EINVAL;
+		return (iconv_t)(-1);
+	}
+
+	e = calloc(1, sizeof(*e));
+	if (!e) {
+		LOG(("malloc failed"));
+		errno = ENOMEM;
+		return (iconv_t)(-1);
+	}
+
+	/* strip any parameters off the end of the tocode string */
+	slash = strchr(tocode, '/');
+	len = slash ? (unsigned) (slash - tocode) : strlen(tocode);
+	snprintf(totemp, sizeof totemp, "%.*s", len, tocode);
+
+	/* parse parameters */
+	if (slash && *(slash + 1) == '/' && *(slash + 2) != '\0')
+		parse_parameters(e, slash + 2, true);
+
+	/* strip any parameters off the end of the fromcode string */
+	slash = strchr(fromcode, '/');
+	len = slash ? (unsigned) (slash - fromcode) : strlen(fromcode);
+	snprintf(fromtemp, sizeof fromtemp, "%.*s", len, fromcode);
+
+	/* parse parameters */
+	if (slash && *(slash + 1) == '/' && *(slash + 2) != '\0')
+		parse_parameters(e, slash + 2, false);
+
+	/* try our own 8bit charset code first */
+	to = iconv_eightbit_number_from_name(totemp);
+	from = iconv_eightbit_number_from_name(fromtemp);
+
+	/* if that failed, try the UnicodeLib functionality */
+	if (!to)
+		to = iconv_encoding_number_from_name(totemp);
+
+	if (!from)
+		from = iconv_encoding_number_from_name(fromtemp);
+
+	/* if that failed, perhaps it was an endian-specific variant of
+	 * something UnicodeLib can handle? */
+	if (!to) {
+		c = alias_canonicalise(totemp);
+		if (c) {
+			switch(c->mib_enum) {
+			case 1013: /* UTF-16BE */
+				to = csUnicode11;
+				break;
+			case 1014: /* UTF-16LE */
+				to = csUnicode11;
+				to_force_le = true;
+				break;
+			case 1018: /* UTF-32BE */
+				to = csUCS4;
+				break;
+			case 1019: /* UTF-32LE */
+				to = csUCS4;
+				to_force_le = true;
+				break;
+			}
+		}
+	}
+
+	if (!from) {
+		c = alias_canonicalise(fromtemp);
+		if (c) {
+			switch(c->mib_enum) {
+			case 1013: /* UTF-16BE */
+				from = csUnicode11;
+				break;
+			case 1014: /* UTF-16LE */
+				from = csUnicode11;
+				from_force_le = true;
+				break;
+			case 1018: /* UTF-32BE */
+				from = csUCS4;
+				break;
+			case 1019: /* UTF-32LE */
+				from = csUCS4;
+				from_force_le = true;
+				break;
+			}
+		}
+	}
+
+	LOG(("to: %d(%s) from: %d(%s)", to, totemp, from, fromtemp));
+
+	/* ensure both encodings are recognised */
+	if (to == 0 || from == 0) {
+		free(e);
+		errno = EINVAL;
+		return (iconv_t)(-1);
+	}
+
+	/* bit 30 set indicates that this is an 8bit encoding */
+	if (from & (1<<30))
+		e->intab = iconv_eightbit_new(from & ~(1<<30));
+	else {
+		e->in = encoding_new(from, encoding_READ);
+		if (e->in) {
+			/* Set encoding flags */
+			unsigned int flags = 0;
+			if (from_force_le)
+				flags |= encoding_FLAG_LITTLE_ENDIAN;
+
+			c = alias_canonicalise(fromtemp);
+			if (c && (c->mib_enum == csUCS4 ||
+					c->mib_enum == csUnicode))
+				flags |= encoding_FLAG_NO_HEADER;
+
+			encoding_set_flags(e->in, flags, flags);
+		}
+	}
+
+	/* neither created => memory error or somesuch. assume ENOMEM */
+	/* no table is ever generated for ASCII */
+	if (!e->in && !e->intab && (from & ~(1<<30)) != csASCII) {
+		free(e);
+		errno = ENOMEM;
+		return (iconv_t)(-1);
+	}
+
+	if (to & (1<<30))
+		e->outtab = iconv_eightbit_new(to & ~(1<<30));
+	else {
+		e->out = encoding_new(to, encoding_WRITE_STRICT);
+		if (e->out) {
+			/* Set encoding flags */
+			unsigned int flags = 0;
+			if (to_force_le)
+				flags |= encoding_FLAG_LITTLE_ENDIAN;
+
+			c = alias_canonicalise(totemp);
+			if (c && (c->mib_enum == csUCS4 ||
+					c->mib_enum == csUnicode))
+				flags |= encoding_FLAG_NO_HEADER;
+
+			encoding_set_flags(e->out, flags, flags);
+		}
+	}
+
+	/* neither created => ENOMEM */
+	if (!e->out && !e->outtab && (to & ~(1<<30)) != csASCII) {
+		if (e->in)
+			encoding_delete(e->in);
+		iconv_eightbit_delete(e);
+		free(e);
+		errno = ENOMEM;
+		return (iconv_t)(-1);
+	}
+
+	/* add to list */
+	e->prev = 0;
+	e->next = context_list;
+	if (context_list)
+		context_list->prev = e;
+	context_list = e;
+
+	return (iconv_t)e;
+}
+
+size_t iconv(iconv_t cd, char **inbuf, size_t *inbytesleft, char **outbuf,
+		size_t *outbytesleft)
+{
+	struct encoding_context *e;
+	unsigned read;
+
+	/* search for cd in list */
+	for (e = context_list; e; e = e->next)
+		if (e == (struct encoding_context *)cd)
+			break;
+
+	/* not found => invalid */
+	if (!e) {
+		errno = EINVAL;
+		return (size_t)(-1);
+	}
+
+	if (inbuf == NULL || *inbuf == NULL) {
+		if (e->in)
+			encoding_reset(e->in);
+		return 0;
+	}
+
+	/* Is there any point doing anything? */
+	if (!outbuf || !(*outbuf) || !outbytesleft) {
+		errno = EINVAL;
+		return (size_t)(-1);
+	}
+
+	e->outbuf = outbuf;
+	e->outbytesleft = outbytesleft;
+
+	LOG(("reading"));
+
+	if (e->in)
+		read = encoding_read(e->in, character_callback, *inbuf,
+				*inbytesleft, e);
+	else
+		read = iconv_eightbit_read(e, character_callback, *inbuf,
+				*inbytesleft, e);
+
+	LOG(("done"));
+
+	LOG(("read: %d, ibl: %d, obl: %d", read, *inbytesleft, *outbytesleft));
+
+	/* 2 */
+	if (read == *inbytesleft) {
+		*inbuf += read;
+		*inbytesleft = 0;
+		return 0;
+	}
+	/* 4 */
+	else if ((int)*outbytesleft < 0) {
+		LOG(("e2big"));
+		*outbytesleft = 0;
+		*inbuf += read - 1;
+		*inbytesleft -= read - 1;
+		errno = E2BIG;
+	}
+	/** \todo find a mechanism for distinguishing between 1 & 3 */
+	/* 1 */
+	else if (read != *inbytesleft) {
+		*inbuf += read;
+		*inbytesleft -= read;
+		LOG(("eilseq"));
+		errno = EILSEQ;
+	}
+	/* 3 */
+	else if ((int)*outbytesleft >= 0) {
+		*inbuf += read;
+		*inbytesleft -= read;
+		LOG(("einval"));
+		errno = EINVAL;
+	}
+
+	LOG(("errno: %d", errno));
+
+	return (size_t)(-1);
+}
+
+int iconv_close(iconv_t cd)
+{
+	struct encoding_context *e;
+
+	/* search for cd in list */
+	for (e = context_list; e; e = e->next)
+		if (e == (struct encoding_context *)cd)
+			break;
+
+	/* not found => invalid */
+	if (!e)
+		return 0;
+
+	if (e->in)
+		encoding_delete(e->in);
+	if (e->out)
+		encoding_delete(e->out);
+	iconv_eightbit_delete(e);
+
+	/* remove from list */
+	if (e->next)
+		e->next->prev = e->prev;
+	if (e->prev)
+		e->prev->next = e->next;
+	else
+		context_list = e->next;
+
+	free(e);
+
+	/* reduce our memory usage somewhat */
+	encoding_table_remove_unused(8 /* recommended value */);
+
+	return 0;
+}
+
+/* this is called for each converted character */
+int character_callback(void *handle, UCS4 c)
+{
+	struct encoding_context *e;
+	int ret;
+
+	e = (struct encoding_context*)handle;
+
+	LOG(("outbuf: %p, free: %d", *e->outbuf, *e->outbytesleft));
+	LOG(("writing: %d", c));
+
+	if (e->out) {
+		char *prev_outbuf = *e->outbuf;
+		size_t prev_outbytesleft = *e->outbytesleft;
+
+		ret = encoding_write(e->out, c, e->outbuf,
+				(int*)e->outbytesleft);
+
+		LOG(("ret: %d", ret));
+
+		/* Why the need for this nonsense? UnicodeLib appears to
+		 * decrease the count of free space in the buffer even
+		 * if it doesn't write into it. This is a bug, as the
+		 * documentation says that the buffer pointer AND free
+		 * space count are left unmodified if nothing is written.
+		 * Therefore, we have this hack until UnicodeLib gets fixed.
+		 */
+		if (ret == -1) {
+			*e->outbytesleft = prev_outbytesleft -
+					(*e->outbuf - prev_outbuf);
+		}
+	} else {
+		ret = iconv_eightbit_write(e, c, e->outbuf,
+				(int*)e->outbytesleft);
+	}
+
+	if (ret == -1) {
+		/* Transliterate, if we've been asked to.
+		 * Assumes that output is 8bit/8bit multibyte with ASCII G0.
+		 * This should be fine as the only <>8bit encodings are
+		 * UCS{2,4}, UTF-{16,32}, neither of which return -1.
+		 * Also, afaiaa, all supported multibyte encodings are ASCII
+		 * compatible. */
+		/** \todo Actually perform some kind of transliteration */
+		if (e->transliterate && (int)*e->outbytesleft > 0) {
+			if (e->out) {
+				/* Reset encoding write state */
+				/** \todo this is a bit dodgy, as we only
+				 * really need to ensure that the ASCII set
+				 * is mapped into G0 in ISO2022 encodings.
+				 * This will reset G1->G3, too, which may
+				 * break things. If so, we may have to
+				 * perform some dirty hackery which relies
+				 * upon knowledge of UnicodeLib's internals
+				 */
+				encoding_write(e->out, NULL_UCS4, e->outbuf,
+						(int*)e->outbytesleft);
+			}
+
+			if ((int)*e->outbytesleft > 0) {
+				*(*e->outbuf)++ = '?';
+				--*e->outbytesleft;
+
+				ret = 1;
+			} else {
+				ret = 0;
+			}
+		} else {
+			ret = 1;
+		}
+	}
+
+	return (!ret);
+}
+
+void parse_parameters(struct encoding_context *e, const char *params,
+		bool destination)
+{
+	char *slash = NULL, *prev = NULL;
+	int len;
+
+	len = strlen(params);
+
+	while (slash - params < len &&
+			(slash = strchr(params, '/')) != NULL) {
+		parse_parameter(e, prev == NULL ? params : prev,
+				slash - (prev == NULL ? params : prev),
+				destination);
+
+		prev = slash + 2;
+		slash += 2;
+	}
+
+	if (slash == NULL)
+		parse_parameter(e, prev == NULL ? params : prev,
+				(params + len) -
+					(prev == NULL ? params : prev),
+				destination);
+}
+
+void parse_parameter(struct encoding_context *e, const char *param,
+		int length, bool destination)
+{
+	if (length == 8 && strncasecmp(param, "TRANSLIT", 8) == 0) {
+		if (destination)
+			e->transliterate = 1;
+	}
+}
+
diff --git a/src/internal.h b/src/internal.h
new file mode 100644
index 0000000..d19bd09
--- /dev/null
+++ b/src/internal.h
@@ -0,0 +1,58 @@
+#ifndef _ICONV_INTERNAL_H_
+#define _ICONV_INTERNAL_H_
+
+#ifndef unicode_encoding_h
+#include <unicode/encoding.h>
+#endif
+
+#ifndef DEBUG
+#define LOG(x)
+#else
+#define LOG(x) (printf(__FILE__ " %s %i: ", __func__, __LINE__), printf x, fputc('\n', stdout))
+#endif
+
+#define UNUSED(x) ((x) = (x))
+
+struct encoding_context {
+	Encoding *in;
+	Encoding *out;
+	unsigned short *intab, *outtab;
+	char **outbuf;
+	size_t *outbytesleft;
+	char transliterate;
+	struct encoding_context *prev, *next;
+};
+
+/* in eightbit.c */
+int iconv_eightbit_number_from_name(const char *name);
+unsigned iconv_eightbit_read(struct encoding_context *e,
+		int (*callback)(void *handle, UCS4 c), const char *s,
+		unsigned int n, void *handle);
+int iconv_eightbit_write(struct encoding_context *e, UCS4 c,
+		char **buf, int *bufsize);
+unsigned short *iconv_eightbit_new(int enc_num);
+void iconv_eightbit_delete(struct encoding_context *e);
+
+/* in alias.c */
+int iconv_encoding_number_from_name(const char *name);
+const char *iconv_encoding_name_from_number(int number);
+
+struct canon {
+	struct canon *next;
+	short mib_enum;
+	unsigned short name_len;
+	char name[1];
+};
+
+/* in aliases.c */
+int create_alias_data(const char *filename);
+void free_alias_data(void);
+struct canon *alias_canonicalise(const char *alias);
+short mibenum_from_name(const char *alias);
+const char *mibenum_to_name(short mibenum);
+
+/* in utils.c */
+int strcasecmp(const char *s1, const char *s2);
+int strncasecmp(const char *s1, const char *s2, size_t len);
+
+#endif
diff --git a/src/utils.c b/src/utils.c
new file mode 100644
index 0000000..5403816
--- /dev/null
+++ b/src/utils.c
@@ -0,0 +1,53 @@
+#include <ctype.h>
+
+#include "internal.h"
+
+/**
+ * Case insensitive string comparison
+ *
+ * \param s1 Pointer to string
+ * \param s2 Pointer to string
+ * \return 0 if strings match, <> 0 if no match
+ */
+int strcasecmp(const char *s1, const char *s2)
+{
+	int i;
+
+	if (!s1 || !s2)
+		return 1; /* this is arbitrary */
+
+	if (s1 == s2)
+		return 0;
+
+	while ((i = tolower(*s1)) && i == tolower(*s2))
+		s1++, s2++;
+
+	return ((unsigned char) tolower(*s1) - (unsigned char) tolower(*s2));
+}
+
+/**
+ * Length-limited case insensitive string comparison
+ *
+ * \param s1 Pointer to string
+ * \param s2 Pointer to string
+ * \param len Length to compare
+ * \return 0 if strings match, <> 0 if no match
+ */
+int strncasecmp(const char *s1, const char *s2, size_t len)
+{
+	int i;
+
+	if (!s1 || !s2)
+		return 1; /* this is arbitrary */
+
+	if (len == 0)
+		return 0;
+
+	if (s1 == s2)
+		return 0;
+
+	while (len-- && (i = tolower(*s1)) && i == tolower(*s2))
+		s1++, s2++;
+
+	return ((unsigned char) tolower(*s1) - (unsigned char) tolower(*s2));
+}
author	John Mark Bell <jmb@netsurf-browser.org>	2008-11-10 18:43:09 +0000
committer	John Mark Bell <jmb@netsurf-browser.org>	2008-11-10 18:43:09 +0000
commit	f8d8287cdbd7da9cd9392bcddf04860a10fa598e (patch)
tree	668b4cc601fdfd050a51095d4f9bbebef9eaffec /src
download	iconv-f8d8287cdbd7da9cd9392bcddf04860a10fa598e.tar.gz iconv-f8d8287cdbd7da9cd9392bcddf04860a10fa598e.tar.bz2