1 files changed, 344 insertions, 0 deletions
diff --git a/build/tools/gentranstab.pl b/build/tools/gentranstab.pl
new file mode 100644
index 0000000..0e9205a
--- /dev/null
+++ b/build/tools/gentranstab.pl
@@ -0,0 +1,344 @@
+#!/usr/bin/perl
+
+use warnings;
+use strict;
+
+# Usage: gentranstab.pl <path to transtab>
+
+usage() if (@ARGV != 1);
+
+my $transtab = shift @ARGV;
+
+open TRANSTAB,"<$transtab" or die "Failed opening $transtab: $!\n";
+
+print <<EOF;
+/* This file is autogenerated. Manual changes will be lost */
+
+#include <assert.h>
+#include <inttypes.h>
+#include <stdbool.h>
+
+#include "internal.h"
+
+static int translit_write_character(struct encoding_context *e,
+		UCS4 c, char **buffer, size_t *buflen, bool use_transout)
+{
+	Encoding *out = use_transout ? e->transout : e->out;
+	int ret;
+
+	if (out != NULL) {
+		char *prev_buf = *buffer;
+		size_t prev_len = *buflen;
+
+		ret = encoding_write(out, c, buffer, (int *) buflen);
+
+		if (ret <= 0)
+			*buflen = prev_len - (*buffer - prev_buf);
+	} else {
+		ret = iconv_eightbit_write(e, c, buffer, (int *) buflen);
+	}
+
+	return ret;
+}
+
+static int translit_try_sequence(struct encoding_context *e,
+		const size_t seqlen, const UCS2 *replacement)
+{
+	char *tmpbuf, *ptmpbuf;
+	size_t orig_tmplen, tmplen, index;
+	int ret = 1;
+
+	/* First, determine if sequence can be written to target encoding */
+	/* Worst case: conversion to UTF-8 (needing 6 bytes per character) */
+	orig_tmplen = tmplen = (seqlen + 1) * 6;
+	ptmpbuf = tmpbuf = malloc(tmplen);
+	if (tmpbuf == NULL)
+		return 0;
+
+	/* Reset the transout codec */
+	if (e->transout != NULL) {
+		encoding_reset(e->transout);
+		encoding_set_flags(e->transout, e->outflags, e->outflags);
+	}
+
+	for (index = 0; index < seqlen; index++) {
+		UCS4 c = replacement[index];
+		do {
+			ret = translit_write_character(e, c, &ptmpbuf,
+					&tmplen, true);
+			if (ret == 0) {
+				char *tmp = realloc(tmpbuf, orig_tmplen * 2);
+				if (tmp == NULL)
+					break;
+
+				ptmpbuf = tmp + (ptmpbuf - tmpbuf);
+				tmpbuf = tmp;
+				tmplen += orig_tmplen;
+				orig_tmplen *= 2;
+			}
+		} while (ret == 0);
+
+		if (ret <= 0)
+			break;
+	}
+
+	free(tmpbuf);
+
+	if (ret <= 0) {
+		/* Consider lack of memory an inability to write the output */
+		return -1;
+	}
+
+	e->substitution = replacement;
+	e->substlen = seqlen;
+
+	/* Emit replacement for real */
+	return translit_flush_replacement(e);
+}
+
+int translit_flush_replacement(struct encoding_context *e)
+{
+	const UCS2 *substitution = e->substitution;
+	size_t substlen = e->substlen;
+	int ret = 1;
+
+	while (substlen > 0) {
+		UCS4 c = substitution[0];
+		
+		ret = translit_write_character(e, c,
+				e->outbuf, e->outbytesleft, false);
+		assert(ret != -1);
+		if (ret <= 0)
+			break;
+
+		substitution++;
+		substlen--;
+	}
+
+	e->substitution = substitution;
+	e->substlen = substlen;
+
+	return ret;
+}
+
+EOF
+
+# Map from codepoint -> ttvals ref
+# ttvals is a list of chars ref
+my %transmap = ();
+# Length, in characters, of longest substitution string seen so far
+my $maxsubst = 0;
+# Total number of substitution strings encountered
+my $numsubsts = 0;
+# Map from substitution string -> start index in charbin
+my %substs = ();
+# Accumulated list of substitution character sequences
+my @charbin = ();
+
+# Read in transtab data
+while (my $line = <TRANSTAB>) {
+	# Skip comments and blank lines
+	next if ($line =~ /^%/);
+	next if ($line =~ /^\s*$/);
+
+	# Format: <codepoint> <data>
+	my ($codepoint, $data) = split(' ', $line);
+
+	# Strip '<U' from start, and '>' from end of input codepoint
+	$codepoint =~ s/^<U([^>]+)>/$1/;
+
+	# Data is a list of semi-colon-separated substitutions
+	my @substitutions = split(';', $data);
+
+	my @ttvals = ();
+
+	foreach my $sub (@substitutions) {
+		# Strip quotes around substitution sequence
+		$sub =~ s/"([^"]*)"/$1/;
+
+		$numsubsts++;
+
+		if ($sub eq "") {
+			# Special-case empty substitutions
+			my @empty = ();
+			push(@ttvals, \@empty);
+			next;
+		}
+
+		# Split characters in sequence
+		my @chars = split('<', $sub);
+		shift @chars;
+		my $num_chars = scalar(@chars);
+
+		# Strip leading 'U' and trailing '>'
+		map { $_ =~ s/U([^>]+)>/$1/; } @chars;
+
+		$maxsubst = $num_chars if ($num_chars > $maxsubst);
+
+		# Stringify chars to produce hash key
+		my $hkey = "@chars";
+
+		# Find/insert in bin, if new substitution
+		if (!defined($substs{$hkey})) {
+			my $pos = find_in_bin(\@chars, $num_chars);
+
+			$substs{$hkey} = $pos;
+		}
+
+		# Append to list of substitutions for codepoint
+		push(@ttvals, \@chars);
+	}
+
+	# Insert into transmap
+	$transmap{$codepoint} = \@ttvals;
+}
+
+close TRANSTAB;
+
+# Ensure transtab is representable
+die "Charbin length exceeds 2^13!" if $#charbin >= 2**13;
+die "Maxsubst exceeds 8!" if $maxsubst >= 2**3;
+
+print <<EOF;
+struct translit_entry {
+	uint32_t codepoint : 16,
+	         offset    : 13,
+		 length    :  3;
+};
+
+EOF
+
+# Emit substitution data
+my $cblen = @charbin;
+print "static const UCS2 substdata[$cblen] = {\n";
+foreach my $c (@charbin) {
+	print "\t0x$c,\n";
+}
+print "};\n\n";
+
+# Emit transliteration LUT
+my $ttlen = $numsubsts + 1; # + 1 for sentinel
+print "static const struct translit_entry transtab[$ttlen] = {\n";
+foreach my $codepoint (sort(keys %transmap)) {
+	my $ttvals = $transmap{$codepoint};
+
+	for my $subst (@$ttvals) {
+		my $hkey = "@$subst";
+
+		if ($hkey ne "") {
+			my $slen = @$subst;
+			print "\t{ 0x$codepoint, $substs{$hkey}, $slen },\n";
+		} else {
+			print "\t{ 0x$codepoint, 0, 0 },\n";
+		}
+	}
+}
+# Place sentinel at the end
+print "\t{ 0, 0, 0 }\n";
+print "};\n\n";
+
+print <<EOF;
+static int translit_tab_cmp(const void *a, const void *b)
+{
+	const struct translit_entry *aa = (const struct translit_entry *) a;
+	const struct translit_entry *bb = (const struct translit_entry *) b;
+
+	return (int) aa->codepoint - (int) bb->codepoint;
+}
+
+int translit_substitute(struct encoding_context *e, UCS4 c)
+{
+	static const UCS2 default_subst[1] = { '?' };
+	int ret = 1;
+
+	if (c <= 0xFFFF) {
+		struct translit_entry key = { c, 0, 0 };
+		const struct translit_entry *res;
+
+		res = bsearch(&key, transtab, $numsubsts,
+				sizeof(struct translit_entry),
+				translit_tab_cmp);
+		if (res != NULL) {
+			/* Reverse until we find the first entry for c */
+			while (res > transtab) {
+				if (res[-1].codepoint != c)
+					break;
+				res--;
+			}
+
+			/* Try substitutions in turn, until we run out */
+			while (res->codepoint == c) {			
+				ret = translit_try_sequence(e, res->length,
+						substdata + res->offset);
+				if (ret >= 0)
+					return ret;
+
+				res++;
+			}
+		}
+	}
+
+	/* Last-ditch replacement: must succeed */
+	return translit_try_sequence(e, 1, default_subst);
+}
+EOF
+
+# Search bin for existing sequence, or append if not found.
+#
+# The intent here is to minimise duplication of substitution
+# sequences. This implementation is decidedly trivial, and
+# makes no attempt to discover the optimal insertion order.
+#
+# Inspection of the output indicates that we use approximately
+# 5.5 bytes of storage for each substitution sequence 
+# encountered (4 of these are the translit_entry, so there
+# doesn't seem much point in trying to optimise the layout of
+# the charbin any further.)
+sub find_in_bin
+{
+	my $pchars = shift;
+	my $pcharslen = shift;
+	my $binlen = scalar(@charbin);
+	my $offset = 0;
+
+	# Search bin for pchars
+	while ($offset <= $binlen - $pcharslen) {
+		my @slice = @charbin[$offset .. $offset + $pcharslen - 1];
+
+		last if aeq(\@slice, $pchars);
+
+		$offset++;
+	}
+
+	if ($offset <= $binlen - $pcharslen) {
+		# Found in bin
+		return $offset;
+	} else {
+		# Not found, so append
+		push(@charbin, @$pchars);
+		return $binlen;
+	}
+}
+
+# Compare two arrays for equality
+sub aeq
+{
+	my ($aref, $bref) = @_;
+	return 0 unless @$aref == @$bref;
+
+	my $idx = 0;
+	for my $item (@$aref) {
+		return 0 unless $item eq $bref->[$idx++];
+	}
+
+	return 1;
+}
+
+sub usage
+{
+	print STDERR <<EOF;
+Usage: gentranstab.pl <path to transtab>
+EOF
+
+	exit 1;
+}