diff options
Diffstat (limited to 'build/tools/gentranstab.pl')
-rw-r--r-- | build/tools/gentranstab.pl | 344 |
1 files changed, 344 insertions, 0 deletions
diff --git a/build/tools/gentranstab.pl b/build/tools/gentranstab.pl new file mode 100644 index 0000000..0e9205a --- /dev/null +++ b/build/tools/gentranstab.pl @@ -0,0 +1,344 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +# Usage: gentranstab.pl <path to transtab> + +usage() if (@ARGV != 1); + +my $transtab = shift @ARGV; + +open TRANSTAB,"<$transtab" or die "Failed opening $transtab: $!\n"; + +print <<EOF; +/* This file is autogenerated. Manual changes will be lost */ + +#include <assert.h> +#include <inttypes.h> +#include <stdbool.h> + +#include "internal.h" + +static int translit_write_character(struct encoding_context *e, + UCS4 c, char **buffer, size_t *buflen, bool use_transout) +{ + Encoding *out = use_transout ? e->transout : e->out; + int ret; + + if (out != NULL) { + char *prev_buf = *buffer; + size_t prev_len = *buflen; + + ret = encoding_write(out, c, buffer, (int *) buflen); + + if (ret <= 0) + *buflen = prev_len - (*buffer - prev_buf); + } else { + ret = iconv_eightbit_write(e, c, buffer, (int *) buflen); + } + + return ret; +} + +static int translit_try_sequence(struct encoding_context *e, + const size_t seqlen, const UCS2 *replacement) +{ + char *tmpbuf, *ptmpbuf; + size_t orig_tmplen, tmplen, index; + int ret = 1; + + /* First, determine if sequence can be written to target encoding */ + /* Worst case: conversion to UTF-8 (needing 6 bytes per character) */ + orig_tmplen = tmplen = (seqlen + 1) * 6; + ptmpbuf = tmpbuf = malloc(tmplen); + if (tmpbuf == NULL) + return 0; + + /* Reset the transout codec */ + if (e->transout != NULL) { + encoding_reset(e->transout); + encoding_set_flags(e->transout, e->outflags, e->outflags); + } + + for (index = 0; index < seqlen; index++) { + UCS4 c = replacement[index]; + do { + ret = translit_write_character(e, c, &ptmpbuf, + &tmplen, true); + if (ret == 0) { + char *tmp = realloc(tmpbuf, orig_tmplen * 2); + if (tmp == NULL) + break; + + ptmpbuf = tmp + (ptmpbuf - tmpbuf); + tmpbuf = tmp; + tmplen += orig_tmplen; + orig_tmplen *= 2; + } + } while (ret == 0); + + if (ret <= 0) + break; + } + + free(tmpbuf); + + if (ret <= 0) { + /* Consider lack of memory an inability to write the output */ + return -1; + } + + e->substitution = replacement; + e->substlen = seqlen; + + /* Emit replacement for real */ + return translit_flush_replacement(e); +} + +int translit_flush_replacement(struct encoding_context *e) +{ + const UCS2 *substitution = e->substitution; + size_t substlen = e->substlen; + int ret = 1; + + while (substlen > 0) { + UCS4 c = substitution[0]; + + ret = translit_write_character(e, c, + e->outbuf, e->outbytesleft, false); + assert(ret != -1); + if (ret <= 0) + break; + + substitution++; + substlen--; + } + + e->substitution = substitution; + e->substlen = substlen; + + return ret; +} + +EOF + +# Map from codepoint -> ttvals ref +# ttvals is a list of chars ref +my %transmap = (); +# Length, in characters, of longest substitution string seen so far +my $maxsubst = 0; +# Total number of substitution strings encountered +my $numsubsts = 0; +# Map from substitution string -> start index in charbin +my %substs = (); +# Accumulated list of substitution character sequences +my @charbin = (); + +# Read in transtab data +while (my $line = <TRANSTAB>) { + # Skip comments and blank lines + next if ($line =~ /^%/); + next if ($line =~ /^\s*$/); + + # Format: <codepoint> <data> + my ($codepoint, $data) = split(' ', $line); + + # Strip '<U' from start, and '>' from end of input codepoint + $codepoint =~ s/^<U([^>]+)>/$1/; + + # Data is a list of semi-colon-separated substitutions + my @substitutions = split(';', $data); + + my @ttvals = (); + + foreach my $sub (@substitutions) { + # Strip quotes around substitution sequence + $sub =~ s/"([^"]*)"/$1/; + + $numsubsts++; + + if ($sub eq "") { + # Special-case empty substitutions + my @empty = (); + push(@ttvals, \@empty); + next; + } + + # Split characters in sequence + my @chars = split('<', $sub); + shift @chars; + my $num_chars = scalar(@chars); + + # Strip leading 'U' and trailing '>' + map { $_ =~ s/U([^>]+)>/$1/; } @chars; + + $maxsubst = $num_chars if ($num_chars > $maxsubst); + + # Stringify chars to produce hash key + my $hkey = "@chars"; + + # Find/insert in bin, if new substitution + if (!defined($substs{$hkey})) { + my $pos = find_in_bin(\@chars, $num_chars); + + $substs{$hkey} = $pos; + } + + # Append to list of substitutions for codepoint + push(@ttvals, \@chars); + } + + # Insert into transmap + $transmap{$codepoint} = \@ttvals; +} + +close TRANSTAB; + +# Ensure transtab is representable +die "Charbin length exceeds 2^13!" if $#charbin >= 2**13; +die "Maxsubst exceeds 8!" if $maxsubst >= 2**3; + +print <<EOF; +struct translit_entry { + uint32_t codepoint : 16, + offset : 13, + length : 3; +}; + +EOF + +# Emit substitution data +my $cblen = @charbin; +print "static const UCS2 substdata[$cblen] = {\n"; +foreach my $c (@charbin) { + print "\t0x$c,\n"; +} +print "};\n\n"; + +# Emit transliteration LUT +my $ttlen = $numsubsts + 1; # + 1 for sentinel +print "static const struct translit_entry transtab[$ttlen] = {\n"; +foreach my $codepoint (sort(keys %transmap)) { + my $ttvals = $transmap{$codepoint}; + + for my $subst (@$ttvals) { + my $hkey = "@$subst"; + + if ($hkey ne "") { + my $slen = @$subst; + print "\t{ 0x$codepoint, $substs{$hkey}, $slen },\n"; + } else { + print "\t{ 0x$codepoint, 0, 0 },\n"; + } + } +} +# Place sentinel at the end +print "\t{ 0, 0, 0 }\n"; +print "};\n\n"; + +print <<EOF; +static int translit_tab_cmp(const void *a, const void *b) +{ + const struct translit_entry *aa = (const struct translit_entry *) a; + const struct translit_entry *bb = (const struct translit_entry *) b; + + return (int) aa->codepoint - (int) bb->codepoint; +} + +int translit_substitute(struct encoding_context *e, UCS4 c) +{ + static const UCS2 default_subst[1] = { '?' }; + int ret = 1; + + if (c <= 0xFFFF) { + struct translit_entry key = { c, 0, 0 }; + const struct translit_entry *res; + + res = bsearch(&key, transtab, $numsubsts, + sizeof(struct translit_entry), + translit_tab_cmp); + if (res != NULL) { + /* Reverse until we find the first entry for c */ + while (res > transtab) { + if (res[-1].codepoint != c) + break; + res--; + } + + /* Try substitutions in turn, until we run out */ + while (res->codepoint == c) { + ret = translit_try_sequence(e, res->length, + substdata + res->offset); + if (ret >= 0) + return ret; + + res++; + } + } + } + + /* Last-ditch replacement: must succeed */ + return translit_try_sequence(e, 1, default_subst); +} +EOF + +# Search bin for existing sequence, or append if not found. +# +# The intent here is to minimise duplication of substitution +# sequences. This implementation is decidedly trivial, and +# makes no attempt to discover the optimal insertion order. +# +# Inspection of the output indicates that we use approximately +# 5.5 bytes of storage for each substitution sequence +# encountered (4 of these are the translit_entry, so there +# doesn't seem much point in trying to optimise the layout of +# the charbin any further.) +sub find_in_bin +{ + my $pchars = shift; + my $pcharslen = shift; + my $binlen = scalar(@charbin); + my $offset = 0; + + # Search bin for pchars + while ($offset <= $binlen - $pcharslen) { + my @slice = @charbin[$offset .. $offset + $pcharslen - 1]; + + last if aeq(\@slice, $pchars); + + $offset++; + } + + if ($offset <= $binlen - $pcharslen) { + # Found in bin + return $offset; + } else { + # Not found, so append + push(@charbin, @$pchars); + return $binlen; + } +} + +# Compare two arrays for equality +sub aeq +{ + my ($aref, $bref) = @_; + return 0 unless @$aref == @$bref; + + my $idx = 0; + for my $item (@$aref) { + return 0 unless $item eq $bref->[$idx++]; + } + + return 1; +} + +sub usage +{ + print STDERR <<EOF; +Usage: gentranstab.pl <path to transtab> +EOF + + exit 1; +} |