From 2cb8c60b8c7263ec0cadcdd5c157d95f781c449c Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Wed, 19 Nov 2008 04:56:30 +0000 Subject: Tools tree. Currently comprises something approximating a parser for the Unihan database. svn path=/trunk/iconv/; revision=5730 --- build/tools/ParseUnihan.pl | 127 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100755 build/tools/ParseUnihan.pl diff --git a/build/tools/ParseUnihan.pl b/build/tools/ParseUnihan.pl new file mode 100755 index 0000000..4486088 --- /dev/null +++ b/build/tools/ParseUnihan.pl @@ -0,0 +1,127 @@ +#!/usr/bin/perl + +use warnings; +use strict; + +# Usage: ParseUnihan.pl +# +# is the path of the Unihan database, or - to read from stdin +# +# is one of the items in MAP + +use constant MAP => { + "GB2312-80" => { tag => "kIRG_GSource", prefix => "0" }, + "JIS X 0208:1990" => { tag => "kIRG_JSource", prefix => "0" }, + "JIS X 0212:1990" => { tag => "kIRG_JSource", prefix => "1" }, + "JIS X 0213:2000-1" => { tag => "kIRG_JSource", prefix => "3" }, + "JIS X 0213:2000-2" => { tag => "kIRG_JSource", prefix => "4" }, + "JIS X 0213:2004" => { tag => "kIRG_JSource", prefix => "3A" }, + "KS C 5601-1987" => { tag => "kIRG_KSource", prefix => "0" }, + "CNS 11643-1992-1" => { tag => "kIRG_TSource", prefix => "1" }, + "CNS 11643-1992-2" => { tag => "kIRG_TSource", prefix => "2" }, + "CNS 11643-1992-3" => { tag => "kIRG_TSource", prefix => "3" }, + "CNS 11643-1992-4" => { tag => "kIRG_TSource", prefix => "4" }, + "CNS 11643-1992-5" => { tag => "kIRG_TSource", prefix => "5" }, + "CNS 11643-1992-6" => { tag => "kIRG_TSource", prefix => "6" }, + "CNS 11643-1992-7" => { tag => "kIRG_TSource", prefix => "7" } +}; + +usage() if (@ARGV != 2); + +my $unihan = shift @ARGV; +my $charset = shift @ARGV; + +my $umap = MAP->{$charset}; +die "Failed finding map for $charset\n" if (!defined($umap)); + +my %mbtab = (); + +open UNIHAN,"<$unihan" or die "Failed opening $unihan: $!\n"; + +while (my $line = ) { + # Skip comments + next if ($line =~ /^#/); + chomp $line; + + # Format: \t\t + my ($uni, $tag, $data) = split('\t', $line); + + # Ignore if this isn't a tag we're interested in + next if ($tag ne $umap->{tag}); + + # Strip 'U+' from unicode codepoint + $uni =~ s/^U\+//; + + # Data is a list of space-separated fields + my @sources = split('\s', $data); + + foreach my $source (@sources) { + chomp $source; + + # Format: - + my ($prefix, $codepoint) = split('-', $source); + + # Skip prefixes we're not interested in + next if ($prefix ne $umap->{prefix}); + + # Insert into multibyte table + my $b1 = hex($codepoint) >> 8; + my $b2 = hex($codepoint) & 0xFF; + + if (!defined($mbtab{$b1})) { + $mbtab{$b1} = (); + } + + $mbtab{$b1}{$b2} = hex($uni); + + # Drop out early if we've got the mapping for this codepoint + last; + } +} + +close UNIHAN; + +#my $offset = 0; +for (my $i = 1; $i <= 94; $i++) { + print "Row $i:"; + + for (my $j = 1; $j <= 94; $j++) { +# if ($offset % 16 == 0) { +# printf("%07x:", $offset); +# } + + if (defined($mbtab{$i}) && defined($mbtab{$i}{$j})) { + printf(" %04x", $mbtab{$i}{$j}); + } else { + print " ffff"; + } + +# $offset += 2; +# +# if ($offset % 16 == 0) { +# print "\n"; +# } + } + + print "\n"; +} + +sub usage +{ + my $map = MAP; + + print STDERR < + + is the path of the Unihan database, or - to read from stdin + + is one of: +EOF + + foreach my $cset (sort(keys(%$map))) { + print STDERR "\t$cset\n"; + } + + exit 1; +} + -- cgit v1.2.3