1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
#!/usr/bin/perl
use warnings;
use strict;
# Usage: ParseUnihan.pl <unihan> <charset>
#
# <unihan> is the path of the Unihan database, or - to read from stdin
#
# <charset> is one of the items in MAP
use constant MAP => {
"GB2312-80" => { tag => "kIRG_GSource", prefix => "0" },
"JIS X 0208:1990" => { tag => "kIRG_JSource", prefix => "0" },
"JIS X 0212:1990" => { tag => "kIRG_JSource", prefix => "1" },
"JIS X 0213:2000-1" => { tag => "kIRG_JSource", prefix => "3" },
"JIS X 0213:2000-2" => { tag => "kIRG_JSource", prefix => "4" },
"JIS X 0213:2004" => { tag => "kIRG_JSource", prefix => "3A" },
"KS C 5601-1987" => { tag => "kIRG_KSource", prefix => "0" },
"CNS 11643-1992-1" => { tag => "kIRG_TSource", prefix => "1" },
"CNS 11643-1992-2" => { tag => "kIRG_TSource", prefix => "2" },
"CNS 11643-1992-3" => { tag => "kIRG_TSource", prefix => "3" },
"CNS 11643-1992-4" => { tag => "kIRG_TSource", prefix => "4" },
"CNS 11643-1992-5" => { tag => "kIRG_TSource", prefix => "5" },
"CNS 11643-1992-6" => { tag => "kIRG_TSource", prefix => "6" },
"CNS 11643-1992-7" => { tag => "kIRG_TSource", prefix => "7" }
};
usage() if (@ARGV != 2);
my $unihan = shift @ARGV;
my $charset = shift @ARGV;
my $umap = MAP->{$charset};
die "Failed finding map for $charset\n" if (!defined($umap));
my %mbtab = ();
open UNIHAN,"<$unihan" or die "Failed opening $unihan: $!\n";
while (my $line = <UNIHAN>) {
# Skip comments
next if ($line =~ /^#/);
chomp $line;
# Format: <unicode>\t<tag>\t<data>
my ($uni, $tag, $data) = split('\t', $line);
# Ignore if this isn't a tag we're interested in
next if ($tag ne $umap->{tag});
# Strip 'U+' from unicode codepoint
$uni =~ s/^U\+//;
# Data is a list of space-separated fields
my @sources = split('\s', $data);
foreach my $source (@sources) {
chomp $source;
# Format: <prefix>-<codepoint>
my ($prefix, $codepoint) = split('-', $source);
# Skip prefixes we're not interested in
next if ($prefix ne $umap->{prefix});
# Insert into multibyte table
my $b1 = hex($codepoint) >> 8;
my $b2 = hex($codepoint) & 0xFF;
if (!defined($mbtab{$b1})) {
$mbtab{$b1} = ();
}
$mbtab{$b1}{$b2} = hex($uni);
# Drop out early if we've got the mapping for this codepoint
last;
}
}
close UNIHAN;
#my $offset = 0;
for (my $i = 1; $i <= 94; $i++) {
print "Row $i:";
for (my $j = 1; $j <= 94; $j++) {
# if ($offset % 16 == 0) {
# printf("%07x:", $offset);
# }
if (defined($mbtab{$i}) && defined($mbtab{$i}{$j})) {
printf(" %04x", $mbtab{$i}{$j});
} else {
print " ffff";
}
# $offset += 2;
#
# if ($offset % 16 == 0) {
# print "\n";
# }
}
print "\n";
}
sub usage
{
my $map = MAP;
print STDERR <<EOF;
Usage: ParseUnihan.pl <unihan> <charset>
<unihan> is the path of the Unihan database, or - to read from stdin
<charset> is one of:
EOF
foreach my $cset (sort(keys(%$map))) {
print STDERR "\t$cset\n";
}
exit 1;
}
|