# # Public suffix C include generator # # Copyright 2016 Vincent Sanders # # Permission to use, copy, modify, and/or distribute this software for # any purpose with or without fee is hereby granted, provided that the # above copyright notice and this permission notice appear in all # copies. # # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL # WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE # AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL # DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA # OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER # TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR # PERFORMANCE OF THIS SOFTWARE. # This program converts the public suffix list data [1] into a C # program with static data representation and acessor function. # # The actual data list [2] should be placed in a file effective_tld_names.dat # # The C program is written to stdout, the typical 160K input file # generates 500K of program and compiles down to a 100K object file # # There is a single exported function # # const char *getpublicsuffix(const char *hostname) # # This returns the public suffix of the passed hostname or NULL if # there was an error processing the hostname. The returned pointer is # within the passed hostname so if the returned pointer is the same as # hostname the whole hostname is a public suffix otherwise the passed # hostname has a private part. # # The resulting C file is mearly a conversion of the input data (the # added c code is from this source and licenced under the same terms) # and imposes no additional copyright above that of the source data # file. # # Note: The pnode structure is built assuming there will never be more # label nodes than can fit in an unsigned 16 bit value (65535) but as # there are currently around 8000 nodes there is space for another # 58,000 before this becomes an issue. # # [1] https://publicsuffix.org/ # [2] https://publicsuffix.org/list/effective_tld_names.dat # debian package for ordered hashes: libtie-ixhash-perl # debian package for punycode encode: libidna-punycode-perl use strict; use warnings; use utf8; use File::Basename; use Tie::IxHash; use IDNA::Punycode; sub treesubdom { my ($tldtree_ref, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref) = @_; my $domelem = pop @{$parts_ref}; # Doamin element my $isexception = 0; tie my %node, 'Tie::IxHash'; # this nodes hash # deal with explicit domain exceptions $isexception = ($domelem =~ s/\A!//); if ($isexception != 0) { $node{"!"} = {}; $$nodeidx_ref += 1; } my $domelem_puny = encode_punycode($domelem); # Update string table if (! exists $strtab_ref->{$domelem_puny}) { # add to string table $strtab_ref->{$domelem_puny} = $$stridx_ref; { use bytes; # update the character count index $$stridx_ref += length($domelem_puny); } } # link new node list into tree if (! exists $tldtree_ref->{$domelem_puny}) { $tldtree_ref->{$domelem_puny} = \%node; $$nodeidx_ref += 1; } # recurse down if there are more parts to the domain if (($isexception == 0) && (scalar(@{$parts_ref}) > 0)) { treesubdom($tldtree_ref->{$domelem_puny}, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref); } } # output an array of bytes in binary sub phexbits { use bytes; my %btoh = ( "0000" => "0", "1000" => "1", "0100" => "2", "1100" => "3", "0010" => "4", "1010" => "5", "0110" => "6", "1110" => "7", "0001" => "8", "1001" => "9", "0101" => "A", "1101" => "B", "0011" => "C", "1011" => "D", "0111" => "E", "1111" => "F", ); my ($str) = @_; my @nyb = unpack "a4" x ((length($str)/4)-1) . "a*", $str; my $ret = " "; my $count = 0; do { my $a = shift @nyb; my $b = shift @nyb; my $c = shift @nyb; my $d = shift @nyb; my $e = shift @nyb; my $f = shift @nyb; my $g = shift @nyb; my $h = shift @nyb; $ret = $ret . " 0x" . $btoh{$h} . $btoh{$g}. $btoh{$f} . $btoh{$e}. $btoh{$d} . $btoh{$c}. $btoh{$b} . $btoh{$a} . ","; $count += 1; if ($count == 6) { $ret = $ret . "\n "; $count = 0; } } while (@nyb > 0); if ($count != 0) { $ret = $ret . "\n"; } return $ret; } # produce encode and decode dictionary from a tree sub walk_huffman_tree { my ($node, $code, $h, $rev_h) = @_; my $c = $node->[0]; if (ref $c) { walk_huffman_tree($c->[$_], $code.$_, $h, $rev_h) for 0,1; } else { $h->{$c} = $code; $rev_h->{$code} = $c; } $h, $rev_h; } # use huffman encoding to generate a dictionary from a string # # returns a pair of hashes for encode and decode sub mk_huffman_tree { my ($domelem_array_ref) = @_; # generate string table to get source of huffman frequencies my $stringtable = "*!"; # table being generated for my $domelem (@$domelem_array_ref) { my $substridx = index($stringtable, $domelem); if ($substridx == -1) { # no existing string match to put complete label into table #use bytes; $stringtable .= $domelem; } } my (%freq, @nodes); for my $c (split //, $stringtable) { $freq{$c}++; } @nodes = map([$_, $freq{$_}], keys %freq); # build priority queue do { @nodes = sort {$a->[1] <=> $b->[1]} @nodes; my ($x, $y) = splice @nodes, 0, 2; push @nodes, [[$x, $y], $x->[1] + $y->[1]] } while (@nodes > 1); return @nodes; } sub encode { my ($str, $dict) = @_; join '', map $dict->{$_}//die("bad char $_"), split '', $str } # recursively walk the tree and generate nodes # # each huffman graph element is made up from three nodes # [ # [ # [ l ], # [ r ] # ], # count # ] # # the l and r nodes may either be additional graph elements or terminal elements # sub generate_huffman_node { my ($dict, $node, $htableidx) = @_; if (! ref $node->[0]) { die("bad huffman tree"); } my $htable = ""; my $l = $node->[0]->[0]; my $r = $node->[0]->[1]; my $lhtable = ""; my $rhtable = ""; my $lhtableidx = 0; my $rhtableidx = 0; $htableidx += 2; if (ref $l->[0]) { ($lhtable, $lhtableidx) = generate_huffman_node($dict, $l, $htableidx); $htable .= " { 0, " . $htableidx. " },\n"; } else { $htable .= " { 1, '" . $l->[0] . "' }, /* '" . $l->[0] . "': " . $dict->{$l->[0]} . " */\n"; $lhtableidx = $htableidx; } if (ref $r->[0]) { ($rhtable, $rhtableidx) = generate_huffman_node($dict, $r, $lhtableidx); $htable .= " { 0, " . $lhtableidx . " },\n" . $lhtable . $rhtable; } else { $htable .= " { 1, '" . $r->[0] . "' }, /* '" . $r->[0] . "': " . $dict->{$r->[0]} . " */\n" . $lhtable; $rhtableidx = $lhtableidx; } $htable, $rhtableidx; } # generate c code to represent encoding used sub generate_huffman_table { my ($hnodes_ref, $dict) = @_; my ($htable, $htablesize) = generate_huffman_node($dict, $hnodes_ref->[0], 0); my $ret; $ret .= "/**\n"; $ret .= " * Huffman coding node\n"; $ret .= " */\n"; $ret .= "struct hnode {\n"; $ret .= " uint8_t term:1; /**< non zero if the node terminates a code */\n"; $ret .= " uint8_t value:7; /**< value in node */\n"; $ret .= "};\n\n"; $ret .= "/**\n"; $ret .= " * Huffman decoding table\n"; $ret .= " *\n"; $ret .= " * nodes are in pairs even nodes are for 0bit odd for 1\n"; $ret .= " */\n"; $ret .= "static const struct hnode htable[" . $htablesize . "] = {\n"; $ret .= $htable; $ret .= "};\n\n"; return $ret; } # output string table # # array of characters the node table below directly indexes entries. sub generate_string_table { my ($tldtree_ref, $nodeidx_ref, $strtab_ref, $stridx_ref) = @_; # obtain sorted array of domain label strings my $labcount = 0; # total number of labels my $labsize = 0; # total size of labels my @tmp_array; foreach my $key (keys %$strtab_ref) { use bytes; push(@tmp_array, $key); $labcount += 1; $labsize += length($key); } my @domelem_array = sort { length($b) <=> length($a) } @tmp_array; # build huffman tree my @hnodes = mk_huffman_tree(\@domelem_array); # generate huffman encoding dictionary my ($h, $rev_h) = walk_huffman_tree($hnodes[0], '', {}, {}); # generate bit table using huffman encoding my $bittable = encode("*", $h); # table being generated my $exceptidx = length($bittable); $bittable .= encode("!", $h); my $bittablesize = length($bittable); my $labfullcount = 2; # labels inserted into the table in full for my $domelem (@domelem_array) { my $domelembits = encode($domelem, $h); my $substridx = index($bittable, $domelembits); if ($substridx != -1) { # found existing string match so use it $strtab_ref->{$domelem} = $substridx; } else { # no existing string match to put complete label into table use bytes; $strtab_ref->{$domelem} = $bittablesize; $bittable .= $domelembits; $bittablesize += length($domelembits); $labfullcount += 1; } } my $srounding = $bittablesize % 32; if ($srounding > 0) { # ensure bittable is a multiple of 32 bits long for array generation $bittable .= '0' x (32 - $srounding); $bittablesize += (32 - $srounding); } print "enum stab_entities {\n"; print " STAB_WILDCARD = 0,\n"; print " STAB_EXCEPTION = " . $exceptidx . "\n"; print "};\n\n"; # output the node table to allow decoding print generate_huffman_table(\@hnodes, $h); print "/**\n"; print " * Domain label string table huffman encoded.\n"; print " * " . $labcount . " labels(" . $labsize * 8 . " bits) reduced to " . $labfullcount . " labels(" . $bittablesize . " bits)\n"; print " */\n"; print "static const uint32_t stab[" . ($bittablesize / 32) . "] = {\n"; print phexbits($bittable); print "};\n\n"; } # Output the length of the string sub pstr_len { use bytes; my ($str) = @_; my $ret; my @bytes = unpack('C*', $str); $ret = $ret . sprintf("%d", scalar(@bytes)); return $ret; } # generate all the children of a parent node and recurse into each of # those updating optidx to point to the next free node sub calc_pnode { my ($parent_ref, $strtab_ref, $opidx_ref, $nodecount_ref) = @_; my $our_dat; my $child_dat = ""; my $startidx = $$opidx_ref; my $lineidx = -1; # update the output index to after this node # need to allow for an additional node for each entry with children # iterate over each child element domain/ref pair while ( my ($cdom, $cref) = each(%$parent_ref) ) { if (scalar keys (%$cref) != 0) { $$opidx_ref += 2; } else { $$opidx_ref += 1; } } # entry block if ($startidx == ($$opidx_ref - 1)) { $our_dat = "\n /* entry " . $startidx . " */\n"; } else { $our_dat = "\n /* entries " . $startidx . " to " . ($$opidx_ref - 1) . " */\n"; } # iterate over each child element domain/ref pair while ( my ($cdom, $cref) = each(%$parent_ref) ) { my $child_count = scalar keys (%$cref); $$nodecount_ref += 1; # keep count of number of nodes in tree $our_dat .= " { .label = {" . sprintf("% 7d", $strtab_ref->{$cdom}) . ",". sprintf("% 3d", pstr_len($cdom)); if ($child_count == 0) { # complete label for no children $our_dat .= ", 0 } },"; } else { # complete label with children $our_dat .= ", 1 } }, "; $our_dat .= "{ .child = { " . $$opidx_ref . ", " . $child_count . " } },"; $child_dat .= calc_pnode($cref, $strtab_ref, $opidx_ref, $nodecount_ref); } $our_dat .= " /* " . $cdom . " */\n"; } return $our_dat . $child_dat; } # main binmode(STDOUT, ":utf8"); my ($filename) = @ARGV; if (not defined $filename) { die "need filename\n"; } open(my $fh, '<:encoding(UTF-8)', $filename) or die "Could not open file '$filename' $!"; tie my %tldtree, 'Tie::IxHash'; # node tree my $nodeidx = 1; # count of nodes allowing for the root node tie my %strtab, 'Tie::IxHash'; # string table my $stridx = 0; # put the wildcard match at 0 in the string table $strtab{'*'} = $stridx; $stridx += 1; # put the invert match at 1 in the string table $strtab{'!'} = $stridx; $stridx += 1; # read each line from prefix data and inject into hash tree while (my $line = <$fh>) { chomp $line; if (($line ne "") && ($line !~ /\/\/.*$/)) { # print "$line\n"; my @parts=split("\\.", $line); # recusrsive call to build tree from root treesubdom(\%tldtree, \$nodeidx, \%strtab, \$stridx, \@parts); } } # C program header print "/*\n"; print " * Generated with the genpubsuffix tool.\n"; print " * From file " . basename($filename) . "\n"; print " * Converted on " . localtime() . "\n"; print " */\n\n"; print "/**\n"; print " * Public suffix list graph node\n"; print " */\n"; print "union pnode {\n"; print " struct {\n"; print " unsigned int idx:24; /**< index of domain element in string table */\n"; print " unsigned int len:6; /**< length of domain element in string table */\n"; print " unsigned int children:1; /**< has children */\n"; print " } label;\n"; print " struct {\n"; print " uint16_t index; /**< index of first child node */\n"; print " uint16_t count; /**< number of children of this node */\n"; print " } child;\n"; print "};\n\n"; generate_string_table(\%tldtree, \$nodeidx, \%strtab, \$stridx); # output static node array # # The constructed array of nodes has all siblings sequentialy and an # index/count to its children. This yeilds a very compact data # structure easily traversable. # # Additional flags for * (match all) and ! (exception) are omitted as # they can be infered by having a node with a label of 0 (*) or 1 (!) # as the string table has those values explicitly created. # # As labels cannot be more than 63 characters a byte length is more # than sufficient. my $opidx = 2; # output index of node my $opnodes = ""; # output pnode initialisers my $opnodecount = 1; # output domain label nodes # root node initialiser $opnodes .= " /* root entry */\n"; $opnodes .= " { .label = { 0, 0, 1 } }, { .child = { " . $opidx . ", " . scalar keys(%tldtree) . " } },"; # generate node initialiser $opnodes .= calc_pnode(\%tldtree, \%strtab, \$opidx, \$opnodecount); print "/**\n"; print " * PSL represented as a directed acyclic graph\n"; print " * There are " . $opnodecount . " labels in " . $opidx . " nodes\n"; print " */\n"; print "static const union pnode pnodes[" . $opidx . "] = {\n"; print $opnodes; # output node initialisors print "\n};\n\n";