diff options
author | Vincent Sanders <vince@kyllikki.org> | 2015-05-06 23:45:48 +0100 |
---|---|---|
committer | Vincent Sanders <vince@kyllikki.org> | 2015-05-06 23:45:48 +0100 |
commit | 2092ab252f8526453930e1b5b4d9cb3a169af9e0 (patch) | |
tree | bb506a1b92f0582bb1f8f9fff206c6f033dba253 /utils | |
parent | 35751f3faab45f6cedc9249822dbcf1ad508ba8e (diff) | |
download | netsurf-2092ab252f8526453930e1b5b4d9cb3a169af9e0.tar.gz netsurf-2092ab252f8526453930e1b5b4d9cb3a169af9e0.tar.bz2 |
utility to translate public suffix data into a c code
Diffstat (limited to 'utils')
-rw-r--r-- | utils/genpubsuffix.pl | 378 |
1 files changed, 378 insertions, 0 deletions
diff --git a/utils/genpubsuffix.pl b/utils/genpubsuffix.pl new file mode 100644 index 000000000..37a95e96e --- /dev/null +++ b/utils/genpubsuffix.pl @@ -0,0 +1,378 @@ +# +# Public suffix C code generator +# +# Copyright 2015 Vincent Sanders <vince@kyllikki.og> +# +# Permission to use, copy, modify, and/or distribute this software for +# any purpose with or without fee is hereby granted, provided that the +# above copyright notice and this permission notice appear in all +# copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE +# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL +# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA +# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +# PERFORMANCE OF THIS SOFTWARE. + + +# This program converts the public suffix list data [1] into a C +# program with static data representation and acessor function. +# +# The actual data list [2] should be placed in a file effective_tld_names.dat +# +# The C program is written to stdout, the typical 160K input file +# generates 500K of program and compiles down to a 100K object file +# +# There is a single exported function +# +# const char *getpublicsuffix(const char *hostname) +# +# This returns the public suffix of the passed hostname or NULL if +# there was an error processing the hostname. The returned pointer is +# within the passed hostname so if the returned pointer is the same as +# hostname the whole hostname is a public suffix otherwise the passed +# hostname has a private part. +# +# The resulting C file is mearly a conversion of the input data (the +# added c code is from this source and licenced under the same terms) +# and imposes no additional copyright above that of the source data +# file. +# +# Note: The pnode structure is built assuming there will never be more +# label nodes than can fit in an unsigned 16 bit value (65535) but as +# there are currently around 7500 nodes there is space for another +# 58,000 before this becomes an issue. +# +# [1] https://publicsuffix.org/ +# [2] https://publicsuffix.org/list/effective_tld_names.dat + + +# debian package for ordered hashes: libtie-ixhash-perl + +use strict; +use warnings; +use utf8; +use Tie::IxHash; + + +sub treesubdom +{ + my ($tldtree_ref, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref) = @_; + + my $domelem = pop @{$parts_ref}; + my $isexception = 0; + tie my %node, 'Tie::IxHash'; # this nodes hash + + # deal with explicit domain exceptions + $isexception = ($domelem =~ s/\A!//); + if ($isexception != 0) { + $node{"!"} = {}; + $$nodeidx_ref += 1; + } + + # Update string table + if (! exists $strtab_ref->{$domelem}) { + # add to string table + $strtab_ref->{$domelem} = $$stridx_ref; + { + use bytes; + # update the character count index + $$stridx_ref += length($domelem); + $$stridx_ref += 1; # terminator + } + + } + + # link new node list into tree + if (! exists $tldtree_ref->{$domelem}) { + $tldtree_ref->{$domelem} = \%node; + $$nodeidx_ref += 1; + } + + # recurse down if there are more parts to the domain + if (($isexception == 0) && (scalar(@{$parts_ref}) > 0)) { + treesubdom($tldtree_ref->{$domelem}, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref); + } +} + +sub phexstr +{ + use bytes; + + my ($str) = @_; + my $ret; + + my @bytes = unpack('C*', $str); + + $ret = $ret . sprintf("0x%02x, ", scalar(@bytes)); + + foreach (@bytes) { + $ret = $ret . sprintf("0x%02x, ", $_); + } + + return $ret; +} + +# generate all the children of a parent node and recurse into each of +# those updating optidx to point to the next free node +sub calc_pnode +{ + my ($parent_ref, $strtab_ref, $opidx_ref) = @_; + my $our_dat; + my $child_dat = ""; + my $startidx = $$opidx_ref; + my $lineidx = -1; + + # update the output index to after this node + $$opidx_ref += scalar keys %$parent_ref; + + # entry block + if ($startidx == ($$opidx_ref - 1)) { + $our_dat = "\n /* entry " . $startidx . " */\n "; + } else { + $our_dat = "\n /* entries " . $startidx . " to " . ($$opidx_ref - 1) . " */\n "; + } + + # iterate over each child element domain/ref pair + while ( my ($cdom, $cref) = each(%$parent_ref) ) { + # make array look pretty by limiting entries per line + if ($lineidx == 3) { + $our_dat .= "\n "; + $lineidx = 0; + } elsif ($lineidx == -1) { + $lineidx = 1; + } else { + $our_dat .= " "; + $lineidx += 1; + } + + $our_dat .= "{ "; + $our_dat .= $strtab_ref->{$cdom} . ", "; + my $child_count = scalar keys (%$cref); + $our_dat .= $child_count . ", "; + if ($child_count != 0) { + $our_dat .= $$opidx_ref; + $child_dat .= calc_pnode($cref, $strtab_ref, $opidx_ref); + } else { + $our_dat .= 0; + } + $our_dat .= " },"; + + } + + return $our_dat . $child_dat; +} + +# main +binmode(STDOUT, ":utf8"); + +my $filename = "effective_tld_names.dat"; + +open(my $fh, '<:encoding(UTF-8)', $filename) + or die "Could not open file '$filename' $!"; + +tie my %tldtree, 'Tie::IxHash'; # node tree +my $nodeidx = 1; # count of nodes allowing for the root node +tie my %strtab, 'Tie::IxHash'; # string table +my $stridx = 0; + +# put the wildcard match at 0 in the string table +$strtab{'*'} = $stridx; +$stridx += 2; + +# put the invert match at 2 in the string table +$strtab{'!'} = $stridx; +$stridx += 2; + +# read each line from prefix data and inject into hash tree +while (my $line = <$fh>) { + chomp $line; + + if (($line ne "") && ($line !~ /\/\/.*$/)) { + + # print "$line\n"; + my @parts=split("\\.", $line); + + # recusrsive call to build tree from root + + treesubdom(\%tldtree, \$nodeidx, \%strtab, \$stridx, \@parts); + } +} + +# C program header +print <<EOF; +/* + * Generated with the genpubsuffix tool from effective_tld_names.dat + */ + +#include <stdint.h> +#include <string.h> + +EOF + +# output string table +# +# array of characters each string is prefixed with its length and the +# node table below directly indexes emtries. As labels cannot be more +# than 63 characters a byte length is more than sufficient. + +print "static const char stab[" . $stridx . "] = {\n"; +while ( my ($key, $value) = each(%strtab) ) { + print " " . phexstr($key) . "/* " . $key . " " . $value . " */\n"; +} +print "};\n\n"; + +print "enum stab_entities {\n"; +print " STAB_WILDCARD = 0,\n"; +print " STAB_EXCEPTION = 2\n"; +print "};\n\n"; + + +# output static node array +# +# The constructed array of nodes has all siblings sequentialy and an +# index/count to its children. This yeilds a very compact data +# structure easily traversable. +# +# Additional flags for * (match all) and ! (exception) are omitted as +# they can be infered by having a node with a label of 0 (*) or 2 (!) +# as the string table has those values explicitly created. + +print "struct pnode {\n"; +print " uint32_t label; /* index of domain element in string table */\n"; +print " uint16_t child_count; /* number of children of this node */\n"; +print " uint16_t child_index; /* index of first child node */\n"; +print "};\n\n"; + +my $opidx = 1; # output index of node + +print "static const struct pnode pnodes[" . $nodeidx . "] = {\n"; + +# root node +print " /* root entry */\n { 0," . scalar keys(%tldtree) . ", " . $opidx . " },"; + +# all subsequent nodes +print calc_pnode(\%tldtree, \%strtab, \$opidx); +print "\n};\n\n"; + +# lookup code +print <<EOF; + +#define DOMSEP '.' + +static int matchlabel(int parent, const char *start, int len) +{ + int clast = pnodes[parent].child_index + pnodes[parent].child_count; + int cidx; /*child node index */ + int ridx = -1; /* index of match or -1 */ + + if (pnodes[parent].child_count != 0) { + /* there are child nodes present to scan */ + + for (cidx = pnodes[parent].child_index; cidx < clast; cidx++) { + if (pnodes[cidx].label == STAB_WILDCARD) { + /* wildcard match */ + ridx = cidx; + } else { + if ((stab[pnodes[cidx].label] == len) && + (strncasecmp(&stab[pnodes[cidx].label + 1], + start, + len) == 0)) { + + if ((pnodes[cidx].child_count == 1) && + (pnodes[pnodes[cidx].child_index].label == STAB_EXCEPTION)) { + /* exception to previous */ + ridx = -1; + } else { + ridx = cidx; + } + break; + } + } + } + } + return ridx; +} + +/* + * Exported public API + */ +const char *getpublicsuffix(const char *hostname) +{ + int treeidx = 0; /* index to current tree node */ + const char *elem_start; + const char *elem_end; + int lab_count = 0; + + /* deal with obviously bad hostname */ + if ((hostname == NULL) || + (hostname[0]) == 0 || + (hostname[0] == DOMSEP)) { + return NULL; + } + + /* hostnames are ass backwards and we need to consider elemets + * from the end first. + */ + elem_end = hostname + strlen(hostname); + /* fqdn have a separator on the end */ + if (elem_end[-1] == DOMSEP) { + elem_end--; + } + elem_start = elem_end; + + /* extract the element and check for a match in our tree */ + for(;;) { + /* find the start of the element */ + while ((elem_start > hostname) && (*elem_start != DOMSEP)) { + elem_start--; + } + if (*elem_start == DOMSEP) { + elem_start++; + } + + lab_count++; + + /* search child nodes for label */ + treeidx = matchlabel(treeidx, elem_start, elem_end - elem_start); + if (treeidx == -1) { + break; + } + + if (elem_start == hostname) { + /* not valid */ + return NULL; + } + + elem_end = elem_start - 1; + elem_start = elem_end - 1; + } + + /* The public suffix algorithm says: "the domain must match + * the public suffix plus one additional label." This + * requires there to be at least two labels so we need to + * check + */ + if (lab_count == 1) { + if (elem_start == hostname) { + elem_start = NULL; + } else { + /* strip the non matching part */ + elem_start -= 2; + while (elem_start > hostname && *elem_start != DOMSEP) { + elem_start--; + } + if (*elem_start == DOMSEP) + elem_start++; + } + } + + + return elem_start; +} + + +EOF |