diff options
Diffstat (limited to 'utils')
-rw-r--r-- | utils/genpubsuffix.pl | 378 |
1 files changed, 0 insertions, 378 deletions
diff --git a/utils/genpubsuffix.pl b/utils/genpubsuffix.pl deleted file mode 100644 index 37a95e96e..000000000 --- a/utils/genpubsuffix.pl +++ /dev/null @@ -1,378 +0,0 @@ -# -# Public suffix C code generator -# -# Copyright 2015 Vincent Sanders <vince@kyllikki.og> -# -# Permission to use, copy, modify, and/or distribute this software for -# any purpose with or without fee is hereby granted, provided that the -# above copyright notice and this permission notice appear in all -# copies. -# -# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL -# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED -# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE -# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL -# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA -# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER -# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR -# PERFORMANCE OF THIS SOFTWARE. - - -# This program converts the public suffix list data [1] into a C -# program with static data representation and acessor function. -# -# The actual data list [2] should be placed in a file effective_tld_names.dat -# -# The C program is written to stdout, the typical 160K input file -# generates 500K of program and compiles down to a 100K object file -# -# There is a single exported function -# -# const char *getpublicsuffix(const char *hostname) -# -# This returns the public suffix of the passed hostname or NULL if -# there was an error processing the hostname. The returned pointer is -# within the passed hostname so if the returned pointer is the same as -# hostname the whole hostname is a public suffix otherwise the passed -# hostname has a private part. -# -# The resulting C file is mearly a conversion of the input data (the -# added c code is from this source and licenced under the same terms) -# and imposes no additional copyright above that of the source data -# file. -# -# Note: The pnode structure is built assuming there will never be more -# label nodes than can fit in an unsigned 16 bit value (65535) but as -# there are currently around 7500 nodes there is space for another -# 58,000 before this becomes an issue. -# -# [1] https://publicsuffix.org/ -# [2] https://publicsuffix.org/list/effective_tld_names.dat - - -# debian package for ordered hashes: libtie-ixhash-perl - -use strict; -use warnings; -use utf8; -use Tie::IxHash; - - -sub treesubdom -{ - my ($tldtree_ref, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref) = @_; - - my $domelem = pop @{$parts_ref}; - my $isexception = 0; - tie my %node, 'Tie::IxHash'; # this nodes hash - - # deal with explicit domain exceptions - $isexception = ($domelem =~ s/\A!//); - if ($isexception != 0) { - $node{"!"} = {}; - $$nodeidx_ref += 1; - } - - # Update string table - if (! exists $strtab_ref->{$domelem}) { - # add to string table - $strtab_ref->{$domelem} = $$stridx_ref; - { - use bytes; - # update the character count index - $$stridx_ref += length($domelem); - $$stridx_ref += 1; # terminator - } - - } - - # link new node list into tree - if (! exists $tldtree_ref->{$domelem}) { - $tldtree_ref->{$domelem} = \%node; - $$nodeidx_ref += 1; - } - - # recurse down if there are more parts to the domain - if (($isexception == 0) && (scalar(@{$parts_ref}) > 0)) { - treesubdom($tldtree_ref->{$domelem}, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref); - } -} - -sub phexstr -{ - use bytes; - - my ($str) = @_; - my $ret; - - my @bytes = unpack('C*', $str); - - $ret = $ret . sprintf("0x%02x, ", scalar(@bytes)); - - foreach (@bytes) { - $ret = $ret . sprintf("0x%02x, ", $_); - } - - return $ret; -} - -# generate all the children of a parent node and recurse into each of -# those updating optidx to point to the next free node -sub calc_pnode -{ - my ($parent_ref, $strtab_ref, $opidx_ref) = @_; - my $our_dat; - my $child_dat = ""; - my $startidx = $$opidx_ref; - my $lineidx = -1; - - # update the output index to after this node - $$opidx_ref += scalar keys %$parent_ref; - - # entry block - if ($startidx == ($$opidx_ref - 1)) { - $our_dat = "\n /* entry " . $startidx . " */\n "; - } else { - $our_dat = "\n /* entries " . $startidx . " to " . ($$opidx_ref - 1) . " */\n "; - } - - # iterate over each child element domain/ref pair - while ( my ($cdom, $cref) = each(%$parent_ref) ) { - # make array look pretty by limiting entries per line - if ($lineidx == 3) { - $our_dat .= "\n "; - $lineidx = 0; - } elsif ($lineidx == -1) { - $lineidx = 1; - } else { - $our_dat .= " "; - $lineidx += 1; - } - - $our_dat .= "{ "; - $our_dat .= $strtab_ref->{$cdom} . ", "; - my $child_count = scalar keys (%$cref); - $our_dat .= $child_count . ", "; - if ($child_count != 0) { - $our_dat .= $$opidx_ref; - $child_dat .= calc_pnode($cref, $strtab_ref, $opidx_ref); - } else { - $our_dat .= 0; - } - $our_dat .= " },"; - - } - - return $our_dat . $child_dat; -} - -# main -binmode(STDOUT, ":utf8"); - -my $filename = "effective_tld_names.dat"; - -open(my $fh, '<:encoding(UTF-8)', $filename) - or die "Could not open file '$filename' $!"; - -tie my %tldtree, 'Tie::IxHash'; # node tree -my $nodeidx = 1; # count of nodes allowing for the root node -tie my %strtab, 'Tie::IxHash'; # string table -my $stridx = 0; - -# put the wildcard match at 0 in the string table -$strtab{'*'} = $stridx; -$stridx += 2; - -# put the invert match at 2 in the string table -$strtab{'!'} = $stridx; -$stridx += 2; - -# read each line from prefix data and inject into hash tree -while (my $line = <$fh>) { - chomp $line; - - if (($line ne "") && ($line !~ /\/\/.*$/)) { - - # print "$line\n"; - my @parts=split("\\.", $line); - - # recusrsive call to build tree from root - - treesubdom(\%tldtree, \$nodeidx, \%strtab, \$stridx, \@parts); - } -} - -# C program header -print <<EOF; -/* - * Generated with the genpubsuffix tool from effective_tld_names.dat - */ - -#include <stdint.h> -#include <string.h> - -EOF - -# output string table -# -# array of characters each string is prefixed with its length and the -# node table below directly indexes emtries. As labels cannot be more -# than 63 characters a byte length is more than sufficient. - -print "static const char stab[" . $stridx . "] = {\n"; -while ( my ($key, $value) = each(%strtab) ) { - print " " . phexstr($key) . "/* " . $key . " " . $value . " */\n"; -} -print "};\n\n"; - -print "enum stab_entities {\n"; -print " STAB_WILDCARD = 0,\n"; -print " STAB_EXCEPTION = 2\n"; -print "};\n\n"; - - -# output static node array -# -# The constructed array of nodes has all siblings sequentialy and an -# index/count to its children. This yeilds a very compact data -# structure easily traversable. -# -# Additional flags for * (match all) and ! (exception) are omitted as -# they can be infered by having a node with a label of 0 (*) or 2 (!) -# as the string table has those values explicitly created. - -print "struct pnode {\n"; -print " uint32_t label; /* index of domain element in string table */\n"; -print " uint16_t child_count; /* number of children of this node */\n"; -print " uint16_t child_index; /* index of first child node */\n"; -print "};\n\n"; - -my $opidx = 1; # output index of node - -print "static const struct pnode pnodes[" . $nodeidx . "] = {\n"; - -# root node -print " /* root entry */\n { 0," . scalar keys(%tldtree) . ", " . $opidx . " },"; - -# all subsequent nodes -print calc_pnode(\%tldtree, \%strtab, \$opidx); -print "\n};\n\n"; - -# lookup code -print <<EOF; - -#define DOMSEP '.' - -static int matchlabel(int parent, const char *start, int len) -{ - int clast = pnodes[parent].child_index + pnodes[parent].child_count; - int cidx; /*child node index */ - int ridx = -1; /* index of match or -1 */ - - if (pnodes[parent].child_count != 0) { - /* there are child nodes present to scan */ - - for (cidx = pnodes[parent].child_index; cidx < clast; cidx++) { - if (pnodes[cidx].label == STAB_WILDCARD) { - /* wildcard match */ - ridx = cidx; - } else { - if ((stab[pnodes[cidx].label] == len) && - (strncasecmp(&stab[pnodes[cidx].label + 1], - start, - len) == 0)) { - - if ((pnodes[cidx].child_count == 1) && - (pnodes[pnodes[cidx].child_index].label == STAB_EXCEPTION)) { - /* exception to previous */ - ridx = -1; - } else { - ridx = cidx; - } - break; - } - } - } - } - return ridx; -} - -/* - * Exported public API - */ -const char *getpublicsuffix(const char *hostname) -{ - int treeidx = 0; /* index to current tree node */ - const char *elem_start; - const char *elem_end; - int lab_count = 0; - - /* deal with obviously bad hostname */ - if ((hostname == NULL) || - (hostname[0]) == 0 || - (hostname[0] == DOMSEP)) { - return NULL; - } - - /* hostnames are ass backwards and we need to consider elemets - * from the end first. - */ - elem_end = hostname + strlen(hostname); - /* fqdn have a separator on the end */ - if (elem_end[-1] == DOMSEP) { - elem_end--; - } - elem_start = elem_end; - - /* extract the element and check for a match in our tree */ - for(;;) { - /* find the start of the element */ - while ((elem_start > hostname) && (*elem_start != DOMSEP)) { - elem_start--; - } - if (*elem_start == DOMSEP) { - elem_start++; - } - - lab_count++; - - /* search child nodes for label */ - treeidx = matchlabel(treeidx, elem_start, elem_end - elem_start); - if (treeidx == -1) { - break; - } - - if (elem_start == hostname) { - /* not valid */ - return NULL; - } - - elem_end = elem_start - 1; - elem_start = elem_end - 1; - } - - /* The public suffix algorithm says: "the domain must match - * the public suffix plus one additional label." This - * requires there to be at least two labels so we need to - * check - */ - if (lab_count == 1) { - if (elem_start == hostname) { - elem_start = NULL; - } else { - /* strip the non matching part */ - elem_start -= 2; - while (elem_start > hostname && *elem_start != DOMSEP) { - elem_start--; - } - if (*elem_start == DOMSEP) - elem_start++; - } - } - - - return elem_start; -} - - -EOF |