utility to translate public suffix data into a c code

author: Vincent Sanders <vince@kyllikki.org> 2015-05-06 23:45:48 +0100
committer: Vincent Sanders <vince@kyllikki.org> 2015-05-06 23:45:48 +0100
commit: 2092ab252f8526453930e1b5b4d9cb3a169af9e0 (patch)
tree: bb506a1b92f0582bb1f8f9fff206c6f033dba253 /utils
parent: 35751f3faab45f6cedc9249822dbcf1ad508ba8e (diff)
download: netsurf-2092ab252f8526453930e1b5b4d9cb3a169af9e0.tar.gz
netsurf-2092ab252f8526453930e1b5b4d9cb3a169af9e0.tar.bz2
1 files changed, 378 insertions, 0 deletions
diff --git a/utils/genpubsuffix.pl b/utils/genpubsuffix.pl
new file mode 100644
index 000000000..37a95e96e
--- /dev/null
+++ b/utils/genpubsuffix.pl
@@ -0,0 +1,378 @@
+#
+# Public suffix C code generator
+#
+# Copyright 2015 Vincent Sanders <vince@kyllikki.og>
+#
+# Permission to use, copy, modify, and/or distribute this software for
+# any purpose with or without fee is hereby granted, provided that the
+# above copyright notice and this permission notice appear in all
+# copies.
+#
+# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
+# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
+# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
+# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA
+# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
+# PERFORMANCE OF THIS SOFTWARE.
+
+
+# This program converts the public suffix list data [1] into a C
+#  program with static data representation and acessor function.
+#
+# The actual data list [2] should be placed in a file effective_tld_names.dat
+#
+# The C program is written to stdout, the typical 160K input file
+#  generates 500K of program and compiles down to a 100K object file
+#
+# There is a single exported function
+#
+# const char *getpublicsuffix(const char *hostname)
+# 
+# This returns the public suffix of the passed hostname or NULL if
+#  there was an error processing the hostname. The returned pointer is
+#  within the passed hostname so if the returned pointer is the same as
+#  hostname the whole hostname is a public suffix otherwise the passed
+#  hostname has a private part.
+#
+# The resulting C file is mearly a conversion of the input data (the
+#  added c code is from this source and licenced under the same terms)
+#  and imposes no additional copyright above that of the source data
+#  file.
+#
+# Note: The pnode structure is built assuming there will never be more
+#  label nodes than can fit in an unsigned 16 bit value (65535) but as
+#  there are currently around 7500 nodes there is space for another
+#  58,000 before this becomes an issue.
+#
+# [1] https://publicsuffix.org/
+# [2] https://publicsuffix.org/list/effective_tld_names.dat
+
+
+# debian package for ordered hashes: libtie-ixhash-perl
+
+use strict;
+use warnings;
+use utf8;
+use Tie::IxHash;
+
+
+sub treesubdom
+{
+    my ($tldtree_ref, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref) = @_;
+
+    my $domelem = pop @{$parts_ref};
+    my $isexception = 0;
+    tie my %node, 'Tie::IxHash'; # this nodes hash
+
+    # deal with explicit domain exceptions 
+    $isexception = ($domelem =~ s/\A!//);
+    if ($isexception != 0) {
+	$node{"!"} = {};
+	$$nodeidx_ref += 1;
+    }
+
+    # Update string table
+    if (! exists $strtab_ref->{$domelem}) {
+	# add to string table
+	$strtab_ref->{$domelem} = $$stridx_ref;
+	{
+	    use bytes;
+	    # update the character count index
+	    $$stridx_ref += length($domelem);
+	    $$stridx_ref += 1; # terminator
+	}
+	
+    }
+
+    # link new node list into tree
+    if (! exists $tldtree_ref->{$domelem}) {
+	$tldtree_ref->{$domelem} = \%node;
+	$$nodeidx_ref += 1;
+    }
+    
+    # recurse down if there are more parts to the domain
+    if (($isexception == 0) && (scalar(@{$parts_ref}) > 0)) {
+	treesubdom($tldtree_ref->{$domelem}, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref);
+    }
+}
+
+sub phexstr
+{
+    use bytes;
+
+    my ($str) = @_;
+    my $ret;
+
+    my @bytes = unpack('C*', $str);
+
+    $ret = $ret . sprintf("0x%02x, ", scalar(@bytes));
+
+    foreach (@bytes) {
+	$ret = $ret . sprintf("0x%02x, ", $_);
+    }
+
+    return $ret;
+}
+
+# generate all the children of a parent node and recurse into each of
+#  those updating optidx to point to the next free node
+sub calc_pnode 
+{
+    my ($parent_ref, $strtab_ref, $opidx_ref) = @_;
+    my $our_dat;
+    my $child_dat = "";
+    my $startidx = $$opidx_ref;
+    my $lineidx = -1;
+
+    # update the output index to after this node
+    $$opidx_ref += scalar keys %$parent_ref;
+
+    # entry block 
+    if ($startidx == ($$opidx_ref - 1)) {
+	$our_dat = "\n    /* entry " . $startidx . " */\n    ";
+    } else {
+	$our_dat = "\n    /* entries " . $startidx . " to " . ($$opidx_ref - 1) . " */\n    ";
+    }
+
+    # iterate over each child element domain/ref pair
+    while ( my ($cdom, $cref) = each(%$parent_ref) ) {
+        # make array look pretty by limiting entries per line
+	if ($lineidx == 3) {
+	    $our_dat .= "\n    ";
+	    $lineidx = 0;
+	} elsif ($lineidx == -1) {
+	    $lineidx = 1;
+	} else {
+	    $our_dat .= " ";
+	    $lineidx += 1;
+	}
+
+	$our_dat .= "{ ";
+	$our_dat .= $strtab_ref->{$cdom} . ", ";
+	my $child_count = scalar keys (%$cref);
+	$our_dat .= $child_count . ", ";
+	if ($child_count != 0) {
+	    $our_dat .= $$opidx_ref;
+	    $child_dat .= calc_pnode($cref, $strtab_ref, $opidx_ref);
+	} else {
+	    $our_dat .= 0;
+	}
+	$our_dat .= " },";
+
+    }
+
+    return $our_dat . $child_dat;
+}
+
+# main
+binmode(STDOUT, ":utf8");
+
+my $filename = "effective_tld_names.dat";
+
+open(my $fh, '<:encoding(UTF-8)', $filename)
+    or die "Could not open file '$filename' $!";
+
+tie my %tldtree, 'Tie::IxHash'; # node tree
+my $nodeidx = 1; # count of nodes allowing for the root node
+tie my %strtab, 'Tie::IxHash'; # string table
+my $stridx = 0;
+
+# put the wildcard match at 0 in the string table
+$strtab{'*'} = $stridx;
+$stridx += 2;
+
+# put the invert match at 2 in the string table
+$strtab{'!'} = $stridx;
+$stridx += 2;
+
+# read each line from prefix data and inject into hash tree
+while (my $line = <$fh>) {
+    chomp $line;
+
+    if (($line ne "") && ($line !~ /\/\/.*$/)) {
+
+	# print "$line\n";
+	my @parts=split("\\.", $line);
+
+	# recusrsive call to build tree from root
+
+	treesubdom(\%tldtree, \$nodeidx, \%strtab, \$stridx, \@parts);
+    }
+}
+
+# C program header
+print <<EOF;
+/*
+ * Generated with the genpubsuffix tool from effective_tld_names.dat
+ */
+
+#include <stdint.h>
+#include <string.h>
+
+EOF
+
+# output string table
+#
+# array of characters each string is prefixed with its length and the
+#  node table below directly indexes emtries. As labels cannot be more
+#  than 63 characters a byte length is more than sufficient.
+
+print "static const char stab[" . $stridx . "] = {\n";
+while ( my ($key, $value) = each(%strtab) ) {
+    print "    " . phexstr($key) . "/* " . $key . " " . $value . " */\n";
+}
+print "};\n\n";
+
+print "enum stab_entities {\n";
+print "    STAB_WILDCARD = 0,\n";
+print "    STAB_EXCEPTION = 2\n";
+print "};\n\n";
+
+
+# output static node array
+#
+# The constructed array of nodes has all siblings sequentialy and an
+# index/count to its children. This yeilds a very compact data
+# structure easily traversable.
+#
+# Additional flags for * (match all) and ! (exception) are omitted as
+# they can be infered by having a node with a label of 0 (*) or 2 (!)
+# as the string table has those values explicitly created.
+
+print "struct pnode {\n";
+print "    uint32_t label; /* index of domain element in string table */\n";
+print "    uint16_t child_count; /* number of children of this node */\n";
+print "    uint16_t child_index; /* index of first child node */\n";
+print "};\n\n";
+
+my $opidx = 1; # output index of node
+
+print "static const struct pnode pnodes[" . $nodeidx . "] = {\n";
+
+# root node
+print "    /* root entry */\n    { 0," . scalar keys(%tldtree) . ", " . $opidx . " },"; 
+
+# all subsequent nodes
+print calc_pnode(\%tldtree, \%strtab, \$opidx);
+print "\n};\n\n";
+
+# lookup code
+print <<EOF;
+
+#define DOMSEP '.'
+
+static int matchlabel(int parent, const char *start, int len)
+{
+	int clast = pnodes[parent].child_index + pnodes[parent].child_count;
+	int cidx; /*child node index */
+	int ridx = -1; /* index of match or -1 */
+
+	if (pnodes[parent].child_count != 0) {
+		/* there are child nodes present to scan */
+
+		for (cidx = pnodes[parent].child_index; cidx < clast; cidx++) {
+			if (pnodes[cidx].label == STAB_WILDCARD) {
+				/* wildcard match */
+				ridx = cidx;
+			} else {
+				if ((stab[pnodes[cidx].label] == len) &&
+				    (strncasecmp(&stab[pnodes[cidx].label + 1],
+						 start,
+						 len) == 0)) {
+
+					if ((pnodes[cidx].child_count == 1) &&
+					    (pnodes[pnodes[cidx].child_index].label == STAB_EXCEPTION)) {
+						/* exception to previous */
+						ridx = -1;
+					} else {
+						ridx = cidx;
+					}
+					break;
+				}
+			}
+		}
+	}
+	return ridx;
+}
+
+/*
+ * Exported public API 
+ */
+const char *getpublicsuffix(const char *hostname)
+{
+	int treeidx = 0; /* index to current tree node */
+	const char *elem_start;
+	const char *elem_end;
+	int lab_count = 0;
+
+	/* deal with obviously bad hostname */
+	if ((hostname == NULL) ||
+	    (hostname[0]) == 0 ||
+	    (hostname[0] == DOMSEP)) {
+		return NULL;
+	}
+
+	/* hostnames are ass backwards and we need to consider elemets
+	 * from the end first.
+	 */
+	 elem_end = hostname + strlen(hostname);
+	 /* fqdn have a separator on the end */
+	 if (elem_end[-1] == DOMSEP) {
+		 elem_end--;
+	 }
+	 elem_start = elem_end;
+
+	 /* extract the element and check for a match in our tree */
+	 for(;;) {
+		 /* find the start of the element */
+		 while ((elem_start > hostname) && (*elem_start != DOMSEP)) {
+			 elem_start--;
+		 }
+		 if (*elem_start == DOMSEP) {
+			 elem_start++;
+		 }
+
+		 lab_count++;
+
+		 /* search child nodes for label */
+		 treeidx = matchlabel(treeidx, elem_start, elem_end - elem_start);
+		 if (treeidx == -1) {
+			 break;
+		 }
+
+		 if (elem_start == hostname) {
+			 /* not valid */
+			 return NULL;
+		 }
+
+		 elem_end = elem_start - 1;
+		 elem_start = elem_end - 1;
+	 }
+
+	 /* The public suffix algorithm says: "the domain must match
+	  * the public suffix plus one additional label." This
+	  * requires there to be at least two labels so we need to
+	  * check
+	  */
+	 if (lab_count == 1) {
+		 if (elem_start == hostname) {
+			 elem_start = NULL;
+		 } else {
+			 /* strip the non matching part */
+			 elem_start -= 2;
+			 while (elem_start > hostname && *elem_start != DOMSEP) {
+				 elem_start--;
+			 }
+			 if (*elem_start == DOMSEP)
+				 elem_start++;
+		 }
+	 }
+
+
+	 return elem_start;
+}
+
+
+EOF
author	Vincent Sanders <vince@kyllikki.org>	2015-05-06 23:45:48 +0100
committer	Vincent Sanders <vince@kyllikki.org>	2015-05-06 23:45:48 +0100
commit	2092ab252f8526453930e1b5b4d9cb3a169af9e0 (patch)
tree	bb506a1b92f0582bb1f8f9fff206c6f033dba253 /utils
parent	35751f3faab45f6cedc9249822dbcf1ad508ba8e (diff)
download	netsurf-2092ab252f8526453930e1b5b4d9cb3a169af9e0.tar.gz netsurf-2092ab252f8526453930e1b5b4d9cb3a169af9e0.tar.bz2