From aee69a99c89429677f1275626d82b2b5514e1499 Mon Sep 17 00:00:00 2001 From: Vincent Sanders Date: Wed, 14 Sep 2016 22:14:56 +0100 Subject: Initial working library --- COPYING | 19 ++++ Makefile | 48 +++++++++ README | 4 + include/nspsl.h | 10 ++ libnspsl.pc.in | 10 ++ src/Makefile | 13 +++ src/genpubsuffix.pl | 298 ++++++++++++++++++++++++++++++++++++++++++++++++++++ src/nspsl.c | 128 ++++++++++++++++++++++ test/Makefile | 3 + test/nspsl.c | 44 ++++++++ test/runtest.sh | 106 +++++++++++++++++++ 11 files changed, 683 insertions(+) create mode 100644 COPYING create mode 100644 Makefile create mode 100644 README create mode 100644 include/nspsl.h create mode 100644 libnspsl.pc.in create mode 100644 src/Makefile create mode 100644 src/genpubsuffix.pl create mode 100644 src/nspsl.c create mode 100644 test/Makefile create mode 100644 test/nspsl.c create mode 100755 test/runtest.sh diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..d448ee3 --- /dev/null +++ b/COPYING @@ -0,0 +1,19 @@ +Copyright 2016 Vincent Sanders + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + + * The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1a9c1ed --- /dev/null +++ b/Makefile @@ -0,0 +1,48 @@ +#!/bin/make +# +# Makefile for libnspsl +# +# Copyright 2016 Vincent Sanders + +# Component settings +COMPONENT := nspsl +COMPONENT_VERSION := 0.0.1 +# Default to a static library +COMPONENT_TYPE ?= lib-static + +# Setup the tooling +PREFIX ?= /opt/netsurf +NSSHARED ?= $(PREFIX)/share/netsurf-buildsystem +include $(NSSHARED)/makefiles/Makefile.tools + +# Reevaluate when used, as BUILDDIR won't be defined yet +TESTRUNNER = test/runtest.sh $(BUILDDIR) $(EXEEXT) + +# Toolchain flags +WARNFLAGS := -Wall -W -Wundef -Wpointer-arith -Wcast-align \ + -Wwrite-strings -Wstrict-prototypes -Wmissing-prototypes \ + -Wmissing-declarations -Wnested-externs + +CFLAGS := -D_GNU_SOURCE -D_DEFAULT_SOURCE \ + -I$(CURDIR)/include/ -I$(CURDIR)/src \ + $(WARNFLAGS) $(CFLAGS) +ifneq ($(GCCVER),2) + CFLAGS := $(CFLAGS) -std=c99 +else + # __inline__ is a GCCism + CFLAGS := $(CFLAGS) -Dinline="__inline__" +endif +CFLAGS := $(CFLAGS) -D_POSIX_C_SOURCE=200809L + +REQUIRED_LIBS := nspsl + +TESTCFLAGS := -g -O2 +TESTLDFLAGS := -l$(COMPONENT) $(TESTLDFLAGS) + +include $(NSBUILD)/Makefile.top + +# Extra installation rules +I := /$(INCLUDEDIR) +INSTALL_ITEMS := $(INSTALL_ITEMS) $(I):include/nspsl.h +INSTALL_ITEMS := $(INSTALL_ITEMS) /$(LIBDIR)/pkgconfig:lib$(COMPONENT).pc.in +INSTALL_ITEMS := $(INSTALL_ITEMS) /$(LIBDIR):$(OUTPUT) diff --git a/README b/README new file mode 100644 index 0000000..792e62e --- /dev/null +++ b/README @@ -0,0 +1,4 @@ +library to generate static code representation of the Public suffix list + +https://publicsuffix.org + diff --git a/include/nspsl.h b/include/nspsl.h new file mode 100644 index 0000000..2358167 --- /dev/null +++ b/include/nspsl.h @@ -0,0 +1,10 @@ +/* + * Copyright 2016 Vincent Sanders + * + * This file is part of libnspsl + * + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + */ + +const char *nspsl_getpublicsuffix(const char *hostname); diff --git a/libnspsl.pc.in b/libnspsl.pc.in new file mode 100644 index 0000000..91aff99 --- /dev/null +++ b/libnspsl.pc.in @@ -0,0 +1,10 @@ +prefix=PREFIX +exec_prefix=${prefix} +libdir=${exec_prefix}/LIBDIR +includedir=${prefix}/INCLUDEDIR + +Name: libnspsl +Description: NetSurf Public Suffix List +Version: VERSION +Libs: -L${libdir} LIBRARIES +Cflags: -I${includedir} diff --git a/src/Makefile b/src/Makefile new file mode 100644 index 0000000..1121639 --- /dev/null +++ b/src/Makefile @@ -0,0 +1,13 @@ +CFLAGS := $(CFLAGS) -I$(BUILDDIR) + +DIR_SOURCES := nspsl.c + +src/nspsl.c:$(BUILDDIR)/psl.inc + +$(BUILDDIR)/psl.inc:$(BUILDDIR)/public_suffix_list.dat + perl src/genpubsuffix.pl $< > $@ + +$(BUILDDIR)/public_suffix_list.dat: + wget -O $@ https://publicsuffix.org/list/public_suffix_list.dat + +include $(NSBUILD)/Makefile.subdir diff --git a/src/genpubsuffix.pl b/src/genpubsuffix.pl new file mode 100644 index 0000000..7323645 --- /dev/null +++ b/src/genpubsuffix.pl @@ -0,0 +1,298 @@ +# +# Public suffix C include generator +# +# Copyright 2016 Vincent Sanders +# +# Permission to use, copy, modify, and/or distribute this software for +# any purpose with or without fee is hereby granted, provided that the +# above copyright notice and this permission notice appear in all +# copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL +# WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE +# AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL +# DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA +# OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER +# TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR +# PERFORMANCE OF THIS SOFTWARE. + + +# This program converts the public suffix list data [1] into a C +# program with static data representation and acessor function. +# +# The actual data list [2] should be placed in a file effective_tld_names.dat +# +# The C program is written to stdout, the typical 160K input file +# generates 500K of program and compiles down to a 100K object file +# +# There is a single exported function +# +# const char *getpublicsuffix(const char *hostname) +# +# This returns the public suffix of the passed hostname or NULL if +# there was an error processing the hostname. The returned pointer is +# within the passed hostname so if the returned pointer is the same as +# hostname the whole hostname is a public suffix otherwise the passed +# hostname has a private part. +# +# The resulting C file is mearly a conversion of the input data (the +# added c code is from this source and licenced under the same terms) +# and imposes no additional copyright above that of the source data +# file. +# +# Note: The pnode structure is built assuming there will never be more +# label nodes than can fit in an unsigned 16 bit value (65535) but as +# there are currently around 8000 nodes there is space for another +# 58,000 before this becomes an issue. +# +# [1] https://publicsuffix.org/ +# [2] https://publicsuffix.org/list/effective_tld_names.dat + + +# debian package for ordered hashes: libtie-ixhash-perl +# debian package for punycode encode: libidna-punycode-perl + +use strict; +use warnings; +use utf8; +use Tie::IxHash; +use IDNA::Punycode; + +sub treesubdom +{ + my ($tldtree_ref, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref) = @_; + + my $domelem = pop @{$parts_ref}; # Doamin element + my $isexception = 0; + tie my %node, 'Tie::IxHash'; # this nodes hash + + # deal with explicit domain exceptions + $isexception = ($domelem =~ s/\A!//); + if ($isexception != 0) { + $node{"!"} = {}; + $$nodeidx_ref += 1; + } + my $domelem_puny = encode_punycode($domelem); + + # Update string table + if (! exists $strtab_ref->{$domelem_puny}) { + # add to string table + $strtab_ref->{$domelem_puny} = $$stridx_ref; + { + use bytes; + # update the character count index + $$stridx_ref += length($domelem_puny); + } + + } + + # link new node list into tree + if (! exists $tldtree_ref->{$domelem_puny}) { + $tldtree_ref->{$domelem_puny} = \%node; + $$nodeidx_ref += 1; + } + + # recurse down if there are more parts to the domain + if (($isexception == 0) && (scalar(@{$parts_ref}) > 0)) { + treesubdom($tldtree_ref->{$domelem_puny}, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref); + } +} + +# output string table +# +# array of characters the node table below directly indexes entries. +sub generate_string_table +{ + my ($tldtree_ref, $nodeidx_ref, $strtab_ref, $stridx_ref) = @_; + + my @tmp_array; + + foreach my $keys (keys %$strtab_ref) { + push(@tmp_array, $keys); + } + + my @domelem_array = sort { length($b) <=> length($a) } @tmp_array; + + print "static const char stab[" . $$stridx_ref . "] = {\n"; + while ( my ($key, $value) = each(%$strtab_ref) ) { + #for (@domelem_array) { +# my $key = $_; +# my $value = $strtab_ref->{$key}; + print " " . phexstr($key) . "/* " . $key . " " . $value . " */\n"; + } + print "};\n\n"; +} + +sub phexstr +{ + use bytes; + + my ($str) = @_; + my $ret; + + my @bytes = unpack('C*', $str); + + #$ret = $ret . sprintf("0x%02x, ", scalar(@bytes)); + + foreach (@bytes) { + $ret = $ret . sprintf("0x%02x, ", $_); + } + + return $ret; +} + +# Output the length of the string +sub pstr_len +{ + use bytes; + + my ($str) = @_; + my $ret; + + my @bytes = unpack('C*', $str); + + $ret = $ret . sprintf("%d", scalar(@bytes)); + + return $ret; +} + +# generate all the children of a parent node and recurse into each of +# those updating optidx to point to the next free node +sub calc_pnode +{ + my ($parent_ref, $strtab_ref, $opidx_ref) = @_; + my $our_dat; + my $child_dat = ""; + my $startidx = $$opidx_ref; + my $lineidx = -1; + + # update the output index to after this node + $$opidx_ref += scalar keys %$parent_ref; + + # entry block + if ($startidx == ($$opidx_ref - 1)) { + $our_dat = "\n /* entry " . $startidx . " */\n "; + } else { + $our_dat = "\n /* entries " . $startidx . " to " . ($$opidx_ref - 1) . " */\n "; + } + + # iterate over each child element domain/ref pair + while ( my ($cdom, $cref) = each(%$parent_ref) ) { + # make array look pretty by limiting entries per line + if ($lineidx == 3) { + $our_dat .= "\n "; + $lineidx = 0; + } elsif ($lineidx == -1) { + $lineidx = 1; + } else { + $our_dat .= " "; + $lineidx += 1; + } + + $our_dat .= "{ "; + $our_dat .= $strtab_ref->{$cdom} . ", "; + my $child_count = scalar keys (%$cref); + $our_dat .= $child_count . ", "; + if ($child_count != 0) { + $our_dat .= $$opidx_ref . ", "; + $child_dat .= calc_pnode($cref, $strtab_ref, $opidx_ref); + } else { + $our_dat .= 0 . ", "; + } + $our_dat .= pstr_len($cdom) ; + $our_dat .= " },"; + + } + + return $our_dat . $child_dat; +} + +# main +binmode(STDOUT, ":utf8"); + +my ($filename) = @ARGV; + +if (not defined $filename) { + die "need filename\n"; +} + +open(my $fh, '<:encoding(UTF-8)', $filename) + or die "Could not open file '$filename' $!"; + +tie my %tldtree, 'Tie::IxHash'; # node tree +my $nodeidx = 1; # count of nodes allowing for the root node + +tie my %strtab, 'Tie::IxHash'; # string table +my $stridx = 0; + +# put the wildcard match at 0 in the string table +$strtab{'*'} = $stridx; +$stridx += 1; + +# put the invert match at 1 in the string table +$strtab{'!'} = $stridx; +$stridx += 1; + +# read each line from prefix data and inject into hash tree +while (my $line = <$fh>) { + chomp $line; + + if (($line ne "") && ($line !~ /\/\/.*$/)) { + + # print "$line\n"; + my @parts=split("\\.", $line); + + # recusrsive call to build tree from root + + treesubdom(\%tldtree, \$nodeidx, \%strtab, \$stridx, \@parts); + } +} + +# C program header +print < + * + * This file is part of libnspsl + * + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + */ + +#include +#include + +#include "nspsl.h" + +#include "psl.inc" + +#define DOMSEP '.' + +static int matchlabel(int parent, const char *start, int len) +{ + int clast = pnodes[parent].child_index + pnodes[parent].child_count; + int cidx; /*child node index */ + int ridx = -1; /* index of match or -1 */ + + if (pnodes[parent].child_count != 0) { + /* there are child nodes present to scan */ + + for (cidx = pnodes[parent].child_index; cidx < clast; cidx++) { + if (pnodes[cidx].label == STAB_WILDCARD) { + /* wildcard match */ + ridx = cidx; + } else { + if ((pnodes[cidx].label_len == len) && + (strncasecmp(&stab[pnodes[cidx].label], + start, + len) == 0)) { + + if ((pnodes[cidx].child_count == 1) && + (pnodes[pnodes[cidx].child_index].label == STAB_EXCEPTION)) { + /* exception to previous */ + ridx = -1; + } else { + ridx = cidx; + } + break; + } + } + } + } + return ridx; +} + +/* + * Exported public API + */ +const char *nspsl_getpublicsuffix(const char *hostname) +{ + int treeidx = 0; /* index to current tree node */ + const char *elem_start; + const char *elem_end; + int lab_count = 0; + + /* deal with obviously bad hostname */ + if ((hostname == NULL) || + (hostname[0]) == 0 || + (hostname[0] == DOMSEP)) { + return NULL; + } + + /* hostnames are ass backwards and we need to consider elemets + * from the end first. + */ + elem_end = hostname + strlen(hostname); + /* fqdn have a separator on the end */ + if (elem_end[-1] == DOMSEP) { + elem_end--; + } + elem_start = elem_end; + + /* extract the element and check for a match in our tree */ + for(;;) { + /* find the start of the element */ + while ((elem_start > hostname) && (*elem_start != DOMSEP)) { + elem_start--; + } + if (*elem_start == DOMSEP) { + elem_start++; + } + + lab_count++; + + /* search child nodes for label */ + treeidx = matchlabel(treeidx, elem_start, elem_end - elem_start); + if (treeidx == -1) { + break; + } + + if (elem_start == hostname) { + /* not valid */ + return NULL; + } + + elem_end = elem_start - 1; + elem_start = elem_end - 1; + } + + /* The public suffix algorithm says: "the domain must match + * the public suffix plus one additional label." This + * requires there to be at least two labels so we need to + * check + */ + if (lab_count == 1) { + if (elem_start == hostname) { + elem_start = NULL; + } else { + /* strip the non matching part */ + elem_start -= 2; + while (elem_start > hostname && *elem_start != DOMSEP) { + elem_start--; + } + if (*elem_start == DOMSEP) + elem_start++; + } + } + + + return elem_start; +} diff --git a/test/Makefile b/test/Makefile new file mode 100644 index 0000000..369b078 --- /dev/null +++ b/test/Makefile @@ -0,0 +1,3 @@ +DIR_TEST_ITEMS := nspsl:nspsl.c + +include $(NSBUILD)/Makefile.subdir diff --git a/test/nspsl.c b/test/nspsl.c new file mode 100644 index 0000000..88b41c7 --- /dev/null +++ b/test/nspsl.c @@ -0,0 +1,44 @@ +/* + * Copyright 2016 Vincent Sanders + * + * This file is part of libnspsl + * + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + */ + +/** + * \file + * + * psl test program. first argument is checked against being a public suffix. + */ + +#include +#include +#include +#include + +#include + +int main(int argc, char**argv) +{ + const char *output; + size_t output_len; + + + if (argc == 2) { + output = nspsl_getpublicsuffix(argv[1]); + } else { + fprintf(stderr, "Usage: %s data", argv[0]); + return 1; + } + + if (output != NULL) { + output_len = strlen(output); + printf("%.*s\n", (int)output_len, output); + } else { + printf("null\n"); + } + + return 0; +} diff --git a/test/runtest.sh b/test/runtest.sh new file mode 100755 index 0000000..8520232 --- /dev/null +++ b/test/runtest.sh @@ -0,0 +1,106 @@ +#!/bin/sh +TEST_PATH=$1 + +#set -x + +checkPublicSuffix() +{ + ENC=$(${TEST_PATH}/test_nspsl "${1}") + if [ "${ENC}" != "${2}" ];then + echo "psl error ${ENC} != ${2}" + exit 2 + fi +} + +## test list derived from mozilla test data +## +## https://hg.mozilla.org/mozilla-central/raw-file/82d0a583a9a39bf0b0000bccbf6d5c9ec2596bcc/netwerk/test/unit/data/test_psl.txt +## +## Any copyright is dedicated to the Public Domain. +## http://creativecommons.org/publicdomain/zero/1.0/ + +## null input. +#checkPublicSuffix(null, null); +## Mixed case. +checkPublicSuffix 'COM' null +#checkPublicSuffix 'example.COM' 'example.com' +#checkPublicSuffix 'WwW.example.COM' 'example.com' +## Leading dot. +checkPublicSuffix '.com' null +checkPublicSuffix '.example' null +checkPublicSuffix '.example.com' null +checkPublicSuffix '.example.example' null +## Unlisted TLD. +checkPublicSuffix 'example' null +checkPublicSuffix 'example.example' 'example.example' +checkPublicSuffix 'b.example.example' 'example.example' +checkPublicSuffix 'a.b.example.example' 'example.example' +## Listed, but non-Internet, TLD. +##checkPublicSuffix 'local' null +##checkPublicSuffix 'example.local' null +##checkPublicSuffix 'b.example.local' null +##checkPublicSuffix 'a.b.example.local' null +## TLD with only 1 rule. +checkPublicSuffix 'biz' null +checkPublicSuffix 'domain.biz' 'domain.biz' +checkPublicSuffix 'b.domain.biz' 'domain.biz' +checkPublicSuffix 'a.b.domain.biz' 'domain.biz' +## TLD with some 2-level rules. +checkPublicSuffix 'com' null +checkPublicSuffix 'example.com' 'example.com' +checkPublicSuffix 'b.example.com' 'example.com' +checkPublicSuffix 'a.b.example.com' 'example.com' +checkPublicSuffix 'uk.com' null +checkPublicSuffix 'example.uk.com' 'example.uk.com' +checkPublicSuffix 'b.example.uk.com' 'example.uk.com' +checkPublicSuffix 'a.b.example.uk.com' 'example.uk.com' +checkPublicSuffix 'test.ac' 'test.ac' +## TLD with only 1 (wildcard) rule. +checkPublicSuffix 'il' null +#checkPublicSuffix 'c.il' null +#checkPublicSuffix 'b.c.il' 'b.c.il' +#checkPublicSuffix 'a.b.c.il' 'b.c.il' +## More complex TLD. +checkPublicSuffix 'jp' null +checkPublicSuffix 'test.jp' 'test.jp' +checkPublicSuffix 'www.test.jp' 'test.jp' +checkPublicSuffix 'ac.jp' null +checkPublicSuffix 'test.ac.jp' 'test.ac.jp' +checkPublicSuffix 'www.test.ac.jp' 'test.ac.jp' +checkPublicSuffix 'kyoto.jp' null +checkPublicSuffix 'test.kyoto.jp' 'test.kyoto.jp' +checkPublicSuffix 'ide.kyoto.jp' null +checkPublicSuffix 'b.ide.kyoto.jp' 'b.ide.kyoto.jp' +checkPublicSuffix 'a.b.ide.kyoto.jp' 'b.ide.kyoto.jp' +checkPublicSuffix 'c.kobe.jp' null +checkPublicSuffix 'b.c.kobe.jp' 'b.c.kobe.jp' +checkPublicSuffix 'a.b.c.kobe.jp' 'b.c.kobe.jp' +checkPublicSuffix 'city.kobe.jp' 'city.kobe.jp' +checkPublicSuffix 'www.city.kobe.jp' 'city.kobe.jp' +## TLD with a wildcard rule and exceptions. +checkPublicSuffix 'ck' null +checkPublicSuffix 'test.ck' null +checkPublicSuffix 'b.test.ck' 'b.test.ck' +checkPublicSuffix 'a.b.test.ck' 'b.test.ck' +checkPublicSuffix 'www.ck' 'www.ck' +checkPublicSuffix 'www.www.ck' 'www.ck' +## US K12. +checkPublicSuffix 'us' null +checkPublicSuffix 'test.us' 'test.us' +checkPublicSuffix 'www.test.us' 'test.us' +checkPublicSuffix 'ak.us' null +checkPublicSuffix 'test.ak.us' 'test.ak.us' +checkPublicSuffix 'www.test.ak.us' 'test.ak.us' +checkPublicSuffix 'k12.ak.us' null +checkPublicSuffix 'test.k12.ak.us' 'test.k12.ak.us' +checkPublicSuffix 'www.test.k12.ak.us' 'test.k12.ak.us' +## punycoded IDN labels. +checkPublicSuffix 'xn--85x722f.com.cn' 'xn--85x722f.com.cn' +checkPublicSuffix 'xn--85x722f.xn--55qx5d.cn' 'xn--85x722f.xn--55qx5d.cn' +checkPublicSuffix 'www.xn--85x722f.xn--55qx5d.cn' 'xn--85x722f.xn--55qx5d.cn' +checkPublicSuffix 'shishi.xn--55qx5d.cn' 'shishi.xn--55qx5d.cn' +checkPublicSuffix 'xn--55qx5d.cn' null +checkPublicSuffix 'xn--85x722f.xn--fiqs8s' 'xn--85x722f.xn--fiqs8s' +checkPublicSuffix 'www.xn--85x722f.xn--fiqs8s' 'xn--85x722f.xn--fiqs8s' +checkPublicSuffix 'shishi.xn--fiqs8s' 'shishi.xn--fiqs8s' +checkPublicSuffix 'xn--fiqs8s' null -- cgit v1.2.3