From 125245cafceff46686b0858098d62d645956389a Mon Sep 17 00:00:00 2001 From: Vincent Sanders Date: Thu, 15 Sep 2016 22:40:16 +0100 Subject: improve the node tree representation to halve its memory usage --- src/genpubsuffix.pl | 86 ++++++++++++--------- src/nspsl.c | 216 ++++++++++++++++++++++++++++------------------------ 2 files changed, 163 insertions(+), 139 deletions(-) (limited to 'src') diff --git a/src/genpubsuffix.pl b/src/genpubsuffix.pl index 6e90751..e95529b 100644 --- a/src/genpubsuffix.pl +++ b/src/genpubsuffix.pl @@ -92,7 +92,7 @@ sub treesubdom $tldtree_ref->{$domelem_puny} = \%node; $$nodeidx_ref += 1; } - + # recurse down if there are more parts to the domain if (($isexception == 0) && (scalar(@{$parts_ref}) > 0)) { treesubdom($tldtree_ref->{$domelem_puny}, $nodeidx_ref, $strtab_ref, $stridx_ref, $parts_ref); @@ -193,41 +193,39 @@ sub calc_pnode my $lineidx = -1; # update the output index to after this node - $$opidx_ref += scalar keys %$parent_ref; + # need to allow for an additional node for each entry with children + + # iterate over each child element domain/ref pair + while ( my ($cdom, $cref) = each(%$parent_ref) ) { + if (scalar keys (%$cref) != 0) { + $$opidx_ref += 2; + } else { + $$opidx_ref += 1; + } + } # entry block if ($startidx == ($$opidx_ref - 1)) { - $our_dat = "\n /* entry " . $startidx . " */\n "; + $our_dat = "\n /* entry " . $startidx . " */\n"; } else { - $our_dat = "\n /* entries " . $startidx . " to " . ($$opidx_ref - 1) . " */\n "; + $our_dat = "\n /* entries " . $startidx . " to " . ($$opidx_ref - 1) . " */\n"; } # iterate over each child element domain/ref pair while ( my ($cdom, $cref) = each(%$parent_ref) ) { - # make array look pretty by limiting entries per line - if ($lineidx == 3) { - $our_dat .= "\n "; - $lineidx = 0; - } elsif ($lineidx == -1) { - $lineidx = 1; - } else { - $our_dat .= " "; - $lineidx += 1; - } - - $our_dat .= "{ "; - $our_dat .= $strtab_ref->{$cdom} . ", "; my $child_count = scalar keys (%$cref); - $our_dat .= $child_count . ", "; - if ($child_count != 0) { - $our_dat .= $$opidx_ref . ", "; - $child_dat .= calc_pnode($cref, $strtab_ref, $opidx_ref); + + $our_dat .= " { "; + $our_dat .= ".label = {" . $strtab_ref->{$cdom} . ", ". pstr_len($cdom) ; + if ($child_count == 0) { + # complete label for no children + $our_dat .= ", 0 } },\n"; } else { - $our_dat .= 0 . ", "; + # complete label with children + $our_dat .= ", 1 } }, "; + $our_dat .= "{ .child = { " . $$opidx_ref . ", " . $child_count . " } },\n"; + $child_dat .= calc_pnode($cref, $strtab_ref, $opidx_ref); } - $our_dat .= pstr_len($cdom) ; - $our_dat .= " },"; - } return $our_dat . $child_dat; @@ -303,21 +301,33 @@ print "};\n\n"; # As labels cannot be more than 63 characters a byte length is more # than sufficient. -print "struct pnode {\n"; -print " uint16_t label; /**< index of domain element in string table */\n"; -print " uint16_t child_count; /* number of children of this node */\n"; -print " uint16_t child_index; /**< index of first child node */\n"; -print " uint8_t label_len; /**< length of domain element in string table */\n"; -print "};\n\n"; -my $opidx = 1; # output index of node +my $opidx = 2; # output index of node +my $opnodes = ""; # output pnode initialisers + +# root node initialiser +$opnodes .= " /* root entry */\n"; +$opnodes .= " { .label = { 0, 0, 1 } }, { .child = { " . $opidx . ", " . scalar keys(%tldtree) . " } },"; + +# generate node initialiser +$opnodes .= calc_pnode(\%tldtree, \%strtab, \$opidx); + + +print "union pnode {\n"; +print " struct {\n"; +print " uint16_t idx; /**< index of domain element in string table */\n"; +print " uint8_t len; /**< length of domain element in string table */\n"; +print " uint8_t children; /**< has children */\n"; +print " } label;\n"; +print " struct {\n"; +print " uint16_t index; /**< index of first child node */\n"; +print " uint16_t count; /* number of children of this node */\n"; +print " } child;\n"; +print "};\n\n"; -print "static const struct pnode pnodes[" . $nodeidx . "] = {\n"; +print "static const union pnode pnodes[" . $opidx . "] = {\n"; -# root node -print " /* root entry */\n"; -print " { 0, " . scalar keys(%tldtree) . ", " . $opidx . ", 0 },"; +# output node initialisors +print $opnodes; -# all subsequent nodes -print calc_pnode(\%tldtree, \%strtab, \$opidx); print "\n};\n\n"; diff --git a/src/nspsl.c b/src/nspsl.c index 2e6e4dd..83e0f4e 100644 --- a/src/nspsl.c +++ b/src/nspsl.c @@ -16,113 +16,127 @@ #define DOMSEP '.' +/** + * match the label element of the domain from a point in the tree + */ static int matchlabel(int parent, const char *start, int len) { - int clast = pnodes[parent].child_index + pnodes[parent].child_count; - int cidx; /*child node index */ - int ridx = -1; /* index of match or -1 */ - - if (pnodes[parent].child_count != 0) { - /* there are child nodes present to scan */ - - for (cidx = pnodes[parent].child_index; cidx < clast; cidx++) { - if (pnodes[cidx].label == STAB_WILDCARD) { - /* wildcard match */ - ridx = cidx; - } else { - if ((pnodes[cidx].label_len == len) && - (strncasecmp(&stab[pnodes[cidx].label], - start, - len) == 0)) { - - if ((pnodes[cidx].child_count == 1) && - (pnodes[pnodes[cidx].child_index].label == STAB_EXCEPTION)) { - /* exception to previous */ - ridx = -1; - } else { - ridx = cidx; - } - break; - } - } - } - } - return ridx; + int cidx; /* child node index */ + int ccount; /* child node count */ + int ridx = -1; /* index of match or -1 */ + + if (pnodes[parent].label.children != 0) { + /* there are child nodes present to scan */ + + cidx = pnodes[parent + 1].child.index; + + for (ccount = pnodes[parent + 1].child.count; + ccount > 0; + ccount--) { + if (pnodes[cidx].label.idx == STAB_WILDCARD) { + /* wildcard match */ + ridx = cidx; + } else { + if ((pnodes[cidx].label.len == len) && + (strncasecmp(&stab[pnodes[cidx].label.idx], + start, + len) == 0)) { + /* label match */ + if ((pnodes[cidx].label.children != 0) && + (pnodes[cidx + 1].child.count == 1) && + (pnodes[pnodes[cidx + 1].child.index].label.idx == STAB_EXCEPTION)) { + /* exception to previous */ + ridx = -1; + } else { + ridx = cidx; + } + break; + } + } + + /* move index to next sibling */ + if (pnodes[cidx].label.children != 0) { + cidx += 2; + } else { + cidx += 1; + } + } + } + return ridx; } /* - * Exported public API + * Exported public API */ const char *nspsl_getpublicsuffix(const char *hostname) { - int treeidx = 0; /* index to current tree node */ - const char *elem_start; - const char *elem_end; - int lab_count = 0; - - /* deal with obviously bad hostname */ - if ((hostname == NULL) || - (hostname[0]) == 0 || - (hostname[0] == DOMSEP)) { - return NULL; - } - - /* hostnames are ass backwards and we need to consider elemets - * from the end first. - */ - elem_end = hostname + strlen(hostname); - /* fqdn have a separator on the end */ - if (elem_end[-1] == DOMSEP) { - elem_end--; - } - elem_start = elem_end; - - /* extract the element and check for a match in our tree */ - for(;;) { - /* find the start of the element */ - while ((elem_start > hostname) && (*elem_start != DOMSEP)) { - elem_start--; - } - if (*elem_start == DOMSEP) { - elem_start++; - } - - lab_count++; - - /* search child nodes for label */ - treeidx = matchlabel(treeidx, elem_start, elem_end - elem_start); - if (treeidx == -1) { - break; - } - - if (elem_start == hostname) { - /* not valid */ - return NULL; - } - - elem_end = elem_start - 1; - elem_start = elem_end - 1; - } - - /* The public suffix algorithm says: "the domain must match - * the public suffix plus one additional label." This - * requires there to be at least two labels so we need to - * check - */ - if (lab_count == 1) { - if (elem_start == hostname) { - elem_start = NULL; - } else { - /* strip the non matching part */ - elem_start -= 2; - while (elem_start > hostname && *elem_start != DOMSEP) { - elem_start--; - } - if (*elem_start == DOMSEP) - elem_start++; - } - } - - - return elem_start; + int treeidx = 0; /* index to current tree node */ + const char *elem_start; + const char *elem_end; + int lab_count = 0; + + /* deal with obviously bad hostname */ + if ((hostname == NULL) || + (hostname[0]) == 0 || + (hostname[0] == DOMSEP)) { + return NULL; + } + + /* hostnames are ass backwards and we need to consider elemets + * from the end first. + */ + elem_end = hostname + strlen(hostname); + /* fqdn have a separator on the end */ + if (elem_end[-1] == DOMSEP) { + elem_end--; + } + elem_start = elem_end; + + /* extract the element and check for a match in our tree */ + for(;;) { + /* find the start of the element */ + while ((elem_start > hostname) && (*elem_start != DOMSEP)) { + elem_start--; + } + if (*elem_start == DOMSEP) { + elem_start++; + } + + lab_count++; + + /* search child nodes for label */ + treeidx = matchlabel(treeidx, elem_start, elem_end - elem_start); + if (treeidx == -1) { + break; + } + + if (elem_start == hostname) { + /* not valid */ + return NULL; + } + + elem_end = elem_start - 1; + elem_start = elem_end - 1; + } + + /* The public suffix algorithm says: "the domain must match + * the public suffix plus one additional label." This + * requires there to be at least two labels so we need to + * check + */ + if (lab_count == 1) { + if (elem_start == hostname) { + elem_start = NULL; + } else { + /* strip the non matching part */ + elem_start -= 2; + while (elem_start > hostname && *elem_start != DOMSEP) { + elem_start--; + } + if (*elem_start == DOMSEP) + elem_start++; + } + } + + return elem_start; } -- cgit v1.2.3