summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRob Kendrick (humdrum) <rob.kendrick@codethink.co.uk>2013-05-20 13:13:43 +0100
committerRob Kendrick (humdrum) <rob.kendrick@codethink.co.uk>2013-05-20 13:13:43 +0100
commitd78c9ccd1003f7ea933819c84d05ca0332f4afaf (patch)
treecf78d72518e5fe076bda864987353b791bbc168d
parent0088273dba9fb6cfde673f2a951320a5055e7075 (diff)
parentf571aebbdbee36256e3c243e555887996e4b7dff (diff)
downloadnetsurf-d78c9ccd1003f7ea933819c84d05ca0332f4afaf.tar.gz
netsurf-d78c9ccd1003f7ea933819c84d05ca0332f4afaf.tar.bz2
Merge branch 'rjek/bloom'
-rw-r--r--content/urldb.c41
-rw-r--r--utils/Makefile4
-rw-r--r--utils/bloom.c163
-rw-r--r--utils/bloom.h99
4 files changed, 304 insertions, 3 deletions
diff --git a/content/urldb.c b/content/urldb.c
index e3cc1d73d..059c3c687 100644
--- a/content/urldb.c
+++ b/content/urldb.c
@@ -107,6 +107,7 @@
#include "utils/filename.h"
#include "utils/url.h"
#include "utils/utils.h"
+#include "utils/bloom.h"
struct cookie_internal_data {
char *name; /**< Cookie name */
@@ -327,6 +328,17 @@ static int loaded_cookie_file_version;
#define MIN_URL_FILE_VERSION 106
#define URL_FILE_VERSION 106
+/* Bloom filter used for short-circuting the false case of "is this
+ * URL in the database?". BLOOM_SIZE controls how large the filter is
+ * in bytes. Primitive experimentation shows that for a filter of X
+ * bytes filled with X items, searching for X items not in the filter
+ * has a 5% false-positive rate. We set it to 32kB, which should be
+ * enough for all but the largest databases, while not being shockingly
+ * wasteful on memory.
+ */
+static struct bloom_filter *url_bloom;
+#define BLOOM_SIZE (1024 * 32)
+
/**
* Import an URL database from file, replacing any existing database
*
@@ -346,7 +358,10 @@ void urldb_load(const char *filename)
assert(filename);
- LOG(("Loading URL file"));
+ LOG(("Loading URL file %s", filename));
+
+ if (url_bloom == NULL)
+ url_bloom = bloom_create(BLOOM_SIZE);
fp = fopen(filename, "r");
if (!fp) {
@@ -456,6 +471,11 @@ void urldb_load(const char *filename)
die("Memory exhausted whilst loading "
"URL file");
}
+
+ if (url_bloom != NULL) {
+ uint32_t hash = nsurl_hash(nsurl);
+ bloom_insert_hash(url_bloom, hash);
+ }
/* Copy and merge path/query strings */
if (nsurl_get(nsurl, NSURL_PATH | NSURL_QUERY,
@@ -782,6 +802,14 @@ bool urldb_add_url(nsurl *url)
unsigned int port_int;
assert(url);
+
+ if (url_bloom == NULL)
+ url_bloom = bloom_create(BLOOM_SIZE);
+
+ if (url_bloom != NULL) {
+ uint32_t hash = nsurl_hash(url);
+ bloom_insert_hash(url_bloom, hash);
+ }
/* Copy and merge path/query strings */
if (nsurl_get(url, NSURL_PATH | NSURL_QUERY, &path_query, &len) !=
@@ -1857,6 +1885,13 @@ struct path_data *urldb_find_url(nsurl *url)
bool match;
assert(url);
+
+ if (url_bloom != NULL) {
+ if (bloom_search_hash(url_bloom,
+ nsurl_hash(url)) == false) {
+ return NULL;
+ }
+ }
scheme = nsurl_get_component(url, NSURL_SCHEME);
if (scheme == NULL)
@@ -3951,6 +3986,10 @@ void urldb_destroy(void)
b = a->next;
urldb_destroy_host_tree(a);
}
+
+ /* And the bloom filter */
+ if (url_bloom != NULL)
+ bloom_destroy(url_bloom);
}
/**
diff --git a/utils/Makefile b/utils/Makefile
index ed34e9557..071e4fec1 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -2,6 +2,6 @@
S_UTILS := base64.c corestrings.c filename.c filepath.c hashtable.c \
libdom.c locale.c log.c messages.c nsurl.c talloc.c url.c \
- utf8.c utils.c useragent.c
+ utf8.c utils.c useragent.c bloom.c
-S_UTILS := $(addprefix utils/,$(S_UTILS)) \ No newline at end of file
+S_UTILS := $(addprefix utils/,$(S_UTILS))
diff --git a/utils/bloom.c b/utils/bloom.c
new file mode 100644
index 000000000..1b07d6f1b
--- /dev/null
+++ b/utils/bloom.c
@@ -0,0 +1,163 @@
+/*
+ * Copyright 2013 Rob Kendrick <rjek@netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** \file
+ * Trivial bloom filter */
+
+#include <stdlib.h>
+#include "utils/bloom.h"
+
+/**
+ * Hash a string, returning a 32bit value. The hash algorithm used is
+ * Fowler Noll Vo - a very fast and simple hash, ideal for short strings.
+ * See http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash for more details.
+ *
+ * \param datum The string to hash.
+ * \param len size_t of data length.
+ * \return The calculated hash value for the datum.
+ */
+
+static inline uint32_t fnv(const char *datum, size_t len)
+{
+ uint32_t z = 0x811c9dc5;
+
+ if (datum == NULL)
+ return 0;
+
+ while (len--) {
+ z *= 0x01000193;
+ z ^= *datum++;
+ }
+
+ return z;
+}
+
+struct bloom_filter {
+ size_t size;
+ uint32_t items;
+ uint8_t filter[];
+};
+
+struct bloom_filter *bloom_create(size_t size)
+{
+ struct bloom_filter *r = calloc(sizeof(*r) + size, 1);
+
+ if (r == NULL)
+ return NULL;
+
+ r->size = size;
+
+ return r;
+}
+
+void bloom_destroy(struct bloom_filter *b)
+{
+ free(b);
+}
+
+void bloom_insert_str(struct bloom_filter *b, const char *s, size_t z)
+{
+ uint32_t hash = fnv(s, z);
+ bloom_insert_hash(b, hash);
+}
+
+void bloom_insert_hash(struct bloom_filter *b, uint32_t hash)
+{
+ unsigned int index = hash % (b->size << 3);
+ unsigned int byte_index = index >> 3;
+ unsigned int bit_index = index & 7;
+
+ b->filter[byte_index] |= (1 << bit_index);
+ b->items++;
+}
+
+bool bloom_search_str(struct bloom_filter *b, const char *s, size_t z)
+{
+ uint32_t hash = fnv(s, z);
+ return bloom_search_hash(b, hash);
+}
+
+bool bloom_search_hash(struct bloom_filter *b, uint32_t hash)
+{
+ unsigned int index = hash % (b->size << 3);
+ unsigned int byte_index = index >> 3;
+ unsigned int bit_index = index & 7;
+
+ return (b->filter[byte_index] & (1 << bit_index)) != 0;
+}
+
+uint32_t bloom_items(struct bloom_filter *b)
+{
+ return b->items;
+}
+
+#ifdef TEST_RIG
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+int main(int argc, char *arg[])
+{
+ struct bloom_filter *b = bloom_create(8192);
+ FILE *dict = fopen("/usr/share/dict/words", "r");
+ char buf[BUFSIZ];
+ int false_positives = 0, total = 0;
+
+ for (int i = 0; i < 8192; i++) {
+ fscanf(dict, "%s", buf);
+ printf("adding %s\n", buf);
+ bloom_insert_str(b, buf, strlen(buf));
+ }
+
+ printf("adding NetSurf\n");
+
+ bloom_insert_str(b, "NetSurf", 7);
+ printf("checking NetSurf (should be true)\n");
+ assert(bloom_search_str(b, "NetSurf", 7));
+
+ fseek(dict, 0, SEEK_SET);
+
+ for (int i = 0; i < 8192; i++) {
+ fscanf(dict, "%s", buf);
+ printf("checking %s (should be true)\n", buf);
+ assert(bloom_search_str(b, buf, strlen(buf)));
+
+ total++;
+ }
+
+ for (int i = 0; i < 8192; i++) {
+ fscanf(dict, "%s", buf);
+ printf("checking %s (should be false)\n", buf);
+ if (bloom_search_str(b, buf, strlen(buf)) == true)
+ false_positives++;
+ total++;
+ }
+
+ printf("false positives: %d of %d, %f%%\n",
+ false_positives, total,
+ ((float)false_positives / total) * 100);
+
+ fclose(dict);
+ bloom_destroy(b);
+
+ return 0;
+}
+
+#endif /* TEST_RIG */
+
diff --git a/utils/bloom.h b/utils/bloom.h
new file mode 100644
index 000000000..4a7bd3800
--- /dev/null
+++ b/utils/bloom.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2013 Rob Kendrick <rjek@netsurf-browser.org>
+ *
+ * This file is part of NetSurf, http://www.netsurf-browser.org/
+ *
+ * NetSurf is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * NetSurf is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** \file
+ * Trivial bloom filter */
+
+#ifndef _NETSURF_UTILS_BLOOM_H_
+#define _NETSURF_UTILS_BLOOM_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+struct bloom_filter;
+
+/**
+ * Create a new bloom filter.
+ *
+ * \param size Size of bloom filter in bytes
+ * \return Handle for newly-created bloom filter, or NULL
+ */
+struct bloom_filter *bloom_create(size_t size);
+
+/**
+ * Destroy a previously-created bloom filter
+ *
+ * \param b Bloom filter to destroy
+ */
+void bloom_destroy(struct bloom_filter *b);
+
+/**
+ * Insert a string of given length (may include NULs) into the filter,
+ * using an internal hash function.
+ *
+ * \param b Bloom filter to add to
+ * \param s Pointer to data
+ * \param z Length of data
+ */
+void bloom_insert_str(struct bloom_filter *b, const char *s, size_t z);
+
+/**
+ * Insert a given hash value into the filter, should you already have
+ * one to hand.
+ *
+ * \param b Bloom filter to add to
+ * \param hash Value to add
+ */
+void bloom_insert_hash(struct bloom_filter *b, uint32_t hash);
+
+/**
+ * Search the filter for the given string, assuming it was added by
+ * bloom_insert_str(). May return false-positives.
+ *
+ * \param b Bloom filter to search
+ * \param s Pointer to data to search for
+ * \param z Length of data
+ *
+ * \return False if never added, True if it might have been.
+ */
+bool bloom_search_str(struct bloom_filter *b, const char *s, size_t z);
+
+/**
+ * Search the filter for the given hash value, assuming it was added by
+ * bloom_insert_hash(). May return false-positives.
+ *
+ * \param b Bloom filter to search
+ * \param hash Hash value to search for
+ *
+ * \return False if never added, True if it might have been.
+ */
+bool bloom_search_hash(struct bloom_filter *b, uint32_t hash);
+
+/**
+ * Find out how many items have been added to this bloom filter. This
+ * is useful for deciding the size of a new bloom filter should you
+ * need to rehash it.
+ *
+ * \param b Bloom filter to examine
+ *
+ * \return Number of items that have been added
+ */
+uint32_t bloom_items(struct bloom_filter *b);
+
+#endif