diff options
author | Rob Kendrick (humdrum) <rob.kendrick@codethink.co.uk> | 2013-05-16 17:15:29 +0100 |
---|---|---|
committer | Rob Kendrick (humdrum) <rob.kendrick@codethink.co.uk> | 2013-05-16 17:15:29 +0100 |
commit | 7d60132816341e15a853f2f66f06d0755d0f2daa (patch) | |
tree | db50ed8306258dcd4b242561d76e85cac4263772 /utils/bloom.c | |
parent | 023c014ac439b03de4e45dd4cf567fe3a393bbf7 (diff) | |
download | netsurf-7d60132816341e15a853f2f66f06d0755d0f2daa.tar.gz netsurf-7d60132816341e15a853f2f66f06d0755d0f2daa.tar.bz2 |
Painfully trivial bloom filter implementation to experiment with optimising urldb lookups
Diffstat (limited to 'utils/bloom.c')
-rw-r--r-- | utils/bloom.c | 162 |
1 files changed, 162 insertions, 0 deletions
diff --git a/utils/bloom.c b/utils/bloom.c new file mode 100644 index 000000000..e6b9dcf92 --- /dev/null +++ b/utils/bloom.c @@ -0,0 +1,162 @@ +/* + * Copyright 2013 Rob Kendrick <rjek@netsurf-browser.org> + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + */ + +/** \file + * Trivial bloom filter */ + +#include <stdlib.h> +#include <stdio.h> +#include "utils/bloom.h" + +/** + * Hash a string, returning a 32bit value. The hash algorithm used is + * Fowler Noll Vo - a very fast and simple hash, ideal for short strings. + * See http://en.wikipedia.org/wiki/Fowler_Noll_Vo_hash for more details. + * + * \param datum The string to hash. + * \param len size_t of data length. + * \return The calculated hash value for the datum. + */ + +static inline uint32_t fnv(const char *datum, size_t len) +{ + uint32_t z = 0x811c9dc5; + + if (datum == NULL) + return 0; + + while (len--) { + z *= 0x01000193; + z ^= *datum++; + } + + return z; +} + +struct bloom_filter { + size_t size; + uint32_t items; + uint8_t filter[]; +}; + +struct bloom_filter *bloom_create(size_t size) +{ + struct bloom_filter *r = calloc(sizeof(*r) + size, 1); + + if (r == NULL) + return NULL; + + r->size = size; + + return r; +} + +void bloom_destroy(struct bloom_filter *b) +{ + free(b); +} + +void bloom_insert_str(struct bloom_filter *b, const char *s, size_t z) +{ + uint32_t hash = fnv(s, z); + bloom_insert_hash(b, hash); +} + +void bloom_insert_hash(struct bloom_filter *b, uint32_t hash) +{ + int index = hash % b->size; + int bit = hash % 8; + + b->filter[index] |= (1 << bit); + b->items++; +} + +bool bloom_search_str(struct bloom_filter *b, const char *s, size_t z) +{ + uint32_t hash = fnv(s, z); + return bloom_search_hash(b, hash); +} + +bool bloom_search_hash(struct bloom_filter *b, uint32_t hash) +{ + int index = hash % b->size; + int bit = hash % 8; + + return (b->filter[index] & (1 << bit)) != 0; +} + +uint32_t bloom_items(struct bloom_filter *b) +{ + return b->items; +} + +#ifdef TEST_RIG + +#include <stdio.h> +#include <string.h> +#include <assert.h> + +int main(int argc, char *arg[]) +{ + struct bloom_filter *b = bloom_create(8192); + FILE *dict = fopen("/usr/share/dict/words", "r"); + char buf[BUFSIZ]; + int false_positives = 0, total = 0; + + for (int i = 0; i < 8192; i++) { + fscanf(dict, "%s", buf); + printf("adding %s\n", buf); + bloom_insert_str(b, buf, strlen(buf)); + } + + printf("adding NetSurf\n"); + + bloom_insert_str(b, "NetSurf", 7); + printf("checking NetSurf (should be true)\n"); + assert(bloom_search_str(b, "NetSurf", 7)); + + fseek(dict, 0, SEEK_SET); + + for (int i = 0; i < 8192; i++) { + fscanf(dict, "%s", buf); + printf("checking %s (should be true)\n", buf); + assert(bloom_search_str(b, buf, strlen(buf))); + + total++; + } + + for (int i = 0; i < 8192; i++) { + fscanf(dict, "%s", buf); + printf("checking %s (should be false)\n", buf); + if (bloom_search_str(b, buf, strlen(buf)) == true) + false_positives++; + total++; + } + + printf("false positives: %d of %d, %f%%\n", + false_positives, total, + ((float)false_positives / total) * 100); + + fclose(dict); + bloom_destroy(b); + + return 0; +} + +#endif /* TEST_RIG */ + |