From aa0aa44aaab4b0693400dc4307822ac493014224 Mon Sep 17 00:00:00 2001 From: John Mark Bell Date: Sun, 30 Nov 2008 16:35:19 +0000 Subject: New datastructures: + Chunked array + Hash table (open addressing) Constify parameter to parserutils_stack_push svn path=/trunk/libparserutils/; revision=5850 --- src/utils/Makefile | 3 +- src/utils/chunkarray.c | 213 +++++++++++++++++++++++++++++++ src/utils/chunkarray.h | 26 ++++ src/utils/hash.c | 338 +++++++++++++++++++++++++++++++++++++++++++++++++ src/utils/stack.c | 3 +- 5 files changed, 581 insertions(+), 2 deletions(-) create mode 100644 src/utils/chunkarray.c create mode 100644 src/utils/chunkarray.h create mode 100644 src/utils/hash.c (limited to 'src') diff --git a/src/utils/Makefile b/src/utils/Makefile index c786a09..f32ef72 100644 --- a/src/utils/Makefile +++ b/src/utils/Makefile @@ -35,7 +35,8 @@ d := $(DIR) CFLAGS := $(CFLAGS) -I$(d) # Sources -SRCS_$(d) := buffer.c dict.c errors.c rbtree.c stack.c vector.c +SRCS_$(d) := chunkarray.c buffer.c dict.c errors.c hash.c rbtree.c stack.c \ + vector.c # Append to sources for component SOURCES += $(addprefix $(d), $(SRCS_$(d))) diff --git a/src/utils/chunkarray.c b/src/utils/chunkarray.c new file mode 100644 index 0000000..308f537 --- /dev/null +++ b/src/utils/chunkarray.c @@ -0,0 +1,213 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2008 John-Mark Bell + */ + +#include +#include + +#include "utils/chunkarray.h" + +typedef struct chunk chunk; + +struct chunk { +#define CHUNK_SIZE (4096) + chunk *next; + + uint32_t used; + + uint8_t data[CHUNK_SIZE]; +}; + +struct parserutils_chunkarray { + chunk *used_chunks; + + chunk *free_chunks; + + parserutils_alloc alloc; + void *pw; +}; + +/** + * Create a chunked array + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + * \param array Pointer to location to receive array instance + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_chunkarray_create(parserutils_alloc alloc, + void *pw, parserutils_chunkarray **array) +{ + parserutils_chunkarray *c; + + if (alloc == NULL || array == NULL) + return PARSERUTILS_BADPARM; + + c = alloc(0, sizeof(parserutils_chunkarray), pw); + if (c == NULL) + return PARSERUTILS_NOMEM; + + c->used_chunks = NULL; + c->free_chunks = NULL; + + c->alloc = alloc; + c->pw = pw; + + *array = c; + + return PARSERUTILS_OK; +} + +/** + * Destroy a chunked array + * + * \param array The array to destroy + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_chunkarray_destroy(parserutils_chunkarray *array) +{ + chunk *c, *d; + + if (array == NULL) + return PARSERUTILS_BADPARM; + + for (c = array->used_chunks; c != NULL; c = d) { + d = c->next; + array->alloc(c, 0, array->pw); + } + + for (c = array->free_chunks; c != NULL; c = d) { + d = c->next; + array->alloc(c, 0, array->pw); + } + + array->alloc(array, 0, array->pw); + + return PARSERUTILS_OK; +} + +/** + * Insert an item into an array + * + * \param array The array to insert into + * \param data Pointer to data to insert + * \param len Length, in bytes, of data + * \param inserted Pointer to location to receive pointer to inserted data + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_chunkarray_insert(parserutils_chunkarray *array, + const void *data, size_t len, const void **inserted) +{ + if (array == NULL || data == NULL || inserted == NULL) + return PARSERUTILS_BADPARM; + + /* If we're trying to insert data larger than CHUNK_SIZE, then it + * gets its own chunk. */ + if (len > CHUNK_SIZE) { + chunk *c = array->alloc(0, sizeof(chunk) + len - CHUNK_SIZE, + array->pw); + if (c == NULL) + return PARSERUTILS_NOMEM; + + /* Populate chunk */ + (*inserted) = c->data; + memcpy(c->data, data, len); + c->used = len; + + /* And put it in the used list */ + c->next = array->used_chunks; + array->used_chunks = c; + } else { + /* Scan the free list until we find a chunk with enough space */ + chunk *c, *prev; + + for (prev = NULL, c = array->free_chunks; c != NULL; + prev = c, c = c->next) { + if (CHUNK_SIZE - c->used >= len) + break; + } + + if (c == NULL) { + /* None big enough, create a new one */ + c = array->alloc(0, sizeof(chunk), array->pw); + if (c == NULL) + return PARSERUTILS_NOMEM; + + c->used = 0; + + /* Insert it at the head of the free list */ + c->next = array->free_chunks; + array->free_chunks = c; + + /* And ensure that prev is kept in sync */ + prev = NULL; + } + + /* Populate chunk */ + (*inserted) = c->data + c->used; + memcpy(c->data + c->used, data, len); + c->used += len; + + /* If we've now filled the chunk, move it to the used list */ + if (c->used == CHUNK_SIZE) { + if (prev != NULL) + prev->next = c->next; + else + array->free_chunks = c->next; + + c->next = array->used_chunks; + array->used_chunks = c; + } + } + + return PARSERUTILS_OK; +} + +/****************************************************************************** + * Private functions * + ******************************************************************************/ + +extern void parserutils_chunkarray_dump(parserutils_chunkarray *array); + +/** + * Dump details of a chunked array to stdout + * + * \param array The array to dump + */ +void parserutils_chunkarray_dump(parserutils_chunkarray *array) +{ + uint32_t n = 0; + size_t count = 0; + size_t total = sizeof(parserutils_chunkarray); + + for (chunk *c = array->used_chunks; c != NULL; c = c->next) { + n++; + count += c->used; + if (c->used == CHUNK_SIZE) + total += sizeof(chunk); + else + total += sizeof(chunk) + c->used - CHUNK_SIZE; + } + + printf("%u full blocks: %zu bytes\n", n, count); + + n = 0; + count = 0; + + for (chunk *c = array->free_chunks; c != NULL; c = c->next) { + n++; + count += c->used; + total += sizeof(chunk); + } + + printf("%u partial blocks: %zu bytes (of %u => %f%%)\n", n, count, + n * CHUNK_SIZE, + (count * 100) / ((float) n * CHUNK_SIZE)); + + printf("Total: %zu (%lu) (%lu)\n", total, sizeof(chunk), + sizeof(parserutils_chunkarray)); +} + diff --git a/src/utils/chunkarray.h b/src/utils/chunkarray.h new file mode 100644 index 0000000..4ee98cd --- /dev/null +++ b/src/utils/chunkarray.h @@ -0,0 +1,26 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2008 John-Mark Bell + */ + +#ifndef parserutils_utils_chunkarray_h_ +#define parserutils_utils_chunkarray_h_ + +#include +#include + +struct parserutils_chunkarray; +typedef struct parserutils_chunkarray parserutils_chunkarray; + +parserutils_error parserutils_chunkarray_create(parserutils_alloc alloc, + void *pw, parserutils_chunkarray **array); +parserutils_error parserutils_chunkarray_destroy(parserutils_chunkarray *array); + +parserutils_error parserutils_chunkarray_insert(parserutils_chunkarray *array, + const void *data, size_t len, + const void **inserted); + +#endif + diff --git a/src/utils/hash.c b/src/utils/hash.c new file mode 100644 index 0000000..1694d72 --- /dev/null +++ b/src/utils/hash.c @@ -0,0 +1,338 @@ +/* + * This file is part of LibParserUtils. + * Licensed under the MIT License, + * http://www.opensource.org/licenses/mit-license.php + * Copyright 2008 John-Mark Bell + */ + +#include +#include + +#include + +#include "utils/chunkarray.h" + +typedef struct slot_table slot_table; + +struct parserutils_hash { + slot_table *slots; + + parserutils_chunkarray *data; + parserutils_chunkarray *entries; + + parserutils_alloc alloc; + void *pw; +}; + +struct slot_table { +#define DEFAULT_SLOTS (1<<6) + size_t n_slots; + size_t n_used; + + const parserutils_hash_entry *slots[]; +}; + +static inline uint32_t _hash(const uint8_t *data, size_t len); +static inline int _cmp(const uint8_t *a, size_t alen, + const uint8_t *b, size_t blen); +static inline void grow_slots(parserutils_hash *hash); + +/** + * Create a hash + * + * \param alloc Memory (de)allocation function + * \param pw Pointer to client-specific private data + * \param hash Pointer to location to receive result + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_hash_create(parserutils_alloc alloc, void *pw, + parserutils_hash **hash) +{ + parserutils_hash *h; + parserutils_error error; + + if (alloc == NULL || hash == NULL) + return PARSERUTILS_BADPARM; + + h = alloc(0, sizeof(parserutils_hash), pw); + if (h == NULL) + return PARSERUTILS_NOMEM; + + h->slots = alloc(0, sizeof(slot_table) + + DEFAULT_SLOTS * sizeof(parserutils_hash_entry *), + pw); + if (h->slots == NULL) { + alloc(h, 0, pw); + return PARSERUTILS_NOMEM; + } + + memset(h->slots, 0, sizeof(slot_table) + + DEFAULT_SLOTS * sizeof(parserutils_hash_entry *)); + h->slots->n_slots = DEFAULT_SLOTS; + + error = parserutils_chunkarray_create(alloc, pw, &h->data); + if (error != PARSERUTILS_OK) { + alloc(h->slots, 0, pw); + alloc(h, 0, pw); + return error; + } + + error = parserutils_chunkarray_create(alloc, pw, &h->entries); + if (error != PARSERUTILS_OK) { + alloc(h->data, 0, pw); + alloc(h->slots, 0, pw); + alloc(h, 0, pw); + return error; + } + + h->alloc = alloc; + h->pw = pw; + + *hash = h; + + return PARSERUTILS_OK; +} + +/** + * Destroy a hash + * + * \param hash The hash to destroy + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_hash_destroy(parserutils_hash *hash) +{ + if (hash == NULL) + return PARSERUTILS_BADPARM; + + parserutils_chunkarray_destroy(hash->data); + + parserutils_chunkarray_destroy(hash->entries); + + hash->alloc(hash->slots, 0, hash->pw); + + hash->alloc(hash, 0, hash->pw); + + return PARSERUTILS_OK; +} + +/** + * Insert an item into a hash + * + * \param hash The hash to insert into + * \param data Pointer to data + * \param len Length, in bytes, of data + * \param inserted Pointer to location to receive pointer to data in hash + * \return PARSERUTILS_OK on success, appropriate error otherwise + */ +parserutils_error parserutils_hash_insert(parserutils_hash *hash, + const uint8_t *data, size_t len, + const parserutils_hash_entry **inserted) +{ + uint32_t index; + slot_table *slots; + const void *idata, *ientry; + parserutils_hash_entry entry; + parserutils_error error; + + if (hash == NULL || data == NULL || inserted == NULL) + return PARSERUTILS_BADPARM; + + slots = hash->slots; + + /* Find index */ + index = _hash(data, len) % slots->n_slots; + + /* Use linear probing to resolve collisions */ + while (slots->slots[index] != NULL) { + /* If this data is already in the hash, return it */ + if (_cmp(data, len, slots->slots[index]->data, + slots->slots[index]->len) == 0) { + (*inserted) = slots->slots[index]; + return PARSERUTILS_OK; + } + + index = (index + 1) % slots->n_slots; + } + + /* The data isn't in the hash. Index identifies the slot to use */ + error = parserutils_chunkarray_insert(hash->data, data, len, &idata); + if (error != PARSERUTILS_OK) + return error; + + entry.len = len; + entry.data = idata; + + error = parserutils_chunkarray_insert(hash->entries, + &entry, sizeof(parserutils_hash_entry), &ientry); + if (error != PARSERUTILS_OK) { + /* We effectively leak the inserted data, as chunkarray + * doesn't support deletion. */ + return error; + } + + (*inserted) = slots->slots[index] = ientry; + + /* If the table is 75% full, then increase its size */ + if (++slots->n_used >= (slots->n_slots >> 1) + (slots->n_slots >> 2)) + grow_slots(hash); + + return PARSERUTILS_OK; +} + +/****************************************************************************** + * Private functions * + ******************************************************************************/ + +/** + * Hsieh hash function + * + * \param data Pointer to data to hash + * \param len Length, in bytes, of data + * \return hash value + */ +uint32_t _hash(const uint8_t *data, size_t len) +{ + uint32_t hash = len, tmp; + int rem = len & 3; + +#define read16(d) ((((uint32_t)((d)[1])) << 8) | ((uint32_t)((d)[0]))) + + for (len = len / 4; len > 0; len--) { + hash += read16(data); + tmp = (read16(data + 2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 4; + hash += hash >> 11; + } + + switch (rem) { + case 3: + hash += read16(data); + hash ^= hash << 16; + hash ^= data[2] << 18; + hash += hash >> 11; + break; + case 2: + hash += read16(data); + hash ^= hash << 11; + hash += hash >> 17; + break; + case 1: + hash += data[0]; + hash ^= hash << 10; + hash += hash >> 1; + } + +#undef read16 + + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; +} + +/** + * Comparator for hash entries + * + * \param a First item to consider + * \param alen Length, in bytes, of a + * \param b Second item to consider + * \param blen Length, in bytes, of b + * \return <0 iff a0 iff a>b + */ +int _cmp(const uint8_t *a, size_t alen, const uint8_t *b, size_t blen) +{ + int result; + + /* Check for identity first */ + if (a == b) + return 0; + + /* Sort first by length, then by data equality */ + if ((result = alen - blen) == 0) + result = memcmp(a, b, alen); + + return result; +} + +/** + * Increase the size of the slot table + * + * \param hash The hash to process + */ +void grow_slots(parserutils_hash *hash) +{ + slot_table *s; + size_t new_size; + + if (hash == NULL) + return; + + new_size = hash->slots->n_slots << 1; + + /* Create new slot table */ + s = hash->alloc(0, sizeof(slot_table) + + new_size * sizeof(parserutils_hash_entry *), + hash->pw); + if (s == NULL) + return; + + memset(s, 0, sizeof(slot_table) + + new_size * sizeof(parserutils_hash_entry *)); + s->n_slots = new_size; + + /* Now, populate it with the contents of the current one */ + for (uint32_t i = 0; i < hash->slots->n_slots; i++) { + const parserutils_hash_entry *e = hash->slots->slots[i]; + + if (e == NULL) + continue; + + /* Find new index */ + uint32_t index = _hash(e->data, e->len) % s->n_slots; + + /* Use linear probing to resolve collisions */ + while (s->slots[index] != NULL) + index = (index + 1) % s->n_slots; + + /* Insert into new slot table */ + s->slots[index] = e; + s->n_used++; + } + + /* Destroy current table, and replace it with the new one */ + hash->alloc(hash->slots, 0, hash->pw); + hash->slots = s; + + return; +} + +extern void parserutils_chunkarray_dump(parserutils_chunkarray *array); +extern void parserutils_hash_dump(parserutils_hash *hash); + +/** + * Dump details of a hash to stdout + * + * \param hash The hash to dump + */ +void parserutils_hash_dump(parserutils_hash *hash) +{ + printf("%zu slots used (of %zu => %f%%)\n", hash->slots->n_used, + hash->slots->n_slots, + hash->slots->n_used * 100 / (float) hash->slots->n_slots); + + printf("Data:\n"); + parserutils_chunkarray_dump(hash->data); + + printf("Entries:\n"); + parserutils_chunkarray_dump(hash->entries); + + printf("Hash structures: %zu\n", + sizeof(parserutils_hash) + sizeof(slot_table) + + hash->slots->n_slots * sizeof(parserutils_hash_entry *)); +} + diff --git a/src/utils/stack.c b/src/utils/stack.c index a157597..725a23a 100644 --- a/src/utils/stack.c +++ b/src/utils/stack.c @@ -92,7 +92,8 @@ parserutils_error parserutils_stack_destroy(parserutils_stack *stack) * \param item The item to push * \return PARSERUTILS_OK on success, appropriate error otherwise */ -parserutils_error parserutils_stack_push(parserutils_stack *stack, void *item) +parserutils_error parserutils_stack_push(parserutils_stack *stack, + const void *item) { int32_t slot; -- cgit v1.2.3