From 2fee3114b46682b220c73aae7c4f2466dedb81ff Mon Sep 17 00:00:00 2001 From: Daniel Silverstone Date: Sat, 4 Dec 2010 15:28:50 +0000 Subject: Remove init/final code and turn aliases into static data structure. r=vince svn path=/trunk/libparserutils/; revision=10961 --- src/charset/Makefile | 10 +- src/charset/aliases.c | 459 ++++++++------------------------------------------ src/charset/aliases.h | 14 +- src/charset/charset.c | 54 ------ src/charset/charset.h | 24 --- 5 files changed, 80 insertions(+), 481 deletions(-) delete mode 100644 src/charset/charset.c delete mode 100644 src/charset/charset.h (limited to 'src/charset') diff --git a/src/charset/Makefile b/src/charset/Makefile index a4c8f64..d851b8e 100644 --- a/src/charset/Makefile +++ b/src/charset/Makefile @@ -1,4 +1,12 @@ # Sources -DIR_SOURCES := aliases.c charset.c codec.c +DIR_SOURCES := aliases.c codec.c + +$(DIR)aliases.c: $(DIR)aliases.inc + +$(DIR)aliases.inc: build/make-aliases.pl build/Aliases + $(VQ)$(ECHO) " ALIAS: $@" + $(Q)$(PERL) build/make-aliases.pl + +CLEAN_ITEMS := $(CLEAN_ITEMS) $(DIR)aliases.inc include build/makefiles/Makefile.subdir diff --git a/src/charset/aliases.c b/src/charset/aliases.c index 188a275..5c173d0 100644 --- a/src/charset/aliases.c +++ b/src/charset/aliases.c @@ -17,151 +17,77 @@ #include "charset/aliases.h" #include "utils/utils.h" -struct alias { - struct alias *next; - parserutils_charset_aliases_canon *canon; - uint16_t name_len; - char name[1]; -}; +/* Bring in the aliases tables */ +#include "aliases.inc" -#define HASH_SIZE (43) -static parserutils_charset_aliases_canon *canon_tab[HASH_SIZE]; -static struct alias *alias_tab[HASH_SIZE]; +typedef struct { + size_t slen; + const char *s; +} lengthed_string; -static parserutils_error parserutils_charset_create_alias(const char *alias, - parserutils_charset_aliases_canon *c, - parserutils_alloc alloc, void *pw); -static parserutils_charset_aliases_canon *parserutils_charset_create_canon( - const char *canon, uint16_t mibenum, - parserutils_alloc alloc, void *pw); -static int aliascmp(const char *s1, const char *s2, size_t s2_len); -static uint32_t parserutils_charset_hash_val(const char *alias, size_t len); -/** - * Create alias data from Aliases file - * - * \param filename The path to the Aliases file - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return PARSERUTILS_OK on success, appropriate error otherwise. - */ -parserutils_error parserutils_charset_aliases_create(const char *filename, - parserutils_alloc alloc, void *pw) -{ - char buf[300]; - FILE *fp; - - if (filename == NULL || alloc == NULL) - return PARSERUTILS_BADPARM; - - fp = fopen(filename, "r"); - if (fp == NULL) - return PARSERUTILS_FILENOTFOUND; - - while (fgets(buf, sizeof buf, fp)) { - char *p, *aliases = 0, *mib, *end; - parserutils_charset_aliases_canon *cf; - - if (buf[0] == 0 || buf[0] == '#') - /* skip blank lines or comments */ - continue; - - buf[strlen(buf) - 1] = 0; /* lose terminating newline */ - end = buf + strlen(buf); - - /* find end of canonical form */ - for (p = buf; *p && !isspace(*p) && !iscntrl(*p); p++) - ; /* do nothing */ - if (p >= end) - continue; - *p++ = '\0'; /* terminate canonical form */ - - /* skip whitespace */ - for (; *p && isspace(*p); p++) - ; /* do nothing */ - if (p >= end) - continue; - mib = p; - - /* find end of mibenum */ - for (; *p && !isspace(*p) && !iscntrl(*p); p++) - ; /* do nothing */ - if (p < end) - *p++ = '\0'; /* terminate mibenum */ - - cf = parserutils_charset_create_canon(buf, atoi(mib), alloc, pw); - if (cf == NULL) - continue; - - /* skip whitespace */ - for (; p < end && *p && isspace(*p); p++) - ; /* do nothing */ - if (p >= end) - continue; - aliases = p; - - while (p < end) { - /* find end of alias */ - for (; *p && !isspace(*p) && !iscntrl(*p); p++) - ; /* do nothing */ - if (p > end) - /* stop if we've gone past the end */ - break; - /* terminate current alias */ - *p++ = '\0'; +#define IS_PUNCT_OR_SPACE(x) \ + (!(((x) >= 'A' && (x) <= 'Z') || \ + ((x) >= 'a' && (x) <= 'z') || \ + ((x) >= '0' && (x) <= '9'))) - if (parserutils_charset_create_alias(aliases, cf, - alloc, pw) != PARSERUTILS_OK) - break; - /* in terminating, we may have advanced - * past the end - check this here */ - if (p >= end) - break; - - /* skip whitespace */ - for (; *p && isspace(*p); p++) - ; /* do nothing */ - - if (p >= end) - /* gone past end => stop */ - break; - - /* update pointer to current alias */ - aliases = p; - } - } - - fclose(fp); - - return PARSERUTILS_OK; +static int parserutils_charset_alias_match(const void *a, const void *b) +{ + lengthed_string *s = (lengthed_string *)a; + parserutils_charset_aliases_alias *alias = (parserutils_charset_aliases_alias*)b; + size_t key_left = s->slen; + size_t alias_left = alias->name_len; + const char *s_alias = alias->name; + const char *s_key = s->s; + int cmpret; + + while ((key_left > 0) && (alias_left > 0)) { + while ((key_left > 0) && IS_PUNCT_OR_SPACE(*s_key)) { + key_left--; s_key++; + } + + if (key_left == 0) + break; + + cmpret = tolower(*s_key) - *s_alias; + + if (cmpret != 0) { + return cmpret; + } + + key_left--; + s_key++; + alias_left--; + s_alias++; + } + + return key_left - alias_left; } /** - * Free all alias data + * Retrieve the canonical form of an alias name * - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data + * \param alias The alias name + * \param len The length of the alias name + * \return Pointer to canonical form or NULL if not found */ -void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw) +parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise( + const char *alias, size_t len) { - parserutils_charset_aliases_canon *c, *d; - struct alias *a, *b; - int i; - - for (i = 0; i != HASH_SIZE; i++) { - for (c = canon_tab[i]; c; c = d) { - d = c->next; - alloc(c, 0, pw); - } - canon_tab[i] = NULL; - - for (a = alias_tab[i]; a; a = b) { - b = a->next; - alloc(a, 0, pw); - } - alias_tab[i] = NULL; - } + parserutils_charset_aliases_alias *c; + lengthed_string s = {len, alias}; + + c = (parserutils_charset_aliases_alias*)bsearch(&s, + &charset_aliases[0], + charset_aliases_count, + sizeof(parserutils_charset_aliases_alias), + parserutils_charset_alias_match); + + if (c == NULL) + return NULL; + + return c->canon; } /** @@ -195,13 +121,14 @@ const char *parserutils_charset_mibenum_to_name(uint16_t mibenum) { int i; parserutils_charset_aliases_canon *c; - - for (i = 0; i != HASH_SIZE; i++) - for (c = canon_tab[i]; c; c = c->next) - if (c->mib_enum == mibenum) - return c->name; - - return NULL; + + for (i = 0; i < charset_aliases_canon_count; ++i) { + c = &canonical_charset_names[i]; + if (c->mib_enum == mibenum) + return c->name; + } + + return NULL; } /** @@ -212,253 +139,5 @@ const char *parserutils_charset_mibenum_to_name(uint16_t mibenum) */ bool parserutils_charset_mibenum_is_unicode(uint16_t mibenum) { - static uint16_t ucs4; - static uint16_t ucs2; - static uint16_t utf8; - static uint16_t utf16; - static uint16_t utf16be; - static uint16_t utf16le; - static uint16_t utf32; - static uint16_t utf32be; - static uint16_t utf32le; - - if (ucs4 == 0) { - ucs4 = parserutils_charset_mibenum_from_name("UCS-4", - SLEN("UCS-4")); - ucs2 = parserutils_charset_mibenum_from_name("UCS-2", - SLEN("UCS-2")); - utf8 = parserutils_charset_mibenum_from_name("UTF-8", - SLEN("UTF-8")); - utf16 = parserutils_charset_mibenum_from_name("UTF-16", - SLEN("UTF-16")); - utf16be = parserutils_charset_mibenum_from_name("UTF-16BE", - SLEN("UTF-16BE")); - utf16le = parserutils_charset_mibenum_from_name("UTF-16LE", - SLEN("UTF-16LE")); - utf32 = parserutils_charset_mibenum_from_name("UTF-32", - SLEN("UTF-32")); - utf32be = parserutils_charset_mibenum_from_name("UTF-32BE", - SLEN("UTF-32BE")); - utf32le = parserutils_charset_mibenum_from_name("UTF-32LE", - SLEN("UTF-32LE")); - } - - return (mibenum == ucs4 || mibenum == ucs2 || mibenum == utf8 || - mibenum == utf16 || mibenum == utf16be || - mibenum == utf16le || mibenum == utf32 || - mibenum == utf32be || mibenum == utf32le); -} - -#define IS_PUNCT_OR_SPACE(x) \ - ((0x09 <= (x) && (x) <= 0x0D) || \ - (0x20 <= (x) && (x) <= 0x2F) || \ - (0x3A <= (x) && (x) <= 0x40) || \ - (0x5B <= (x) && (x) <= 0x60) || \ - (0x7B <= (x) && (x) <= 0x7E)) - - -/** - * Compare name "s1" to name "s2" (of size s2_len) case-insensitively - * and ignoring ASCII punctuation characters. - * - * See http://www.whatwg.org/specs/web-apps/current-work/#character0 - * - * \param s1 Alias to compare to - * \param s2 Alias to compare - * \param s2_len Length of "s2" - * \returns 0 if equal, 1 otherwise - */ -int aliascmp(const char *s1, const char *s2, size_t s2_len) -{ - size_t s2_pos = 0; - - if (s1 == NULL || s2_len == 0) - return 1; - - while (true) { - while (IS_PUNCT_OR_SPACE(*s1)) - s1++; - while (s2_pos < s2_len && - IS_PUNCT_OR_SPACE(s2[s2_pos])) { - s2_pos++; - } - - if (s2_pos == s2_len) - return (*s1 != '\0') ? 1 : 0; - - if (tolower(*s1) != tolower(s2[s2_pos])) - break; - s1++; - s2_pos++; - } - - return 1; -} - - -/** - * Retrieve the canonical form of an alias name - * - * \param alias The alias name - * \param len The length of the alias name - * \return Pointer to canonical form or NULL if not found - */ -parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise( - const char *alias, size_t len) -{ - uint32_t hash; - parserutils_charset_aliases_canon *c; - struct alias *a; - - if (alias == NULL) - return NULL; - - hash = parserutils_charset_hash_val(alias, len); - - for (c = canon_tab[hash]; c; c = c->next) - if (aliascmp(c->name, alias, len) == 0) - break; - if (c) - return c; - - for (a = alias_tab[hash]; a; a = a->next) - if (aliascmp(a->name, alias, len) == 0) - break; - if (a) - return a->canon; - - return NULL; -} - - -/** - * Create an alias - * - * \param alias The alias name - * \param c The canonical form - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return PARSERUTILS_OK on success, appropriate error otherwise - */ -parserutils_error parserutils_charset_create_alias(const char *alias, - parserutils_charset_aliases_canon *c, - parserutils_alloc alloc, void *pw) -{ - struct alias *a; - uint32_t hash; - - if (alias == NULL || c == NULL || alloc == NULL) - return PARSERUTILS_BADPARM; - - a = alloc(NULL, sizeof(struct alias) + strlen(alias) + 1, pw); - if (a == NULL) - return PARSERUTILS_NOMEM; - - a->canon = c; - a->name_len = strlen(alias); - strcpy(a->name, alias); - a->name[a->name_len] = '\0'; - - hash = parserutils_charset_hash_val(alias, a->name_len); - - a->next = alias_tab[hash]; - alias_tab[hash] = a; - - return PARSERUTILS_OK; -} - -/** - * Create a canonical form - * - * \param canon The canonical name - * \param mibenum The MIB enum value - * \param alloc Memory (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return Pointer to canonical form or NULL on error - */ -parserutils_charset_aliases_canon *parserutils_charset_create_canon( - const char *canon, uint16_t mibenum, - parserutils_alloc alloc, void *pw) -{ - parserutils_charset_aliases_canon *c; - uint32_t hash, len; - - if (canon == NULL || alloc == NULL) - return NULL; - - len = strlen(canon); - - c = alloc(NULL, sizeof(parserutils_charset_aliases_canon) + len + 1, pw); - if (c == NULL) - return NULL; - - c->mib_enum = mibenum; - c->name_len = len; - strcpy(c->name, canon); - c->name[len] = '\0'; - - hash = parserutils_charset_hash_val(canon, len); - - c->next = canon_tab[hash]; - canon_tab[hash] = c; - - return c; -} - -/** - * Hash function - * - * \param alias String to hash - * \param len Number of bytes to hash (<= strlen(alias)) - * \return The hashed value - */ -uint32_t parserutils_charset_hash_val(const char *alias, size_t len) -{ - const char *s = alias; - uint32_t h = 5381; - - if (alias == NULL) - return 0; - - while (len--) { - if (IS_PUNCT_OR_SPACE(*s)) { - s++; - } else { - h = (h * 33) ^ (*s++ & ~0x20); /* case insensitive */ - } - } - - return h % HASH_SIZE; -} - - -#ifndef NDEBUG -/** - * Dump all alias data to stdout - */ -void parserutils_charset_aliases_dump(void) -{ - parserutils_charset_aliases_canon *c; - struct alias *a; - int i; - size_t size = 0; - - for (i = 0; i != HASH_SIZE; i++) { - for (c = canon_tab[i]; c; c = c->next) { - printf("%d %s\n", i, c->name); - size += offsetof(parserutils_charset_aliases_canon, - name) + c->name_len; - } - - for (a = alias_tab[i]; a; a = a->next) { - printf("%d %s\n", i, a->name); - size += offsetof(struct alias, name) + a->name_len; - } - } - - size += (sizeof(canon_tab) / sizeof(canon_tab[0])); - size += (sizeof(alias_tab) / sizeof(alias_tab[0])); - - printf("%u\n", (unsigned int) size); + return MIBENUM_IS_UNICODE(mibenum); } -#endif diff --git a/src/charset/aliases.h b/src/charset/aliases.h index 9abd2c8..189f8d5 100644 --- a/src/charset/aliases.h +++ b/src/charset/aliases.h @@ -13,24 +13,14 @@ #include typedef struct parserutils_charset_aliases_canon { - struct parserutils_charset_aliases_canon *next; + /* Do not change the ordering here without changing make-aliases.pl */ uint16_t mib_enum; uint16_t name_len; - char name[1]; + const char *name; } parserutils_charset_aliases_canon; -/* Load encoding aliases from file */ -parserutils_error parserutils_charset_aliases_create(const char *filename, - parserutils_alloc alloc, void *pw); -/* Destroy encoding aliases */ -void parserutils_charset_aliases_destroy(parserutils_alloc alloc, void *pw); - /* Canonicalise an alias name */ parserutils_charset_aliases_canon *parserutils_charset_alias_canonicalise( const char *alias, size_t len); -#ifndef NDEBUG -void parserutils_charset_aliases_dump(void); -#endif - #endif diff --git a/src/charset/charset.c b/src/charset/charset.c deleted file mode 100644 index 3ef1a71..0000000 --- a/src/charset/charset.c +++ /dev/null @@ -1,54 +0,0 @@ -/* - * This file is part of LibParserUtils. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -#include "charset/aliases.h" -#include "charset/charset.h" - -/** - * Initialise the Charset library for use. - * - * This _must_ be called before using any libparserutils charset functions - * - * \param aliases_file Pointer to name of file containing encoding alias data - * \param alloc Pointer to (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return PARSERUTILS_OK on success, applicable error otherwise. - */ -parserutils_error parserutils_charset_initialise(const char *aliases_file, - parserutils_alloc alloc, void *pw) -{ - parserutils_error error; - - if (aliases_file == NULL || alloc == NULL) - return PARSERUTILS_BADPARM; - - error = parserutils_charset_aliases_create(aliases_file, alloc, pw); - if (error != PARSERUTILS_OK) - return error; - - return PARSERUTILS_OK; -} - -/** - * Clean up after Libparserutils - * - * \param alloc Pointer to (de)allocation function - * \param pw Pointer to client-specific private data (may be NULL) - * \return PARSERUTILS_OK on success, applicable error otherwise. - */ -parserutils_error parserutils_charset_finalise(parserutils_alloc alloc, - void *pw) -{ - if (alloc == NULL) - return PARSERUTILS_BADPARM; - - parserutils_charset_aliases_destroy(alloc, pw); - - return PARSERUTILS_OK; -} - - diff --git a/src/charset/charset.h b/src/charset/charset.h deleted file mode 100644 index 4b07577..0000000 --- a/src/charset/charset.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * This file is part of LibParserUtils. - * Licensed under the MIT License, - * http://www.opensource.org/licenses/mit-license.php - * Copyright 2007 John-Mark Bell - */ - -#ifndef parserutils_charset_charset_h_ -#define parserutils_charset_charset_h_ - -#include -#include -#include - -/* Initialise the Charset library for use */ -parserutils_error parserutils_charset_initialise(const char *aliases_file, - parserutils_alloc alloc, void *pw); - -/* Clean up after Charset */ -parserutils_error parserutils_charset_finalise(parserutils_alloc alloc, - void *pw); - -#endif - -- cgit v1.2.3