diff options
author | Vincent Sanders <vince@kyllikki.org> | 2014-11-01 23:27:29 +0000 |
---|---|---|
committer | Vincent Sanders <vince@kyllikki.org> | 2014-11-01 23:27:29 +0000 |
commit | cf7abb4a0ad6a6de3acf3215ca6d31fdebbf4708 (patch) | |
tree | 68dea68cf0f765e3bbc18adccebc303864ccbc9d | |
parent | 1ea422a9d11f7253f08cc4cc39cf0cbfee79c030 (diff) | |
download | netsurf-cf7abb4a0ad6a6de3acf3215ca6d31fdebbf4708.tar.gz netsurf-cf7abb4a0ad6a6de3acf3215ca6d31fdebbf4708.tar.bz2 |
restructure urldb source
remove forward declarations and restructure. exported functions are
also now documented in the urldb.h header.
-rw-r--r-- | content/urldb.c | 4800 | ||||
-rw-r--r-- | content/urldb.h | 216 |
2 files changed, 2530 insertions, 2486 deletions
diff --git a/content/urldb.c b/content/urldb.c index bf873c62e..8af6ae150 100644 --- a/content/urldb.c +++ b/content/urldb.c @@ -17,8 +17,9 @@ * along with this program. If not, see <http://www.gnu.org/licenses/>. */ -/** \file - * Unified URL information database (implementation) +/** + * \file + * Unified URL information database implementation * * URLs are stored in a tree-based structure as follows: * @@ -81,8 +82,8 @@ * simpler implementation. Entries in this tree comprise pointers to the * leaf nodes of the host tree described above. * - * REALLY IMPORTANT NOTE: urldb expects all URLs to be normalised. Use of - * non-normalised URLs with urldb will result in undefined behaviour and + * REALLY IMPORTANT NOTE: urldb expects all URLs to be normalised. Use of + * non-normalised URLs with urldb will result in undefined behaviour and * potential crashes. */ @@ -217,94 +218,6 @@ struct search_node { struct search_node *right; /**< Right subtree */ }; -/* Destruction */ -static void urldb_destroy_host_tree(struct host_part *root); -static void urldb_destroy_path_tree(struct path_data *root); -static void urldb_destroy_path_node_content(struct path_data *node); -static void urldb_destroy_cookie(struct cookie_internal_data *c); -static void urldb_destroy_prot_space(struct prot_space_data *space); -static void urldb_destroy_search_tree(struct search_node *root); - -/* Saving */ -static void urldb_save_search_tree(struct search_node *root, FILE *fp); -static void urldb_count_urls(const struct path_data *root, time_t expiry, - unsigned int *count); -static void urldb_write_paths(const struct path_data *parent, - const char *host, FILE *fp, char **path, int *path_alloc, - int *path_used, time_t expiry); - -/* Iteration */ -static bool urldb_iterate_partial_host(struct search_node *root, - const char *prefix, bool (*callback)(nsurl *url, - const struct url_data *data)); -static bool urldb_iterate_partial_path(const struct path_data *parent, - const char *prefix, bool (*callback)(nsurl *url, - const struct url_data *data)); -static bool urldb_iterate_entries_host(struct search_node *parent, - bool (*url_callback)(nsurl *url, - const struct url_data *data), - bool (*cookie_callback)(const struct cookie_data *data)); -static bool urldb_iterate_entries_path(const struct path_data *parent, - bool (*url_callback)(nsurl *url, - const struct url_data *data), - bool (*cookie_callback)(const struct cookie_data *data)); - -/* Insertion */ -static struct host_part *urldb_add_host_node(const char *part, - struct host_part *parent); -static struct path_data *urldb_add_path_node(lwc_string *scheme, - unsigned int port, const char *segment, lwc_string *fragment, - struct path_data *parent); -static int urldb_add_path_fragment_cmp(const void *a, const void *b); -static struct path_data *urldb_add_path_fragment(struct path_data *segment, - lwc_string *fragment); - -/* Lookup */ -static struct path_data *urldb_find_url(nsurl *url); -static struct path_data *urldb_match_path(const struct path_data *parent, - const char *path, lwc_string *scheme, unsigned short port); -static struct search_node **urldb_get_search_tree_direct(const char *host); -static struct search_node *urldb_get_search_tree(const char *host); - -/* Dump */ -static void urldb_dump_hosts(struct host_part *parent); -static void urldb_dump_paths(struct path_data *parent); -static void urldb_dump_search(struct search_node *parent, int depth); - -/* Search tree */ -static struct search_node *urldb_search_insert(struct search_node *root, - const struct host_part *data); -static struct search_node *urldb_search_insert_internal( - struct search_node *root, struct search_node *n); -/* for urldb_search_remove, see r5531 which removed it */ -static const struct host_part *urldb_search_find(struct search_node *root, - const char *host); -static struct search_node *urldb_search_skew(struct search_node *root); -static struct search_node *urldb_search_split(struct search_node *root); -static int urldb_search_match_host(const struct host_part *a, - const struct host_part *b); -static int urldb_search_match_string(const struct host_part *a, - const char *b); -static int urldb_search_match_prefix(const struct host_part *a, - const char *b); - -/* Cookies */ -static struct cookie_internal_data *urldb_parse_cookie(nsurl *url, - const char **cookie); -static bool urldb_parse_avpair(struct cookie_internal_data *c, char *n, - char *v, bool was_quoted); -static bool urldb_insert_cookie(struct cookie_internal_data *c, - lwc_string *scheme, nsurl *url); -static void urldb_free_cookie(struct cookie_internal_data *c); -static bool urldb_concat_cookie(struct cookie_internal_data *c, int version, - int *used, int *alloc, char **buf); -static void urldb_delete_cookie_hosts(const char *domain, const char *path, - const char *name, struct host_part *parent); -static void urldb_delete_cookie_paths(const char *domain, const char *path, - const char *name, struct path_data *parent); -static void urldb_save_cookie_hosts(FILE *fp, struct host_part *parent); -static void urldb_save_cookie_paths(FILE *fp, struct path_data *parent); - /** Root database handle */ static struct host_part db_root; @@ -325,6 +238,8 @@ static struct search_node *search_trees[NUM_SEARCH_TREES] = { #define COOKIE_FILE_VERSION 102 static int loaded_cookie_file_version; #define MIN_URL_FILE_VERSION 106 + +/** URL database file version */ #define URL_FILE_VERSION 106 /* Bloom filter used for short-circuting the false case of "is this @@ -338,323 +253,7 @@ static int loaded_cookie_file_version; static struct bloom_filter *url_bloom; #define BLOOM_SIZE (1024 * 32) -/** - * Import an URL database from file, replacing any existing database - * - * \param filename Name of file containing data - */ -nserror urldb_load(const char *filename) -{ -#define MAXIMUM_URL_LENGTH 4096 - char s[MAXIMUM_URL_LENGTH]; - char host[256]; - struct host_part *h; - int urls; - int i; - int version; - int length; - FILE *fp; - - assert(filename); - - LOG(("Loading URL file %s", filename)); - if (url_bloom == NULL) - url_bloom = bloom_create(BLOOM_SIZE); - - fp = fopen(filename, "r"); - if (!fp) { - LOG(("Failed to open file '%s' for reading", filename)); - return NSERROR_NOT_FOUND; - } - - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) { - fclose(fp); - return NSERROR_NEED_DATA; - } - - version = atoi(s); - if (version < MIN_URL_FILE_VERSION) { - LOG(("Unsupported URL file version.")); - fclose(fp); - return NSERROR_INVALID; - } - if (version > URL_FILE_VERSION) { - LOG(("Unknown URL file version.")); - fclose(fp); - return NSERROR_INVALID; - } - - while (fgets(host, sizeof host, fp)) { - /* get the hostname */ - length = strlen(host) - 1; - host[length] = '\0'; - - /* skip data that has ended up with a host of '' */ - if (length == 0) { - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - urls = atoi(s); - /* Eight fields/url */ - for (i = 0; i < (8 * urls); i++) { - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - } - continue; - } - - /* read number of URLs */ - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - urls = atoi(s); - - /* no URLs => try next host */ - if (urls == 0) { - LOG(("No URLs for '%s'", host)); - continue; - } - - h = urldb_add_host(host); - if (!h) { - LOG(("Failed adding host: '%s'", host)); - fclose(fp); - return NSERROR_NOMEM; - } - - /* load the non-corrupt data */ - for (i = 0; i < urls; i++) { - struct path_data *p = NULL; - char scheme[64], ports[10]; - char url[64 + 3 + 256 + 6 + 4096 + 1]; - unsigned int port; - bool is_file = false; - nsurl *nsurl; - lwc_string *scheme_lwc, *fragment_lwc; - char *path_query; - size_t len; - - if (!fgets(scheme, sizeof scheme, fp)) - break; - length = strlen(scheme) - 1; - scheme[length] = '\0'; - - if (!fgets(ports, sizeof ports, fp)) - break; - length = strlen(ports) - 1; - ports[length] = '\0'; - port = atoi(ports); - - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - length = strlen(s) - 1; - s[length] = '\0'; - - if (!strcasecmp(host, "localhost") && - !strcasecmp(scheme, "file")) - is_file = true; - - snprintf(url, sizeof url, "%s://%s%s%s%s", - scheme, - /* file URLs have no host */ - (is_file ? "" : host), - (port ? ":" : ""), - (port ? ports : ""), - s); - - /* TODO: store URLs in pre-parsed state, and make - * a nsurl_load to generate the nsurl more - * swiftly. - * Need a nsurl_save too. - */ - if (nsurl_create(url, &nsurl) != NSERROR_OK) { - LOG(("Failed inserting '%s'", url)); - fclose(fp); - return NSERROR_NOMEM; - } - - if (url_bloom != NULL) { - uint32_t hash = nsurl_hash(nsurl); - bloom_insert_hash(url_bloom, hash); - } - - /* Copy and merge path/query strings */ - if (nsurl_get(nsurl, NSURL_PATH | NSURL_QUERY, - &path_query, &len) != NSERROR_OK) { - LOG(("Failed inserting '%s'", url)); - fclose(fp); - return NSERROR_NOMEM; - } - - scheme_lwc = nsurl_get_component(nsurl, NSURL_SCHEME); - fragment_lwc = nsurl_get_component(nsurl, - NSURL_FRAGMENT); - p = urldb_add_path(scheme_lwc, port, h, path_query, - fragment_lwc, nsurl); - if (!p) { - LOG(("Failed inserting '%s'", url)); - fclose(fp); - return NSERROR_NOMEM; - } - nsurl_unref(nsurl); - lwc_string_unref(scheme_lwc); - if (fragment_lwc != NULL) - lwc_string_unref(fragment_lwc); - - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - if (p) - p->urld.visits = (unsigned int)atoi(s); - - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - if (p) - p->urld.last_visit = (time_t)atoi(s); - - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - if (p) - p->urld.type = (content_type)atoi(s); - - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - - - if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) - break; - length = strlen(s) - 1; - if (p && length > 0) { - s[length] = '\0'; - p->urld.title = malloc(length + 1); - if (p->urld.title) - memcpy(p->urld.title, s, length + 1); - } - } - } - - fclose(fp); - LOG(("Successfully loaded URL file")); -#undef MAXIMUM_URL_LENGTH - - return NSERROR_OK; -} - -/** - * Export the current database to file - * - * \param filename Name of file to export to - */ -void urldb_save(const char *filename) -{ - FILE *fp; - int i; - - assert(filename); - - fp = fopen(filename, "w"); - if (!fp) { - LOG(("Failed to open file '%s' for writing", filename)); - return; - } - - /* file format version number */ - fprintf(fp, "%d\n", URL_FILE_VERSION); - - for (i = 0; i != NUM_SEARCH_TREES; i++) { - urldb_save_search_tree(search_trees[i], fp); - } - - fclose(fp); -} - -/** - * Save a search (sub)tree - * - * \param root Root of (sub)tree to save - * \param fp File to write to - */ -void urldb_save_search_tree(struct search_node *parent, FILE *fp) -{ - char host[256]; - const struct host_part *h; - unsigned int path_count = 0; - char *path, *p, *end; - int path_alloc = 64, path_used = 1; - time_t expiry; - - expiry = time(NULL) - ((60 * 60 * 24) * nsoption_int(expire_url)); - - if (parent == &empty) - return; - - urldb_save_search_tree(parent->left, fp); - - path = malloc(path_alloc); - if (!path) - return; - - path[0] = '\0'; - - for (h = parent->data, p = host, end = host + sizeof host; - h && h != &db_root && p < end; h = h->parent) { - int written = snprintf(p, end - p, "%s%s", h->part, - (h->parent && h->parent->parent) ? "." : ""); - if (written < 0) { - free(path); - return; - } - p += written; - } - - urldb_count_urls(&parent->data->paths, expiry, &path_count); - - if (path_count > 0) { - fprintf(fp, "%s\n%i\n", host, path_count); - - urldb_write_paths(&parent->data->paths, host, fp, - &path, &path_alloc, &path_used, expiry); - } - - free(path); - - urldb_save_search_tree(parent->right, fp); -} - -/** - * Count number of URLs associated with a host - * - * \param root Root of path data tree - * \param expiry Expiry time for URLs - * \param count Pointer to count - */ -void urldb_count_urls(const struct path_data *root, time_t expiry, - unsigned int *count) -{ - const struct path_data *p = root; - - do { - if (p->children != NULL) { - /* Drill down into children */ - p = p->children; - } else { - /* No more children, increment count if required */ - if (p->persistent || ((p->urld.last_visit > expiry) && - (p->urld.visits > 0))) - (*count)++; - - /* Now, find next node to process. */ - while (p != root) { - if (p->next != NULL) { - /* Have a sibling, process that */ - p = p->next; - break; - } - - /* Ascend tree */ - p = p->parent; - } - } - } while (p != root); -} /** * Write paths associated with a host @@ -667,7 +266,7 @@ void urldb_count_urls(const struct path_data *root, time_t expiry, * \param path_used Used size of path * \param expiry Expiry time of URLs */ -void urldb_write_paths(const struct path_data *parent, const char *host, +static void urldb_write_paths(const struct path_data *parent, const char *host, FILE *fp, char **path, int *path_alloc, int *path_used, time_t expiry) { @@ -706,7 +305,7 @@ void urldb_write_paths(const struct path_data *parent, const char *host, } else { /* leaf node */ if (p->persistent ||((p->urld.last_visit > expiry) && - (p->urld.visits > 0))) { + (p->urld.visits > 0))) { fprintf(fp, "%s\n", lwc_string_data(p->scheme)); if (p->port) @@ -719,8 +318,8 @@ void urldb_write_paths(const struct path_data *parent, const char *host, /** \todo handle fragments? */ fprintf(fp, "%i\n%i\n%i\n", p->urld.visits, - (int)p->urld.last_visit, - (int)p->urld.type); + (int)p->urld.last_visit, + (int)p->urld.type); fprintf(fp, "\n"); @@ -730,8 +329,8 @@ void urldb_write_paths(const struct path_data *parent, const char *host, for (i = 0; s[i] != '\0'; i++) if (s[i] < 32) s[i] = ' '; - for (--i; ((i > 0) && (s[i] == ' ')); - i--) + for (--i; ((i > 0) && (s[i] == ' ')); + i--) s[i] = '\0'; fprintf(fp, "%s\n", p->urld.title); } else @@ -740,8 +339,8 @@ void urldb_write_paths(const struct path_data *parent, const char *host, /* Now, find next node to process. */ while (p != parent) { - int seglen = p->segment != NULL - ? strlen(p->segment) : 0; + int seglen = p->segment != NULL + ? strlen(p->segment) : 0; /* Remove our segment from the path */ *path_used -= seglen; @@ -764,522 +363,319 @@ void urldb_write_paths(const struct path_data *parent, const char *host, } while (p != parent); } -/** - * Set the cross-session persistence of the entry for an URL - * - * \param url Absolute URL to persist - * \param persist True to persist, false otherwise - */ -void urldb_set_url_persistence(nsurl *url, bool persist) -{ - struct path_data *p; - - assert(url); - - p = urldb_find_url(url); - if (!p) - return; - - p->persistent = persist; -} - -/** - * Insert an URL into the database - * - * \param url Absolute URL to insert - * \return true on success, false otherwise - */ -bool urldb_add_url(nsurl *url) -{ - struct host_part *h; - struct path_data *p; - lwc_string *scheme; - lwc_string *port; - lwc_string *host; - lwc_string *fragment; - const char *host_str; - char *path_query = NULL; - size_t len; - bool match; - unsigned int port_int; - - assert(url); - - if (url_bloom == NULL) - url_bloom = bloom_create(BLOOM_SIZE); - - if (url_bloom != NULL) { - uint32_t hash = nsurl_hash(url); - bloom_insert_hash(url_bloom, hash); - } - - /* Copy and merge path/query strings */ - if (nsurl_get(url, NSURL_PATH | NSURL_QUERY, &path_query, &len) != - NSERROR_OK) { - return false; - } - assert(path_query != NULL); - - scheme = nsurl_get_component(url, NSURL_SCHEME); - if (scheme == NULL) { - free(path_query); - return false; - } - - host = nsurl_get_component(url, NSURL_HOST); - if (host != NULL) { - host_str = lwc_string_data(host); - lwc_string_unref(host); - - } else if (lwc_string_isequal(scheme, corestring_lwc_file, &match) == - lwc_error_ok && match == true) { - host_str = "localhost"; - - } else { - lwc_string_unref(scheme); - free(path_query); - return false; - } - - fragment = nsurl_get_component(url, NSURL_FRAGMENT); - - port = nsurl_get_component(url, NSURL_PORT); - if (port != NULL) { - port_int = atoi(lwc_string_data(port)); - lwc_string_unref(port); - } else { - port_int = 0; - } - - /* Get host entry */ - h = urldb_add_host(host_str); - - /* Get path entry */ - p = (h != NULL) ? urldb_add_path(scheme, port_int, h, path_query, - fragment, url) : NULL; - - lwc_string_unref(scheme); - if (fragment != NULL) - lwc_string_unref(fragment); - - return (p != NULL); -} /** - * Set an URL's title string, replacing any existing one + * Count number of URLs associated with a host * - * \param url The URL to look for - * \param title The title string to use (copied) + * \param root Root of path data tree + * \param expiry Expiry time for URLs + * \param count Pointer to count */ -void urldb_set_url_title(nsurl *url, const char *title) +static void urldb_count_urls(const struct path_data *root, time_t expiry, + unsigned int *count) { - struct path_data *p; - char *temp; - - assert(url && title); + const struct path_data *p = root; - p = urldb_find_url(url); - if (!p) - return; + do { + if (p->children != NULL) { + /* Drill down into children */ + p = p->children; + } else { + /* No more children, increment count if required */ + if (p->persistent || ((p->urld.last_visit > expiry) && + (p->urld.visits > 0))) { + (*count)++; + } - temp = strdup(title); - if (!temp) - return; + /* Now, find next node to process. */ + while (p != root) { + if (p->next != NULL) { + /* Have a sibling, process that */ + p = p->next; + break; + } - free(p->urld.title); - p->urld.title = temp; + /* Ascend tree */ + p = p->parent; + } + } + } while (p != root); } -/** - * Set an URL's content type - * - * \param url The URL to look for - * \param type The type to set - */ -void urldb_set_url_content_type(nsurl *url, content_type type) -{ - struct path_data *p; - - assert(url); - - p = urldb_find_url(url); - if (!p) - return; - - p->urld.type = type; -} /** - * Update an URL's visit data + * Save a search (sub)tree * - * \param url The URL to update + * \param root Root of (sub)tree to save + * \param fp File to write to */ -void urldb_update_url_visit_data(nsurl *url) +static void urldb_save_search_tree(struct search_node *parent, FILE *fp) { - struct path_data *p; + char host[256]; + const struct host_part *h; + unsigned int path_count = 0; + char *path, *p, *end; + int path_alloc = 64, path_used = 1; + time_t expiry; - assert(url); + expiry = time(NULL) - ((60 * 60 * 24) * nsoption_int(expire_url)); - p = urldb_find_url(url); - if (!p) + if (parent == &empty) return; - p->urld.last_visit = time(NULL); - p->urld.visits++; -} - -/** - * Reset an URL's visit statistics - * - * \param url The URL to reset - */ -void urldb_reset_url_visit_data(nsurl *url) -{ - struct path_data *p; - - assert(url); + urldb_save_search_tree(parent->left, fp); - p = urldb_find_url(url); - if (!p) + path = malloc(path_alloc); + if (!path) return; - p->urld.last_visit = (time_t)0; - p->urld.visits = 0; -} + path[0] = '\0'; + for (h = parent->data, p = host, end = host + sizeof host; + h && h != &db_root && p < end; h = h->parent) { + int written = snprintf(p, end - p, "%s%s", h->part, + (h->parent && h->parent->parent) ? "." : ""); + if (written < 0) { + free(path); + return; + } + p += written; + } -/** - * Find data for an URL. - * - * \param url Absolute URL to look for - * \return Pointer to result struct, or NULL - */ -const struct url_data *urldb_get_url_data(nsurl *url) -{ - struct path_data *p; - struct url_internal_data *u; + urldb_count_urls(&parent->data->paths, expiry, &path_count); - assert(url); + if (path_count > 0) { + fprintf(fp, "%s\n%i\n", host, path_count); - p = urldb_find_url(url); - if (!p) - return NULL; + urldb_write_paths(&parent->data->paths, host, fp, + &path, &path_alloc, &path_used, expiry); + } - u = &p->urld; + free(path); - return (const struct url_data *) u; + urldb_save_search_tree(parent->right, fp); } -/** - * Extract an URL from the db - * - * \param url URL to extract - * \return Pointer to database's copy of URL or NULL if not found - */ -nsurl *urldb_get_url(nsurl *url) -{ - struct path_data *p; - - assert(url); - - p = urldb_find_url(url); - if (!p) - return NULL; - - return p->url; -} /** - * Look up authentication details in database + * Path data iterator (internal) * - * \param url Absolute URL to search for - * \param realm When non-NULL, it is realm which can be used to determine - * the protection space when that's not been done before for given URL. - * \return Pointer to authentication details, or NULL if not found + * \param parent Root of subtree to iterate over + * \param url_callback Callback function + * \param cookie_callback Callback function + * \return true to continue, false otherwise */ -const char *urldb_get_auth_details(nsurl *url, const char *realm) +static bool urldb_iterate_entries_path(const struct path_data *parent, + bool (*url_callback)(nsurl *url, const struct url_data *data), + bool (*cookie_callback)(const struct cookie_data *data)) { - struct path_data *p, *p_cur, *p_top; + const struct path_data *p = parent; + const struct cookie_data *c; - assert(url); + do { + if (p->children != NULL) { + /* Drill down into children */ + p = p->children; + } else { + /* All leaf nodes in the path tree should have an URL or + * cookies attached to them. If this is not the case, it + * indicates that there's a bug in the file loader/URL + * insertion code. Therefore, assert this here. */ + assert(url_callback || cookie_callback); - /* add to the db, so our lookup will work */ - urldb_add_url(url); + /** \todo handle fragments? */ + if (url_callback) { + const struct url_internal_data *u = &p->urld; - p = urldb_find_url(url); - if (!p) - return NULL; + assert(p->url); - /* Check for any auth details attached to the path_data node or any of - * its parents. */ - for (p_cur = p; p_cur != NULL; p_top = p_cur, p_cur = p_cur->parent) { - if (p_cur->prot_space) { - return p_cur->prot_space->auth; - } - } + if (!url_callback(p->url, + (const struct url_data *) u)) + return false; + } else { + c = (const struct cookie_data *)p->cookies; + for (; c != NULL; c = c->next) { + if (!cookie_callback(c)) + return false; + } + } - /* Only when we have a realm (and canonical root of given URL), we can - * uniquely locate the protection space. */ - if (realm != NULL) { - const struct host_part *h = (const struct host_part *)p_top; - const struct prot_space_data *space; - bool match; + /* Now, find next node to process. */ + while (p != parent) { + if (p->next != NULL) { + /* Have a sibling, process that */ + p = p->next; + break; + } - /* Search for a possible matching protection space. */ - for (space = h->prot_space; space != NULL; - space = space->next) { - if (!strcmp(space->realm, realm) && - lwc_string_isequal(space->scheme, - p->scheme, &match) == - lwc_error_ok && - match == true && - space->port == p->port) { - p->prot_space = space; - return p->prot_space->auth; + /* Ascend tree */ + p = p->parent; } } - } + } while (p != parent); - return NULL; + return true; } -/** - * Retrieve certificate verification permissions from database - * - * \param url Absolute URL to search for - * \return true to permit connections to hosts with invalid certificates, - * false otherwise. - */ -bool urldb_get_cert_permissions(nsurl *url) -{ - struct path_data *p; - const struct host_part *h; - - assert(url); - - p = urldb_find_url(url); - if (!p) - return false; - - for (; p && p->parent; p = p->parent) - /* do nothing */; - assert(p); - - h = (const struct host_part *)p; - - return h->permit_invalid_certs; -} /** - * Set authentication data for an URL + * Check whether a host string is an IP address. * - * \param url The URL to consider - * \param realm The authentication realm - * \param auth The authentication details (in form username:password) + * This call detects IPv4 addresses (all of dotted-quad or subsets, + * decimal or hexadecimal notations) and IPv6 addresses (including + * those containing embedded IPv4 addresses.) + * + * \param host a hostname terminated by '\0' + * \return true if the hostname is an IP address, false otherwise */ -void urldb_set_auth_details(nsurl *url, const char *realm, - const char *auth) +static bool urldb__host_is_ip_address(const char *host) { - struct path_data *p, *pi; - struct host_part *h; - struct prot_space_data *space, *space_alloc; - char *realm_alloc, *auth_alloc; - bool match; - - assert(url && realm && auth); - - /* add url, in case it's missing */ - urldb_add_url(url); - - p = urldb_find_url(url); - - if (!p) - return; - - /* Search for host_part */ - for (pi = p; pi->parent != NULL; pi = pi->parent) - ; - h = (struct host_part *)pi; - - /* Search if given URL belongs to a protection space we already know of. */ - for (space = h->prot_space; space; space = space->next) { - if (!strcmp(space->realm, realm) && - lwc_string_isequal(space->scheme, p->scheme, - &match) == lwc_error_ok && - match == true && - space->port == p->port) - break; - } + struct in_addr ipv4; + size_t host_len = strlen(host); + const char *sane_host; + const char *slash; +#ifndef NO_IPV6 + struct in6_addr ipv6; + char ipv6_addr[64]; +#endif + /** @todo FIXME Some parts of urldb.c make confusions between hosts + * and "prefixes", we can sometimes be erroneously passed more than + * just a host. Sometimes we may be passed trailing slashes, or even + * whole path segments. A specific criminal in this class is + * urldb_iterate_partial, which takes a prefix to search for, but + * passes that prefix to functions that expect only hosts. + * + * For the time being, we will accept such calls; we check if there + * is a / in the host parameter, and if there is, we take a copy and + * replace the / with a \0. This is not a permanent solution; we + * should search through NetSurf and find all the callers that are + * in error and fix them. When doing this task, it might be wise + * to replace the hideousness below with code that doesn't have to do + * this, and add assert(strchr(host, '/') == NULL); somewhere. + * -- rjek - 2010-11-04 + */ - if (space != NULL) { - /* Overrule existing auth. */ - free(space->auth); - space->auth = strdup(auth); + slash = strchr(host, '/'); + if (slash == NULL) { + sane_host = host; } else { - /* Create a new protection space. */ - space = space_alloc = malloc(sizeof(struct prot_space_data)); - realm_alloc = strdup(realm); - auth_alloc = strdup(auth); - - if (!space_alloc || !realm_alloc || !auth_alloc) { - free(space_alloc); - free(realm_alloc); - free(auth_alloc); - return; - } - - space->scheme = lwc_string_ref(p->scheme); - space->port = p->port; - space->realm = realm_alloc; - space->auth = auth_alloc; - space->next = h->prot_space; - h->prot_space = space; + char *c = strdup(host); + c[slash - host] = '\0'; + sane_host = c; + host_len = slash - host - 1; + LOG(("WARNING: called with non-host '%s'", host)); } - p->prot_space = space; -} - -/** - * Set certificate verification permissions - * - * \param url URL to consider - * \param permit Set to true to allow invalid certificates - */ -void urldb_set_cert_permissions(nsurl *url, bool permit) -{ - struct path_data *p; - struct host_part *h; - - assert(url); - - /* add url, in case it's missing */ - urldb_add_url(url); - - p = urldb_find_url(url); - if (!p) - return; + if (strspn(sane_host, "0123456789abcdefABCDEF[].:") < host_len) + goto out_false; - for (; p && p->parent; p = p->parent) - /* do nothing */; - assert(p); + if (inet_aton(sane_host, &ipv4) != 0) { + /* This can only be a sane IPv4 address if it contains 3 dots. + * Helpfully, inet_aton is happy to treat "a", "a.b", "a.b.c", + * and "a.b.c.d" as valid IPv4 address strings where we only + * support the full, dotted-quad, form. + */ + int num_dots = 0; + size_t index; - h = (struct host_part *)p; + for (index = 0; index < host_len; index++) { + if (sane_host[index] == '.') + num_dots++; + } - h->permit_invalid_certs = permit; -} + if (num_dots == 3) + goto out_true; + else + goto out_false; + } -/** - * Set thumbnail for url, replacing any existing thumbnail - * - * \param url Absolute URL to consider - * \param bitmap Opaque pointer to thumbnail data, or NULL to invalidate - */ -void urldb_set_thumbnail(nsurl *url, struct bitmap *bitmap) -{ - struct path_data *p; +#ifndef NO_IPV6 + if (sane_host[0] != '[' || sane_host[host_len] != ']') + goto out_false; - assert(url); + strncpy(ipv6_addr, sane_host + 1, sizeof(ipv6_addr)); + ipv6_addr[sizeof(ipv6_addr) - 1] = '\0'; - p = urldb_find_url(url); - if (!p) - return; + if (inet_pton(AF_INET6, ipv6_addr, &ipv6) == 1) + goto out_true; +#endif - if (p->thumb && p->thumb != bitmap) - bitmap_destroy(p->thumb); +out_false: + if (slash != NULL) free((void *)sane_host); + return false; - p->thumb = bitmap; +out_true: + if (slash != NULL) free((void *)sane_host); + return true; } -/** - * Retrieve thumbnail data for given URL - * - * \param url Absolute URL to search for - * \return Pointer to thumbnail data, or NULL if not found. - */ -struct bitmap *urldb_get_thumbnail(nsurl *url) -{ - struct path_data *p; - - assert(url); - - p = urldb_find_url(url); - if (!p) - return NULL; - - return p->thumb; -} /** - * Iterate over entries in the database which match the given prefix + * Compare host_part with prefix * - * \param prefix Prefix to match - * \param callback Callback function + * \param a + * \param b + * \return 0 if match, non-zero, otherwise */ -void urldb_iterate_partial(const char *prefix, - bool (*callback)(nsurl *url, - const struct url_data *data)) +static int urldb_search_match_prefix(const struct host_part *a, const char *b) { - char host[256]; - char buf[260]; /* max domain + "www." */ - const char *slash, *scheme_sep; - struct search_node *tree; - const struct host_part *h; - - assert(prefix && callback); - - /* strip scheme */ - scheme_sep = strstr(prefix, "://"); - if (scheme_sep) - prefix = scheme_sep + 3; + const char *end, *dot; + int plen, ret; - slash = strchr(prefix, '/'); - tree = urldb_get_search_tree(prefix); + assert(a && a != &db_root && b); - if (slash) { - /* if there's a slash in the input, then we can - * assume that we're looking for a path */ - snprintf(host, sizeof host, "%.*s", - (int) (slash - prefix), prefix); + if (urldb__host_is_ip_address(b)) { + /* IP address */ + return strncasecmp(a->part, b, strlen(b)); + } - h = urldb_search_find(tree, host); - if (!h) { - int len = slash - prefix; + end = b + strlen(b) + 1; - if (len <= 3 || strncasecmp(host, "www.", 4) != 0) { - snprintf(buf, sizeof buf, "www.%s", host); - h = urldb_search_find( - search_trees[ST_DN + 'w' - 'a'], - buf); - if (!h) - return; - } else - return; + while (b < end && a && a != &db_root) { + dot = strchr(b, '.'); + if (!dot) { + /* last segment */ + dot = end - 1; } - if (h->paths.children) { - /* Have paths, iterate them */ - urldb_iterate_partial_path(&h->paths, slash + 1, - callback); + /* Compare strings (length limited) */ + if ((ret = strncasecmp(a->part, b, dot - b)) != 0) + /* didn't match => return difference */ + return ret; + + /* The strings matched */ + if (dot < end - 1) { + /* Consider segment lengths only in the case + * where the prefix contains segments */ + plen = strlen(a->part); + if (plen > dot - b) + /* len(a) > len(b) */ + return 1; + else if (plen < dot - b) + /* len(a) < len(b) */ + return -1; } - } else { - int len = strlen(prefix); + b = dot + 1; + a = a->parent; + } - /* looking for hosts */ - if (!urldb_iterate_partial_host(tree, prefix, callback)) - return; + /* If we get here then either: + * a) The path lengths differ + * or b) The hosts are identical + */ + if (a && a != &db_root && b >= end) + /* len(a) > len(b) => prefix matches */ + return 0; + else if ((!a || a == &db_root) && b < end) + /* len(a) < len(b) => prefix does not match */ + return -1; - if (len <= 3 || strncasecmp(prefix, "www.", 4) != 0) { - /* now look for www.prefix */ - snprintf(buf, sizeof buf, "www.%s", prefix); - if(!urldb_iterate_partial_host( - search_trees[ST_DN + 'w' - 'a'], - buf, callback)) - return; - } - } + /* Identical */ + return 0; } + /** * Partial host iterator (internal) * @@ -1288,7 +684,9 @@ void urldb_iterate_partial(const char *prefix, * \param callback Callback function * \return true to continue, false otherwise */ -bool urldb_iterate_partial_host(struct search_node *root, const char *prefix, +static bool +urldb_iterate_partial_host(struct search_node *root, + const char *prefix, bool (*callback)(nsurl *url, const struct url_data *data)) { int c; @@ -1330,6 +728,7 @@ bool urldb_iterate_partial_host(struct search_node *root, const char *prefix, return true; } + /** * Partial path iterator (internal) * @@ -1338,14 +737,14 @@ bool urldb_iterate_partial_host(struct search_node *root, const char *prefix, * \param callback Callback function * \return true to continue, false otherwise */ -bool urldb_iterate_partial_path(const struct path_data *parent, +static bool urldb_iterate_partial_path(const struct path_data *parent, const char *prefix, bool (*callback)(nsurl *url, const struct url_data *data)) { const struct path_data *p = parent->children; const char *slash, *end = prefix + strlen(prefix); - /* + /* * Given: http://www.example.org/a/b/c/d//e * and assuming a path tree: * . @@ -1387,13 +786,13 @@ bool urldb_iterate_partial_path(const struct path_data *parent, prefix++; continue; } - + if (strncasecmp(p->segment, prefix, slash - prefix) == 0) { /* prefix matches so far */ if (slash == end) { /* we've run out of prefix, so all * paths below this one match */ - if (!urldb_iterate_entries_path(p, callback, + if (!urldb_iterate_entries_path(p, callback, NULL)) return false; @@ -1414,42 +813,6 @@ bool urldb_iterate_partial_path(const struct path_data *parent, return true; } -/** - * Iterate over all entries in database - * - * \param callback Function to callback for each entry - */ -void urldb_iterate_entries(bool (*callback)(nsurl *url, - const struct url_data *data)) -{ - int i; - - assert(callback); - - for (i = 0; i < NUM_SEARCH_TREES; i++) { - if (!urldb_iterate_entries_host(search_trees[i], - callback, NULL)) - break; - } -} - -/** - * Iterate over all cookies in database - * - * \param callback Function to callback for each entry - */ -void urldb_iterate_cookies(bool (*callback)(const struct cookie_data *data)) -{ - int i; - - assert(callback); - - for (i = 0; i < NUM_SEARCH_TREES; i++) { - if (!urldb_iterate_entries_host(search_trees[i], - NULL, callback)) - break; - } -} /** * Host data iterator (internal) @@ -1459,7 +822,7 @@ void urldb_iterate_cookies(bool (*callback)(const struct cookie_data *data)) * \param cookie_callback Callback function * \return true to continue, false otherwise */ -bool urldb_iterate_entries_host(struct search_node *parent, +static bool urldb_iterate_entries_host(struct search_node *parent, bool (*url_callback)(nsurl *url, const struct url_data *data), bool (*cookie_callback)(const struct cookie_data *data)) @@ -1487,65 +850,6 @@ bool urldb_iterate_entries_host(struct search_node *parent, return true; } -/** - * Path data iterator (internal) - * - * \param parent Root of subtree to iterate over - * \param url_callback Callback function - * \param cookie_callback Callback function - * \return true to continue, false otherwise - */ -bool urldb_iterate_entries_path(const struct path_data *parent, - bool (*url_callback)(nsurl *url, - const struct url_data *data), - bool (*cookie_callback)(const struct cookie_data *data)) -{ - const struct path_data *p = parent; - const struct cookie_data *c; - - do { - if (p->children != NULL) { - /* Drill down into children */ - p = p->children; - } else { - /* All leaf nodes in the path tree should have an URL or - * cookies attached to them. If this is not the case, it - * indicates that there's a bug in the file loader/URL - * insertion code. Therefore, assert this here. */ - assert(url_callback || cookie_callback); - - /** \todo handle fragments? */ - if (url_callback) { - const struct url_internal_data *u = &p->urld; - - assert(p->url); - - if (!url_callback(p->url, - (const struct url_data *) u)) - return false; - } else { - c = (const struct cookie_data *)p->cookies; - for (; c != NULL; c = c->next) - if (!cookie_callback(c)) - return false; - } - - /* Now, find next node to process. */ - while (p != parent) { - if (p->next != NULL) { - /* Have a sibling, process that */ - p = p->next; - break; - } - - /* Ascend tree */ - p = p->parent; - } - } - } while (p != parent); - - return true; -} /** * Add a host node to the tree @@ -1554,7 +858,7 @@ bool urldb_iterate_entries_path(const struct path_data *parent, * \param parent Parent node to add to * \return Pointer to added node, or NULL on memory exhaustion */ -struct host_part *urldb_add_host_node(const char *part, +static struct host_part *urldb_add_host_node(const char *part, struct host_part *parent) { struct host_part *d; @@ -1582,186 +886,58 @@ struct host_part *urldb_add_host_node(const char *part, /** - * Check whether a host string is an IP address. - * - * This call detects IPv4 addresses (all of dotted-quad or subsets, - * decimal or hexadecimal notations) and IPv6 addresses (including - * those containing embedded IPv4 addresses.) - * - * \param host a hostname terminated by '\0' - * \return true if the hostname is an IP address, false otherwise + * Fragment comparator callback for qsort */ -static bool urldb__host_is_ip_address(const char *host) +static int urldb_add_path_fragment_cmp(const void *a, const void *b) { - struct in_addr ipv4; - size_t host_len = strlen(host); - const char *sane_host; - const char *slash; -#ifndef NO_IPV6 - struct in6_addr ipv6; - char ipv6_addr[64]; -#endif - /** @todo FIXME Some parts of urldb.c make confusions between hosts - * and "prefixes", we can sometimes be erroneously passed more than - * just a host. Sometimes we may be passed trailing slashes, or even - * whole path segments. A specific criminal in this class is - * urldb_iterate_partial, which takes a prefix to search for, but - * passes that prefix to functions that expect only hosts. - * - * For the time being, we will accept such calls; we check if there - * is a / in the host parameter, and if there is, we take a copy and - * replace the / with a \0. This is not a permanent solution; we - * should search through NetSurf and find all the callers that are - * in error and fix them. When doing this task, it might be wise - * to replace the hideousness below with code that doesn't have to do - * this, and add assert(strchr(host, '/') == NULL); somewhere. - * -- rjek - 2010-11-04 - */ - - slash = strchr(host, '/'); - if (slash == NULL) { - sane_host = host; - } else { - char *c = strdup(host); - c[slash - host] = '\0'; - sane_host = c; - host_len = slash - host - 1; - LOG(("WARNING: called with non-host '%s'", host)); - } - - if (strspn(sane_host, "0123456789abcdefABCDEF[].:") < host_len) - goto out_false; - - if (inet_aton(sane_host, &ipv4) != 0) { - /* This can only be a sane IPv4 address if it contains 3 dots. - * Helpfully, inet_aton is happy to treat "a", "a.b", "a.b.c", - * and "a.b.c.d" as valid IPv4 address strings where we only - * support the full, dotted-quad, form. - */ - int num_dots = 0; - size_t index; - - for (index = 0; index < host_len; index++) { - if (sane_host[index] == '.') - num_dots++; - } - - if (num_dots == 3) - goto out_true; - else - goto out_false; - } - -#ifndef NO_IPV6 - if (sane_host[0] != '[' || sane_host[host_len] != ']') - goto out_false; - - strncpy(ipv6_addr, sane_host + 1, sizeof(ipv6_addr)); - ipv6_addr[sizeof(ipv6_addr) - 1] = '\0'; - - if (inet_pton(AF_INET6, ipv6_addr, &ipv6) == 1) - goto out_true; -#endif - -out_false: - if (slash != NULL) free((void *)sane_host); - return false; - -out_true: - if (slash != NULL) free((void *)sane_host); - return true; + return strcasecmp(*((const char **) a), *((const char **) b)); } /** - * Add a host to the database, creating any intermediate entries + * Add a fragment to a path segment * - * \param host Hostname to add - * \return Pointer to leaf node, or NULL on memory exhaustion + * \param segment Path segment to add to + * \param fragment Fragment to add (copied), or NULL + * \return segment or NULL on memory exhaustion */ -struct host_part *urldb_add_host(const char *host) +static struct path_data * +urldb_add_path_fragment(struct path_data *segment, lwc_string *fragment) { - struct host_part *d = (struct host_part *) &db_root, *e; - struct search_node *s; - char buf[256]; /* 256 bytes is sufficient - domain names are - * limited to 255 chars. */ - char *part; - - assert(host); - - if (urldb__host_is_ip_address(host)) { - /* Host is an IP, so simply add as TLD */ + char **temp; - /* Check for existing entry */ - for (e = d->children; e; e = e->next) - if (strcasecmp(host, e->part) == 0) - /* found => return it */ - return e; + assert(segment); - d = urldb_add_host_node(host, d); + /* If no fragment, this function is a NOP + * This may seem strange, but it makes the rest + * of the code cleaner */ + if (!fragment) + return segment; - s = urldb_search_insert(search_trees[ST_IP], d); - if (!s) { - /* failed */ - d = NULL; - } else { - search_trees[ST_IP] = s; - } + temp = realloc(segment->fragment, + (segment->frag_cnt + 1) * sizeof(char *)); + if (!temp) + return NULL; - return d; + segment->fragment = temp; + segment->fragment[segment->frag_cnt] = + strdup(lwc_string_data(fragment)); + if (!segment->fragment[segment->frag_cnt]) { + /* Don't free temp - it's now our buffer */ + return NULL; } - /* Copy host string, so we can corrupt it */ - strncpy(buf, host, sizeof buf); - buf[sizeof buf - 1] = '\0'; - - /* Process FQDN segments backwards */ - do { - part = strrchr(buf, '.'); - if (!part) { - /* last segment */ - /* Check for existing entry */ - for (e = d->children; e; e = e->next) - if (strcasecmp(buf, e->part) == 0) - break; - - if (e) { - d = e; - } else { - d = urldb_add_host_node(buf, d); - } - - /* And insert into search tree */ - if (d) { - struct search_node **r; - - r = urldb_get_search_tree_direct(buf); - s = urldb_search_insert(*r, d); - if (!s) { - /* failed */ - d = NULL; - } else { - *r = s; - } - } - break; - } - - /* Check for existing entry */ - for (e = d->children; e; e = e->next) - if (strcasecmp(part + 1, e->part) == 0) - break; - - d = e ? e : urldb_add_host_node(part + 1, d); - if (!d) - break; + segment->frag_cnt++; - *part = '\0'; - } while (1); + /* We want fragments in alphabetical order, so sort them + * It may prove better to insert in alphabetical order instead */ + qsort(segment->fragment, segment->frag_cnt, sizeof (char *), + urldb_add_path_fragment_cmp); - return d; + return segment; } + /** * Add a path node to the tree * @@ -1772,7 +948,8 @@ struct host_part *urldb_add_host(const char *host) * \param parent Parent node to add to * \return Pointer to added node, or NULL on memory exhaustion */ -struct path_data *urldb_add_path_node(lwc_string *scheme, unsigned int port, +static struct path_data * +urldb_add_path_node(lwc_string *scheme, unsigned int port, const char *segment, lwc_string *fragment, struct path_data *parent) { @@ -1804,9 +981,10 @@ struct path_data *urldb_add_path_node(lwc_string *scheme, unsigned int port, } } - for (e = parent->children; e; e = e->next) + for (e = parent->children; e; e = e->next) { if (strcmp(e->segment, d->segment) > 0) break; + } if (e) { d->prev = e->prev; @@ -1830,143 +1008,193 @@ struct path_data *urldb_add_path_node(lwc_string *scheme, unsigned int port, return d; } + /** - * Add a path to the database, creating any intermediate entries + * Get the search tree for a particular host * - * \param scheme URL scheme associated with path - * \param port Port number on host associated with path - * \param host Host tree node to attach to - * \param path_query Absolute path plus query to add (freed) - * \param fragment URL fragment, or NULL - * \param url URL (fragment ignored) - * \return Pointer to leaf node, or NULL on memory exhaustion + * \param host the host to lookup + * \return the corresponding search tree */ -struct path_data *urldb_add_path(lwc_string *scheme, unsigned int port, - const struct host_part *host, char *path_query, - lwc_string *fragment, nsurl *url) +static struct search_node **urldb_get_search_tree_direct(const char *host) { - struct path_data *d, *e; - char *buf = path_query; - char *segment, *slash; - bool match; + assert(host); - assert(scheme && host && url); + if (urldb__host_is_ip_address(host)) + return &search_trees[ST_IP]; + else if (isalpha(*host)) + return &search_trees[ST_DN + tolower(*host) - 'a']; + return &search_trees[ST_EE]; +} - d = (struct path_data *) &host->paths; - /* skip leading '/' */ - segment = buf; - if (*segment == '/') - segment++; +/** + * Get the search tree for a particular host + * + * \param host the host to lookup + * \return the corresponding search tree + */ +static struct search_node *urldb_get_search_tree(const char *host) +{ + return *urldb_get_search_tree_direct(host); +} - /* Process path segments */ - do { - slash = strchr(segment, '/'); - if (!slash) { - /* last segment */ - /* look for existing entry */ - for (e = d->children; e; e = e->next) - if (strcmp(segment, e->segment) == 0 && - lwc_string_isequal(scheme, - e->scheme, &match) == - lwc_error_ok && - match == true && - e->port == port) - break; - d = e ? urldb_add_path_fragment(e, fragment) : - urldb_add_path_node(scheme, port, - segment, fragment, d); - break; - } +/** + * Compare host_part with a string + * + * \param a + * \param b + * \return 0 if match, non-zero, otherwise + */ +static int urldb_search_match_string(const struct host_part *a, const char *b) +{ + const char *end, *dot; + int plen, ret; - *slash = '\0'; + assert(a && a != &db_root && b); - /* look for existing entry */ - for (e = d->children; e; e = e->next) - if (strcmp(segment, e->segment) == 0 && - lwc_string_isequal(scheme, e->scheme, - &match) == lwc_error_ok && - match == true && - e->port == port) - break; + if (urldb__host_is_ip_address(b)) { + /* IP address */ + return strcasecmp(a->part, b); + } - d = e ? e : urldb_add_path_node(scheme, port, segment, NULL, d); - if (!d) - break; + end = b + strlen(b) + 1; - segment = slash + 1; - } while (1); + while (b < end && a && a != &db_root) { + dot = strchr(b, '.'); + if (!dot) { + /* last segment */ + dot = end - 1; + } - free(path_query); + /* Compare strings (length limited) */ + if ((ret = strncasecmp(a->part, b, dot - b)) != 0) + /* didn't match => return difference */ + return ret; - if (d && !d->url) { - /* Insert defragmented URL */ - if (nsurl_defragment(url, &d->url) != NSERROR_OK) - return NULL; + /* The strings matched, now check that the lengths do, too */ + plen = strlen(a->part); + + if (plen > dot - b) + /* len(a) > len(b) */ + return 1; + else if (plen < dot - b) + /* len(a) < len(b) */ + return -1; + + b = dot + 1; + a = a->parent; } - return d; + /* If we get here then either: + * a) The path lengths differ + * or b) The hosts are identical + */ + if (a && a != &db_root && b >= end) + /* len(a) > len(b) */ + return 1; + else if ((!a || a == &db_root) && b < end) + /* len(a) < len(b) */ + return -1; + + /* Identical */ + return 0; } + /** - * Fragment comparator callback for qsort + * Find a node in a search tree + * + * \param root Tree to look in + * \param host Host to find + * \return Pointer to host tree node, or NULL if not found */ -int urldb_add_path_fragment_cmp(const void *a, const void *b) +static const struct host_part * +urldb_search_find(struct search_node *root, const char *host) { - return strcasecmp(*((const char **) a), *((const char **) b)); + int c; + + assert(root && host); + + if (root == &empty) { + return NULL; + } + + c = urldb_search_match_string(root->data, host); + + if (c > 0) + return urldb_search_find(root->left, host); + else if (c < 0) + return urldb_search_find(root->right, host); + else + return root->data; } + /** - * Add a fragment to a path segment + * Match a path string * - * \param segment Path segment to add to - * \param fragment Fragment to add (copied), or NULL - * \return segment or NULL on memory exhaustion + * \param parent Path (sub)tree to look in + * \param path The path to search for + * \param scheme The URL scheme associated with the path + * \param port The port associated with the path + * \return Pointer to path data or NULL if not found. */ -struct path_data *urldb_add_path_fragment(struct path_data *segment, - lwc_string *fragment) +static struct path_data *urldb_match_path(const struct path_data *parent, + const char *path, lwc_string *scheme, unsigned short port) { - char **temp; + const struct path_data *p; + const char *slash; + bool match; - assert(segment); + assert(parent != NULL); + assert(parent->segment == NULL); - /* If no fragment, this function is a NOP - * This may seem strange, but it makes the rest - * of the code cleaner */ - if (!fragment) - return segment; + if (path[0] != '/') { + LOG(("path is %s", path)); + } - temp = realloc(segment->fragment, - (segment->frag_cnt + 1) * sizeof(char *)); - if (!temp) - return NULL; + assert(path[0] == '/'); - segment->fragment = temp; - segment->fragment[segment->frag_cnt] = - strdup(lwc_string_data(fragment)); - if (!segment->fragment[segment->frag_cnt]) { - /* Don't free temp - it's now our buffer */ - return NULL; - } + /* Start with children, as parent has no segment */ + p = parent->children; - segment->frag_cnt++; + while (p != NULL) { + slash = strchr(path + 1, '/'); + if (!slash) + slash = path + strlen(path); - /* We want fragments in alphabetical order, so sort them - * It may prove better to insert in alphabetical order instead */ - qsort(segment->fragment, segment->frag_cnt, sizeof (char *), - urldb_add_path_fragment_cmp); + if (strncmp(p->segment, path + 1, slash - path - 1) == 0 && + lwc_string_isequal(p->scheme, scheme, &match) == + lwc_error_ok && + match == true && + p->port == port) { + if (*slash == '\0') { + /* Complete match */ + return (struct path_data *) p; + } - return segment; + /* Match so far, go down tree */ + p = p->children; + + path = slash; + } else { + /* No match, try next sibling */ + p = p->next; + } + } + + return NULL; } + /** * Find an URL in the database * * \param url Absolute URL to find * \return Pointer to path data, or NULL if not found */ -struct path_data *urldb_find_url(nsurl *url) +static struct path_data *urldb_find_url(nsurl *url) { const struct host_part *h; struct path_data *p; @@ -1979,7 +1207,7 @@ struct path_data *urldb_find_url(nsurl *url) bool match; assert(url); - + if (url_bloom != NULL) { if (bloom_search_hash(url_bloom, nsurl_hash(url)) == false) { @@ -2036,102 +1264,49 @@ struct path_data *urldb_find_url(nsurl *url) return p; } + /** - * Match a path string + * Dump URL database paths to stderr * - * \param parent Path (sub)tree to look in - * \param path The path to search for - * \param scheme The URL scheme associated with the path - * \param port The port associated with the path - * \return Pointer to path data or NULL if not found. + * \param parent Parent node of tree to dump */ -struct path_data *urldb_match_path(const struct path_data *parent, - const char *path, lwc_string *scheme, unsigned short port) +static void urldb_dump_paths(struct path_data *parent) { - const struct path_data *p; - const char *slash; - bool match; - - assert(parent != NULL); - assert(parent->segment == NULL); - assert(path[0] == '/'); + const struct path_data *p = parent; + unsigned int i; - /* Start with children, as parent has no segment */ - p = parent->children; + do { + if (p->segment != NULL) { + LOG(("\t%s : %u", lwc_string_data(p->scheme), p->port)); - while (p != NULL) { - slash = strchr(path + 1, '/'); - if (!slash) - slash = path + strlen(path); + LOG(("\t\t'%s'", p->segment)); - if (strncmp(p->segment, path + 1, slash - path - 1) == 0 && - lwc_string_isequal(p->scheme, scheme, &match) == - lwc_error_ok && - match == true && - p->port == port) { - if (*slash == '\0') { - /* Complete match */ - return (struct path_data *) p; - } + for (i = 0; i != p->frag_cnt; i++) + LOG(("\t\t\t#%s", p->fragment[i])); + } - /* Match so far, go down tree */ + if (p->children != NULL) { p = p->children; - - path = slash; } else { - /* No match, try next sibling */ - p = p->next; - } - } - - return NULL; -} - -/** - * Get the search tree for a particular host - * - * \param host the host to lookup - * \return the corresponding search tree - */ -struct search_node **urldb_get_search_tree_direct(const char *host) { - assert(host); - - if (urldb__host_is_ip_address(host)) - return &search_trees[ST_IP]; - else if (isalpha(*host)) - return &search_trees[ST_DN + tolower(*host) - 'a']; - return &search_trees[ST_EE]; -} + while (p != parent) { + if (p->next != NULL) { + p = p->next; + break; + } -/** - * Get the search tree for a particular host - * - * \param host the host to lookup - * \return the corresponding search tree - */ -struct search_node *urldb_get_search_tree(const char *host) { - return *urldb_get_search_tree_direct(host); + p = p->parent; + } + } + } while (p != parent); } -/** - * Dump URL database to stderr - */ -void urldb_dump(void) -{ - int i; - - urldb_dump_hosts(&db_root); - - for (i = 0; i != NUM_SEARCH_TREES; i++) - urldb_dump_search(search_trees[i], 0); -} /** * Dump URL database hosts to stderr * * \param parent Parent node of tree to dump */ -void urldb_dump_hosts(struct host_part *parent) +static void urldb_dump_hosts(struct host_part *parent) { struct host_part *h; @@ -2150,40 +1325,6 @@ void urldb_dump_hosts(struct host_part *parent) urldb_dump_hosts(h); } -/** - * Dump URL database paths to stderr - * - * \param parent Parent node of tree to dump - */ -void urldb_dump_paths(struct path_data *parent) -{ - const struct path_data *p = parent; - unsigned int i; - - do { - if (p->segment != NULL) { - LOG(("\t%s : %u", lwc_string_data(p->scheme), p->port)); - - LOG(("\t\t'%s'", p->segment)); - - for (i = 0; i != p->frag_cnt; i++) - LOG(("\t\t\t#%s", p->fragment[i])); - } - - if (p->children != NULL) { - p = p->children; - } else { - while (p != parent) { - if (p->next != NULL) { - p = p->next; - break; - } - - p = p->parent; - } - } - } while (p != parent); -} /** * Dump search tree @@ -2191,7 +1332,7 @@ void urldb_dump_paths(struct path_data *parent) * \param parent Parent node of tree to dump * \param depth Tree depth */ -void urldb_dump_search(struct search_node *parent, int depth) +static void urldb_dump_search(struct search_node *parent, int depth) { const struct host_part *h; int i; @@ -2217,96 +1358,6 @@ void urldb_dump_search(struct search_node *parent, int depth) urldb_dump_search(parent->right, depth + 1); } -/** - * Insert a node into the search tree - * - * \param root Root of tree to insert into - * \param data User data to insert - * \return Pointer to updated root, or NULL if failed - */ -struct search_node *urldb_search_insert(struct search_node *root, - const struct host_part *data) -{ - struct search_node *n; - - assert(root && data); - - n = malloc(sizeof(struct search_node)); - if (!n) - return NULL; - - n->level = 1; - n->data = data; - n->left = n->right = ∅ - - root = urldb_search_insert_internal(root, n); - - return root; -} - -/** - * Insert node into search tree - * - * \param root Root of (sub)tree to insert into - * \param n Node to insert - * \return Pointer to updated root - */ -struct search_node *urldb_search_insert_internal(struct search_node *root, - struct search_node *n) -{ - assert(root && n); - - if (root == &empty) { - root = n; - } else { - int c = urldb_search_match_host(root->data, n->data); - - if (c > 0) { - root->left = urldb_search_insert_internal( - root->left, n); - } else if (c < 0) { - root->right = urldb_search_insert_internal( - root->right, n); - } else { - /* exact match */ - free(n); - return root; - } - - root = urldb_search_skew(root); - root = urldb_search_split(root); - } - - return root; -} - -/** - * Find a node in a search tree - * - * \param root Tree to look in - * \param host Host to find - * \return Pointer to host tree node, or NULL if not found - */ -const struct host_part *urldb_search_find(struct search_node *root, - const char *host) -{ - int c; - - assert(root && host); - - if (root == &empty) { - return NULL; - } - - c = urldb_search_match_string(root->data, host); - - if (c > 0) - return urldb_search_find(root->left, host); - else if (c < 0) - return urldb_search_find(root->right, host); - else - return root->data; -} /** * Compare a pair of host_parts @@ -2315,8 +1366,8 @@ const struct host_part *urldb_search_find(struct search_node *root, * \param b * \return 0 if match, non-zero, otherwise */ -int urldb_search_match_host(const struct host_part *a, - const struct host_part *b) +static int +urldb_search_match_host(const struct host_part *a, const struct host_part *b) { int ret; @@ -2344,134 +1395,6 @@ int urldb_search_match_host(const struct host_part *a, return 0; } -/** - * Compare host_part with a string - * - * \param a - * \param b - * \return 0 if match, non-zero, otherwise - */ -int urldb_search_match_string(const struct host_part *a, - const char *b) -{ - const char *end, *dot; - int plen, ret; - - assert(a && a != &db_root && b); - - if (urldb__host_is_ip_address(b)) { - /* IP address */ - return strcasecmp(a->part, b); - } - - end = b + strlen(b) + 1; - - while (b < end && a && a != &db_root) { - dot = strchr(b, '.'); - if (!dot) { - /* last segment */ - dot = end - 1; - } - - /* Compare strings (length limited) */ - if ((ret = strncasecmp(a->part, b, dot - b)) != 0) - /* didn't match => return difference */ - return ret; - - /* The strings matched, now check that the lengths do, too */ - plen = strlen(a->part); - - if (plen > dot - b) - /* len(a) > len(b) */ - return 1; - else if (plen < dot - b) - /* len(a) < len(b) */ - return -1; - - b = dot + 1; - a = a->parent; - } - - /* If we get here then either: - * a) The path lengths differ - * or b) The hosts are identical - */ - if (a && a != &db_root && b >= end) - /* len(a) > len(b) */ - return 1; - else if ((!a || a == &db_root) && b < end) - /* len(a) < len(b) */ - return -1; - - /* Identical */ - return 0; -} - -/** - * Compare host_part with prefix - * - * \param a - * \param b - * \return 0 if match, non-zero, otherwise - */ -int urldb_search_match_prefix(const struct host_part *a, - const char *b) -{ - const char *end, *dot; - int plen, ret; - - assert(a && a != &db_root && b); - - if (urldb__host_is_ip_address(b)) { - /* IP address */ - return strncasecmp(a->part, b, strlen(b)); - } - - end = b + strlen(b) + 1; - - while (b < end && a && a != &db_root) { - dot = strchr(b, '.'); - if (!dot) { - /* last segment */ - dot = end - 1; - } - - /* Compare strings (length limited) */ - if ((ret = strncasecmp(a->part, b, dot - b)) != 0) - /* didn't match => return difference */ - return ret; - - /* The strings matched */ - if (dot < end - 1) { - /* Consider segment lengths only in the case - * where the prefix contains segments */ - plen = strlen(a->part); - if (plen > dot - b) - /* len(a) > len(b) */ - return 1; - else if (plen < dot - b) - /* len(a) < len(b) */ - return -1; - } - - b = dot + 1; - a = a->parent; - } - - /* If we get here then either: - * a) The path lengths differ - * or b) The hosts are identical - */ - if (a && a != &db_root && b >= end) - /* len(a) > len(b) => prefix matches */ - return 0; - else if ((!a || a == &db_root) && b < end) - /* len(a) < len(b) => prefix does not match */ - return -1; - - /* Identical */ - return 0; -} /** * Rotate a subtree right @@ -2479,7 +1402,7 @@ int urldb_search_match_prefix(const struct host_part *a, * \param root Root of subtree to rotate * \return new root of subtree */ -struct search_node *urldb_search_skew(struct search_node *root) +static struct search_node *urldb_search_skew(struct search_node *root) { struct search_node *temp; @@ -2495,13 +1418,14 @@ struct search_node *urldb_search_skew(struct search_node *root) return root; } + /** * Rotate a node left, increasing the parent's level * * \param root Root of subtree to rotate * \return New root of subtree */ -struct search_node *urldb_search_split(struct search_node *root) +static struct search_node *urldb_search_split(struct search_node *root) { struct search_node *temp; @@ -2519,538 +1443,198 @@ struct search_node *urldb_search_split(struct search_node *root) return root; } + /** - * Retrieve cookies for an URL + * Insert node into search tree * - * \param url URL being fetched - * \param include_http_only Whether to include HTTP(S) only cookies. - * \return Cookies string for libcurl (on heap), or NULL on error/no cookies + * \param root Root of (sub)tree to insert into + * \param n Node to insert + * \return Pointer to updated root */ -char *urldb_get_cookie(nsurl *url, bool include_http_only) +static struct search_node * +urldb_search_insert_internal(struct search_node *root, struct search_node *n) { - const struct path_data *p, *q; - const struct host_part *h; - lwc_string *path_lwc; - struct cookie_internal_data *c; - int count = 0, version = COOKIE_RFC2965; - struct cookie_internal_data **matched_cookies; - int matched_cookies_size = 20; - int ret_alloc = 4096, ret_used = 1; - const char *path; - char *ret; - lwc_string *scheme; - time_t now; - int i; - bool match; - - assert(url != NULL); - - /* The URL must exist in the db in order to find relevant cookies, since - * we search up the tree from the URL node, and cookies from further - * up also apply. */ - urldb_add_url(url); - - p = urldb_find_url(url); - if (!p) - return NULL; - - scheme = p->scheme; - - matched_cookies = malloc(matched_cookies_size * - sizeof(struct cookie_internal_data *)); - if (!matched_cookies) - return NULL; - -#define GROW_MATCHED_COOKIES \ - do { \ - if (count == matched_cookies_size) { \ - struct cookie_internal_data **temp; \ - temp = realloc(matched_cookies, \ - (matched_cookies_size + 20) * \ - sizeof(struct cookie_internal_data *)); \ - \ - if (temp == NULL) { \ - free(ret); \ - free(matched_cookies); \ - return NULL; \ - } \ - \ - matched_cookies = temp; \ - matched_cookies_size += 20; \ - } \ - } while(0) - - ret = malloc(ret_alloc); - if (!ret) { - free(matched_cookies); - return NULL; - } - - ret[0] = '\0'; - - path_lwc = nsurl_get_component(url, NSURL_PATH); - if (path_lwc == NULL) { - free(ret); - free(matched_cookies); - return NULL; - } - path = lwc_string_data(path_lwc); - lwc_string_unref(path_lwc); - - now = time(NULL); - - if (*(p->segment) != '\0') { - /* Match exact path, unless directory, when prefix matching - * will handle this case for us. */ - for (q = p->parent->children; q; q = q->next) { - if (strcmp(q->segment, p->segment)) - continue; - - /* Consider all cookies associated with - * this exact path */ - for (c = q->cookies; c; c = c->next) { - if (c->expires != -1 && c->expires < now) - /* cookie has expired => ignore */ - continue; - - if (c->secure && lwc_string_isequal( - q->scheme, - corestring_lwc_https, - &match) && - match == false) - /* secure cookie for insecure host. - * ignore */ - continue; - - if (c->http_only && !include_http_only) - /* Ignore HttpOnly */ - continue; - - matched_cookies[count++] = c; - - GROW_MATCHED_COOKIES; - - if (c->version < (unsigned int)version) - version = c->version; - - c->last_used = now; - - cookie_manager_add((struct cookie_data *)c); - } - } - } - - /* Now consider cookies whose paths prefix-match ours */ - for (p = p->parent; p; p = p->parent) { - /* Find directory's path entry(ies) */ - /* There are potentially multiple due to differing schemes */ - for (q = p->children; q; q = q->next) { - if (*(q->segment) != '\0') - continue; - - for (c = q->cookies; c; c = c->next) { - if (c->expires != -1 && c->expires < now) - /* cookie has expired => ignore */ - continue; - - if (c->secure && lwc_string_isequal( - q->scheme, - corestring_lwc_https, - &match) && - match == false) - /* Secure cookie for insecure server - * => ignore */ - continue; - - matched_cookies[count++] = c; - - GROW_MATCHED_COOKIES; - - if (c->version < (unsigned int) version) - version = c->version; - - c->last_used = now; - - cookie_manager_add((struct cookie_data *)c); - } - } - - if (!p->parent) { - /* No parent, so bail here. This can't go in - * the loop exit condition as we also want to - * process the top-level node. - * - * If p->parent is NULL then p->cookies are - * the domain cookies and thus we don't even - * try matching against them. - */ - break; - } - - /* Consider p itself - may be the result of Path=/foo */ - for (c = p->cookies; c; c = c->next) { - if (c->expires != -1 && c->expires < now) - /* cookie has expired => ignore */ - continue; - - /* Ensure cookie path is a prefix of the resource */ - if (strncmp(c->path, path, strlen(c->path)) != 0) - /* paths don't match => ignore */ - continue; - - if (c->secure && lwc_string_isequal(p->scheme, - corestring_lwc_https, - &match) && - match == false) - /* Secure cookie for insecure server - * => ignore */ - continue; - - matched_cookies[count++] = c; - - GROW_MATCHED_COOKIES; - - if (c->version < (unsigned int) version) - version = c->version; + assert(root && n); - c->last_used = now; + if (root == &empty) { + root = n; + } else { + int c = urldb_search_match_host(root->data, n->data); - cookie_manager_add((struct cookie_data *)c); + if (c > 0) { + root->left = urldb_search_insert_internal( + root->left, n); + } else if (c < 0) { + root->right = urldb_search_insert_internal( + root->right, n); + } else { + /* exact match */ + free(n); + return root; } + root = urldb_search_skew(root); + root = urldb_search_split(root); } - /* Finally consider domain cookies for hosts which domain match ours */ - for (h = (const struct host_part *)p; h && h != &db_root; - h = h->parent) { - for (c = h->paths.cookies; c; c = c->next) { - if (c->expires != -1 && c->expires < now) - /* cookie has expired => ignore */ - continue; - - /* Ensure cookie path is a prefix of the resource */ - if (strncmp(c->path, path, strlen(c->path)) != 0) - /* paths don't match => ignore */ - continue; - - if (c->secure && lwc_string_isequal(scheme, - corestring_lwc_https, - &match) && - match == false) - /* secure cookie for insecure host. ignore */ - continue; - - matched_cookies[count++] = c; + return root; +} - GROW_MATCHED_COOKIES; - if (c->version < (unsigned int)version) - version = c->version; - - c->last_used = now; +/** + * Insert a node into the search tree + * + * \param root Root of tree to insert into + * \param data User data to insert + * \return Pointer to updated root, or NULL if failed + */ +static struct search_node * +urldb_search_insert(struct search_node *root, const struct host_part *data) +{ + struct search_node *n; - cookie_manager_add((struct cookie_data *)c); - } - } + assert(root && data); - if (count == 0) { - /* No cookies found */ - free(ret); - free(matched_cookies); + n = malloc(sizeof(struct search_node)); + if (!n) return NULL; - } - - /* and build output string */ - if (version > COOKIE_NETSCAPE) { - sprintf(ret, "$Version=%d", version); - ret_used = strlen(ret) + 1; - } - - for (i = 0; i < count; i++) { - if (!urldb_concat_cookie(matched_cookies[i], version, - &ret_used, &ret_alloc, &ret)) { - free(ret); - free(matched_cookies); - return NULL; - } - } - - if (version == COOKIE_NETSCAPE) { - /* Old-style cookies => no version & skip "; " */ - memmove(ret, ret + 2, ret_used - 2); - ret_used -= 2; - } - /* Now, shrink the output buffer to the required size */ - { - char *temp = realloc(ret, ret_used); - if (!temp) { - free(ret); - free(matched_cookies); - return NULL; - } - - ret = temp; - } - - free(matched_cookies); + n->level = 1; + n->data = data; + n->left = n->right = ∅ - return ret; + root = urldb_search_insert_internal(root, n); -#undef GROW_MATCHED_COOKIES + return root; } + /** - * Parse Set-Cookie header and insert cookie(s) into database + * Parse a cookie avpair * - * \param header Header to parse, with Set-Cookie: stripped - * \param url URL being fetched - * \param referer Referring resource, or 0 for verifiable transaction - * \return true on success, false otherwise + * \param c Cookie struct to populate + * \param n Name component + * \param v Value component + * \param was_quoted Whether ::v was quoted in the input + * \return true on success, false on memory exhaustion */ -bool urldb_set_cookie(const char *header, nsurl *url, nsurl *referer) +static bool urldb_parse_avpair(struct cookie_internal_data *c, char *n, + char *v, bool was_quoted) { - const char *cur = header, *end; - lwc_string *path, *host, *scheme; - nsurl *urlt; - bool match; - - assert(url && header); + int vlen; - /* Get defragmented URL, as 'urlt' */ - if (nsurl_defragment(url, &urlt) != NSERROR_OK) - return NULL; + assert(c && n && v); - scheme = nsurl_get_component(url, NSURL_SCHEME); - if (scheme == NULL) { - nsurl_unref(urlt); - return false; + /* Strip whitespace from start of name */ + for (; *n; n++) { + if (*n != ' ' && *n != '\t') + break; } - path = nsurl_get_component(url, NSURL_PATH); - if (path == NULL) { - lwc_string_unref(scheme); - nsurl_unref(urlt); - return false; + /* Strip whitespace from end of name */ + for (vlen = strlen(n); vlen; vlen--) { + if (n[vlen] == ' ' || n[vlen] == '\t') + n[vlen] = '\0'; + else + break; } - host = nsurl_get_component(url, NSURL_HOST); - if (host == NULL) { - lwc_string_unref(path); - lwc_string_unref(scheme); - nsurl_unref(urlt); - return false; + /* Strip whitespace from start of value */ + for (; *v; v++) { + if (*v != ' ' && *v != '\t') + break; } - if (referer) { - lwc_string *rhost; - - /* Ensure that url's host name domain matches - * referer's (4.3.5) */ - rhost = nsurl_get_component(referer, NSURL_HOST); - if (rhost == NULL) { - goto error; - } - - /* Domain match host names */ - if (lwc_string_isequal(host, rhost, &match) == lwc_error_ok && - match == false) { - const char *hptr; - const char *rptr; - const char *dot; - const char *host_data = lwc_string_data(host); - const char *rhost_data = lwc_string_data(rhost); - - /* Ensure neither host nor rhost are IP addresses */ - if (urldb__host_is_ip_address(host_data) || - urldb__host_is_ip_address(rhost_data)) { - /* IP address, so no partial match */ - lwc_string_unref(rhost); - goto error; - } - - /* Not exact match, so try the following: - * - * 1) Find the longest common suffix of host and rhost - * (may be all of host/rhost) - * 2) Discard characters from the start of the suffix - * until the suffix starts with a dot - * (prevents foobar.com matching bar.com) - * 3) Ensure the suffix is non-empty and contains - * embedded dots (to avoid permitting .com as a - * suffix) - * - * Note that the above in no way resembles the - * domain matching algorithm found in RFC2109. - * It does, however, model the real world rather - * more accurately. - */ - - /** \todo In future, we should consult a TLD service - * instead of just looking for embedded dots. - */ - - hptr = host_data + lwc_string_length(host) - 1; - rptr = rhost_data + lwc_string_length(rhost) - 1; - - /* 1 */ - while (hptr >= host_data && rptr >= rhost_data) { - if (*hptr != *rptr) - break; - hptr--; - rptr--; - } - /* Ensure we end up pointing at the start of the - * common suffix. The above loop will exit pointing - * to the byte before the start of the suffix. */ - hptr++; - - /* 2 */ - while (*hptr != '\0' && *hptr != '.') - hptr++; - - /* 3 */ - if (*hptr == '\0' || - (dot = strchr(hptr + 1, '.')) == NULL || - *(dot + 1) == '\0') { - lwc_string_unref(rhost); - goto error; - } - } - - lwc_string_unref(rhost); + /* Strip whitespace from end of value */ + for (vlen = strlen(v); vlen; vlen--) { + if (v[vlen] == ' ' || v[vlen] == '\t') + v[vlen] = '\0'; + else + break; } - end = cur + strlen(cur) - 2 /* Trailing CRLF */; - - do { - struct cookie_internal_data *c; - char *dot; - size_t len; - - c = urldb_parse_cookie(url, &cur); - if (!c) { - /* failed => stop parsing */ - goto error; - } - - /* validate cookie */ - - /* 4.2.2:i Cookie must have NAME and VALUE */ - if (!c->name || !c->value) { - urldb_free_cookie(c); - goto error; - } - - /* 4.3.2:i Cookie path must be a prefix of URL path */ - len = strlen(c->path); - if (len > lwc_string_length(path) || - strncmp(c->path, lwc_string_data(path), - len) != 0) { - urldb_free_cookie(c); - goto error; - } - - /* 4.3.2:ii Cookie domain must contain embedded dots */ - dot = strchr(c->domain + 1, '.'); - if (!dot || *(dot + 1) == '\0') { - /* no embedded dots */ - urldb_free_cookie(c); - goto error; + if (!c->comment && strcasecmp(n, "Comment") == 0) { + c->comment = strdup(v); + if (!c->comment) + return false; + } else if (!c->domain && strcasecmp(n, "Domain") == 0) { + if (v[0] == '.') { + /* Domain must start with a dot */ + c->domain_from_set = true; + c->domain = strdup(v); + if (!c->domain) + return false; } + } else if (strcasecmp(n, "Max-Age") == 0) { + int temp = atoi(v); + if (temp == 0) + /* Special case - 0 means delete */ + c->expires = 0; + else + c->expires = time(NULL) + temp; + } else if (!c->path && strcasecmp(n, "Path") == 0) { + c->path_from_set = true; + c->path = strdup(v); + if (!c->path) + return false; + } else if (strcasecmp(n, "Version") == 0) { + c->version = atoi(v); + } else if (strcasecmp(n, "Expires") == 0) { + char *datenoday; + time_t expires; - /* Domain match fetch host with cookie domain */ - if (strcasecmp(lwc_string_data(host), c->domain) != 0) { - int hlen, dlen; - char *domain = c->domain; - - /* c->domain must be a domain cookie here because: - * c->domain is either: - * + specified in the header as a domain cookie - * (non-domain cookies in the header are ignored - * by urldb_parse_cookie / urldb_parse_avpair) - * + defaulted to the URL's host part - * (by urldb_parse_cookie if no valid domain was - * specified in the header) - * - * The latter will pass the strcasecmp above, which - * leaves the former (i.e. a domain cookie) - */ - assert(c->domain[0] == '.'); - - /* 4.3.2:iii */ - if (urldb__host_is_ip_address(lwc_string_data(host))) { - /* IP address, so no partial match */ - urldb_free_cookie(c); - goto error; - } - - hlen = lwc_string_length(host); - dlen = strlen(c->domain); - - if (hlen <= dlen && hlen != dlen - 1) { - /* Partial match not possible */ - urldb_free_cookie(c); - goto error; - } - - if (hlen == dlen - 1) { - /* Relax matching to allow - * host a.com to match .a.com */ - domain++; - dlen--; - } - - if (strcasecmp(lwc_string_data(host) + (hlen - dlen), - domain)) { - urldb_free_cookie(c); - goto error; - } + /* Strip dayname from date (these are hugely + * variable and liable to break the parser. + * They also serve no useful purpose) */ + for (datenoday = v; *datenoday && !isdigit(*datenoday); + datenoday++) + ; /* do nothing */ - /* 4.3.2:iv Ensure H contains no dots - * - * If you believe the spec, H should contain no - * dots in _any_ cookie. Unfortunately, however, - * reality differs in that many sites send domain - * cookies of the form .foo.com from hosts such - * as bar.bat.foo.com and then expect domain - * matching to work. Thus we have to do what they - * expect, regardless of any potential security - * implications. - * - * This is what code conforming to the spec would - * look like: - * - * for (int i = 0; i < (hlen - dlen); i++) { - * if (host[i] == '.') { - * urldb_free_cookie(c); - * goto error; - * } - * } - */ + expires = curl_getdate(datenoday, NULL); + if (expires == -1) { + /* assume we have an unrepresentable + * date => force it to the maximum + * possible value of a 32bit time_t + * (this may break in 2038. We'll + * deal with that once we come to + * it) */ + expires = (time_t)0x7fffffff; } - - /* Now insert into database */ - if (!urldb_insert_cookie(c, scheme, urlt)) - goto error; - } while (cur < end); - - lwc_string_unref(host); - lwc_string_unref(path); - lwc_string_unref(scheme); - nsurl_unref(urlt); + c->expires = expires; + } else if (strcasecmp(n, "Secure") == 0) { + c->secure = true; + } else if (strcasecmp(n, "HttpOnly") == 0) { + c->http_only = true; + } else if (!c->name) { + c->name = strdup(n); + c->value = strdup(v); + c->value_was_quoted = was_quoted; + if (!c->name || !c->value) + return false; + } return true; +} -error: - lwc_string_unref(host); - lwc_string_unref(path); - lwc_string_unref(scheme); - nsurl_unref(urlt); - return false; +/** + * Free a cookie + * + * \param c The cookie to free + */ +static void urldb_free_cookie(struct cookie_internal_data *c) +{ + assert(c); + + free(c->comment); + free(c->domain); + free(c->path); + free(c->name); + free(c->value); + free(c); } + /** * Parse a cookie * @@ -3058,8 +1642,8 @@ error: * \param cookie Pointer to cookie string (updated on exit) * \return Pointer to cookie structure (on heap, caller frees) or NULL */ -struct cookie_internal_data *urldb_parse_cookie(nsurl *url, - const char **cookie) +static struct cookie_internal_data * +urldb_parse_cookie(nsurl *url, const char **cookie) { struct cookie_internal_data *c; const char *cur; @@ -3091,34 +1675,12 @@ struct cookie_internal_data *urldb_parse_cookie(nsurl *url, /* Match Firefox 2.0.0.11 */ value[0] = '\0'; -#if 0 - /* This is what IE6/7 & Safari 3 do */ - /* Opera 9.25 discards the entire cookie */ - - /* Shuffle value up by 1 */ - memmove(value + 1, value, - min(v - value, sizeof(value) - 2)); - v++; - /* And insert " character at the start */ - value[0] = '"'; - - /* Now, run forwards through the value - * looking for a semicolon. If one exists, - * terminate the value at this point. */ - for (char *s = value; s < v; s++) { - if (*s == ';') { - *s = '\0'; - v = s; - break; - } - } -#endif } break; } else if (*cur == '\r') { /* Spurious linefeed */ - continue; + continue; } else if (*cur == '\n') { /* Spurious newline */ continue; @@ -3131,7 +1693,7 @@ struct cookie_internal_data *urldb_parse_cookie(nsurl *url, } else { had_value_data = true; - /* Value is taken verbatim if first non-space + /* Value is taken verbatim if first non-space * character is not a " */ if (*cur != '"') { value_verbatim = true; @@ -3157,7 +1719,7 @@ struct cookie_internal_data *urldb_parse_cookie(nsurl *url, } if (!quoted && (was_quoted || *cur == ';')) { - /* Semicolon or after quoted value + /* Semicolon or after quoted value * => end of current avpair */ /* NUL-terminate tokens */ @@ -3316,112 +1878,6 @@ struct cookie_internal_data *urldb_parse_cookie(nsurl *url, return c; } -/** - * Parse a cookie avpair - * - * \param c Cookie struct to populate - * \param n Name component - * \param v Value component - * \param was_quoted Whether ::v was quoted in the input - * \return true on success, false on memory exhaustion - */ -bool urldb_parse_avpair(struct cookie_internal_data *c, char *n, char *v, - bool was_quoted) -{ - int vlen; - - assert(c && n && v); - - /* Strip whitespace from start of name */ - for (; *n; n++) { - if (*n != ' ' && *n != '\t') - break; - } - - /* Strip whitespace from end of name */ - for (vlen = strlen(n); vlen; vlen--) { - if (n[vlen] == ' ' || n[vlen] == '\t') - n[vlen] = '\0'; - else - break; - } - - /* Strip whitespace from start of value */ - for (; *v; v++) { - if (*v != ' ' && *v != '\t') - break; - } - - /* Strip whitespace from end of value */ - for (vlen = strlen(v); vlen; vlen--) { - if (v[vlen] == ' ' || v[vlen] == '\t') - v[vlen] = '\0'; - else - break; - } - - if (!c->comment && strcasecmp(n, "Comment") == 0) { - c->comment = strdup(v); - if (!c->comment) - return false; - } else if (!c->domain && strcasecmp(n, "Domain") == 0) { - if (v[0] == '.') { - /* Domain must start with a dot */ - c->domain_from_set = true; - c->domain = strdup(v); - if (!c->domain) - return false; - } - } else if (strcasecmp(n, "Max-Age") == 0) { - int temp = atoi(v); - if (temp == 0) - /* Special case - 0 means delete */ - c->expires = 0; - else - c->expires = time(NULL) + temp; - } else if (!c->path && strcasecmp(n, "Path") == 0) { - c->path_from_set = true; - c->path = strdup(v); - if (!c->path) - return false; - } else if (strcasecmp(n, "Version") == 0) { - c->version = atoi(v); - } else if (strcasecmp(n, "Expires") == 0) { - char *datenoday; - time_t expires; - - /* Strip dayname from date (these are hugely - * variable and liable to break the parser. - * They also serve no useful purpose) */ - for (datenoday = v; *datenoday && !isdigit(*datenoday); - datenoday++) - ; /* do nothing */ - - expires = curl_getdate(datenoday, NULL); - if (expires == -1) { - /* assume we have an unrepresentable - * date => force it to the maximum - * possible value of a 32bit time_t - * (this may break in 2038. We'll - * deal with that once we come to - * it) */ - expires = (time_t)0x7fffffff; - } - c->expires = expires; - } else if (strcasecmp(n, "Secure") == 0) { - c->secure = true; - } else if (strcasecmp(n, "HttpOnly") == 0) { - c->http_only = true; - } else if (!c->name) { - c->name = strdup(n); - c->value = strdup(v); - c->value_was_quoted = was_quoted; - if (!c->name || !c->value) - return false; - } - - return true; -} /** * Insert a cookie into the database @@ -3431,8 +1887,8 @@ bool urldb_parse_avpair(struct cookie_internal_data *c, char *n, char *v, * \param url URL (sans fragment) associated with cookie * \return true on success, false on memory exhaustion (c will be freed) */ -bool urldb_insert_cookie(struct cookie_internal_data *c, lwc_string *scheme, - nsurl *url) +static bool urldb_insert_cookie(struct cookie_internal_data *c, + lwc_string *scheme, nsurl *url) { struct cookie_internal_data *d; const struct host_part *h; @@ -3537,22 +1993,6 @@ bool urldb_insert_cookie(struct cookie_internal_data *c, lwc_string *scheme, return true; } -/** - * Free a cookie - * - * \param c The cookie to free - */ -void urldb_free_cookie(struct cookie_internal_data *c) -{ - assert(c); - - free(c->comment); - free(c->domain); - free(c->path); - free(c->name); - free(c->value); - free(c); -} /** * Concatenate a cookie into the provided buffer @@ -3564,16 +2004,16 @@ void urldb_free_cookie(struct cookie_internal_data *c) * \param buf Pointer to Pointer to buffer (updated) * \return true on success, false on memory exhaustion */ -bool urldb_concat_cookie(struct cookie_internal_data *c, int version, +static bool urldb_concat_cookie(struct cookie_internal_data *c, int version, int *used, int *alloc, char **buf) { /* Combined (A)BNF for the Cookie: request header: - * + * * CHAR = <any US-ASCII character (octets 0 - 127)> * CTL = <any US-ASCII control character * (octets 0 - 31) and DEL (127)> * CR = <US-ASCII CR, carriage return (13)> - * LF = <US-ASCII LF, linefeed (10)> + * LF = <US-ASCII LF, linefeed (10)> * SP = <US-ASCII SP, space (32)> * HT = <US-ASCII HT, horizontal-tab (9)> * <"> = <US-ASCII double-quote mark (34)> @@ -3610,22 +2050,22 @@ bool urldb_concat_cookie(struct cookie_internal_data *c, int version, * * A note on quoted-string handling: * The cookie data stored in the db is verbatim (i.e. sans enclosing - * <">, if any, and with all quoted-pairs intact) thus all that we + * <">, if any, and with all quoted-pairs intact) thus all that we * need to do here is ensure that value strings which were quoted - * in Set-Cookie or which include any of the separators are quoted + * in Set-Cookie or which include any of the separators are quoted * before use. * * A note on cookie-value separation: - * We use semicolons for all separators, including between + * We use semicolons for all separators, including between * cookie-values. This simplifies things and is backwards compatible. - */ + */ const char * const separators = "()<>@,;:\\\"/[]?={} \t"; int max_len; assert(c && used && alloc && buf && *buf); - /* "; " cookie-value + /* "; " cookie-value * We allow for the possibility that values are quoted */ max_len = 2 + strlen(c->name) + 1 + strlen(c->value) + 2 + @@ -3663,7 +2103,7 @@ bool urldb_concat_cookie(struct cookie_internal_data *c, int version, *used += strlen(c->value); } - /* We don't send path/domain information -- that's what the + /* We don't send path/domain information -- that's what the * Netscape spec suggests we should do, anyway. */ } else { /* RFC2109 or RFC2965 cookie */ @@ -3713,11 +2153,1567 @@ bool urldb_concat_cookie(struct cookie_internal_data *c, int version, return true; } + +/** + * deletes paths from a cookie. + */ +static void urldb_delete_cookie_paths(const char *domain, const char *path, + const char *name, struct path_data *parent) +{ + struct cookie_internal_data *c; + struct path_data *p = parent; + + assert(parent); + + do { + for (c = p->cookies; c; c = c->next) { + if (strcmp(c->domain, domain) == 0 && + strcmp(c->path, path) == 0 && + strcmp(c->name, name) == 0) { + if (c->prev) + c->prev->next = c->next; + else + p->cookies = c->next; + + if (c->next) + c->next->prev = c->prev; + else + p->cookies_end = c->prev; + + urldb_free_cookie(c); + + return; + } + } + + if (p->children) { + p = p->children; + } else { + while (p != parent) { + if (p->next != NULL) { + p = p->next; + break; + } + + p = p->parent; + } + } + } while (p != parent); +} + + +/** + * Deletes cookie hosts and their assoicated paths + */ +static void urldb_delete_cookie_hosts(const char *domain, const char *path, + const char *name, struct host_part *parent) +{ + struct host_part *h; + assert(parent); + + urldb_delete_cookie_paths(domain, path, name, &parent->paths); + + for (h = parent->children; h; h = h->next) + urldb_delete_cookie_hosts(domain, path, name, h); +} + + +/** + * Save a path subtree's cookies + * + * \param fp File pointer to write to + * \param parent Parent path + */ +static void urldb_save_cookie_paths(FILE *fp, struct path_data *parent) +{ + struct path_data *p = parent; + time_t now = time(NULL); + + assert(fp && parent); + + do { + if (p->cookies != NULL) { + struct cookie_internal_data *c; + + for (c = p->cookies; c != NULL; c = c->next) { + if (c->expires == -1 || c->expires < now) + /* Skip expired & session cookies */ + continue; + + fprintf(fp, + "%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t" + "%s\t%s\t%d\t%s\t%s\t%s\n", + c->version, c->domain, + c->domain_from_set, c->path, + c->path_from_set, c->secure, + c->http_only, + (int)c->expires, (int)c->last_used, + c->no_destroy, c->name, c->value, + c->value_was_quoted, + p->scheme ? lwc_string_data(p->scheme) : + "unused", + p->url ? nsurl_access(p->url) : + "unused", + c->comment ? c->comment : ""); + } + } + + if (p->children != NULL) { + p = p->children; + } else { + while (p != parent) { + if (p->next != NULL) { + p = p->next; + break; + } + + p = p->parent; + } + } + } while (p != parent); +} + + +/** + * Save a host subtree's cookies + * + * \param fp File pointer to write to + * \param parent Parent host + */ +static void urldb_save_cookie_hosts(FILE *fp, struct host_part *parent) +{ + struct host_part *h; + assert(fp && parent); + + urldb_save_cookie_paths(fp, &parent->paths); + + for (h = parent->children; h; h = h->next) + urldb_save_cookie_hosts(fp, h); +} + + +/** + * Destroy a cookie node + * + * \param c Cookie to destroy + */ +static void urldb_destroy_cookie(struct cookie_internal_data *c) +{ + free(c->name); + free(c->value); + free(c->comment); + free(c->domain); + free(c->path); + + free(c); +} + + +/** + * Destroy the contents of a path node + * + * \param node Node to destroy contents of (does not destroy node) + */ +static void urldb_destroy_path_node_content(struct path_data *node) +{ + struct cookie_internal_data *a, *b; + unsigned int i; + + if (node->url != NULL) + nsurl_unref(node->url); + + if (node->scheme != NULL) + lwc_string_unref(node->scheme); + + free(node->segment); + for (i = 0; i < node->frag_cnt; i++) + free(node->fragment[i]); + free(node->fragment); + + if (node->thumb) + bitmap_destroy(node->thumb); + + free(node->urld.title); + + for (a = node->cookies; a; a = b) { + b = a->next; + urldb_destroy_cookie(a); + } +} + + +/** + * Destroy protection space data + * + * \param space Protection space to destroy + */ +static void urldb_destroy_prot_space(struct prot_space_data *space) +{ + lwc_string_unref(space->scheme); + free(space->realm); + free(space->auth); + + free(space); +} + + +/** + * Destroy a path tree + * + * \param root Root node of tree to destroy + */ +static void urldb_destroy_path_tree(struct path_data *root) +{ + struct path_data *p = root; + + do { + if (p->children != NULL) { + p = p->children; + } else { + struct path_data *q = p; + + while (p != root) { + if (p->next != NULL) { + p = p->next; + break; + } + + p = p->parent; + + urldb_destroy_path_node_content(q); + free(q); + + q = p; + } + + urldb_destroy_path_node_content(q); + free(q); + } + } while (p != root); +} + + /** - * Load a cookie file into the database + * Destroy a host tree * - * \param filename File to load + * \param root Root node of tree to destroy */ +static void urldb_destroy_host_tree(struct host_part *root) +{ + struct host_part *a, *b; + struct path_data *p, *q; + struct prot_space_data *s, *t; + + /* Destroy children */ + for (a = root->children; a; a = b) { + b = a->next; + urldb_destroy_host_tree(a); + } + + /* Now clean up paths */ + for (p = root->paths.children; p; p = q) { + q = p->next; + urldb_destroy_path_tree(p); + } + + /* Root path */ + urldb_destroy_path_node_content(&root->paths); + + /* Proctection space data */ + for (s = root->prot_space; s; s = t) { + t = s->next; + urldb_destroy_prot_space(s); + } + + /* And ourselves */ + free(root->part); + free(root); +} + + +/** + * Destroy a search tree + * + * \param root Root node of tree to destroy + */ +static void urldb_destroy_search_tree(struct search_node *root) +{ + /* Destroy children */ + if (root->left != &empty) + urldb_destroy_search_tree(root->left); + if (root->right != &empty) + urldb_destroy_search_tree(root->right); + + /* And destroy ourselves */ + free(root); +} + + +/*************** External interface ***************/ + + +/* exported interface documented in content/urldb.h */ +void urldb_destroy(void) +{ + struct host_part *a, *b; + int i; + + /* Clean up search trees */ + for (i = 0; i < NUM_SEARCH_TREES; i++) { + if (search_trees[i] != &empty) + urldb_destroy_search_tree(search_trees[i]); + } + + /* And database */ + for (a = db_root.children; a; a = b) { + b = a->next; + urldb_destroy_host_tree(a); + } + + /* And the bloom filter */ + if (url_bloom != NULL) + bloom_destroy(url_bloom); +} + + +/* exported interface documented in content/urldb.h */ +nserror urldb_load(const char *filename) +{ +#define MAXIMUM_URL_LENGTH 4096 + char s[MAXIMUM_URL_LENGTH]; + char host[256]; + struct host_part *h; + int urls; + int i; + int version; + int length; + FILE *fp; + + assert(filename); + + LOG(("Loading URL file %s", filename)); + + if (url_bloom == NULL) + url_bloom = bloom_create(BLOOM_SIZE); + + fp = fopen(filename, "r"); + if (!fp) { + LOG(("Failed to open file '%s' for reading", filename)); + return NSERROR_NOT_FOUND; + } + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) { + fclose(fp); + return NSERROR_NEED_DATA; + } + + version = atoi(s); + if (version < MIN_URL_FILE_VERSION) { + LOG(("Unsupported URL file version.")); + fclose(fp); + return NSERROR_INVALID; + } + if (version > URL_FILE_VERSION) { + LOG(("Unknown URL file version.")); + fclose(fp); + return NSERROR_INVALID; + } + + while (fgets(host, sizeof host, fp)) { + /* get the hostname */ + length = strlen(host) - 1; + host[length] = '\0'; + + /* skip data that has ended up with a host of '' */ + if (length == 0) { + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + urls = atoi(s); + /* Eight fields/url */ + for (i = 0; i < (8 * urls); i++) { + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + } + continue; + } + + /* read number of URLs */ + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + urls = atoi(s); + + /* no URLs => try next host */ + if (urls == 0) { + LOG(("No URLs for '%s'", host)); + continue; + } + + h = urldb_add_host(host); + if (!h) { + LOG(("Failed adding host: '%s'", host)); + fclose(fp); + return NSERROR_NOMEM; + } + + /* load the non-corrupt data */ + for (i = 0; i < urls; i++) { + struct path_data *p = NULL; + char scheme[64], ports[10]; + char url[64 + 3 + 256 + 6 + 4096 + 1]; + unsigned int port; + bool is_file = false; + nsurl *nsurl; + lwc_string *scheme_lwc, *fragment_lwc; + char *path_query; + size_t len; + + if (!fgets(scheme, sizeof scheme, fp)) + break; + length = strlen(scheme) - 1; + scheme[length] = '\0'; + + if (!fgets(ports, sizeof ports, fp)) + break; + length = strlen(ports) - 1; + ports[length] = '\0'; + port = atoi(ports); + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + length = strlen(s) - 1; + s[length] = '\0'; + + if (!strcasecmp(host, "localhost") && + !strcasecmp(scheme, "file")) + is_file = true; + + snprintf(url, sizeof url, "%s://%s%s%s%s", + scheme, + /* file URLs have no host */ + (is_file ? "" : host), + (port ? ":" : ""), + (port ? ports : ""), + s); + + /* TODO: store URLs in pre-parsed state, and make + * a nsurl_load to generate the nsurl more + * swiftly. + * Need a nsurl_save too. + */ + if (nsurl_create(url, &nsurl) != NSERROR_OK) { + LOG(("Failed inserting '%s'", url)); + fclose(fp); + return NSERROR_NOMEM; + } + + if (url_bloom != NULL) { + uint32_t hash = nsurl_hash(nsurl); + bloom_insert_hash(url_bloom, hash); + } + + /* Copy and merge path/query strings */ + if (nsurl_get(nsurl, NSURL_PATH | NSURL_QUERY, + &path_query, &len) != NSERROR_OK) { + LOG(("Failed inserting '%s'", url)); + fclose(fp); + return NSERROR_NOMEM; + } + + scheme_lwc = nsurl_get_component(nsurl, NSURL_SCHEME); + fragment_lwc = nsurl_get_component(nsurl, + NSURL_FRAGMENT); + p = urldb_add_path(scheme_lwc, port, h, path_query, + fragment_lwc, nsurl); + if (!p) { + LOG(("Failed inserting '%s'", url)); + fclose(fp); + return NSERROR_NOMEM; + } + nsurl_unref(nsurl); + lwc_string_unref(scheme_lwc); + if (fragment_lwc != NULL) + lwc_string_unref(fragment_lwc); + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + if (p) + p->urld.visits = (unsigned int)atoi(s); + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + if (p) + p->urld.last_visit = (time_t)atoi(s); + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + if (p) + p->urld.type = (content_type)atoi(s); + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + + + if (!fgets(s, MAXIMUM_URL_LENGTH, fp)) + break; + length = strlen(s) - 1; + if (p && length > 0) { + s[length] = '\0'; + p->urld.title = malloc(length + 1); + if (p->urld.title) + memcpy(p->urld.title, s, length + 1); + } + } + } + + fclose(fp); + LOG(("Successfully loaded URL file")); +#undef MAXIMUM_URL_LENGTH + + return NSERROR_OK; +} + +/* exported interface documented in content/urldb.h */ +nserror urldb_save(const char *filename) +{ + FILE *fp; + int i; + + assert(filename); + + fp = fopen(filename, "w"); + if (!fp) { + LOG(("Failed to open file '%s' for writing", filename)); + return NSERROR_SAVE_FAILED; + } + + /* file format version number */ + fprintf(fp, "%d\n", URL_FILE_VERSION); + + for (i = 0; i != NUM_SEARCH_TREES; i++) { + urldb_save_search_tree(search_trees[i], fp); + } + + fclose(fp); + + return NSERROR_OK; +} + + +/* exported interface documented in content/urldb.h */ +void urldb_set_url_persistence(nsurl *url, bool persist) +{ + struct path_data *p; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return; + + p->persistent = persist; +} + + +/* exported interface documented in content/urldb.h */ +bool urldb_add_url(nsurl *url) +{ + struct host_part *h; + struct path_data *p; + lwc_string *scheme; + lwc_string *port; + lwc_string *host; + lwc_string *fragment; + const char *host_str; + char *path_query = NULL; + size_t len; + bool match; + unsigned int port_int; + + assert(url); + + if (url_bloom == NULL) + url_bloom = bloom_create(BLOOM_SIZE); + + if (url_bloom != NULL) { + uint32_t hash = nsurl_hash(url); + bloom_insert_hash(url_bloom, hash); + } + + /* Copy and merge path/query strings */ + if (nsurl_get(url, NSURL_PATH | NSURL_QUERY, &path_query, &len) != + NSERROR_OK) { + return false; + } + assert(path_query != NULL); + + scheme = nsurl_get_component(url, NSURL_SCHEME); + if (scheme == NULL) { + free(path_query); + return false; + } + + host = nsurl_get_component(url, NSURL_HOST); + if (host != NULL) { + host_str = lwc_string_data(host); + lwc_string_unref(host); + + } else if (lwc_string_isequal(scheme, corestring_lwc_file, &match) == + lwc_error_ok && match == true) { + host_str = "localhost"; + + } else { + lwc_string_unref(scheme); + free(path_query); + return false; + } + + fragment = nsurl_get_component(url, NSURL_FRAGMENT); + + port = nsurl_get_component(url, NSURL_PORT); + if (port != NULL) { + port_int = atoi(lwc_string_data(port)); + lwc_string_unref(port); + } else { + port_int = 0; + } + + /* Get host entry */ + h = urldb_add_host(host_str); + + /* Get path entry */ + p = (h != NULL) ? urldb_add_path(scheme, port_int, h, path_query, + fragment, url) : NULL; + + lwc_string_unref(scheme); + if (fragment != NULL) + lwc_string_unref(fragment); + + return (p != NULL); +} + + +/* exported interface documented in content/urldb.h */ +void urldb_set_url_title(nsurl *url, const char *title) +{ + struct path_data *p; + char *temp; + + assert(url && title); + + p = urldb_find_url(url); + if (!p) + return; + + temp = strdup(title); + if (!temp) + return; + + free(p->urld.title); + p->urld.title = temp; +} + + +/* exported interface documented in content/urldb.h */ +void urldb_set_url_content_type(nsurl *url, content_type type) +{ + struct path_data *p; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return; + + p->urld.type = type; +} + + +/* exported interface documented in content/urldb.h */ +void urldb_update_url_visit_data(nsurl *url) +{ + struct path_data *p; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return; + + p->urld.last_visit = time(NULL); + p->urld.visits++; +} + + +/* exported interface documented in content/urldb.h */ +void urldb_reset_url_visit_data(nsurl *url) +{ + struct path_data *p; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return; + + p->urld.last_visit = (time_t)0; + p->urld.visits = 0; +} + + +/* exported interface documented in content/urldb.h */ +const struct url_data *urldb_get_url_data(nsurl *url) +{ + struct path_data *p; + struct url_internal_data *u; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return NULL; + + u = &p->urld; + + return (const struct url_data *) u; +} + + +/* exported interface documented in content/urldb.h */ +nsurl *urldb_get_url(nsurl *url) +{ + struct path_data *p; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return NULL; + + return p->url; +} + + +/* exported interface documented in content/urldb.h */ +void urldb_set_auth_details(nsurl *url, const char *realm, const char *auth) +{ + struct path_data *p, *pi; + struct host_part *h; + struct prot_space_data *space, *space_alloc; + char *realm_alloc, *auth_alloc; + bool match; + + assert(url && realm && auth); + + /* add url, in case it's missing */ + urldb_add_url(url); + + p = urldb_find_url(url); + + if (!p) + return; + + /* Search for host_part */ + for (pi = p; pi->parent != NULL; pi = pi->parent) + ; + h = (struct host_part *)pi; + + /* Search if given URL belongs to a protection space we already know of. */ + for (space = h->prot_space; space; space = space->next) { + if (!strcmp(space->realm, realm) && + lwc_string_isequal(space->scheme, p->scheme, + &match) == lwc_error_ok && + match == true && + space->port == p->port) + break; + } + + if (space != NULL) { + /* Overrule existing auth. */ + free(space->auth); + space->auth = strdup(auth); + } else { + /* Create a new protection space. */ + space = space_alloc = malloc(sizeof(struct prot_space_data)); + realm_alloc = strdup(realm); + auth_alloc = strdup(auth); + + if (!space_alloc || !realm_alloc || !auth_alloc) { + free(space_alloc); + free(realm_alloc); + free(auth_alloc); + return; + } + + space->scheme = lwc_string_ref(p->scheme); + space->port = p->port; + space->realm = realm_alloc; + space->auth = auth_alloc; + space->next = h->prot_space; + h->prot_space = space; + } + + p->prot_space = space; +} + + +/* exported interface documented in content/urldb.h */ +const char *urldb_get_auth_details(nsurl *url, const char *realm) +{ + struct path_data *p, *p_cur, *p_top; + + assert(url); + + /* add to the db, so our lookup will work */ + urldb_add_url(url); + + p = urldb_find_url(url); + if (!p) + return NULL; + + /* Check for any auth details attached to the path_data node or any of + * its parents. + */ + for (p_cur = p; p_cur != NULL; p_top = p_cur, p_cur = p_cur->parent) { + if (p_cur->prot_space) { + return p_cur->prot_space->auth; + } + } + + /* Only when we have a realm (and canonical root of given URL), we can + * uniquely locate the protection space. + */ + if (realm != NULL) { + const struct host_part *h = (const struct host_part *)p_top; + const struct prot_space_data *space; + bool match; + + /* Search for a possible matching protection space. */ + for (space = h->prot_space; space != NULL; + space = space->next) { + if (!strcmp(space->realm, realm) && + lwc_string_isequal(space->scheme, + p->scheme, &match) == + lwc_error_ok && + match == true && + space->port == p->port) { + p->prot_space = space; + return p->prot_space->auth; + } + } + } + + return NULL; +} + + +/* exported interface documented in content/urldb.h */ +void urldb_set_cert_permissions(nsurl *url, bool permit) +{ + struct path_data *p; + struct host_part *h; + + assert(url); + + /* add url, in case it's missing */ + urldb_add_url(url); + + p = urldb_find_url(url); + if (!p) + return; + + for (; p && p->parent; p = p->parent) + /* do nothing */; + assert(p); + + h = (struct host_part *)p; + + h->permit_invalid_certs = permit; +} + + +/* exported interface documented in content/urldb.h */ +bool urldb_get_cert_permissions(nsurl *url) +{ + struct path_data *p; + const struct host_part *h; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return false; + + for (; p && p->parent; p = p->parent) + /* do nothing */; + assert(p); + + h = (const struct host_part *)p; + + return h->permit_invalid_certs; +} + + +/* exported interface documented in content/urldb.h */ +void urldb_set_thumbnail(nsurl *url, struct bitmap *bitmap) +{ + struct path_data *p; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return; + + if (p->thumb && p->thumb != bitmap) + bitmap_destroy(p->thumb); + + p->thumb = bitmap; +} + + +/* exported interface documented in content/urldb.h */ +struct bitmap *urldb_get_thumbnail(nsurl *url) +{ + struct path_data *p; + + assert(url); + + p = urldb_find_url(url); + if (!p) + return NULL; + + return p->thumb; +} + + +/* exported interface documented in content/urldb.h */ +void urldb_iterate_partial(const char *prefix, + bool (*callback)(nsurl *url, + const struct url_data *data)) +{ + char host[256]; + char buf[260]; /* max domain + "www." */ + const char *slash, *scheme_sep; + struct search_node *tree; + const struct host_part *h; + + assert(prefix && callback); + + /* strip scheme */ + scheme_sep = strstr(prefix, "://"); + if (scheme_sep) + prefix = scheme_sep + 3; + + slash = strchr(prefix, '/'); + tree = urldb_get_search_tree(prefix); + + if (slash) { + /* if there's a slash in the input, then we can + * assume that we're looking for a path */ + snprintf(host, sizeof host, "%.*s", + (int) (slash - prefix), prefix); + + h = urldb_search_find(tree, host); + if (!h) { + int len = slash - prefix; + + if (len <= 3 || strncasecmp(host, "www.", 4) != 0) { + snprintf(buf, sizeof buf, "www.%s", host); + h = urldb_search_find( + search_trees[ST_DN + 'w' - 'a'], + buf); + if (!h) + return; + } else + return; + } + + if (h->paths.children) { + /* Have paths, iterate them */ + urldb_iterate_partial_path(&h->paths, slash + 1, + callback); + } + + } else { + int len = strlen(prefix); + + /* looking for hosts */ + if (!urldb_iterate_partial_host(tree, prefix, callback)) + return; + + if (len <= 3 || strncasecmp(prefix, "www.", 4) != 0) { + /* now look for www.prefix */ + snprintf(buf, sizeof buf, "www.%s", prefix); + if(!urldb_iterate_partial_host( + search_trees[ST_DN + 'w' - 'a'], + buf, callback)) + return; + } + } +} + + +/* exported interface documented in content/urldb.h */ +void urldb_iterate_entries(bool (*callback)(nsurl *url, + const struct url_data *data)) +{ + int i; + + assert(callback); + + for (i = 0; i < NUM_SEARCH_TREES; i++) { + if (!urldb_iterate_entries_host(search_trees[i], + callback, NULL)) + break; + } +} + + +/* exported interface documented in content/urldb.h */ +void urldb_iterate_cookies(bool (*callback)(const struct cookie_data *data)) +{ + int i; + + assert(callback); + + for (i = 0; i < NUM_SEARCH_TREES; i++) { + if (!urldb_iterate_entries_host(search_trees[i], + NULL, callback)) + break; + } +} + + +/* exported interface documented in content/urldb.h */ +bool urldb_set_cookie(const char *header, nsurl *url, nsurl *referer) +{ + const char *cur = header, *end; + lwc_string *path, *host, *scheme; + nsurl *urlt; + bool match; + + assert(url && header); + + /* Get defragmented URL, as 'urlt' */ + if (nsurl_defragment(url, &urlt) != NSERROR_OK) + return NULL; + + scheme = nsurl_get_component(url, NSURL_SCHEME); + if (scheme == NULL) { + nsurl_unref(urlt); + return false; + } + + path = nsurl_get_component(url, NSURL_PATH); + if (path == NULL) { + lwc_string_unref(scheme); + nsurl_unref(urlt); + return false; + } + + host = nsurl_get_component(url, NSURL_HOST); + if (host == NULL) { + lwc_string_unref(path); + lwc_string_unref(scheme); + nsurl_unref(urlt); + return false; + } + + if (referer) { + lwc_string *rhost; + + /* Ensure that url's host name domain matches + * referer's (4.3.5) */ + rhost = nsurl_get_component(referer, NSURL_HOST); + if (rhost == NULL) { + goto error; + } + + /* Domain match host names */ + if (lwc_string_isequal(host, rhost, &match) == lwc_error_ok && + match == false) { + const char *hptr; + const char *rptr; + const char *dot; + const char *host_data = lwc_string_data(host); + const char *rhost_data = lwc_string_data(rhost); + + /* Ensure neither host nor rhost are IP addresses */ + if (urldb__host_is_ip_address(host_data) || + urldb__host_is_ip_address(rhost_data)) { + /* IP address, so no partial match */ + lwc_string_unref(rhost); + goto error; + } + + /* Not exact match, so try the following: + * + * 1) Find the longest common suffix of host and rhost + * (may be all of host/rhost) + * 2) Discard characters from the start of the suffix + * until the suffix starts with a dot + * (prevents foobar.com matching bar.com) + * 3) Ensure the suffix is non-empty and contains + * embedded dots (to avoid permitting .com as a + * suffix) + * + * Note that the above in no way resembles the + * domain matching algorithm found in RFC2109. + * It does, however, model the real world rather + * more accurately. + */ + + /** \todo In future, we should consult a TLD service + * instead of just looking for embedded dots. + */ + + hptr = host_data + lwc_string_length(host) - 1; + rptr = rhost_data + lwc_string_length(rhost) - 1; + + /* 1 */ + while (hptr >= host_data && rptr >= rhost_data) { + if (*hptr != *rptr) + break; + hptr--; + rptr--; + } + /* Ensure we end up pointing at the start of the + * common suffix. The above loop will exit pointing + * to the byte before the start of the suffix. */ + hptr++; + + /* 2 */ + while (*hptr != '\0' && *hptr != '.') + hptr++; + + /* 3 */ + if (*hptr == '\0' || + (dot = strchr(hptr + 1, '.')) == NULL || + *(dot + 1) == '\0') { + lwc_string_unref(rhost); + goto error; + } + } + + lwc_string_unref(rhost); + } + + end = cur + strlen(cur) - 2 /* Trailing CRLF */; + + do { + struct cookie_internal_data *c; + char *dot; + size_t len; + + c = urldb_parse_cookie(url, &cur); + if (!c) { + /* failed => stop parsing */ + goto error; + } + + /* validate cookie */ + + /* 4.2.2:i Cookie must have NAME and VALUE */ + if (!c->name || !c->value) { + urldb_free_cookie(c); + goto error; + } + + /* 4.3.2:i Cookie path must be a prefix of URL path */ + len = strlen(c->path); + if (len > lwc_string_length(path) || + strncmp(c->path, lwc_string_data(path), + len) != 0) { + urldb_free_cookie(c); + goto error; + } + + /* 4.3.2:ii Cookie domain must contain embedded dots */ + dot = strchr(c->domain + 1, '.'); + if (!dot || *(dot + 1) == '\0') { + /* no embedded dots */ + urldb_free_cookie(c); + goto error; + } + + /* Domain match fetch host with cookie domain */ + if (strcasecmp(lwc_string_data(host), c->domain) != 0) { + int hlen, dlen; + char *domain = c->domain; + + /* c->domain must be a domain cookie here because: + * c->domain is either: + * + specified in the header as a domain cookie + * (non-domain cookies in the header are ignored + * by urldb_parse_cookie / urldb_parse_avpair) + * + defaulted to the URL's host part + * (by urldb_parse_cookie if no valid domain was + * specified in the header) + * + * The latter will pass the strcasecmp above, which + * leaves the former (i.e. a domain cookie) + */ + assert(c->domain[0] == '.'); + + /* 4.3.2:iii */ + if (urldb__host_is_ip_address(lwc_string_data(host))) { + /* IP address, so no partial match */ + urldb_free_cookie(c); + goto error; + } + + hlen = lwc_string_length(host); + dlen = strlen(c->domain); + + if (hlen <= dlen && hlen != dlen - 1) { + /* Partial match not possible */ + urldb_free_cookie(c); + goto error; + } + + if (hlen == dlen - 1) { + /* Relax matching to allow + * host a.com to match .a.com */ + domain++; + dlen--; + } + + if (strcasecmp(lwc_string_data(host) + (hlen - dlen), + domain)) { + urldb_free_cookie(c); + goto error; + } + + /* 4.3.2:iv Ensure H contains no dots + * + * If you believe the spec, H should contain no + * dots in _any_ cookie. Unfortunately, however, + * reality differs in that many sites send domain + * cookies of the form .foo.com from hosts such + * as bar.bat.foo.com and then expect domain + * matching to work. Thus we have to do what they + * expect, regardless of any potential security + * implications. + * + * This is what code conforming to the spec would + * look like: + * + * for (int i = 0; i < (hlen - dlen); i++) { + * if (host[i] == '.') { + * urldb_free_cookie(c); + * goto error; + * } + * } + */ + } + + /* Now insert into database */ + if (!urldb_insert_cookie(c, scheme, urlt)) + goto error; + } while (cur < end); + + lwc_string_unref(host); + lwc_string_unref(path); + lwc_string_unref(scheme); + nsurl_unref(urlt); + + return true; + +error: + lwc_string_unref(host); + lwc_string_unref(path); + lwc_string_unref(scheme); + nsurl_unref(urlt); + + return false; +} + + +/* exported interface documented in content/urldb.h */ +char *urldb_get_cookie(nsurl *url, bool include_http_only) +{ + const struct path_data *p, *q; + const struct host_part *h; + lwc_string *path_lwc; + struct cookie_internal_data *c; + int count = 0, version = COOKIE_RFC2965; + struct cookie_internal_data **matched_cookies; + int matched_cookies_size = 20; + int ret_alloc = 4096, ret_used = 1; + const char *path; + char *ret; + lwc_string *scheme; + time_t now; + int i; + bool match; + + assert(url != NULL); + + /* The URL must exist in the db in order to find relevant cookies, since + * we search up the tree from the URL node, and cookies from further + * up also apply. */ + urldb_add_url(url); + + p = urldb_find_url(url); + if (!p) + return NULL; + + scheme = p->scheme; + + matched_cookies = malloc(matched_cookies_size * + sizeof(struct cookie_internal_data *)); + if (!matched_cookies) + return NULL; + +#define GROW_MATCHED_COOKIES \ + do { \ + if (count == matched_cookies_size) { \ + struct cookie_internal_data **temp; \ + temp = realloc(matched_cookies, \ + (matched_cookies_size + 20) * \ + sizeof(struct cookie_internal_data *)); \ + \ + if (temp == NULL) { \ + free(ret); \ + free(matched_cookies); \ + return NULL; \ + } \ + \ + matched_cookies = temp; \ + matched_cookies_size += 20; \ + } \ + } while(0) + + ret = malloc(ret_alloc); + if (!ret) { + free(matched_cookies); + return NULL; + } + + ret[0] = '\0'; + + path_lwc = nsurl_get_component(url, NSURL_PATH); + if (path_lwc == NULL) { + free(ret); + free(matched_cookies); + return NULL; + } + path = lwc_string_data(path_lwc); + lwc_string_unref(path_lwc); + + now = time(NULL); + + if (*(p->segment) != '\0') { + /* Match exact path, unless directory, when prefix matching + * will handle this case for us. */ + for (q = p->parent->children; q; q = q->next) { + if (strcmp(q->segment, p->segment)) + continue; + + /* Consider all cookies associated with + * this exact path */ + for (c = q->cookies; c; c = c->next) { + if (c->expires != -1 && c->expires < now) + /* cookie has expired => ignore */ + continue; + + if (c->secure && lwc_string_isequal( + q->scheme, + corestring_lwc_https, + &match) && + match == false) + /* secure cookie for insecure host. + * ignore */ + continue; + + if (c->http_only && !include_http_only) + /* Ignore HttpOnly */ + continue; + + matched_cookies[count++] = c; + + GROW_MATCHED_COOKIES; + + if (c->version < (unsigned int)version) + version = c->version; + + c->last_used = now; + + cookie_manager_add((struct cookie_data *)c); + } + } + } + + /* Now consider cookies whose paths prefix-match ours */ + for (p = p->parent; p; p = p->parent) { + /* Find directory's path entry(ies) */ + /* There are potentially multiple due to differing schemes */ + for (q = p->children; q; q = q->next) { + if (*(q->segment) != '\0') + continue; + + for (c = q->cookies; c; c = c->next) { + if (c->expires != -1 && c->expires < now) + /* cookie has expired => ignore */ + continue; + + if (c->secure && lwc_string_isequal( + q->scheme, + corestring_lwc_https, + &match) && + match == false) + /* Secure cookie for insecure server + * => ignore */ + continue; + + matched_cookies[count++] = c; + + GROW_MATCHED_COOKIES; + + if (c->version < (unsigned int) version) + version = c->version; + + c->last_used = now; + + cookie_manager_add((struct cookie_data *)c); + } + } + + if (!p->parent) { + /* No parent, so bail here. This can't go in + * the loop exit condition as we also want to + * process the top-level node. + * + * If p->parent is NULL then p->cookies are + * the domain cookies and thus we don't even + * try matching against them. + */ + break; + } + + /* Consider p itself - may be the result of Path=/foo */ + for (c = p->cookies; c; c = c->next) { + if (c->expires != -1 && c->expires < now) + /* cookie has expired => ignore */ + continue; + + /* Ensure cookie path is a prefix of the resource */ + if (strncmp(c->path, path, strlen(c->path)) != 0) + /* paths don't match => ignore */ + continue; + + if (c->secure && lwc_string_isequal(p->scheme, + corestring_lwc_https, + &match) && + match == false) + /* Secure cookie for insecure server + * => ignore */ + continue; + + matched_cookies[count++] = c; + + GROW_MATCHED_COOKIES; + + if (c->version < (unsigned int) version) + version = c->version; + + c->last_used = now; + + cookie_manager_add((struct cookie_data *)c); + } + + } + + /* Finally consider domain cookies for hosts which domain match ours */ + for (h = (const struct host_part *)p; h && h != &db_root; + h = h->parent) { + for (c = h->paths.cookies; c; c = c->next) { + if (c->expires != -1 && c->expires < now) + /* cookie has expired => ignore */ + continue; + + /* Ensure cookie path is a prefix of the resource */ + if (strncmp(c->path, path, strlen(c->path)) != 0) + /* paths don't match => ignore */ + continue; + + if (c->secure && lwc_string_isequal(scheme, + corestring_lwc_https, + &match) && + match == false) + /* secure cookie for insecure host. ignore */ + continue; + + matched_cookies[count++] = c; + + GROW_MATCHED_COOKIES; + + if (c->version < (unsigned int)version) + version = c->version; + + c->last_used = now; + + cookie_manager_add((struct cookie_data *)c); + } + } + + if (count == 0) { + /* No cookies found */ + free(ret); + free(matched_cookies); + return NULL; + } + + /* and build output string */ + if (version > COOKIE_NETSCAPE) { + sprintf(ret, "$Version=%d", version); + ret_used = strlen(ret) + 1; + } + + for (i = 0; i < count; i++) { + if (!urldb_concat_cookie(matched_cookies[i], version, + &ret_used, &ret_alloc, &ret)) { + free(ret); + free(matched_cookies); + return NULL; + } + } + + if (version == COOKIE_NETSCAPE) { + /* Old-style cookies => no version & skip "; " */ + memmove(ret, ret + 2, ret_used - 2); + ret_used -= 2; + } + + /* Now, shrink the output buffer to the required size */ + { + char *temp = realloc(ret, ret_used); + if (!temp) { + free(ret); + free(matched_cookies); + return NULL; + } + + ret = temp; + } + + free(matched_cookies); + + return ret; + +#undef GROW_MATCHED_COOKIES +} + + +/* exported interface documented in content/urldb.h */ +void urldb_delete_cookie(const char *domain, const char *path, + const char *name) +{ + urldb_delete_cookie_hosts(domain, path, name, &db_root); +} + + +/* exported interface documented in content/urldb.h */ void urldb_load_cookies(const char *filename) { FILE *fp; @@ -3770,7 +3766,7 @@ void urldb_load_cookies(const char *filename) if (strncasecmp(s, "Version:", 8) == 0) { FIND_T; SKIP_T; loaded_cookie_file_version = atoi(p); - if (loaded_cookie_file_version < + if (loaded_cookie_file_version < MIN_COOKIE_FILE_VERSION) { LOG(("Unsupported Cookie file version")); break; @@ -3882,84 +3878,12 @@ void urldb_load_cookies(const char *filename) fclose(fp); } -/** - * Delete a cookie - * - * \param domain The cookie's domain - * \param path The cookie's path - * \param name The cookie's name - */ -void urldb_delete_cookie(const char *domain, const char *path, - const char *name) -{ - urldb_delete_cookie_hosts(domain, path, name, &db_root); -} - -void urldb_delete_cookie_hosts(const char *domain, const char *path, - const char *name, struct host_part *parent) -{ - struct host_part *h; - assert(parent); - - urldb_delete_cookie_paths(domain, path, name, &parent->paths); - - for (h = parent->children; h; h = h->next) - urldb_delete_cookie_hosts(domain, path, name, h); -} - -void urldb_delete_cookie_paths(const char *domain, const char *path, - const char *name, struct path_data *parent) -{ - struct cookie_internal_data *c; - struct path_data *p = parent; - - assert(parent); - - do { - for (c = p->cookies; c; c = c->next) { - if (strcmp(c->domain, domain) == 0 && - strcmp(c->path, path) == 0 && - strcmp(c->name, name) == 0) { - if (c->prev) - c->prev->next = c->next; - else - p->cookies = c->next; - - if (c->next) - c->next->prev = c->prev; - else - p->cookies_end = c->prev; - - urldb_free_cookie(c); - - return; - } - } - - if (p->children) { - p = p->children; - } else { - while (p != parent) { - if (p->next != NULL) { - p = p->next; - break; - } - - p = p->parent; - } - } - } while(p != parent); -} -/** - * Save persistent cookies to file - * - * \param filename Path to save to - */ +/* exported interface documented in content/urldb.h */ void urldb_save_cookies(const char *filename) { FILE *fp; - int cookie_file_version = max(loaded_cookie_file_version, + int cookie_file_version = max(loaded_cookie_file_version, COOKIE_FILE_VERSION); assert(filename); @@ -3988,253 +3912,173 @@ void urldb_save_cookies(const char *filename) fclose(fp); } -/** - * Save a host subtree's cookies - * - * \param fp File pointer to write to - * \param parent Parent host - */ -void urldb_save_cookie_hosts(FILE *fp, struct host_part *parent) + +/* exported interface documented in content/urldb.h */ +void urldb_dump(void) { - struct host_part *h; - assert(fp && parent); + int i; - urldb_save_cookie_paths(fp, &parent->paths); + urldb_dump_hosts(&db_root); - for (h = parent->children; h; h = h->next) - urldb_save_cookie_hosts(fp, h); + for (i = 0; i != NUM_SEARCH_TREES; i++) + urldb_dump_search(search_trees[i], 0); } -/** - * Save a path subtree's cookies - * - * \param fp File pointer to write to - * \param parent Parent path - */ -void urldb_save_cookie_paths(FILE *fp, struct path_data *parent) + +/* exported interface documented in content/urldb.h */ +struct host_part *urldb_add_host(const char *host) { - struct path_data *p = parent; - time_t now = time(NULL); + struct host_part *d = (struct host_part *) &db_root, *e; + struct search_node *s; + char buf[256]; /* 256 bytes is sufficient - domain names are + * limited to 255 chars. */ + char *part; - assert(fp && parent); + assert(host); - do { - if (p->cookies != NULL) { - struct cookie_internal_data *c; + if (urldb__host_is_ip_address(host)) { + /* Host is an IP, so simply add as TLD */ - for (c = p->cookies; c != NULL; c = c->next) { - if (c->expires == -1 || c->expires < now) - /* Skip expired & session cookies */ - continue; + /* Check for existing entry */ + for (e = d->children; e; e = e->next) + if (strcasecmp(host, e->part) == 0) + /* found => return it */ + return e; - fprintf(fp, - "%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%d\t%d\t" - "%s\t%s\t%d\t%s\t%s\t%s\n", - c->version, c->domain, - c->domain_from_set, c->path, - c->path_from_set, c->secure, - c->http_only, - (int)c->expires, (int)c->last_used, - c->no_destroy, c->name, c->value, - c->value_was_quoted, - p->scheme ? lwc_string_data(p->scheme) : - "unused", - p->url ? nsurl_access(p->url) : - "unused", - c->comment ? c->comment : ""); - } - } + d = urldb_add_host_node(host, d); - if (p->children != NULL) { - p = p->children; + s = urldb_search_insert(search_trees[ST_IP], d); + if (!s) { + /* failed */ + d = NULL; } else { - while (p != parent) { - if (p->next != NULL) { - p = p->next; - break; - } - - p = p->parent; - } + search_trees[ST_IP] = s; } - } while (p != parent); -} - - -/** - * Destroy urldb - */ -void urldb_destroy(void) -{ - struct host_part *a, *b; - int i; - - /* Clean up search trees */ - for (i = 0; i < NUM_SEARCH_TREES; i++) { - if (search_trees[i] != &empty) - urldb_destroy_search_tree(search_trees[i]); - } - - /* And database */ - for (a = db_root.children; a; a = b) { - b = a->next; - urldb_destroy_host_tree(a); - } - - /* And the bloom filter */ - if (url_bloom != NULL) - bloom_destroy(url_bloom); -} - -/** - * Destroy a host tree - * - * \param root Root node of tree to destroy - */ -void urldb_destroy_host_tree(struct host_part *root) -{ - struct host_part *a, *b; - struct path_data *p, *q; - struct prot_space_data *s, *t; - - /* Destroy children */ - for (a = root->children; a; a = b) { - b = a->next; - urldb_destroy_host_tree(a); - } - /* Now clean up paths */ - for (p = root->paths.children; p; p = q) { - q = p->next; - urldb_destroy_path_tree(p); + return d; } - /* Root path */ - urldb_destroy_path_node_content(&root->paths); - - /* Proctection space data */ - for (s = root->prot_space; s; s = t) { - t = s->next; - urldb_destroy_prot_space(s); - } + /* Copy host string, so we can corrupt it */ + strncpy(buf, host, sizeof buf); + buf[sizeof buf - 1] = '\0'; - /* And ourselves */ - free(root->part); - free(root); -} + /* Process FQDN segments backwards */ + do { + part = strrchr(buf, '.'); + if (!part) { + /* last segment */ + /* Check for existing entry */ + for (e = d->children; e; e = e->next) + if (strcasecmp(buf, e->part) == 0) + break; -/** - * Destroy a path tree - * - * \param root Root node of tree to destroy - */ -void urldb_destroy_path_tree(struct path_data *root) -{ - struct path_data *p = root; + if (e) { + d = e; + } else { + d = urldb_add_host_node(buf, d); + } - do { - if (p->children != NULL) { - p = p->children; - } else { - struct path_data *q = p; + /* And insert into search tree */ + if (d) { + struct search_node **r; - while (p != root) { - if (p->next != NULL) { - p = p->next; - break; + r = urldb_get_search_tree_direct(buf); + s = urldb_search_insert(*r, d); + if (!s) { + /* failed */ + d = NULL; + } else { + *r = s; } + } + break; + } - p = p->parent; + /* Check for existing entry */ + for (e = d->children; e; e = e->next) + if (strcasecmp(part + 1, e->part) == 0) + break; - urldb_destroy_path_node_content(q); - free(q); + d = e ? e : urldb_add_host_node(part + 1, d); + if (!d) + break; - q = p; - } + *part = '\0'; + } while (1); - urldb_destroy_path_node_content(q); - free(q); - } - } while (p != root); + return d; } -/** - * Destroy the contents of a path node - * - * \param node Node to destroy contents of (does not destroy node) - */ -void urldb_destroy_path_node_content(struct path_data *node) -{ - struct cookie_internal_data *a, *b; - unsigned int i; - if (node->url != NULL) - nsurl_unref(node->url); +/* exported interface documented in content/urldb.h */ +struct path_data * +urldb_add_path(lwc_string *scheme, + unsigned int port, + const struct host_part *host, + char *path_query, + lwc_string *fragment, + nsurl *url) +{ + struct path_data *d, *e; + char *buf = path_query; + char *segment, *slash; + bool match; - if (node->scheme != NULL) - lwc_string_unref(node->scheme); + assert(scheme && host && url); - free(node->segment); - for (i = 0; i < node->frag_cnt; i++) - free(node->fragment[i]); - free(node->fragment); + d = (struct path_data *) &host->paths; - if (node->thumb) - bitmap_destroy(node->thumb); + /* skip leading '/' */ + segment = buf; + if (*segment == '/') + segment++; - free(node->urld.title); + /* Process path segments */ + do { + slash = strchr(segment, '/'); + if (!slash) { + /* last segment */ + /* look for existing entry */ + for (e = d->children; e; e = e->next) + if (strcmp(segment, e->segment) == 0 && + lwc_string_isequal(scheme, + e->scheme, &match) == + lwc_error_ok && + match == true && + e->port == port) + break; - for (a = node->cookies; a; a = b) { - b = a->next; - urldb_destroy_cookie(a); - } -} + d = e ? urldb_add_path_fragment(e, fragment) : + urldb_add_path_node(scheme, port, + segment, fragment, d); + break; + } -/** - * Destroy a cookie node - * - * \param c Cookie to destroy - */ -void urldb_destroy_cookie(struct cookie_internal_data *c) -{ - free(c->name); - free(c->value); - free(c->comment); - free(c->domain); - free(c->path); + *slash = '\0'; - free(c); -} + /* look for existing entry */ + for (e = d->children; e; e = e->next) + if (strcmp(segment, e->segment) == 0 && + lwc_string_isequal(scheme, e->scheme, + &match) == lwc_error_ok && + match == true && + e->port == port) + break; -/** - * Destroy protection space data - * - * \param space Protection space to destroy - */ -void urldb_destroy_prot_space(struct prot_space_data *space) -{ - lwc_string_unref(space->scheme); - free(space->realm); - free(space->auth); + d = e ? e : urldb_add_path_node(scheme, port, segment, NULL, d); + if (!d) + break; - free(space); -} + segment = slash + 1; + } while (1); + free(path_query); -/** - * Destroy a search tree - * - * \param root Root node of tree to destroy - */ -void urldb_destroy_search_tree(struct search_node *root) -{ - /* Destroy children */ - if (root->left != &empty) - urldb_destroy_search_tree(root->left); - if (root->right != &empty) - urldb_destroy_search_tree(root->right); + if (d && !d->url) { + /* Insert defragmented URL */ + if (nsurl_defragment(url, &d->url) != NSERROR_OK) + return NULL; + } - /* And destroy ourselves */ - free(root); + return d; } - diff --git a/content/urldb.h b/content/urldb.h index c0fece24e..d7ca8b0f8 100644 --- a/content/urldb.h +++ b/content/urldb.h @@ -64,62 +64,262 @@ struct cookie_data { struct bitmap; -/* Destruction */ +/** + * Destroy urldb + */ void urldb_destroy(void); + /* Persistence support */ + +/** + * Import an URL database from file, replacing any existing database + * + * \param filename Name of file containing data + */ nserror urldb_load(const char *filename); -void urldb_save(const char *filename); + +/** + * Export the current database to file + * + * \param filename Name of file to export to + */ +nserror urldb_save(const char *filename); + +/** + * Set the cross-session persistence of the entry for an URL + * + * \param url Absolute URL to persist + * \param persist True to persist, false otherwise + */ void urldb_set_url_persistence(nsurl *url, bool persist); + /* URL insertion */ + +/** + * Insert an URL into the database + * + * \param url Absolute URL to insert + * \return true on success, false otherwise + */ bool urldb_add_url(nsurl *url); /* URL data modification / lookup */ + +/** + * Set an URL's title string, replacing any existing one + * + * \param url The URL to look for + * \param title The title string to use (copied) + */ void urldb_set_url_title(nsurl *url, const char *title); + +/** + * Set an URL's content type + * + * \param url The URL to look for + * \param type The type to set + */ void urldb_set_url_content_type(nsurl *url, content_type type); + +/** + * Update an URL's visit data + * + * \param url The URL to update + */ void urldb_update_url_visit_data(nsurl *url); + +/** + * Reset an URL's visit statistics + * + * \param url The URL to reset + */ void urldb_reset_url_visit_data(nsurl *url); + +/** + * Find data for an URL. + * + * \param url Absolute URL to look for + * \return Pointer to result struct, or NULL + */ const struct url_data *urldb_get_url_data(nsurl *url); + +/** + * Extract an URL from the db + * + * \param url URL to extract + * \return Pointer to database's copy of URL or NULL if not found + */ nsurl *urldb_get_url(nsurl *url); + /* Authentication modification / lookup */ -void urldb_set_auth_details(nsurl *url, const char *realm, - const char *auth); + +/** + * Set authentication data for an URL + * + * \param url The URL to consider + * \param realm The authentication realm + * \param auth The authentication details (in form username:password) + */ +void urldb_set_auth_details(nsurl *url, const char *realm, const char *auth); + +/** + * Look up authentication details in database + * + * \param url Absolute URL to search for + * \param realm When non-NULL, it is realm which can be used to determine + * the protection space when that's not been done before for given URL. + * \return Pointer to authentication details, or NULL if not found + */ const char *urldb_get_auth_details(nsurl *url, const char *realm); + /* SSL certificate permissions */ + +/** + * Set certificate verification permissions + * + * \param url URL to consider + * \param permit Set to true to allow invalid certificates + */ void urldb_set_cert_permissions(nsurl *url, bool permit); + +/** + * Retrieve certificate verification permissions from database + * + * \param url Absolute URL to search for + * \return true to permit connections to hosts with invalid certificates, + * false otherwise. + */ bool urldb_get_cert_permissions(nsurl *url); + /* Thumbnail handling */ + +/** + * Set thumbnail for url, replacing any existing thumbnail + * + * \param url Absolute URL to consider + * \param bitmap Opaque pointer to thumbnail data, or NULL to invalidate + */ void urldb_set_thumbnail(nsurl *url, struct bitmap *bitmap); + +/** + * Retrieve thumbnail data for given URL + * + * \param url Absolute URL to search for + * \return Pointer to thumbnail data, or NULL if not found. + */ struct bitmap *urldb_get_thumbnail(nsurl *url); + /* URL completion */ + +/** + * Iterate over entries in the database which match the given prefix + * + * \param prefix Prefix to match + * \param callback Callback function + */ void urldb_iterate_partial(const char *prefix, - bool (*callback)(nsurl *url, - const struct url_data *data)); + bool (*callback)(nsurl *url, const struct url_data *data)); + /* Iteration */ + +/** + * Iterate over all entries in database + * + * \param callback Function to callback for each entry + */ void urldb_iterate_entries(bool (*callback)(nsurl *url, const struct url_data *data)); + +/** + * Iterate over all cookies in database + * + * \param callback Function to callback for each entry + */ void urldb_iterate_cookies(bool (*callback)(const struct cookie_data *cookie)); -/* Debug */ -void urldb_dump(void); /* Cookies */ + +/** + * Parse Set-Cookie header and insert cookie(s) into database + * + * \param header Header to parse, with Set-Cookie: stripped + * \param url URL being fetched + * \param referer Referring resource, or 0 for verifiable transaction + * \return true on success, false otherwise + */ bool urldb_set_cookie(const char *header, nsurl *url, nsurl *referer); + +/** + * Retrieve cookies for an URL + * + * \param url URL being fetched + * \param include_http_only Whether to include HTTP(S) only cookies. + * \return Cookies string for libcurl (on heap), or NULL on error/no cookies + */ char *urldb_get_cookie(nsurl *url, bool include_http_only); + +/** + * Delete a cookie + * + * \param domain The cookie's domain + * \param path The cookie's path + * \param name The cookie's name + */ void urldb_delete_cookie(const char *domain, const char *path, const char *name); + +/** + * Load a cookie file into the database + * + * \param filename File to load + */ void urldb_load_cookies(const char *filename); + +/** + * Save persistent cookies to file + * + * \param filename Path to save to + */ void urldb_save_cookies(const char *filename); +/* Debug */ + +/** + * Dump URL database to stderr + */ +void urldb_dump(void); + + /* test harness only */ + +/** + * Add a host to the database, creating any intermediate entries + * + * \param host Hostname to add + * \return Pointer to leaf node, or NULL on memory exhaustion + */ struct host_part *urldb_add_host(const char *host); + +/** + * Add a path to the database, creating any intermediate entries + * + * \param scheme URL scheme associated with path + * \param port Port number on host associated with path + * \param host Host tree node to attach to + * \param path_query Absolute path plus query to add (freed) + * \param fragment URL fragment, or NULL + * \param url URL (fragment ignored) + * \return Pointer to leaf node, or NULL on memory exhaustion + */ struct path_data *urldb_add_path(lwc_string *scheme, unsigned int port, const struct host_part *host, char *path_query, lwc_string *fragment, nsurl *url); |