/* * Copyright 2006 Richard Wilson * Copyright 2005 James Bursa * Copyright 2005 John M Bell * * This file is part of NetSurf, http://www.netsurf-browser.org/ * * NetSurf is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; version 2 of the License. * * NetSurf is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. If not, see . */ /** \file * URL parsing and joining (implementation). */ #include #include #include #include #include #include #include #include "curl/curl.h" #include "utils/config.h" #include "utils/log.h" #include "utils/url.h" #include "utils/utils.h" struct url_components_internal { char *buffer; /* buffer used for all the following data */ char *scheme; char *authority; char *path; char *query; char *fragment; }; regex_t url_re, url_up_re; /** * Initialise URL routines. * * Compiles regular expressions required by the url_ functions. */ void url_init(void) { /* regex from RFC 2396 */ regcomp_wrapper(&url_re, "^[[:space:]]*" #define URL_RE_SCHEME 2 "(([a-zA-Z][-a-zA-Z0-9+.]*):)?" #define URL_RE_AUTHORITY 4 "(//([^/?#[:space:]]*))?" #define URL_RE_PATH 5 "([^?#[:space:]]*)" #define URL_RE_QUERY 7 "(\\?([^#[:space:]]*))?" #define URL_RE_FRAGMENT 9 "(#([^[:space:]]*))?" "[[:space:]]*$", REG_EXTENDED); regcomp_wrapper(&url_up_re, "/([^/]?|[.][^./]|[^./][.]|[^./][^./]|[^/][^/][^/]+)" "/[.][.](/|$)", REG_EXTENDED); } /** * Check whether a host string is an IP address. It should support and * detect IPv4 addresses (all of dotted-quad or subsets, decimal or * hexadecimal notations) and IPv6 addresses (including those containing * embedded IPv4 addresses.) * * \param host a hostname terminated by '\0' * \return true if the hostname is an IP address, false otherwise */ bool url_host_is_ip_address(const char *host) { struct in_addr ipv4; size_t host_len = strlen(host); const char *sane_host; const char *slash; #ifndef NO_IPV6 struct in6_addr ipv6; char ipv6_addr[64]; #endif /* FIXME TODO: Some parts of urldb.c (and perhaps other parts of * NetSurf) make confusions between hosts and "prefixes", we can * sometimes be erroneously passed more than just a host. Sometimes * we may be passed trailing slashes, or even whole path segments. * A specific criminal in this class is urldb_iterate_partial, which * takes a prefix to search for, but passes that prefix to functions * that expect only hosts. * * For the time being, we will accept such calls; we check if there * is a / in the host parameter, and if there is, we take a copy and * replace the / with a \0. This is not a permanent solution; we * should search through NetSurf and find all the callers that are * in error and fix them. When doing this task, it might be wise * to replace the hideousness below with code that doesn't have to do * this, and add assert(strchr(host, '/') == NULL); somewhere. * -- rjek - 2010-11-04 */ slash = strchr(host, '/'); if (slash == NULL) { sane_host = host; } else { char *c = strdup(host); c[slash - host] = '\0'; sane_host = c; host_len = slash - host - 1; LOG(("WARNING: called with non-host '%s'", host)); } if (strspn(sane_host, "0123456789abcdefABCDEF[].:") < host_len) goto out_false; if (inet_aton(sane_host, &ipv4) != 0) { /* This can only be a sane IPv4 address if it contains 3 dots. * Helpfully, inet_aton is happy to treat "a", "a.b", "a.b.c", * and "a.b.c.d" as valid IPv4 address strings where we only * support the full, dotted-quad, form. */ int num_dots = 0; size_t index; for (index = 0; index < host_len; index++) { if (sane_host[index] == '.') num_dots++; } if (num_dots == 3) goto out_true; else goto out_false; } #ifndef NO_IPV6 if (sane_host[0] != '[' || sane_host[host_len] != ']') goto out_false; strncpy(ipv6_addr, sane_host + 1, sizeof(ipv6_addr)); ipv6_addr[sizeof(ipv6_addr) - 1] = '\0'; if (inet_pton(AF_INET6, ipv6_addr, &ipv6) == 1) goto out_true; #endif out_false: if (slash != NULL) free((void *)sane_host); return false; out_true: if (slash != NULL) free((void *)sane_host); return true; } /** * Split a URL into separate components * * URLs passed to this function are assumed to be valid and no error checking * or recovery is attempted. * * See RFC 3986 for reference. * * \param url a valid absolute or relative URL * \param result pointer to buffer to hold components * \return URL_FUNC_OK on success */ static url_func_result url_get_components(const char *url, struct url_components *result) { int storage_length; char *storage_end; const char *scheme; const char *authority; const char *path; const char *query; const char *fragment; struct url_components_internal *internal; assert(url); /* clear our return value */ internal = (struct url_components_internal *)result; memset(result, 0x00, sizeof(struct url_components)); /* get enough storage space for a URL with termination at each node */ storage_length = strlen(url) + 8; internal->buffer = malloc(storage_length); if (!internal->buffer) return URL_FUNC_NOMEM; storage_end = internal->buffer; /* look for a valid scheme */ scheme = url; if (isalpha(*scheme)) { for (scheme = url + 1; ((*scheme != ':') && (*scheme != '\0')); scheme++) { if (!isalnum(*scheme) && (*scheme != '+') && (*scheme != '-') && (*scheme != '.')) break; } if (*scheme == ':') { memcpy(storage_end, url, scheme - url); storage_end[scheme - url] = '\0'; result->scheme = storage_end; storage_end += scheme - url + 1; scheme++; } else { scheme = url; } } /* look for an authority */ authority = scheme; if ((authority[0] == '/') && (authority[1] == '/')) { authority = strpbrk(scheme + 2, "/?#"); if (!authority) authority = scheme + strlen(scheme); memcpy(storage_end, scheme + 2, authority - scheme - 2); storage_end[authority - scheme - 2] = '\0'; result->authority = storage_end; storage_end += authority - scheme - 1; } /* look for a path */ path = authority; if ((*path != '?') && (*path != '#') && (*path != '\0')) { path = strpbrk(path, "?#"); if (!path) path = authority + strlen(authority); memcpy(storage_end, authority, path - authority); storage_end[path - authority] = '\0'; result->path = storage_end; storage_end += path - authority + 1; } /* look for a query */ query = path; if (*query == '?') { query = strchr(query, '#'); if (!query) query = path + strlen(path); memcpy(storage_end, path + 1, query - path - 1); storage_end[query - path - 1] = '\0'; result->query = storage_end; storage_end += query - path; } /* look for a fragment */ fragment = query; if (*fragment == '#') { fragment = query + strlen(query); /* make a copy of the result for the caller */ memcpy(storage_end, query + 1, fragment - query - 1); storage_end[fragment - query - 1] = '\0'; result->fragment = storage_end; storage_end += fragment - query; } assert((result->buffer + storage_length) >= storage_end); return URL_FUNC_OK; } /** * Reform a URL from separate components * * See RFC 3986 for reference. * * \param components the components to reform into a URL * \return a new URL allocated on the heap, or NULL on failure */ static char *url_reform_components(const struct url_components *components) { int scheme_len = 0, authority_len = 0, path_len = 0, query_len = 0, fragment_len = 0; char *result, *url; /* 5.3 */ if (components->scheme) scheme_len = strlen(components->scheme) + 1; if (components->authority) authority_len = strlen(components->authority) + 2; if (components->path) path_len = strlen(components->path); if (components->query) query_len = strlen(components->query) + 1; if (components->fragment) fragment_len = strlen(components->fragment) + 1; /* claim memory */ url = result = malloc(scheme_len + authority_len + path_len + query_len + fragment_len + 1); if (!url) { LOG(("malloc failed")); return NULL; } /* rebuild URL */ if (components->scheme) { sprintf(url, "%s:", components->scheme); url += scheme_len; } if (components->authority) { sprintf(url, "//%s", components->authority); url += authority_len; } if (components->path) { sprintf(url, "%s", components->path); url += path_len; } if (components->query) { sprintf(url, "?%s", components->query); url += query_len; } if (components->fragment) sprintf(url, "#%s", components->fragment); return result; } /** * Release some url components from memory * * \param result pointer to buffer containing components */ static void url_destroy_components(const struct url_components *components) { const struct url_components_internal *internal; assert(components); internal = (const struct url_components_internal *)components; if (internal->buffer) free(internal->buffer); } /** * Resolve a relative URL to absolute form. * * \param rel relative URL * \param base base URL, must be absolute and cleaned as by nsurl_create() * \param result pointer to pointer to buffer to hold absolute url * \return URL_FUNC_OK on success */ url_func_result url_join(const char *rel, const char *base, char **result) { url_func_result status = URL_FUNC_NOMEM; struct url_components_internal base_components = {0,0,0,0,0,0}; struct url_components_internal *base_ptr = &base_components; struct url_components_internal rel_components = {0,0,0,0,0,0}; struct url_components_internal *rel_ptr = &rel_components; struct url_components_internal merged_components = {0,0,0,0,0,0}; struct url_components_internal *merged_ptr = &merged_components; char *merge_path = NULL, *split_point; char *input, *output, *start = NULL; int len, buf_len; (*result) = 0; assert(base); assert(rel); /* break down the relative URL (not cached, corruptable) */ status = url_get_components(rel, (struct url_components *) rel_ptr); if (status != URL_FUNC_OK) { LOG(("relative url '%s' failed to get components", rel)); return URL_FUNC_FAILED; } /* [1] relative URL is absolute, use it entirely */ merged_components = rel_components; if (rel_components.scheme) goto url_join_reform_url; /* break down the base URL (possibly cached, not corruptable) */ status = url_get_components(base, (struct url_components *) base_ptr); if (status != URL_FUNC_OK) { url_destroy_components((struct url_components *) rel_ptr); LOG(("base url '%s' failed to get components", base)); return URL_FUNC_FAILED; } /* [2] relative authority takes presidence */ merged_components.scheme = base_components.scheme; if (rel_components.authority) goto url_join_reform_url; /* [3] handle empty paths */ merged_components.authority = base_components.authority; if (!rel_components.path) { merged_components.path = base_components.path; if (!rel_components.query) merged_components.query = base_components.query; goto url_join_reform_url; } /* [4] handle valid paths */ if (rel_components.path[0] == '/') merged_components.path = rel_components.path; else { /* 5.2.3 */ if ((base_components.authority) && (!base_components.path)) { merge_path = malloc(strlen(rel_components.path) + 2); if (!merge_path) { LOG(("malloc failed")); goto url_join_no_mem; } sprintf(merge_path, "/%s", rel_components.path); merged_components.path = merge_path; } else { split_point = base_components.path ? strrchr(base_components.path, '/') : NULL; if (!split_point) { merged_components.path = rel_components.path; } else { len = ++split_point - base_components.path; buf_len = len + 1 + strlen(rel_components.path); merge_path = malloc(buf_len); if (!merge_path) { LOG(("malloc failed")); goto url_join_no_mem; } memcpy(merge_path, base_components.path, len); memcpy(merge_path + len, rel_components.path, strlen(rel_components.path)); merge_path[buf_len - 1] = '\0'; merged_components.path = merge_path; } } } url_join_reform_url: /* 5.2.4 */ input = merged_components.path; if ((input) && (strchr(input, '.'))) { /* [1] remove all dot references */ output = start = malloc(strlen(input) + 1); if (!output) { LOG(("malloc failed")); goto url_join_no_mem; } merged_components.path = output; *output = '\0'; while (*input != '\0') { /* [2A] */ if (input[0] == '.') { if (input[1] == '/') { input = input + 2; continue; } else if ((input[1] == '.') && (input[2] == '/')) { input = input + 3; continue; } } /* [2B] */ if ((input[0] == '/') && (input[1] == '.')) { if (input[2] == '/') { input = input + 2; continue; } else if (input[2] == '\0') { input = input + 1; *input = '/'; continue; } /* [2C] */ if ((input[2] == '.') && ((input[3] == '/') || (input[3] == '\0'))) { if (input[3] == '/') { input = input + 3; } else { input = input + 2; *input = '/'; } if ((output > start) && (output[-1] == '/')) *--output = '\0'; split_point = strrchr(start, '/'); if (!split_point) output = start; else output = split_point; *output = '\0'; continue; } } /* [2D] */ if (input[0] == '.') { if (input[1] == '\0') { input = input + 1; continue; } else if ((input[1] == '.') && (input[2] == '\0')) { input = input + 2; continue; } } /* [2E] */ if (*input == '/') *output++ = *input++; while ((*input != '/') && (*input != '\0')) *output++ = *input++; *output = '\0'; } /* [3] */ merged_components.path = start; } /* 5.3 */ *result = url_reform_components((struct url_components *) merged_ptr); if (!(*result)) goto url_join_no_mem; /* return success */ status = URL_FUNC_OK; url_join_no_mem: free(start); free(merge_path); url_destroy_components((struct url_components *) base_ptr); url_destroy_components((struct url_components *) rel_ptr); return status; } /** * Return the host name from an URL. * * \param url an absolute URL * \param result pointer to pointer to buffer to hold host name * \return URL_FUNC_OK on success */ url_func_result url_host(const char *url, char **result) { url_func_result status; struct url_components components; const char *host_start, *host_end; assert(url); status = url_get_components(url, &components); if (status == URL_FUNC_OK) { if (!components.authority) { url_destroy_components(&components); return URL_FUNC_FAILED; } host_start = strchr(components.authority, '@'); host_start = host_start ? host_start + 1 : components.authority; /* skip over an IPv6 address if there is one */ if (host_start[0] == '[') { host_end = strchr(host_start, ']') + 1; } else { host_end = strchr(host_start, ':'); } if (!host_end) host_end = components.authority + strlen(components.authority); *result = malloc(host_end - host_start + 1); if (!(*result)) { url_destroy_components(&components); return URL_FUNC_FAILED; } memcpy((*result), host_start, host_end - host_start); (*result)[host_end - host_start] = '\0'; } url_destroy_components(&components); return status; } /** * Return the scheme name from an URL. * * See RFC 3986, 3.1 for reference. * * \param url an absolute URL * \param result pointer to pointer to buffer to hold scheme name * \return URL_FUNC_OK on success */ url_func_result url_scheme(const char *url, char **result) { url_func_result status; struct url_components components; assert(url); status = url_get_components(url, &components); if (status == URL_FUNC_OK) { if (!components.scheme) { status = URL_FUNC_FAILED; } else { *result = strdup(components.scheme); if (!(*result)) status = URL_FUNC_NOMEM; } } url_destroy_components(&components); return status; } /** * Extract path segment from an URL * * \param url an absolute URL * \param result pointer to pointer to buffer to hold result * \return URL_FUNC_OK on success */ url_func_result url_path(const char *url, char **result) { url_func_result status; struct url_components components; assert(url); status = url_get_components(url, &components); if (status == URL_FUNC_OK) { if (!components.path) { status = URL_FUNC_FAILED; } else { *result = strdup(components.path); if (!(*result)) status = URL_FUNC_NOMEM; } } url_destroy_components(&components); return status; } /** * Attempt to find a nice filename for a URL. * * \param url an absolute URL * \param result pointer to pointer to buffer to hold filename * \param remove_extensions remove any extensions from the filename * \return URL_FUNC_OK on success */ url_func_result url_nice(const char *url, char **result, bool remove_extensions) { int m; regmatch_t match[10]; regoff_t start, end; size_t i; char *dot; *result = 0; m = regexec(&url_re, url, 10, match, 0); if (m) { LOG(("url '%s' failed to match regex", url)); return URL_FUNC_FAILED; } /* extract the last component of the path, if possible */ if (match[URL_RE_PATH].rm_so == -1 || match[URL_RE_PATH].rm_so == match[URL_RE_PATH].rm_eo) goto no_path; /* no path, or empty */ for (end = match[URL_RE_PATH].rm_eo - 1; end != match[URL_RE_PATH].rm_so && url[end] == '/'; end--) ; if (end == match[URL_RE_PATH].rm_so) goto no_path; /* path is a string of '/' */ end++; for (start = end - 1; start != match[URL_RE_PATH].rm_so && url[start] != '/'; start--) ; if (url[start] == '/') start++; if (!strncasecmp(url + start, "index.", 6) || !strncasecmp(url + start, "default.", 8)) { /* try again */ if (start == match[URL_RE_PATH].rm_so) goto no_path; for (end = start - 1; end != match[URL_RE_PATH].rm_so && url[end] == '/'; end--) ; if (end == match[URL_RE_PATH].rm_so) goto no_path; end++; for (start = end - 1; start != match[URL_RE_PATH].rm_so && url[start] != '/'; start--) ; if (url[start] == '/') start++; } *result = malloc(end - start + 1); if (!*result) { LOG(("malloc failed")); return URL_FUNC_NOMEM; } strncpy(*result, url + start, end - start); (*result)[end - start] = 0; if (remove_extensions) { dot = strchr(*result, '.'); if (dot && dot != *result) *dot = 0; } return URL_FUNC_OK; no_path: /* otherwise, use the host name, with '.' replaced by '_' */ if (match[URL_RE_AUTHORITY].rm_so != -1 && match[URL_RE_AUTHORITY].rm_so != match[URL_RE_AUTHORITY].rm_eo) { *result = malloc(match[URL_RE_AUTHORITY].rm_eo - match[URL_RE_AUTHORITY].rm_so + 1); if (!*result) { LOG(("malloc failed")); return URL_FUNC_NOMEM; } strncpy(*result, url + match[URL_RE_AUTHORITY].rm_so, match[URL_RE_AUTHORITY].rm_eo - match[URL_RE_AUTHORITY].rm_so); (*result)[match[URL_RE_AUTHORITY].rm_eo - match[URL_RE_AUTHORITY].rm_so] = 0; for (i = 0; (*result)[i]; i++) if ((*result)[i] == '.') (*result)[i] = '_'; return URL_FUNC_OK; } return URL_FUNC_FAILED; } /** * Convert an escaped string to plain. * \param result unescaped string owned by caller must be freed with free() * \return URL_FUNC_OK on success */ url_func_result url_unescape(const char *str, char **result) { char *curlstr; char *retstr; curlstr = curl_unescape(str, 0); if (curlstr == NULL) { return URL_FUNC_NOMEM; } retstr = strdup(curlstr); curl_free(curlstr); if (retstr == NULL) { return URL_FUNC_NOMEM; } *result = retstr; return URL_FUNC_OK; } /** * Escape a string suitable for inclusion in an URL. * * \param unescaped the unescaped string * \param toskip number of bytes to skip in unescaped string * \param sptoplus true iff spaces should be converted to + * \param escexceptions NULL or a string of characters excluded to be escaped * \param result pointer to pointer to buffer to hold escaped string * \return URL_FUNC_OK on success */ url_func_result url_escape(const char *unescaped, size_t toskip, bool sptoplus, const char *escexceptions, char **result) { size_t len; char *escaped, *d, *tmpres; const char *c; if (!unescaped || !result) return URL_FUNC_FAILED; *result = NULL; len = strlen(unescaped); if (len < toskip) return URL_FUNC_FAILED; len -= toskip; escaped = malloc(len * 3 + 1); if (!escaped) return URL_FUNC_NOMEM; for (c = unescaped + toskip, d = escaped; *c; c++) { /* Check if we should escape this byte. * '~' is unreserved and should not be percent encoded, if * you believe the spec; however, leaving it unescaped * breaks a bunch of websites, so we escape it anyway. */ if (!isascii(*c) || (strchr(":/?#[]@" /* gen-delims */ "!$&'()*+,;=" /* sub-delims */ "<>%\"{}|\\^`~" /* others */, *c) && (!escexceptions || !strchr(escexceptions, *c))) || *c <= 0x20 || *c == 0x7f) { if (*c == 0x20 && sptoplus) { *d++ = '+'; } else { *d++ = '%'; *d++ = "0123456789ABCDEF"[((*c >> 4) & 0xf)]; *d++ = "0123456789ABCDEF"[(*c & 0xf)]; } } else { /* unreserved characters: [a-zA-Z0-9-._] */ *d++ = *c; } } *d++ = '\0'; tmpres = malloc(d - escaped + toskip); if (!tmpres) { free(escaped); return URL_FUNC_NOMEM; } memcpy(tmpres, unescaped, toskip); memcpy(tmpres + toskip, escaped, d - escaped); *result = tmpres; free(escaped); return URL_FUNC_OK; } #ifdef TEST int main(int argc, char *argv[]) { int i; url_func_result res; char *s; url_init(); for (i = 1; i != argc; i++) { /* printf("==> '%s'\n", argv[i]); res = url_normalize(argv[i], &s); if (res == URL_FUNC_OK) { printf("<== '%s'\n", s); free(s); }*/ /* printf("==> '%s'\n", argv[i]); res = url_host(argv[i], &s); if (res == URL_FUNC_OK) { printf("<== '%s'\n", s); free(s); }*/ if (1 != i) { res = url_join(argv[i], argv[1], &s); if (res == URL_FUNC_OK) { printf("'%s' + '%s' \t= '%s'\n", argv[1], argv[i], s); free(s); } } /* printf("'%s' => ", argv[i]); res = url_nice(argv[i], &s, true); if (res == URL_FUNC_OK) { printf("'%s', ", s); free(s); } else { printf("failed %u, ", res); } res = url_nice(argv[i], &s, false); if (res == URL_FUNC_OK) { printf("'%s', ", s); free(s); } else { printf("failed %u, ", res); } printf("\n");*/ } return 0; } void regcomp_wrapper(regex_t *preg, const char *regex, int cflags) { char errbuf[200]; int r; r = regcomp(preg, regex, cflags); if (r) { regerror(r, preg, errbuf, sizeof errbuf); fprintf(stderr, "Failed to compile regexp '%s'\n", regex); fprintf(stderr, "error: %s\n", errbuf); exit(1); } } #endif