diff options
author | James Bursa <james@netsurf-browser.org> | 2004-03-02 18:02:41 +0000 |
---|---|---|
committer | James Bursa <james@netsurf-browser.org> | 2004-03-02 18:02:41 +0000 |
commit | 1c85bf04293cfba663c5170bbe762825b7e72af1 (patch) | |
tree | 6db0b94b8e875ae20514b334c9f9acb3b380f362 /utils | |
parent | 2d816dda237a80f413a9d90031c7f5aff01e7a83 (diff) | |
download | netsurf-1c85bf04293cfba663c5170bbe762825b7e72af1.tar.gz netsurf-1c85bf04293cfba663c5170bbe762825b7e72af1.tar.bz2 |
[project @ 2004-03-02 18:02:17 by bursa]
Add new url functions and modify to use them.
svn path=/import/netsurf/; revision=578
Diffstat (limited to 'utils')
-rw-r--r-- | utils/url.c | 437 | ||||
-rw-r--r-- | utils/url.h | 20 | ||||
-rw-r--r-- | utils/utils.c | 105 | ||||
-rw-r--r-- | utils/utils.h | 4 |
4 files changed, 459 insertions, 107 deletions
diff --git a/utils/url.c b/utils/url.c new file mode 100644 index 000000000..c22144495 --- /dev/null +++ b/utils/url.c @@ -0,0 +1,437 @@ +/* + * This file is part of NetSurf, http://netsurf.sourceforge.net/ + * Licensed under the GNU General Public License, + * http://www.opensource.org/licenses/gpl-license + * Copyright 2004 James Bursa <bursa@users.sourceforge.net> + */ + +/** \file + * URL parsing and joining (implementation). + */ + +#include <ctype.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <sys/types.h> +#include <regex.h> +#include "netsurf/utils/log.h" +#include "netsurf/utils/url.h" +#include "netsurf/utils/utils.h" + + +regex_t url_re, url_up_re; + +/** + * Initialise URL routines. + * + * Compiles regular expressions required by the url_ functions. + */ + +void url_init(void) +{ + /* regex from RFC 2396 */ + regcomp_wrapper(&url_re, "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)" + "(\\?([^#]*))?(#(.*))?$", REG_EXTENDED); + regcomp_wrapper(&url_up_re, + "/(|[^/]|[.][^./]|[^./][.]|[^/][^/][^/]+)/[.][.](/|$)", + REG_EXTENDED); +} + + +/** + * Normalize a URL. + * + * \param url an absolute URL + * \return cleaned up url, allocated on the heap, or 0 on failure + * + * If there is no scheme, http:// is added. The scheme and host are + * lower-cased. Default ports are removed (http only). An empty path is + * replaced with "/". Characters are unescaped if safe. + */ + +char *url_normalize(const char *url) +{ + char c; + char *res = 0; + int m; + int i; + int len; + bool http = false; + regmatch_t match[10]; + + m = regexec(&url_re, url, 10, match, 0); + if (m) { + LOG(("url '%s' failed to match regex", url)); + return 0; + } + + len = strlen(url); + + if (match[1].rm_so == -1) { + /* scheme missing: add http:// and reparse */ + LOG(("scheme missing: using http")); + res = malloc(strlen(url) + 13); + if (!res) { + LOG(("malloc failed")); + return 0; + } + strcpy(res, "http://"); + strcpy(res + 7, url); + m = regexec(&url_re, res, 10, match, 0); + if (m) { + LOG(("url '%s' failed to match regex", res)); + free(res); + return 0; + } + len += 7; + } else { + res = malloc(len + 6); + if (!res) { + LOG(("strdup failed")); + return 0; + } + strcpy(res, url); + } + + /*for (unsigned int i = 0; i != 10; i++) { + if (match[i].rm_so == -1) + continue; + fprintf(stderr, "%i: '%.*s'\n", i, + match[i].rm_eo - match[i].rm_so, + res + match[i].rm_so); + }*/ + + /* see RFC 2616 section 3.2.3 */ + /* make scheme lower-case */ + if (match[2].rm_so != -1) { + for (i = match[2].rm_so; i != match[2].rm_eo; i++) + res[i] = tolower(res[i]); + if (match[2].rm_eo == 4 && res[0] == 'h' && res[1] == 't' && + res[2] == 't' && res[3] == 'p') + http = true; + } + + /* make empty path into "/" */ + if (match[5].rm_so != -1 && match[5].rm_so == match[5].rm_eo) { + memmove(res + match[5].rm_so + 1, res + match[5].rm_so, + len - match[5].rm_so + 1); + res[match[5].rm_so] = '/'; + len++; + } + + /* make host lower-case */ + if (match[4].rm_so != -1) { + for (i = match[4].rm_so; i != match[4].rm_eo; i++) { + if (res[i] == ':') { + if (http && res[i + 1] == '8' && + res[i + 2] == '0' && + i + 3 == match[4].rm_eo) { + memmove(res + i, res + i + 3, + len - match[4].rm_eo); + len -= 3; + res[len] = '\0'; + } else if (i + 1 == match[4].rm_eo) { + memmove(res + i, res + i + 1, + len - match[4].rm_eo); + len--; + res[len] = '\0'; + } + break; + } + res[i] = tolower(res[i]); + } + } + + /* unescape non-"reserved" escaped characters */ + for (i = 0; i != len; i++) { + if (res[i] != '%') + continue; + c = tolower(res[i + 1]); + if ('0' <= c && c <= '9') + m = 16 * (c - '0'); + else if ('a' <= c && c <= 'f') + m = 16 * (c - 'a' + 10); + else + continue; + c = tolower(res[i + 2]); + if ('0' <= c && c <= '9') + m += c - '0'; + else if ('a' <= c && c <= 'f') + m += c - 'a' + 10; + else + continue; + + if (m <= 0x20 || strchr(";/?:@&=+$," "<>#%\"" + "{}|\\^[]`", m)) { + i += 2; + continue; + } + + res[i] = m; + memmove(res + i + 1, res + i + 3, len - i - 2); + len -= 2; + } + + return res; +} + + +/** + * Resolve a relative URL to absolute form. + * + * \param rel relative URL + * \param base base URL, must be absolute and cleaned as by url_normalize() + * \return an absolute URL, allocated on the heap, or 0 on failure + */ + +char *url_join(const char *rel, const char *base) +{ + int m; + int i, j; + char *buf = 0; + char *res; + const char *scheme = 0, *authority = 0, *path = 0, *query = 0, + *fragment = 0; + int scheme_len = 0, authority_len = 0, path_len = 0, query_len = 0, + fragment_len = 0; + regmatch_t base_match[10]; + regmatch_t rel_match[10]; + regmatch_t up_match[3]; + + /* see RFC 2396 section 5.2 */ + m = regexec(&url_re, base, 10, base_match, 0); + if (m) { + LOG(("base url '%s' failed to match regex", base)); + return 0; + } + /*for (unsigned int i = 0; i != 10; i++) { + if (base_match[i].rm_so == -1) + continue; + fprintf(stderr, "%i: '%.*s'\n", i, + base_match[i].rm_eo - base_match[i].rm_so, + base + base_match[i].rm_so); + }*/ + if (base_match[2].rm_so == -1) { + LOG(("base url '%s' is not absolute", base)); + return 0; + } + scheme = base + base_match[2].rm_so; + scheme_len = base_match[2].rm_eo - base_match[2].rm_so; + if (base_match[4].rm_so != -1) { + authority = base + base_match[4].rm_so; + authority_len = base_match[4].rm_eo - base_match[4].rm_so; + } + path = base + base_match[5].rm_so; + path_len = base_match[5].rm_eo - base_match[5].rm_so; + + /* 1) */ + m = regexec(&url_re, rel, 10, rel_match, 0); + if (m) { + LOG(("relative url '%s' failed to match regex", rel)); + return 0; + } + + /* 2) */ + if (rel_match[5].rm_so == rel_match[5].rm_eo && + rel_match[2].rm_so == -1 && + rel_match[4].rm_so == -1 && + rel_match[6].rm_so == -1) { + goto step7; + } + if (rel_match[7].rm_so != -1) { + query = rel + rel_match[7].rm_so; + query_len = rel_match[7].rm_eo - rel_match[7].rm_so; + } + if (rel_match[9].rm_so != -1) { + fragment = rel + rel_match[9].rm_so; + fragment_len = rel_match[9].rm_eo - rel_match[9].rm_so; + } + + /* 3) */ + if (rel_match[2].rm_so != -1) { + scheme = rel + rel_match[2].rm_so; + scheme_len = rel_match[2].rm_eo - rel_match[2].rm_so; + authority = 0; + authority_len = 0; + if (rel_match[4].rm_so != -1) { + authority = rel + rel_match[4].rm_so; + authority_len = rel_match[4].rm_eo - rel_match[4].rm_so; + } + path = rel + rel_match[5].rm_so; + path_len = rel_match[5].rm_eo - rel_match[5].rm_so; + goto step7; + } + + /* 4) */ + if (rel_match[4].rm_so != -1) { + authority = rel + rel_match[4].rm_so; + authority_len = rel_match[4].rm_eo - rel_match[4].rm_so; + path = rel + rel_match[5].rm_so; + path_len = rel_match[5].rm_eo - rel_match[5].rm_so; + goto step7; + } + + /* 5) */ + if (rel[rel_match[5].rm_so] == '/') { + path = rel + rel_match[5].rm_so; + path_len = rel_match[5].rm_eo - rel_match[5].rm_so; + goto step7; + } + + /* 6) */ + buf = malloc(path_len + rel_match[5].rm_eo + 10); + if (!buf) { + LOG(("malloc failed")); + return 0; + } + /* a) */ + strncpy(buf, path, path_len); + for (; path_len != 0 && buf[path_len - 1] != '/'; path_len--) + ; + /* b) */ + strncpy(buf + path_len, rel + rel_match[5].rm_so, + rel_match[5].rm_eo - rel_match[5].rm_so); + path_len += rel_match[5].rm_eo - rel_match[5].rm_so; + /* c) */ + buf[path_len] = 0; + for (i = j = 0; j != path_len; ) { + if (j && buf[j - 1] == '/' && buf[j] == '.' && + buf[j + 1] == '/') + j += 2; + else + buf[i++] = buf[j++]; + } + path_len = i; + /* d) */ + if (buf[path_len - 2] == '/' && buf[path_len - 1] == '.') + path_len--; + /* e) and f) */ + while (1) { + buf[path_len] = 0; + m = regexec(&url_up_re, buf, 3, up_match, 0); + if (m) + break; + if (up_match[1].rm_eo + 4 <= path_len) { + memmove(buf + up_match[1].rm_so, + buf + up_match[1].rm_eo + 4, + path_len - up_match[1].rm_eo - 4); + path_len -= up_match[1].rm_eo - up_match[1].rm_so + 4; + } else + path_len -= up_match[1].rm_eo - up_match[1].rm_so + 3; + } + buf[path_len] = 0; + path = buf; + +step7: /* 7) */ + res = malloc(scheme_len + 1 + 2 + authority_len + path_len + 1 + + query_len + 1 + fragment_len + 1); + if (!res) { + LOG(("malloc failed")); + free(buf); + return 0; + } + + strncpy(res, scheme, scheme_len); + res[scheme_len] = ':'; + i = scheme_len + 1; + if (authority) { + res[i++] = '/'; + res[i++] = '/'; + strncpy(res + i, authority, authority_len); + i += authority_len; + } + strncpy(res + i, path, path_len); + i += path_len; + if (query) { + res[i++] = '?'; + strncpy(res + i, query, query_len); + i += query_len; + } + if (fragment) { + res[i++] = '#'; + strncpy(res + i, fragment, fragment_len); + i += fragment_len; + } + res[i] = 0; + + free(buf); + + return res; +} + + +/** + * Return the host name from an URL. + * + * \param url an absolute URL + * \returns host name allocated on heap, or 0 on failure + */ + +char *url_host(const char *url) +{ + int m; + char *host; + regmatch_t match[10]; + + m = regexec(&url_re, url, 10, match, 0); + if (m) { + LOG(("url '%s' failed to match regex", url)); + return 0; + } + if (match[4].rm_so == -1) + return 0; + + host = malloc(match[4].rm_eo - match[4].rm_so + 1); + if (!host) { + LOG(("malloc failed")); + return 0; + } + strncpy(host, url + match[4].rm_so, match[4].rm_eo - match[4].rm_so); + host[match[4].rm_eo - match[4].rm_so] = 0; + + return host; +} + + + +#ifdef TEST + +int main(int argc, char *argv[]) +{ + int i; + char *s; + url_init(); + for (i = 1; i != argc; i++) { +/* printf("==> '%s'\n", argv[i]); + s = url_normalize(argv[i]); + if (s) + printf("<== '%s'\n", s);*/ +/* printf("==> '%s'\n", argv[i]); + s = url_host(argv[i]); + if (s) + printf("<== '%s'\n", s);*/ + if (1 != i) { + s = url_join(argv[i], argv[1]); + if (s) + printf("'%s' + '%s' \t= '%s'\n", argv[1], + argv[i], s); + } + } + return 0; +} + +void regcomp_wrapper(regex_t *preg, const char *regex, int cflags) +{ + char errbuf[200]; + int r; + r = regcomp(preg, regex, cflags); + if (r) { + regerror(r, preg, errbuf, sizeof errbuf); + fprintf(stderr, "Failed to compile regexp '%s'\n", regex); + fprintf(stderr, "error: %s\n", errbuf); + exit(1); + } +} + +#endif diff --git a/utils/url.h b/utils/url.h new file mode 100644 index 000000000..f908e8f9a --- /dev/null +++ b/utils/url.h @@ -0,0 +1,20 @@ +/* + * This file is part of NetSurf, http://netsurf.sourceforge.net/ + * Licensed under the GNU General Public License, + * http://www.opensource.org/licenses/gpl-license + * Copyright 2004 James Bursa <bursa@users.sourceforge.net> + */ + +/** \file + * URL parsing and joining (interface). + */ + +#ifndef _NETSURF_UTILS_URL_H_ +#define _NETSURF_UTILS_URL_H_ + +void url_init(void); +char *url_normalize(const char *url); +char *url_join(const char *rel, const char *base); +char *url_host(const char *url); + +#endif diff --git a/utils/utils.c b/utils/utils.c index 889985178..f2c7188ea 100644 --- a/utils/utils.c +++ b/utils/utils.c @@ -2,7 +2,7 @@ * This file is part of NetSurf, http://netsurf.sourceforge.net/ * Licensed under the GNU General Public License, * http://www.opensource.org/licenses/gpl-license - * Copyright 2003 James Bursa <bursa@users.sourceforge.net> + * Copyright 2004 James Bursa <bursa@users.sourceforge.net> * Copyright 2003 Phil Mellor <monkeyson@users.sourceforge.net> * Copyright 2003 John M Bell <jmb202@ecs.soton.ac.uk> */ @@ -12,17 +12,11 @@ #include <stdio.h> #include <string.h> #include <sys/stat.h> -#include <uri.h> #include <sys/types.h> #include <regex.h> #include <time.h> #include "libxml/encoding.h" -#include "libxml/uri.h" #include "netsurf/utils/config.h" -#ifdef riscos -#include "netsurf/riscos/about.h" -#include "netsurf/riscos/constdata.h" -#endif #define NDEBUG #include "netsurf/utils/log.h" #include "netsurf/utils/messages.h" @@ -190,103 +184,6 @@ char *squash_tolat1(xmlChar *s) /** - * Calculate an URL from a relative and base URL. - * - * base may be 0 for a new URL, in which case the URL is canonicalized and - * returned. Returns 0 in case of error. - */ - -char *url_join(char *rel_url, char *base_url) -{ - char *res; - uri_t *base = 0, *rel = 0, *abs; - - LOG(("rel_url = %s, base_url = %s", rel_url, base_url)); - -#ifdef riscos - /* hacky, hacky, hacky... - * It is, however, best to do this here as it avoids - * duplicating code for clicking links and url bar handling. - * It simplifies the code it the other places too (they just - * call this as usual, then we handle it here). - */ -#ifdef WITH_ABOUT - if (strcasecmp(rel_url, "about:") == 0) { - about_create(); - return xstrdup(ABOUT_URL); - } -#ifdef WITH_COOKIES - if (strcasecmp(rel_url, "about:cookies") == 0) { - cookie_create(); - return xstrdup(COOKIE_URL); - } -#endif -#endif -#endif - - if (!base_url) { - res = uri_cannonicalize_string(rel_url, - (int)(strlen(rel_url)), - URI_STRING_URI_STYLE); - LOG(("res = %s", res)); - if (res) - return xstrdup(res); - return 0; - } - - base = uri_alloc(base_url, (int)(strlen(base_url))); - rel = uri_alloc(rel_url, (int)(strlen(rel_url))); - if (!base || !rel) - goto fail; - if (!base->scheme) - goto fail; - - abs = uri_abs_1(base, rel); - - res = xstrdup(uri_uri(abs)); - - uri_free(base); - uri_free(rel); - - LOG(("res = %s", res)); - return res; - -fail: - if (base) - uri_free(base); - if (rel) - uri_free(rel); - - LOG(("error")); - - return 0; -} - - -/** - * Extract the host name from a url. - * - * \param url an absolute URL - * \return a new string, or 0 in case of error - */ - -char *get_host_from_url(char *url) -{ - char *host = 0; - uri_t *uri; - - uri = uri_alloc(url, (int)(strlen(url))); - if (!uri) - return 0; - if (uri->host) - host = xstrdup(uri->host); - uri_free(uri); - - return host; -} - - -/** * Check if a directory exists. */ diff --git a/utils/utils.h b/utils/utils.h index 02b927711..1faef449d 100644 --- a/utils/utils.h +++ b/utils/utils.h @@ -2,7 +2,7 @@ * This file is part of NetSurf, http://netsurf.sourceforge.net/ * Licensed under the GNU General Public License, * http://www.opensource.org/licenses/gpl-license - * Copyright 2003 James Bursa <bursa@users.sourceforge.net> + * Copyright 2004 James Bursa <bursa@users.sourceforge.net> */ #ifndef _NETSURF_UTILS_UTILS_H_ @@ -26,8 +26,6 @@ char * squash_whitespace(const char * s); char * tolat1(xmlChar * s); char * tolat1_pre(xmlChar * s); char *squash_tolat1(xmlChar *s); -char *url_join(char *rel_url, char *base_url); -char *get_host_from_url(char* url); bool is_dir(const char *path); void regcomp_wrapper(regex_t *preg, const char *regex, int cflags); void clean_cookiejar(void); |