[project @ 2004-03-02 18:02:17 by bursa]

Add new url functions and modify to use them. svn path=/import/netsurf/; revision=578
author: James Bursa <james@netsurf-browser.org> 2004-03-02 18:02:41 +0000
committer: James Bursa <james@netsurf-browser.org> 2004-03-02 18:02:41 +0000
commit: 1c85bf04293cfba663c5170bbe762825b7e72af1 (patch)
tree: 6db0b94b8e875ae20514b334c9f9acb3b380f362 /utils
parent: 2d816dda237a80f413a9d90031c7f5aff01e7a83 (diff)
download: netsurf-1c85bf04293cfba663c5170bbe762825b7e72af1.tar.gz
netsurf-1c85bf04293cfba663c5170bbe762825b7e72af1.tar.bz2
4 files changed, 459 insertions, 107 deletions
diff --git a/utils/url.c b/utils/url.c
new file mode 100644
index 000000000..c22144495
--- /dev/null
+++ b/utils/url.c
@@ -0,0 +1,437 @@
+/*
+ * This file is part of NetSurf, http://netsurf.sourceforge.net/
+ * Licensed under the GNU General Public License,
+ *                http://www.opensource.org/licenses/gpl-license
+ * Copyright 2004 James Bursa <bursa@users.sourceforge.net>
+ */
+
+/** \file
+ * URL parsing and joining (implementation).
+ */
+
+#include <ctype.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <regex.h>
+#include "netsurf/utils/log.h"
+#include "netsurf/utils/url.h"
+#include "netsurf/utils/utils.h"
+
+
+regex_t url_re, url_up_re;
+
+/**
+ * Initialise URL routines.
+ *
+ * Compiles regular expressions required by the url_ functions.
+ */
+
+void url_init(void)
+{
+	/* regex from RFC 2396 */
+	regcomp_wrapper(&url_re, "^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)"
+			"(\\?([^#]*))?(#(.*))?$", REG_EXTENDED);
+	regcomp_wrapper(&url_up_re,
+			"/(|[^/]|[.][^./]|[^./][.]|[^/][^/][^/]+)/[.][.](/|$)",
+			REG_EXTENDED);
+}
+
+
+/**
+ * Normalize a URL.
+ *
+ * \param  url  an absolute URL
+ * \return  cleaned up url, allocated on the heap, or 0 on failure
+ *
+ * If there is no scheme, http:// is added. The scheme and host are
+ * lower-cased. Default ports are removed (http only). An empty path is
+ * replaced with "/". Characters are unescaped if safe.
+ */
+
+char *url_normalize(const char *url)
+{
+	char c;
+	char *res = 0;
+	int m;
+	int i;
+	int len;
+	bool http = false;
+	regmatch_t match[10];
+
+	m = regexec(&url_re, url, 10, match, 0);
+	if (m) {
+		LOG(("url '%s' failed to match regex", url));
+		return 0;
+	}
+
+	len = strlen(url);
+
+	if (match[1].rm_so == -1) {
+		/* scheme missing: add http:// and reparse */
+		LOG(("scheme missing: using http"));
+		res = malloc(strlen(url) + 13);
+		if (!res) {
+			LOG(("malloc failed"));
+			return 0;
+		}
+		strcpy(res, "http://");
+		strcpy(res + 7, url);
+		m = regexec(&url_re, res, 10, match, 0);
+		if (m) {
+			LOG(("url '%s' failed to match regex", res));
+			free(res);
+			return 0;
+		}
+		len += 7;
+	} else {
+		res = malloc(len + 6);
+		if (!res) {
+			LOG(("strdup failed"));
+			return 0;
+		}
+		strcpy(res, url);
+	}
+
+	/*for (unsigned int i = 0; i != 10; i++) {
+		if (match[i].rm_so == -1)
+			continue;
+		fprintf(stderr, "%i: '%.*s'\n", i,
+				match[i].rm_eo - match[i].rm_so,
+				res + match[i].rm_so);
+	}*/
+
+	/* see RFC 2616 section 3.2.3 */
+	/* make scheme lower-case */
+	if (match[2].rm_so != -1) {
+		for (i = match[2].rm_so; i != match[2].rm_eo; i++)
+			res[i] = tolower(res[i]);
+		if (match[2].rm_eo == 4 && res[0] == 'h' && res[1] == 't' &&
+				res[2] == 't' && res[3] == 'p')
+			http = true;
+	}
+
+	/* make empty path into "/" */
+	if (match[5].rm_so != -1 && match[5].rm_so == match[5].rm_eo) {
+		memmove(res + match[5].rm_so + 1, res + match[5].rm_so,
+				len - match[5].rm_so + 1);
+		res[match[5].rm_so] = '/';
+		len++;
+	}
+
+	/* make host lower-case */
+	if (match[4].rm_so != -1) {
+		for (i = match[4].rm_so; i != match[4].rm_eo; i++) {
+			if (res[i] == ':') {
+				if (http && res[i + 1] == '8' &&
+						res[i + 2] == '0' &&
+						i + 3 == match[4].rm_eo) {
+					memmove(res + i, res + i + 3,
+							len - match[4].rm_eo);
+					len -= 3;
+					res[len] = '\0';
+				} else if (i + 1 == match[4].rm_eo) {
+					memmove(res + i, res + i + 1,
+							len - match[4].rm_eo);
+					len--;
+					res[len] = '\0';
+				}
+				break;
+			}
+			res[i] = tolower(res[i]);
+		}
+	}
+
+	/* unescape non-"reserved" escaped characters */
+	for (i = 0; i != len; i++) {
+		if (res[i] != '%')
+			continue;
+		c = tolower(res[i + 1]);
+		if ('0' <= c && c <= '9')
+			m = 16 * (c - '0');
+		else if ('a' <= c && c <= 'f')
+			m = 16 * (c - 'a' + 10);
+		else
+			continue;
+		c = tolower(res[i + 2]);
+		if ('0' <= c && c <= '9')
+			m += c - '0';
+		else if ('a' <= c && c <= 'f')
+			m += c - 'a' + 10;
+		else
+			continue;
+
+		if (m <= 0x20 || strchr(";/?:@&=+$," "<>#%\""
+				"{}|\\^[]`", m)) {
+			i += 2;
+			continue;
+		}
+
+		res[i] = m;
+		memmove(res + i + 1, res + i + 3, len - i - 2);
+		len -= 2;
+	}
+
+	return res;
+}
+
+
+/**
+ * Resolve a relative URL to absolute form.
+ *
+ * \param  rel   relative URL
+ * \param  base  base URL, must be absolute and cleaned as by url_normalize()
+ * \return  an absolute URL, allocated on the heap, or 0 on failure
+ */
+
+char *url_join(const char *rel, const char *base)
+{
+	int m;
+	int i, j;
+	char *buf = 0;
+	char *res;
+	const char *scheme = 0, *authority = 0, *path = 0, *query = 0,
+			*fragment = 0;
+	int scheme_len = 0, authority_len = 0, path_len = 0, query_len = 0,
+			fragment_len = 0;
+	regmatch_t base_match[10];
+	regmatch_t rel_match[10];
+	regmatch_t up_match[3];
+
+	/* see RFC 2396 section 5.2 */
+	m = regexec(&url_re, base, 10, base_match, 0);
+	if (m) {
+		LOG(("base url '%s' failed to match regex", base));
+		return 0;
+	}
+	/*for (unsigned int i = 0; i != 10; i++) {
+		if (base_match[i].rm_so == -1)
+			continue;
+		fprintf(stderr, "%i: '%.*s'\n", i,
+				base_match[i].rm_eo - base_match[i].rm_so,
+				base + base_match[i].rm_so);
+	}*/
+	if (base_match[2].rm_so == -1) {
+		LOG(("base url '%s' is not absolute", base));
+		return 0;
+	}
+	scheme = base + base_match[2].rm_so;
+	scheme_len = base_match[2].rm_eo - base_match[2].rm_so;
+	if (base_match[4].rm_so != -1) {
+		authority = base + base_match[4].rm_so;
+		authority_len = base_match[4].rm_eo - base_match[4].rm_so;
+	}
+	path = base + base_match[5].rm_so;
+	path_len = base_match[5].rm_eo - base_match[5].rm_so;
+
+	/* 1) */
+	m = regexec(&url_re, rel, 10, rel_match, 0);
+	if (m) {
+		LOG(("relative url '%s' failed to match regex", rel));
+		return 0;
+	}
+
+	/* 2) */
+	if (rel_match[5].rm_so == rel_match[5].rm_eo &&
+			rel_match[2].rm_so == -1 &&
+			rel_match[4].rm_so == -1 &&
+			rel_match[6].rm_so == -1) {
+		goto step7;
+	}
+	if (rel_match[7].rm_so != -1) {
+		query = rel + rel_match[7].rm_so;
+		query_len = rel_match[7].rm_eo - rel_match[7].rm_so;
+	}
+	if (rel_match[9].rm_so != -1) {
+		fragment = rel + rel_match[9].rm_so;
+		fragment_len = rel_match[9].rm_eo - rel_match[9].rm_so;
+	}
+
+	/* 3) */
+	if (rel_match[2].rm_so != -1) {
+		scheme = rel + rel_match[2].rm_so;
+		scheme_len = rel_match[2].rm_eo - rel_match[2].rm_so;
+		authority = 0;
+		authority_len = 0;
+		if (rel_match[4].rm_so != -1) {
+			authority = rel + rel_match[4].rm_so;
+			authority_len = rel_match[4].rm_eo - rel_match[4].rm_so;
+		}
+		path = rel + rel_match[5].rm_so;
+		path_len = rel_match[5].rm_eo - rel_match[5].rm_so;
+		goto step7;
+	}
+
+	/* 4) */
+	if (rel_match[4].rm_so != -1) {
+		authority = rel + rel_match[4].rm_so;
+		authority_len = rel_match[4].rm_eo - rel_match[4].rm_so;
+		path = rel + rel_match[5].rm_so;
+		path_len = rel_match[5].rm_eo - rel_match[5].rm_so;
+		goto step7;
+	}
+
+	/* 5) */
+	if (rel[rel_match[5].rm_so] == '/') {
+		path = rel + rel_match[5].rm_so;
+		path_len = rel_match[5].rm_eo - rel_match[5].rm_so;
+		goto step7;
+	}
+
+	/* 6) */
+	buf = malloc(path_len + rel_match[5].rm_eo + 10);
+	if (!buf) {
+		LOG(("malloc failed"));
+		return 0;
+	}
+	/* a) */
+	strncpy(buf, path, path_len);
+	for (; path_len != 0 && buf[path_len - 1] != '/'; path_len--)
+		;
+	/* b) */
+	strncpy(buf + path_len, rel + rel_match[5].rm_so,
+			rel_match[5].rm_eo - rel_match[5].rm_so);
+	path_len += rel_match[5].rm_eo - rel_match[5].rm_so;
+	/* c) */
+	buf[path_len] = 0;
+	for (i = j = 0; j != path_len; ) {
+		if (j && buf[j - 1] == '/' && buf[j] == '.' &&
+				buf[j + 1] == '/')
+			j += 2;
+		else
+			buf[i++] = buf[j++];
+	}
+	path_len = i;
+	/* d) */
+	if (buf[path_len - 2] == '/' && buf[path_len - 1] == '.')
+		path_len--;
+	/* e) and f) */
+	while (1) {
+		buf[path_len] = 0;
+		m = regexec(&url_up_re, buf, 3, up_match, 0);
+		if (m)
+			break;
+		if (up_match[1].rm_eo + 4 <= path_len) {
+			memmove(buf + up_match[1].rm_so,
+					buf + up_match[1].rm_eo + 4,
+					path_len - up_match[1].rm_eo - 4);
+			path_len -= up_match[1].rm_eo - up_match[1].rm_so + 4;
+		} else
+			path_len -= up_match[1].rm_eo - up_match[1].rm_so + 3;
+	}
+	buf[path_len] = 0;
+        path = buf;
+
+step7:	/* 7) */
+	res = malloc(scheme_len + 1 + 2 + authority_len + path_len + 1 +
+			query_len + 1 + fragment_len + 1);
+	if (!res) {
+		LOG(("malloc failed"));
+		free(buf);
+		return 0;
+	}
+
+	strncpy(res, scheme, scheme_len);
+	res[scheme_len] = ':';
+	i = scheme_len + 1;
+	if (authority) {
+		res[i++] = '/';
+		res[i++] = '/';
+		strncpy(res + i, authority, authority_len);
+		i += authority_len;
+	}
+	strncpy(res + i, path, path_len);
+	i += path_len;
+	if (query) {
+		res[i++] = '?';
+		strncpy(res + i, query, query_len);
+		i += query_len;
+	}
+	if (fragment) {
+		res[i++] = '#';
+		strncpy(res + i, fragment, fragment_len);
+		i += fragment_len;
+	}
+	res[i] = 0;
+
+	free(buf);
+
+	return res;
+}
+
+
+/**
+ * Return the host name from an URL.
+ *
+ * \param  url  an absolute URL
+ * \returns  host name allocated on heap, or 0 on failure
+ */
+
+char *url_host(const char *url)
+{
+	int m;
+	char *host;
+	regmatch_t match[10];
+
+	m = regexec(&url_re, url, 10, match, 0);
+	if (m) {
+		LOG(("url '%s' failed to match regex", url));
+		return 0;
+	}
+	if (match[4].rm_so == -1)
+		return 0;
+
+	host = malloc(match[4].rm_eo - match[4].rm_so + 1);
+	if (!host) {
+		LOG(("malloc failed"));
+		return 0;
+	}
+	strncpy(host, url + match[4].rm_so, match[4].rm_eo - match[4].rm_so);
+	host[match[4].rm_eo - match[4].rm_so] = 0;
+
+	return host;
+}
+
+
+
+#ifdef TEST
+
+int main(int argc, char *argv[])
+{
+	int i;
+	char *s;
+	url_init();
+	for (i = 1; i != argc; i++) {
+/*		printf("==> '%s'\n", argv[i]);
+		s = url_normalize(argv[i]);
+		if (s)
+			printf("<== '%s'\n", s);*/
+/*		printf("==> '%s'\n", argv[i]);
+		s = url_host(argv[i]);
+		if (s)
+			printf("<== '%s'\n", s);*/
+		if (1 != i) {
+			s = url_join(argv[i], argv[1]);
+			if (s)
+				printf("'%s' + '%s' \t= '%s'\n", argv[1],
+						argv[i], s);
+		}
+	}
+	return 0;
+}
+
+void regcomp_wrapper(regex_t *preg, const char *regex, int cflags)
+{
+	char errbuf[200];
+	int r;
+	r = regcomp(preg, regex, cflags);
+	if (r) {
+		regerror(r, preg, errbuf, sizeof errbuf);
+		fprintf(stderr, "Failed to compile regexp '%s'\n", regex);
+		fprintf(stderr, "error: %s\n", errbuf);
+		exit(1);
+	}
+}
+
+#endif
diff --git a/utils/url.h b/utils/url.h
new file mode 100644
index 000000000..f908e8f9a
--- /dev/null
+++ b/utils/url.h
@@ -0,0 +1,20 @@
+/*
+ * This file is part of NetSurf, http://netsurf.sourceforge.net/
+ * Licensed under the GNU General Public License,
+ *                http://www.opensource.org/licenses/gpl-license
+ * Copyright 2004 James Bursa <bursa@users.sourceforge.net>
+ */
+
+/** \file
+ * URL parsing and joining (interface).
+ */
+
+#ifndef _NETSURF_UTILS_URL_H_
+#define _NETSURF_UTILS_URL_H_
+
+void url_init(void);
+char *url_normalize(const char *url);
+char *url_join(const char *rel, const char *base);
+char *url_host(const char *url);
+
+#endif
diff --git a/utils/utils.c b/utils/utils.c
index 889985178..f2c7188ea 100644
--- a/utils/utils.c
+++ b/utils/utils.c
@@ -2,7 +2,7 @@
  * This file is part of NetSurf, http://netsurf.sourceforge.net/
  * Licensed under the GNU General Public License,
  *                http://www.opensource.org/licenses/gpl-license
- * Copyright 2003 James Bursa <bursa@users.sourceforge.net>
+ * Copyright 2004 James Bursa <bursa@users.sourceforge.net>
  * Copyright 2003 Phil Mellor <monkeyson@users.sourceforge.net>
  * Copyright 2003 John M Bell <jmb202@ecs.soton.ac.uk>
  */
@@ -12,17 +12,11 @@
 #include <stdio.h>
 #include <string.h>
 #include <sys/stat.h>
-#include <uri.h>
 #include <sys/types.h>
 #include <regex.h>
 #include <time.h>
 #include "libxml/encoding.h"
-#include "libxml/uri.h"
 #include "netsurf/utils/config.h"
-#ifdef riscos
-#include "netsurf/riscos/about.h"
-#include "netsurf/riscos/constdata.h"
-#endif
 #define NDEBUG
 #include "netsurf/utils/log.h"
 #include "netsurf/utils/messages.h"
@@ -190,103 +184,6 @@ char *squash_tolat1(xmlChar *s)
 
 
 /**
- * Calculate an URL from a relative and base URL.
- *
- * base may be 0 for a new URL, in which case the URL is canonicalized and
- * returned. Returns 0 in case of error.
- */
-
-char *url_join(char *rel_url, char *base_url)
-{
-	char *res;
-	uri_t *base = 0, *rel = 0, *abs;
-
-	LOG(("rel_url = %s, base_url = %s", rel_url, base_url));
-
-#ifdef riscos
-       /* hacky, hacky, hacky...
-        * It is, however, best to do this here as it avoids
-        * duplicating code for clicking links and url bar handling.
-        * It simplifies the code it the other places too (they just
-        * call this as usual, then we handle it here).
-        */
-#ifdef WITH_ABOUT
-       if (strcasecmp(rel_url, "about:") == 0) {
-               about_create();
-               return xstrdup(ABOUT_URL);
-       }
-#ifdef WITH_COOKIES
-       if (strcasecmp(rel_url, "about:cookies") == 0) {
-               cookie_create();
-               return xstrdup(COOKIE_URL);
-       }
-#endif
-#endif
-#endif
-
-	if (!base_url) {
-		res = uri_cannonicalize_string(rel_url,
-		                (int)(strlen(rel_url)),
-				URI_STRING_URI_STYLE);
-		LOG(("res = %s", res));
-		if (res)
-			return xstrdup(res);
-		return 0;
-	}
-
-	base = uri_alloc(base_url, (int)(strlen(base_url)));
-	rel = uri_alloc(rel_url, (int)(strlen(rel_url)));
-	if (!base || !rel)
-		goto fail;
-	if (!base->scheme)
-		goto fail;
-
-	abs = uri_abs_1(base, rel);
-
-	res = xstrdup(uri_uri(abs));
-
-	uri_free(base);
-	uri_free(rel);
-
-	LOG(("res = %s", res));
-	return res;
-
-fail:
-	if (base)
-		uri_free(base);
-	if (rel)
-		uri_free(rel);
-
-	LOG(("error"));
-
-	return 0;
-}
-
-
-/**
- * Extract the host name from a url.
- *
- * \param url an absolute URL
- * \return a new string, or 0 in case of error
- */
-
-char *get_host_from_url(char *url)
-{
-	char *host = 0;
-	uri_t *uri;
-
-	uri = uri_alloc(url, (int)(strlen(url)));
-	if (!uri)
-		return 0;
-	if (uri->host)
-		host = xstrdup(uri->host);
-	uri_free(uri);
-
-	return host;
-}
-
-
-/**
  * Check if a directory exists.
  */
 
diff --git a/utils/utils.h b/utils/utils.h
index 02b927711..1faef449d 100644
--- a/utils/utils.h
+++ b/utils/utils.h
@@ -2,7 +2,7 @@
  * This file is part of NetSurf, http://netsurf.sourceforge.net/
  * Licensed under the GNU General Public License,
  *                http://www.opensource.org/licenses/gpl-license
- * Copyright 2003 James Bursa <bursa@users.sourceforge.net>
+ * Copyright 2004 James Bursa <bursa@users.sourceforge.net>
  */
 
 #ifndef _NETSURF_UTILS_UTILS_H_
@@ -26,8 +26,6 @@ char * squash_whitespace(const char * s);
 char * tolat1(xmlChar * s);
 char * tolat1_pre(xmlChar * s);
 char *squash_tolat1(xmlChar *s);
-char *url_join(char *rel_url, char *base_url);
-char *get_host_from_url(char* url);
 bool is_dir(const char *path);
 void regcomp_wrapper(regex_t *preg, const char *regex, int cflags);
 void clean_cookiejar(void);
author	James Bursa <james@netsurf-browser.org>	2004-03-02 18:02:41 +0000
committer	James Bursa <james@netsurf-browser.org>	2004-03-02 18:02:41 +0000
commit	1c85bf04293cfba663c5170bbe762825b7e72af1 (patch)
tree	6db0b94b8e875ae20514b334c9f9acb3b380f362 /utils
parent	2d816dda237a80f413a9d90031c7f5aff01e7a83 (diff)
download	netsurf-1c85bf04293cfba663c5170bbe762825b7e72af1.tar.gz netsurf-1c85bf04293cfba663c5170bbe762825b7e72af1.tar.bz2