summaryrefslogtreecommitdiff
path: root/utils/url.c
diff options
context:
space:
mode:
Diffstat (limited to 'utils/url.c')
-rw-r--r--utils/url.c199
1 files changed, 1 insertions, 198 deletions
diff --git a/utils/url.c b/utils/url.c
index 831f16761..a44bd8f3e 100644
--- a/utils/url.c
+++ b/utils/url.c
@@ -28,7 +28,6 @@
#include <stdbool.h>
#include <stdlib.h>
#include <string.h>
-#include <regex.h>
#include <unistd.h>
#include "curl/curl.h"
@@ -168,208 +167,12 @@ out_true:
return true;
}
-/**
- * Normalize a URL.
- *
- * \param url an absolute URL
- * \param result pointer to pointer to buffer to hold cleaned up url. Caller
- * gets ownership of pointer to buffer value. On failure the
- * pointer to buffer value will be NULL.
- * \return URL_FUNC_OK on success
- *
- * If there is no scheme, http:// is added. The scheme and host are
- * lower-cased. Default ports are removed (http only). An empty path is
- * replaced with "/". Characters are unescaped if safe.
- */
-
-url_func_result url_normalize(const char *url, char **result)
-{
- char c;
- int m;
- size_t i;
- size_t len;
- size_t bufsize;
- char* norm;
- bool http = false;
- regmatch_t match[10];
-
- *result = NULL;
-
- /* skip past any leading whitespace (likely if URL was copy-pasted) */
- while (isspace(*url))
- url++;
-
- /* allocate sufficiently large buffer for new URL */
- len = strlen(url);
- /* "+ 1" for the terminating NUL character. */
- bufsize = len + 1 + SLEN("http://") + SLEN("/");
- /* work out how much extra to leave for internal whitespace */
- for(i = 0; i < len; i++) {
- if(isspace(url[i])) bufsize += 2; /* ' ' -> '%20' */
- }
- if ((norm = malloc(bufsize)) == NULL) {
- LOG(("malloc failed"));
- return URL_FUNC_NOMEM;
- }
- *result = norm;
- strcpy(norm, url);
-
- /* truncate trailing whitespace (significant should be uriencoded) */
- for (i = len - 1; (i > 0) && isspace(norm[i]); i--) {
- norm[i] = '\0';
- len--;
- }
-
- /* encode any remaining (internal) whitespace */
- for (i = 0; i < len; i++) {
- if(isspace(norm[i])) {
- char space = norm[i];
- memmove(norm + i + 2, norm + i, 1 + len - i);
- len += 2;
- norm[ i] = '%';
- norm[++i] = digit2lowcase_hex(space >> 4);
- norm[++i] = digit2lowcase_hex(space & 0xf);
- }
- }
-
- /* finally verify that it's actually an URL we're working on
- * (RFC regex too fussy to tolerate above WSP problems) */
- if (regexec(&url_re, norm, 10, match, 0)) {
- LOG(("url '%s' failed to match regex", url));
- free(norm);
- *result = NULL;
- return URL_FUNC_FAILED;
- }
-
- if (match[URL_RE_SCHEME].rm_so == -1) {
- /* scheme missing: add http:// and reparse */
- memmove(norm + SLEN("http://"), norm, len + 1);
- memcpy(norm, "http://", SLEN("http://")); /* do NOT copy NUL */
- len += SLEN("http://");
- if (regexec(&url_re, norm, 10, match, 0)) {
- LOG(("url '%s' failed to match regex", norm));
- free(norm);
- *result = NULL;
- return URL_FUNC_FAILED;
- }
- }
-
- /*for (unsigned int i = 0; i != 10; i++) {
- if (match[i].rm_so == -1)
- continue;
- fprintf(stderr, "%i: '%.*s'\n", i,
- match[i].rm_eo - match[i].rm_so,
- res + match[i].rm_so);
- }*/
-
- /* see RFC 2616 section 3.2.3 */
- /* make scheme lower-case */
- if (match[URL_RE_SCHEME].rm_so != -1) {
- for (i = match[URL_RE_SCHEME].rm_so;
- (regoff_t) i != match[URL_RE_SCHEME].rm_eo; i++)
- norm[i] = tolower(norm[i]);
- if (match[URL_RE_SCHEME].rm_eo == 4
- && norm[0] == 'h'
- && norm[1] == 't'
- && norm[2] == 't'
- && norm[3] == 'p')
- http = true;
- }
-
- /* make empty path into "/" */
- if (match[URL_RE_PATH].rm_so != -1 &&
- match[URL_RE_PATH].rm_so == match[URL_RE_PATH].rm_eo) {
- memmove(norm + match[URL_RE_PATH].rm_so + 1,
- norm + match[URL_RE_PATH].rm_so,
- len - match[URL_RE_PATH].rm_so + 1);
- norm[match[URL_RE_PATH].rm_so] = '/';
- len++;
- }
-
- /* make host lower-case */
- if (match[URL_RE_AUTHORITY].rm_so != -1) {
- /* Find @ delimiting credentials from host, if any */
- for (i = match[URL_RE_AUTHORITY].rm_so;
- (regoff_t) i != match[URL_RE_AUTHORITY].rm_eo;
- i++) {
- if (norm[i] == '@') {
- i++;
- break;
- }
- }
-
- /* No credentials; transform entire host */
- if ((regoff_t) i == match[URL_RE_AUTHORITY].rm_eo)
- i = match[URL_RE_AUTHORITY].rm_so;
-
- for (; (regoff_t) i != match[URL_RE_AUTHORITY].rm_eo; i++) {
- if (norm[i] == ':' && (i + 3) < len) {
- if (http && norm[i + 1] == '8' &&
- norm[i + 2] == '0' &&
- (regoff_t) i + 3 ==
- match[URL_RE_AUTHORITY].rm_eo) {
- memmove(norm + i,
- norm + i + 3,
- len -
- match[URL_RE_AUTHORITY].
- rm_eo);
- len -= 3;
- norm[len] = '\0';
- } else if ((regoff_t) i + 1 == match[4].rm_eo) {
- memmove(norm + i,
- norm + i + 1,
- len -
- match[URL_RE_AUTHORITY].
- rm_eo);
- len--;
- norm[len] = '\0';
- }
- break;
- }
- norm[i] = tolower(norm[i]);
- }
- }
-
- /* unescape non-"reserved" escaped characters */
- for (i = 0; i + 2 < len; i++) {
- if (norm[i] != '%')
- continue;
- c = tolower(norm[i + 1]);
- if ('0' <= c && c <= '9')
- m = 16 * (c - '0');
- else if ('a' <= c && c <= 'f')
- m = 16 * (c - 'a' + 10);
- else
- continue;
- c = tolower(norm[i + 2]);
- if ('0' <= c && c <= '9')
- m += c - '0';
- else if ('a' <= c && c <= 'f')
- m += c - 'a' + 10;
- else
- continue;
-
- if (m <= 0x20 || strchr(";/?:@&=+$," "<>#%\"{}|\\^[]`", m) ||
- m >= 0x7f) {
- i += 2;
- continue;
- }
-
- norm[i] = m;
- memmove(norm + i + 1, norm + i + 3, len - i - 2);
- len -= 2;
- }
-
- /* norm and *result point to same memory, so just return ok */
- return URL_FUNC_OK;
-}
-
/**
* Resolve a relative URL to absolute form.
*
* \param rel relative URL
- * \param base base URL, must be absolute and cleaned as by url_normalize()
+ * \param base base URL, must be absolute and cleaned as by nsurl_create()
* \param result pointer to pointer to buffer to hold absolute url
* \return URL_FUNC_OK on success
*/