From 299a85fa7a4ee259e757b8ffd6e482410e6726f1 Mon Sep 17 00:00:00 2001 From: Michael Drake Date: Sun, 15 Jan 2017 12:47:38 +0000 Subject: nusrl: Move into utils/nsurl directory. --- utils/nsurl/nsurl.c | 2500 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 2500 insertions(+) create mode 100644 utils/nsurl/nsurl.c (limited to 'utils/nsurl/nsurl.c') diff --git a/utils/nsurl/nsurl.c b/utils/nsurl/nsurl.c new file mode 100644 index 000000000..c5c614c55 --- /dev/null +++ b/utils/nsurl/nsurl.c @@ -0,0 +1,2500 @@ +/* + * Copyright 2011 Michael Drake + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** + * \file + * NetSurf URL handling implementation. + * + * This is the common implementation of all URL handling within the + * browser. This implementation is based upon RFC3986 although this has + * been superceeded by https://url.spec.whatwg.org/ which is based on + * actual contemporary implementations. + * + * Care must be taken with character encodings within this module as + * the specifications work with specific ascii ranges and must not be + * affected by locale. Hence the c library character type functions + * are not used. + */ + +#include +#include +#include +#include +#include + +#include "utils/ascii.h" +#include "utils/corestrings.h" +#include "utils/errors.h" +#include "utils/idna.h" +#include "utils/log.h" +#include "utils/nsurl.h" +#include "utils/utils.h" + +/* Define to enable NSURL debugging */ +#undef NSURL_DEBUG + +/** + * nsurl scheme type + */ +enum scheme_type { + NSURL_SCHEME_OTHER, + NSURL_SCHEME_HTTP, + NSURL_SCHEME_HTTPS, + NSURL_SCHEME_FTP, + NSURL_SCHEME_MAILTO +}; + +/** + * nsurl components + * + * [scheme]://[username]:[password]@[host]:[port][path][?query]#[fragment] + * + * Note: + * "path" string includes preceding '/', if needed for the scheme + * "query" string always includes preceding '?' + * + * The other spanned punctuation is to be inserted when building URLs from + * components. + */ +struct nsurl_components { + lwc_string *scheme; + lwc_string *username; + lwc_string *password; + lwc_string *host; + lwc_string *port; + lwc_string *path; + lwc_string *query; + lwc_string *fragment; + + enum scheme_type scheme_type; +}; + + +/** + * NetSurf URL object + */ +struct nsurl { + struct nsurl_components components; + + int count; /* Number of references to NetSurf URL object */ + uint32_t hash; /* Hash value for nsurl identification */ + + size_t length; /* Length of string */ + char string[FLEX_ARRAY_LEN_DECL]; /* Full URL as a string */ +}; + + +/** Marker set, indicating positions of sections within a URL string */ +struct url_markers { + size_t start; /** start of URL */ + size_t scheme_end; + size_t authority; + + size_t colon_first; + size_t at; + size_t colon_last; + + size_t path; + size_t query; + size_t fragment; + + size_t end; /** end of URL */ + + enum scheme_type scheme_type; +}; + + +/** Marker set, indicating positions of sections within a URL string */ +struct nsurl_component_lengths { + size_t scheme; + size_t username; + size_t password; + size_t host; + size_t port; + size_t path; + size_t query; + size_t fragment; +}; + + +/** Flags indicating which parts of a URL string are required for a nsurl */ +enum nsurl_string_flags { + NSURL_F_SCHEME = (1 << 0), + NSURL_F_SCHEME_PUNCTUATION = (1 << 1), + NSURL_F_AUTHORITY_PUNCTUATION = (1 << 2), + NSURL_F_USERNAME = (1 << 3), + NSURL_F_PASSWORD = (1 << 4), + NSURL_F_CREDENTIALS_PUNCTUATION = (1 << 5), + NSURL_F_HOST = (1 << 6), + NSURL_F_PORT = (1 << 7), + NSURL_F_AUTHORITY = (NSURL_F_USERNAME | + NSURL_F_PASSWORD | + NSURL_F_HOST | + NSURL_F_PORT), + NSURL_F_PATH = (1 << 8), + NSURL_F_QUERY = (1 << 9), + NSURL_F_FRAGMENT_PUNCTUATION = (1 << 10), + NSURL_F_FRAGMENT = (1 << 11) +}; + + +/** Sections of a URL */ +enum url_sections { + URL_SCHEME, + URL_CREDENTIALS, + URL_HOST, + URL_PATH, + URL_QUERY, + URL_FRAGMENT +}; + + +#define nsurl__component_copy(c) (c == NULL) ? NULL : lwc_string_ref(c) + +#define nsurl__component_compare(c1, c2, match) \ + if (c1 && c2 && lwc_error_ok == \ + lwc_string_isequal(c1, c2, match)) { \ + /* do nothing */ \ + } else if (c1 || c2) { \ + *match = false; \ + } + +/** + * Return a hex digit for the given numerical value. + * + * \param digit the value to get the hex digit for. + * \return character in range 0-9A-F + */ +inline static char digit2uppercase_hex(unsigned char digit) { + assert(digit < 16); + return "0123456789ABCDEF"[digit]; +} + +/** + * determine if a character is unreserved + * + * \param c character to classify. + * \return true if the character is unreserved else false. + */ +static bool nsurl__is_unreserved(unsigned char c) +{ + /* From RFC3986 section 2.3 (unreserved characters) + * + * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + * + */ + static const bool unreserved[256] = { + false, false, false, false, false, false, false, false, /* 00 */ + false, false, false, false, false, false, false, false, /* 08 */ + false, false, false, false, false, false, false, false, /* 10 */ + false, false, false, false, false, false, false, false, /* 18 */ + false, false, false, false, false, false, false, false, /* 20 */ + false, false, false, false, false, true, true, false, /* 28 */ + true, true, true, true, true, true, true, true, /* 30 */ + true, true, false, false, false, false, false, false, /* 38 */ + false, true, true, true, true, true, true, true, /* 40 */ + true, true, true, true, true, true, true, true, /* 48 */ + true, true, true, true, true, true, true, true, /* 50 */ + true, true, true, false, false, false, false, true, /* 58 */ + false, true, true, true, true, true, true, true, /* 60 */ + true, true, true, true, true, true, true, true, /* 68 */ + true, true, true, true, true, true, true, true, /* 70 */ + true, true, true, false, false, false, true, false, /* 78 */ + false, false, false, false, false, false, false, false, /* 80 */ + false, false, false, false, false, false, false, false, /* 88 */ + false, false, false, false, false, false, false, false, /* 90 */ + false, false, false, false, false, false, false, false, /* 98 */ + false, false, false, false, false, false, false, false, /* A0 */ + false, false, false, false, false, false, false, false, /* A8 */ + false, false, false, false, false, false, false, false, /* B0 */ + false, false, false, false, false, false, false, false, /* B8 */ + false, false, false, false, false, false, false, false, /* C0 */ + false, false, false, false, false, false, false, false, /* C8 */ + false, false, false, false, false, false, false, false, /* D0 */ + false, false, false, false, false, false, false, false, /* D8 */ + false, false, false, false, false, false, false, false, /* E0 */ + false, false, false, false, false, false, false, false, /* E8 */ + false, false, false, false, false, false, false, false, /* F0 */ + false, false, false, false, false, false, false, false /* F8 */ + }; + return unreserved[c]; +} + +/** + * determine if a character should be percent escaped. + * + * The ASCII codes which should not be percent escaped + * + * \param c character to classify. + * \return true if the character should not be escaped else false. + */ +static bool nsurl__is_no_escape(unsigned char c) +{ + static const bool no_escape[256] = { + false, false, false, false, false, false, false, false, /* 00 */ + false, false, false, false, false, false, false, false, /* 08 */ + false, false, false, false, false, false, false, false, /* 10 */ + false, false, false, false, false, false, false, false, /* 18 */ + false, true, false, true, true, false, true, true, /* 20 */ + true, true, true, true, true, true, true, true, /* 28 */ + true, true, true, true, true, true, true, true, /* 30 */ + true, true, true, true, false, true, false, true, /* 38 */ + true, true, true, true, true, true, true, true, /* 40 */ + true, true, true, true, true, true, true, true, /* 48 */ + true, true, true, true, true, true, true, true, /* 50 */ + true, true, true, true, false, true, false, true, /* 58 */ + false, true, true, true, true, true, true, true, /* 60 */ + true, true, true, true, true, true, true, true, /* 68 */ + true, true, true, true, true, true, true, true, /* 70 */ + true, true, true, false, true, false, true, false, /* 78 */ + false, false, false, false, false, false, false, false, /* 80 */ + false, false, false, false, false, false, false, false, /* 88 */ + false, false, false, false, false, false, false, false, /* 90 */ + false, false, false, false, false, false, false, false, /* 98 */ + false, false, false, false, false, false, false, false, /* A0 */ + false, false, false, false, false, false, false, false, /* A8 */ + false, false, false, false, false, false, false, false, /* B0 */ + false, false, false, false, false, false, false, false, /* B8 */ + false, false, false, false, false, false, false, false, /* C0 */ + false, false, false, false, false, false, false, false, /* C8 */ + false, false, false, false, false, false, false, false, /* D0 */ + false, false, false, false, false, false, false, false, /* D8 */ + false, false, false, false, false, false, false, false, /* E0 */ + false, false, false, false, false, false, false, false, /* E8 */ + false, false, false, false, false, false, false, false, /* F0 */ + false, false, false, false, false, false, false, false, /* F8 */ + }; + return no_escape[c]; +} + + +/** + * Obtains a set of markers delimiting sections in a URL string + * + * \param url_s URL string + * \param markers Updated to mark sections in the URL string + * \param joining True iff URL string is a relative URL for joining + */ +static void nsurl__get_string_markers(const char * const url_s, + struct url_markers *markers, bool joining) +{ + const char *pos = url_s; /** current position in url_s */ + bool is_http = false; + bool trailing_whitespace = false; + + /* Initialise marker set */ + struct url_markers marker = { 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, NSURL_SCHEME_OTHER }; + + /* Skip any leading whitespace in url_s */ + while (ascii_is_space(*pos)) + pos++; + + /* Record start point */ + marker.start = pos - url_s; + + marker.scheme_end = marker.authority = marker.colon_first = marker.at = + marker.colon_last = marker.path = marker.start; + + if (*pos == '\0') { + /* Nothing but whitespace, early exit */ + marker.query = marker.fragment = marker.end = marker.path; + *markers = marker; + return; + } + + /* Get scheme */ + if (ascii_is_alpha(*pos)) { + pos++; + + while (*pos != ':' && *pos != '\0') { + if (!ascii_is_alphanumerical(*pos) && (*pos != '+') && + (*pos != '-') && (*pos != '.')) { + /* This character is not valid in the + * scheme */ + break; + } + pos++; + } + + if (*pos == ':') { + /* This delimits the end of the scheme */ + size_t off; + + marker.scheme_end = pos - url_s; + + off = marker.scheme_end - marker.start; + + /* Detect http(s) and mailto for scheme specifc + * normalisation */ + if (off == SLEN("http") && + (((*(pos - off + 0) == 'h') || + (*(pos - off + 0) == 'H')) && + ((*(pos - off + 1) == 't') || + (*(pos - off + 1) == 'T')) && + ((*(pos - off + 2) == 't') || + (*(pos - off + 2) == 'T')) && + ((*(pos - off + 3) == 'p') || + (*(pos - off + 3) == 'P')))) { + marker.scheme_type = NSURL_SCHEME_HTTP; + is_http = true; + } else if (off == SLEN("https") && + (((*(pos - off + 0) == 'h') || + (*(pos - off + 0) == 'H')) && + ((*(pos - off + 1) == 't') || + (*(pos - off + 1) == 'T')) && + ((*(pos - off + 2) == 't') || + (*(pos - off + 2) == 'T')) && + ((*(pos - off + 3) == 'p') || + (*(pos - off + 3) == 'P')) && + ((*(pos - off + 4) == 's') || + (*(pos - off + 4) == 'S')))) { + marker.scheme_type = NSURL_SCHEME_HTTPS; + is_http = true; + } else if (off == SLEN("ftp") && + (((*(pos - off + 0) == 'f') || + (*(pos - off + 0) == 'F')) && + ((*(pos - off + 1) == 't') || + (*(pos - off + 1) == 'T')) && + ((*(pos - off + 2) == 'p') || + (*(pos - off + 2) == 'P')))) { + marker.scheme_type = NSURL_SCHEME_FTP; + } else if (off == SLEN("mailto") && + (((*(pos - off + 0) == 'm') || + (*(pos - off + 0) == 'M')) && + ((*(pos - off + 1) == 'a') || + (*(pos - off + 1) == 'A')) && + ((*(pos - off + 2) == 'i') || + (*(pos - off + 2) == 'I')) && + ((*(pos - off + 3) == 'l') || + (*(pos - off + 3) == 'L')) && + ((*(pos - off + 4) == 't') || + (*(pos - off + 4) == 'T')) && + ((*(pos - off + 5) == 'o') || + (*(pos - off + 5) == 'O')))) { + marker.scheme_type = NSURL_SCHEME_MAILTO; + } + + /* Skip over colon */ + pos++; + + /* Mark place as start of authority */ + marker.authority = marker.colon_first = marker.at = + marker.colon_last = marker.path = + pos - url_s; + + } else { + /* Not found a scheme */ + if (joining == false) { + /* Assuming no scheme == http */ + marker.scheme_type = NSURL_SCHEME_HTTP; + is_http = true; + } + } + } + + /* Get authority + * + * Two slashes always indicates the start of an authority. + * + * We are more relaxed in the case of http: + * a. when joining, one or more slashes indicates start of authority + * b. when not joining, we assume authority if no scheme was present + * and in the case of mailto: when we assume there is an authority. + */ + if ((*pos == '/' && *(pos + 1) == '/') || + (is_http && ((joining && *pos == '/') || + (joining == false && + marker.scheme_end != marker.start))) || + marker.scheme_type == NSURL_SCHEME_MAILTO) { + + /* Skip over leading slashes */ + if (*pos == '/') { + if (is_http == false) { + if (*pos == '/') pos++; + if (*pos == '/') pos++; + } else { + while (*pos == '/') + pos++; + } + + marker.authority = marker.colon_first = marker.at = + marker.colon_last = marker.path = + pos - url_s; + } + + /* Need to get (or complete) the authority */ + while (*pos != '\0') { + if (*pos == '/' || *pos == '?' || *pos == '#') { + /* End of the authority */ + break; + + } else if (marker.scheme_type != NSURL_SCHEME_MAILTO && + *pos == ':' && marker.colon_first == + marker.authority) { + /* could be username:password or host:port + * separator */ + marker.colon_first = pos - url_s; + + } else if (marker.scheme_type != NSURL_SCHEME_MAILTO && + *pos == ':' && marker.colon_first != + marker.authority) { + /* could be host:port separator */ + marker.colon_last = pos - url_s; + + } else if (*pos == '@' && marker.at == + marker.authority) { + /* Credentials @ host separator */ + marker.at = pos - url_s; + } + + pos++; + } + + marker.path = pos - url_s; + + } else if ((*pos == '\0' || *pos == '/') && + joining == false && is_http == true) { + marker.path = pos - url_s; + } + + /* Get path + * + * Needs to start with '/' if there's no authority + */ + if (*pos == '/' || ((marker.path == marker.authority) && + (*pos != '?') && (*pos != '#') && (*pos != '\0'))) { + while (*(++pos) != '\0') { + if (*pos == '?' || *pos == '#') { + /* End of the path */ + break; + } + } + } + + marker.query = pos - url_s; + + /* Get query */ + if (*pos == '?') { + while (*(++pos) != '\0') { + if (*pos == '#') { + /* End of the query */ + break; + } + } + } + + marker.fragment = pos - url_s; + + /* Get fragment */ + if (*pos == '#') { + while (*(++pos) != '\0') + ; + } + + /* We got to the end of url_s. + * Need to skip back over trailing whitespace to find end of URL */ + pos--; + if (pos >= url_s && ascii_is_space(*pos)) { + trailing_whitespace = true; + while (pos >= url_s && ascii_is_space(*pos)) + pos--; + } + + marker.end = pos + 1 - url_s; + + if (trailing_whitespace == true) { + /* Ensure last url section doesn't pass end */ + if (marker.fragment > marker.end) + marker.fragment = marker.end; + if (marker.query > marker.end) + marker.query = marker.end; + if (marker.path > marker.end) + marker.path = marker.end; + if (marker.colon_last > marker.end) + marker.colon_last = marker.end; + if (marker.at > marker.end) + marker.at = marker.end; + if (marker.colon_last > marker.end) + marker.colon_last = marker.end; + if (marker.fragment > marker.end) + marker.fragment = marker.end; + } + +#ifdef NSURL_DEBUG + LOG("marker.start: %i", marker.start); + LOG("marker.scheme_end: %i", marker.scheme_end); + LOG("marker.authority: %i", marker.authority); + + LOG("marker.colon_first: %i", marker.colon_first); + LOG("marker.at: %i", marker.at); + LOG("marker.colon_last: %i", marker.colon_last); + + LOG("marker.path: %i", marker.path); + LOG("marker.query: %i", marker.query); + LOG("marker.fragment: %i", marker.fragment); + + LOG("marker.end: %i", marker.end); +#endif + + /* Got all the URL components pegged out now */ + *markers = marker; +} + + +/** + * Remove dot segments from a path, as per rfc 3986, 5.2.4 + * + * \param path path to remove dot segments from ('\0' terminated) + * \param output path with dot segments removed + * \return size of output + */ +static size_t nsurl__remove_dot_segments(char *path, char *output) +{ + char *path_pos = path; + char *output_pos = output; + + while (*path_pos != '\0') { +#ifdef NSURL_DEBUG + LOG(" in:%s", path_pos); + LOG("out:%.*s", output_pos - output, output); +#endif + if (*path_pos == '.') { + if (*(path_pos + 1) == '.' && + *(path_pos + 2) == '/') { + /* Found prefix of "../" */ + path_pos += SLEN("../"); + continue; + + } else if (*(path_pos + 1) == '/') { + /* Found prefix of "./" */ + path_pos += SLEN("./"); + continue; + } + } else if (*path_pos == '/' && *(path_pos + 1) == '.') { + if (*(path_pos + 2) == '/') { + /* Found prefix of "/./" */ + path_pos += SLEN("/."); + continue; + + } else if (*(path_pos + 2) == '\0') { + /* Found "/." at end of path */ + *(output_pos++) = '/'; + + /* End of input path */ + break; + + } else if (*(path_pos + 2) == '.') { + if (*(path_pos + 3) == '/') { + /* Found prefix of "/../" */ + path_pos += SLEN("/.."); + + if (output_pos > output) + output_pos--; + while (output_pos > output && + *output_pos != '/') + output_pos--; + + continue; + + } else if (*(path_pos + 3) == '\0') { + /* Found "/.." at end of path */ + + while (output_pos > output && + *(output_pos -1 ) !='/') + output_pos--; + + /* End of input path */ + break; + } + } + } else if (*path_pos == '.') { + if (*(path_pos + 1) == '\0') { + /* Found "." at end of path */ + + /* End of input path */ + break; + + } else if (*(path_pos + 1) == '.' && + *(path_pos + 2) == '\0') { + /* Found ".." at end of path */ + + /* End of input path */ + break; + } + } + /* Copy first character into output path */ + *output_pos++ = *path_pos++; + + /* Copy up to but not including next '/' */ + while ((*path_pos != '/') && (*path_pos != '\0')) + *output_pos++ = *path_pos++; + } + + return output_pos - output; +} + + +/** + * Get the length of the longest section + * + * \param m markers delimiting url sections in a string + * \return the length of the longest section + */ +static size_t nsurl__get_longest_section(struct url_markers *m) +{ + size_t length = m->scheme_end - m->start; /* scheme */ + + if (length < m->at - m->authority) /* credentials */ + length = m->at - m->authority; + + if (length < m->path - m->at) /* host */ + length = m->path - m->at; + + if (length < m->query - m->path) /* path */ + length = m->query - m->path; + + if (length < m->fragment - m->query) /* query */ + length = m->fragment - m->query; + + if (length < m->end - m->fragment) /* fragment */ + length = m->end - m->fragment; + + return length; +} + + +/** + * Converts two hexadecimal digits to a single number + * + * \param c1 most significant hex digit + * \param c2 least significant hex digit + * \return the total value of the two digit hex number, or -ve if input not hex + * + * For unescaping url encoded characters. + */ +static inline int nsurl__get_ascii_offset(char c1, char c2) +{ + int offset; + + /* Use 1st char as most significant hex digit */ + if (ascii_is_digit(c1)) + offset = 16 * (c1 - '0'); + else if (c1 >= 'a' && c1 <= 'f') + offset = 16 * (c1 - 'a' + 10); + else if (c1 >= 'A' && c1 <= 'F') + offset = 16 * (c1 - 'A' + 10); + else + /* Not valid hex */ + return -1; + + /* Use 2nd char as least significant hex digit and sum */ + if (ascii_is_digit(c2)) + offset += c2 - '0'; + else if (c2 >= 'a' && c2 <= 'f') + offset += c2 - 'a' + 10; + else if (c2 >= 'A' && c2 <= 'F') + offset += c2 - 'A' + 10; + else + /* Not valid hex */ + return -1; + + return offset; +} + + +/** + * Create the components of a NetSurf URL object for a section of a URL string + * + * \param url_s URL string + * \param section Sets which section of URL string is to be normalised + * \param pegs Set of markers delimiting the URL string's sections + * \param pos_norm A buffer large enough for the normalised string (*3 + 1) + * \param url A NetSurf URL object, to which components may be added + * \return NSERROR_OK on success, appropriate error otherwise + * + * The section of url_s is normalised appropriately. + */ +static nserror nsurl__create_from_section(const char * const url_s, + const enum url_sections section, + const struct url_markers *pegs, + char *pos_norm, + struct nsurl_components *url) +{ + nserror ret; + int ascii_offset; + int start = 0; + int end = 0; + const char *pos; + const char *pos_url_s; + char *norm_start = pos_norm; + char *host; + size_t copy_len; + size_t length; + size_t host_len; + enum { + NSURL_F_NO_PORT = (1 << 0) + } flags = 0; + + switch (section) { + case URL_SCHEME: + start = pegs->start; + end = pegs->scheme_end; + break; + + case URL_CREDENTIALS: + start = pegs->authority; + end = pegs->at; + break; + + case URL_HOST: + start = (pegs->at == pegs->authority && + *(url_s + pegs->at) != '@') ? + pegs->at : + pegs->at + 1; + end = pegs->path; + break; + + case URL_PATH: + start = pegs->path; + end = pegs->query; + break; + + case URL_QUERY: + start = pegs->query; + end = pegs->fragment; + break; + + case URL_FRAGMENT: + start = (*(url_s + pegs->fragment) != '#') ? + pegs->fragment : + pegs->fragment + 1; + end = pegs->end; + break; + } + + if (end < start) + end = start; + + length = end - start; + + /* Stage 1: Normalise the required section */ + + pos = pos_url_s = url_s + start; + copy_len = 0; + for (; pos < url_s + end; pos++) { + if (*pos == '%' && (pos + 2 < url_s + end)) { + /* Might be an escaped character needing unescaped */ + + /* Find which character which was escaped */ + ascii_offset = nsurl__get_ascii_offset(*(pos + 1), + *(pos + 2)); + + if (ascii_offset < 0) { + /* % with invalid hex digits. */ + copy_len++; + continue; + } + + if ((section != URL_SCHEME && section != URL_HOST) && + (nsurl__is_unreserved(ascii_offset) == false)) { + /* This character should be escaped after all, + * just let it get copied */ + copy_len += 3; + pos += 2; + continue; + } + + if (copy_len > 0) { + /* Copy up to here */ + memcpy(pos_norm, pos_url_s, copy_len); + pos_norm += copy_len; + copy_len = 0; + } + + /* Put the unescaped character in the normalised URL */ + *(pos_norm++) = (char)ascii_offset; + pos += 2; + pos_url_s = pos + 1; + + length -= 2; + + } else if ((section != URL_SCHEME && section != URL_HOST) && + (nsurl__is_no_escape(*pos) == false)) { + + /* This needs to be escaped */ + if (copy_len > 0) { + /* Copy up to here */ + memcpy(pos_norm, pos_url_s, copy_len); + pos_norm += copy_len; + copy_len = 0; + } + + /* escape */ + *(pos_norm++) = '%'; + *(pos_norm++) = digit2uppercase_hex( + ((unsigned char)*pos) >> 4); + *(pos_norm++) = digit2uppercase_hex( + ((unsigned char)*pos) & 0xf); + pos_url_s = pos + 1; + + length += 2; + + } else if ((section == URL_SCHEME || section == URL_HOST) && + ascii_is_alpha_upper(*pos)) { + /* Lower case this letter */ + + if (copy_len > 0) { + /* Copy up to here */ + memcpy(pos_norm, pos_url_s, copy_len); + pos_norm += copy_len; + copy_len = 0; + } + /* Copy lower cased letter into normalised URL */ + *(pos_norm++) = ascii_to_lower(*pos); + pos_url_s = pos + 1; + + } else { + /* This character is safe in normalised URL */ + copy_len++; + } + } + + if (copy_len > 0) { + /* Copy up to here */ + memcpy(pos_norm, pos_url_s, copy_len); + pos_norm += copy_len; + } + + /* Mark end of section */ + (*pos_norm) = '\0'; + + /* Stage 2: Create the URL components for the required section */ + switch (section) { + case URL_SCHEME: + if (length == 0) { + /* No scheme, assuming http */ + url->scheme = lwc_string_ref(corestring_lwc_http); + } else { + /* Add scheme to URL */ + if (lwc_intern_string(norm_start, length, + &url->scheme) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } + + break; + + case URL_CREDENTIALS: + url->username = NULL; + url->password = NULL; + + if (length != 0 && *norm_start != ':') { + char *sec_start = norm_start; + if (pegs->colon_first != pegs->authority && + pegs->at > pegs->colon_first + 1) { + /* there's a password */ + sec_start += pegs->colon_first - + pegs->authority + 1; + if (lwc_intern_string(sec_start, + pegs->at - pegs->colon_first -1, + &url->password) != + lwc_error_ok) { + return NSERROR_NOMEM; + } + + /* update start pos and length for username */ + sec_start = norm_start; + length -= pegs->at - pegs->colon_first; + } else if (pegs->colon_first != pegs->authority && + pegs->at == pegs->colon_first + 1) { + /* strip username colon */ + length--; + } + + /* Username */ + if (lwc_intern_string(sec_start, length, + &url->username) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } + + break; + + case URL_HOST: + url->host = NULL; + url->port = NULL; + + if (length != 0) { + size_t colon = 0; + char *sec_start = norm_start; + if (pegs->at < pegs->colon_first && + pegs->colon_last == pegs->authority) { + /* There's one colon and it's after @ marker */ + colon = pegs->colon_first; + } else if (pegs->colon_last != pegs->authority) { + /* There's more than one colon */ + colon = pegs->colon_last; + } else { + /* There's no colon that could be a port + * separator */ + flags |= NSURL_F_NO_PORT; + } + + if (!(flags & NSURL_F_NO_PORT)) { + /* Determine whether colon is a port separator + */ + sec_start += colon - pegs->at; + while (++sec_start < norm_start + length) { + if (!ascii_is_digit(*sec_start)) { + /* Character after port isn't a + * digit; not a port separator + */ + flags |= NSURL_F_NO_PORT; + break; + } + } + } + + if (!(flags & NSURL_F_NO_PORT)) { + /* There's a port */ + size_t skip = (pegs->at == pegs->authority) ? + 1 : 0; + sec_start = norm_start + colon - pegs->at + + skip; + if (url->scheme != NULL && + url->scheme_type == + NSURL_SCHEME_HTTP && + length - + (colon - pegs->at + skip) == 2 && + *sec_start == '8' && + *(sec_start + 1) == '0') { + /* Scheme is http, and port is default + * (80) */ + flags |= NSURL_F_NO_PORT; + } + + if (length <= (colon - pegs->at + skip)) { + /* No space for a port after the colon + */ + flags |= NSURL_F_NO_PORT; + } + + /* Add non-redundant ports to NetSurf URL */ + sec_start = norm_start + colon - pegs->at + + skip; + if (!(flags & NSURL_F_NO_PORT) && + lwc_intern_string(sec_start, + length - + (colon - pegs->at + skip), + &url->port) != lwc_error_ok) { + return NSERROR_NOMEM; + } + + /* update length for host */ + skip = (pegs->at == pegs->authority) ? 0 : 1; + length = colon - pegs->at - skip; + } + + /* host */ + /* Encode host according to IDNA2008 */ + ret = idna_encode(norm_start, length, &host, &host_len); + if (ret == NSERROR_OK) { + /* valid idna encoding */ + if (lwc_intern_string(host, host_len, + &url->host) != lwc_error_ok) { + return NSERROR_NOMEM; + } + free(host); + } else { + /* fall back to straight interning */ + if (lwc_intern_string(norm_start, length, + &url->host) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } + } + + break; + + case URL_PATH: + if (length != 0) { + if (lwc_intern_string(norm_start, length, + &url->path) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } else if (url->host != NULL && + url->scheme_type != NSURL_SCHEME_MAILTO) { + /* Set empty path to "/", if there's a host */ + if (lwc_intern_string("/", SLEN("/"), + &url->path) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } else { + url->path = NULL; + } + + break; + + case URL_QUERY: + if (length != 0) { + if (lwc_intern_string(norm_start, length, + &url->query) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } else { + url->query = NULL; + } + + break; + + case URL_FRAGMENT: + if (length != 0) { + if (lwc_intern_string(norm_start, length, + &url->fragment) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } else { + url->fragment = NULL; + } + + break; + } + + return NSERROR_OK; +} + + +/** + * Get nsurl string info; total length, component lengths, & components present + * + * \param url NetSurf URL components + * \param parts Which parts of the URL are required in the string + * \param url_l Updated to total string length + * \param lengths Updated with individual component lengths + * \param pflags Updated to contain relevant string flags + */ +static void nsurl__get_string_data(const struct nsurl_components *url, + nsurl_component parts, size_t *url_l, + struct nsurl_component_lengths *lengths, + enum nsurl_string_flags *pflags) +{ + enum nsurl_string_flags flags = *pflags; + *url_l = 0; + + /* Intersection of required parts and available parts gives + * the output parts */ + if (url->scheme && parts & NSURL_SCHEME) { + flags |= NSURL_F_SCHEME; + + lengths->scheme = lwc_string_length(url->scheme); + *url_l += lengths->scheme; + } + + if (url->username && parts & NSURL_USERNAME) { + flags |= NSURL_F_USERNAME; + + lengths->username = lwc_string_length(url->username); + *url_l += lengths->username; + } + + if (url->password && parts & NSURL_PASSWORD) { + flags |= NSURL_F_PASSWORD; + + lengths->password = lwc_string_length(url->password); + *url_l += SLEN(":") + lengths->password; + } + + if (url->host && parts & NSURL_HOST) { + flags |= NSURL_F_HOST; + + lengths->host = lwc_string_length(url->host); + *url_l += lengths->host; + } + + if (url->port && parts & NSURL_PORT) { + flags |= NSURL_F_PORT; + + lengths->port = lwc_string_length(url->port); + *url_l += SLEN(":") + lengths->port; + } + + if (url->path && parts & NSURL_PATH) { + flags |= NSURL_F_PATH; + + lengths->path = lwc_string_length(url->path); + *url_l += lengths->path; + } + + if (url->query && parts & NSURL_QUERY) { + flags |= NSURL_F_QUERY; + + lengths->query = lwc_string_length(url->query); + *url_l += lengths->query; + } + + if (url->fragment && parts & NSURL_FRAGMENT) { + flags |= NSURL_F_FRAGMENT; + + lengths->fragment = lwc_string_length(url->fragment); + *url_l += lengths->fragment; + } + + /* Turn on any spanned punctuation */ + if ((flags & NSURL_F_SCHEME) && (parts > NSURL_SCHEME)) { + flags |= NSURL_F_SCHEME_PUNCTUATION; + + *url_l += SLEN(":"); + } + + if ((flags & NSURL_F_SCHEME) && (flags > NSURL_F_SCHEME) && + url->path && lwc_string_data(url->path)[0] == '/') { + flags |= NSURL_F_AUTHORITY_PUNCTUATION; + + *url_l += SLEN("//"); + } + + if ((flags & (NSURL_F_USERNAME | NSURL_F_PASSWORD)) && + flags & NSURL_F_HOST) { + flags |= NSURL_F_CREDENTIALS_PUNCTUATION; + + *url_l += SLEN("@"); + } + + if ((flags & ~NSURL_F_FRAGMENT) && (flags & NSURL_F_FRAGMENT)) { + flags |= NSURL_F_FRAGMENT_PUNCTUATION; + + *url_l += SLEN("#"); + } + + *pflags = flags; +} + + +/** + * Get nsurl string info; total length, component lengths, & components present + * + * \param url NetSurf URL components + * \param url_s Updated to contain the string + * \param l Individual component lengths + * \param flags String flags + */ +static void nsurl_get_string(const struct nsurl_components *url, char *url_s, + struct nsurl_component_lengths *l, + enum nsurl_string_flags flags) +{ + char *pos; + + /* Copy the required parts into the url string */ + pos = url_s; + + if (flags & NSURL_F_SCHEME) { + memcpy(pos, lwc_string_data(url->scheme), l->scheme); + pos += l->scheme; + } + + if (flags & NSURL_F_SCHEME_PUNCTUATION) { + *(pos++) = ':'; + } + + if (flags & NSURL_F_AUTHORITY_PUNCTUATION) { + *(pos++) = '/'; + *(pos++) = '/'; + } + + if (flags & NSURL_F_USERNAME) { + memcpy(pos, lwc_string_data(url->username), l->username); + pos += l->username; + } + + if (flags & NSURL_F_PASSWORD) { + *(pos++) = ':'; + memcpy(pos, lwc_string_data(url->password), l->password); + pos += l->password; + } + + if (flags & NSURL_F_CREDENTIALS_PUNCTUATION) { + *(pos++) = '@'; + } + + if (flags & NSURL_F_HOST) { + memcpy(pos, lwc_string_data(url->host), l->host); + pos += l->host; + } + + if (flags & NSURL_F_PORT) { + *(pos++) = ':'; + memcpy(pos, lwc_string_data(url->port), l->port); + pos += l->port; + } + + if (flags & NSURL_F_PATH) { + memcpy(pos, lwc_string_data(url->path), l->path); + pos += l->path; + } + + if (flags & NSURL_F_QUERY) { + memcpy(pos, lwc_string_data(url->query), l->query); + pos += l->query; + } + + if (flags & NSURL_F_FRAGMENT) { + if (flags & NSURL_F_FRAGMENT_PUNCTUATION) + *(pos++) = '#'; + memcpy(pos, lwc_string_data(url->fragment), l->fragment); + pos += l->fragment; + } + + *pos = '\0'; +} + + +/** + * Calculate hash value + * + * \param url NetSurf URL object to set hash value for + */ +static void nsurl_calc_hash(nsurl *url) +{ + uint32_t hash = 0; + + if (url->components.scheme) + hash ^= lwc_string_hash_value(url->components.scheme); + + if (url->components.username) + hash ^= lwc_string_hash_value(url->components.username); + + if (url->components.password) + hash ^= lwc_string_hash_value(url->components.password); + + if (url->components.host) + hash ^= lwc_string_hash_value(url->components.host); + + if (url->components.port) + hash ^= lwc_string_hash_value(url->components.port); + + if (url->components.path) + hash ^= lwc_string_hash_value(url->components.path); + + if (url->components.query) + hash ^= lwc_string_hash_value(url->components.query); + + url->hash = hash; +} + + +/** + * Destroy components + * + * \param c url components + */ +static void nsurl_destroy_components(struct nsurl_components *c) +{ + if (c->scheme) + lwc_string_unref(c->scheme); + + if (c->username) + lwc_string_unref(c->username); + + if (c->password) + lwc_string_unref(c->password); + + if (c->host) + lwc_string_unref(c->host); + + if (c->port) + lwc_string_unref(c->port); + + if (c->path) + lwc_string_unref(c->path); + + if (c->query) + lwc_string_unref(c->query); + + if (c->fragment) + lwc_string_unref(c->fragment); +} + + +#ifdef NSURL_DEBUG +/** + * Dump a NetSurf URL's internal components + * + * \param url The NetSurf URL to dump components of + */ +static void nsurl__dump(const nsurl *url) +{ + if (url->components.scheme) + LOG(" Scheme: %s", lwc_string_data(url->components.scheme)); + + if (url->components.username) + LOG("Username: %s", lwc_string_data(url->components.username)); + + if (url->components.password) + LOG("Password: %s", lwc_string_data(url->components.password)); + + if (url->components.host) + LOG(" Host: %s", lwc_string_data(url->components.host)); + + if (url->components.port) + LOG(" Port: %s", lwc_string_data(url->components.port)); + + if (url->components.path) + LOG(" Path: %s", lwc_string_data(url->components.path)); + + if (url->components.query) + LOG(" Query: %s", lwc_string_data(url->components.query)); + + if (url->components.fragment) + LOG("Fragment: %s", lwc_string_data(url->components.fragment)); +} +#endif + +/****************************************************************************** + * NetSurf URL Public API * + ******************************************************************************/ + +/* exported interface, documented in nsurl.h */ +nserror nsurl_create(const char * const url_s, nsurl **url) +{ + struct url_markers m; + struct nsurl_components c; + size_t length; + char *buff; + struct nsurl_component_lengths str_len = { 0, 0, 0, 0, 0, 0, 0, 0 }; + enum nsurl_string_flags str_flags = 0; + nserror e = NSERROR_OK; + bool match; + + assert(url_s != NULL); + + /* Peg out the URL sections */ + nsurl__get_string_markers(url_s, &m, false); + + /* Get the length of the longest section */ + length = nsurl__get_longest_section(&m); + + /* Allocate enough memory to url escape the longest section */ + buff = malloc(length * 3 + 1); + if (buff == NULL) + return NSERROR_NOMEM; + + /* Set scheme type */ + c.scheme_type = m.scheme_type; + + /* Build NetSurf URL object from sections */ + e |= nsurl__create_from_section(url_s, URL_SCHEME, &m, buff, &c); + e |= nsurl__create_from_section(url_s, URL_CREDENTIALS, &m, buff, &c); + e |= nsurl__create_from_section(url_s, URL_HOST, &m, buff, &c); + e |= nsurl__create_from_section(url_s, URL_PATH, &m, buff, &c); + e |= nsurl__create_from_section(url_s, URL_QUERY, &m, buff, &c); + e |= nsurl__create_from_section(url_s, URL_FRAGMENT, &m, buff, &c); + + /* Finished with buffer */ + free(buff); + + if (e != NSERROR_OK) { + nsurl_destroy_components(&c); + return NSERROR_NOMEM; + } + + /* Validate URL */ + if ((lwc_string_isequal(c.scheme, corestring_lwc_http, + &match) == lwc_error_ok && match == true) || + (lwc_string_isequal(c.scheme, corestring_lwc_https, + &match) == lwc_error_ok && match == true)) { + /* http, https must have host */ + if (c.host == NULL) { + nsurl_destroy_components(&c); + return NSERROR_BAD_URL; + } + } + + /* Get the string length and find which parts of url are present */ + nsurl__get_string_data(&c, NSURL_WITH_FRAGMENT, &length, + &str_len, &str_flags); + + /* Create NetSurf URL object */ + *url = malloc(sizeof(nsurl) + length + 1); /* Add 1 for \0 */ + if (*url == NULL) { + nsurl_destroy_components(&c); + return NSERROR_NOMEM; + } + + (*url)->components = c; + (*url)->length = length; + + /* Fill out the url string */ + nsurl_get_string(&c, (*url)->string, &str_len, str_flags); + + /* Get the nsurl's hash */ + nsurl_calc_hash(*url); + + /* Give the URL a reference */ + (*url)->count = 1; + + return NSERROR_OK; +} + + +/* exported interface, documented in nsurl.h */ +nsurl *nsurl_ref(nsurl *url) +{ + assert(url != NULL); + + url->count++; + + return url; +} + + +/* exported interface, documented in nsurl.h */ +void nsurl_unref(nsurl *url) +{ + assert(url != NULL); + assert(url->count > 0); + + if (--url->count > 0) + return; + +#ifdef NSURL_DEBUG + nsurl__dump(url); +#endif + + /* Release lwc strings */ + nsurl_destroy_components(&url->components); + + /* Free the NetSurf URL */ + free(url); +} + + +/* exported interface, documented in nsurl.h */ +bool nsurl_compare(const nsurl *url1, const nsurl *url2, nsurl_component parts) +{ + bool match = true; + + assert(url1 != NULL); + assert(url2 != NULL); + + /* Compare URL components */ + + /* Path, host and query first, since they're most likely to differ */ + + if (parts & NSURL_PATH) { + nsurl__component_compare(url1->components.path, + url2->components.path, &match); + + if (match == false) + return false; + } + + if (parts & NSURL_HOST) { + nsurl__component_compare(url1->components.host, + url2->components.host, &match); + + if (match == false) + return false; + } + + if (parts & NSURL_QUERY) { + nsurl__component_compare(url1->components.query, + url2->components.query, &match); + + if (match == false) + return false; + } + + if (parts & NSURL_SCHEME) { + nsurl__component_compare(url1->components.scheme, + url2->components.scheme, &match); + + if (match == false) + return false; + } + + if (parts & NSURL_USERNAME) { + nsurl__component_compare(url1->components.username, + url2->components.username, &match); + + if (match == false) + return false; + } + + if (parts & NSURL_PASSWORD) { + nsurl__component_compare(url1->components.password, + url2->components.password, &match); + + if (match == false) + return false; + } + + if (parts & NSURL_PORT) { + nsurl__component_compare(url1->components.port, + url2->components.port, &match); + + if (match == false) + return false; + } + + if (parts & NSURL_FRAGMENT) { + nsurl__component_compare(url1->components.fragment, + url2->components.fragment, &match); + + if (match == false) + return false; + } + + return true; +} + + +/* exported interface, documented in nsurl.h */ +nserror nsurl_get(const nsurl *url, nsurl_component parts, + char **url_s, size_t *url_l) +{ + struct nsurl_component_lengths str_len = { 0, 0, 0, 0, 0, 0, 0, 0 }; + enum nsurl_string_flags str_flags = 0; + + assert(url != NULL); + + /* Get the string length and find which parts of url need copied */ + nsurl__get_string_data(&(url->components), parts, url_l, + &str_len, &str_flags); + + if (*url_l == 0) { + return NSERROR_BAD_URL; + } + + /* Allocate memory for url string */ + *url_s = malloc(*url_l + 1); /* adding 1 for '\0' */ + if (*url_s == NULL) { + return NSERROR_NOMEM; + } + + /* Copy the required parts into the url string */ + nsurl_get_string(&(url->components), *url_s, &str_len, str_flags); + + return NSERROR_OK; +} + + +/* exported interface, documented in nsurl.h */ +lwc_string *nsurl_get_component(const nsurl *url, nsurl_component part) +{ + assert(url != NULL); + + switch (part) { + case NSURL_SCHEME: + return (url->components.scheme != NULL) ? + lwc_string_ref(url->components.scheme) : NULL; + + case NSURL_USERNAME: + return (url->components.username != NULL) ? + lwc_string_ref(url->components.username) : NULL; + + case NSURL_PASSWORD: + return (url->components.password != NULL) ? + lwc_string_ref(url->components.password) : NULL; + + case NSURL_HOST: + return (url->components.host != NULL) ? + lwc_string_ref(url->components.host) : NULL; + + case NSURL_PORT: + return (url->components.port != NULL) ? + lwc_string_ref(url->components.port) : NULL; + + case NSURL_PATH: + return (url->components.path != NULL) ? + lwc_string_ref(url->components.path) : NULL; + + case NSURL_QUERY: + return (url->components.query != NULL) ? + lwc_string_ref(url->components.query) : NULL; + + case NSURL_FRAGMENT: + return (url->components.fragment != NULL) ? + lwc_string_ref(url->components.fragment) : NULL; + + default: + LOG("Unsupported value passed to part param."); + assert(0); + } + + return NULL; +} + + +/* exported interface, documented in nsurl.h */ +bool nsurl_has_component(const nsurl *url, nsurl_component part) +{ + assert(url != NULL); + + switch (part) { + case NSURL_SCHEME: + if (url->components.scheme != NULL) + return true; + else + return false; + + case NSURL_CREDENTIALS: + /* Only username required for credentials section */ + /* Fall through */ + case NSURL_USERNAME: + if (url->components.username != NULL) + return true; + else + return false; + + case NSURL_PASSWORD: + if (url->components.password != NULL) + return true; + else + return false; + + case NSURL_HOST: + if (url->components.host != NULL) + return true; + else + return false; + + case NSURL_PORT: + if (url->components.port != NULL) + return true; + else + return false; + + case NSURL_PATH: + if (url->components.path != NULL) + return true; + else + return false; + + case NSURL_QUERY: + if (url->components.query != NULL) + return true; + else + return false; + + case NSURL_FRAGMENT: + if (url->components.fragment != NULL) + return true; + else + return false; + + default: + LOG("Unsupported value passed to part param."); + assert(0); + } + + return false; +} + + +/* exported interface, documented in nsurl.h */ +const char *nsurl_access(const nsurl *url) +{ + assert(url != NULL); + + return url->string; +} + + +/* exported interface, documented in nsurl.h */ +nserror nsurl_get_utf8(const nsurl *url, char **url_s, size_t *url_l) +{ + nserror err; + lwc_string *host; + char *idna_host = NULL; + size_t idna_host_len; + char *scheme = NULL; + size_t scheme_len; + char *path = NULL; + size_t path_len; + + assert(url != NULL); + + if (url->components.host == NULL) { + return nsurl_get(url, NSURL_WITH_FRAGMENT, url_s, url_l); + } + + host = url->components.host; + err = idna_decode(lwc_string_data(host), lwc_string_length(host), + &idna_host, &idna_host_len); + if (err != NSERROR_OK) { + goto cleanup; + } + + err = nsurl_get(url, + NSURL_SCHEME | NSURL_CREDENTIALS, + &scheme, &scheme_len); + if (err != NSERROR_OK) { + goto cleanup; + } + + err = nsurl_get(url, + NSURL_PORT | NSURL_PATH | NSURL_QUERY | NSURL_FRAGMENT, + &path, &path_len); + if (err != NSERROR_OK) { + goto cleanup; + } + + *url_l = scheme_len + idna_host_len + path_len + 1; /* +1 for \0 */ + *url_s = malloc(*url_l); + + if (*url_s == NULL) { + err = NSERROR_NOMEM; + goto cleanup; + } + + snprintf(*url_s, *url_l, "%s%s%s", scheme, idna_host, path); + + err = NSERROR_OK; + +cleanup: + free(idna_host); + free(scheme); + free(path); + + return err; +} + + +/* exported interface, documented in nsurl.h */ +const char *nsurl_access_leaf(const nsurl *url) +{ + size_t path_len; + const char *path; + const char *leaf; + + assert(url != NULL); + + if (url->components.path == NULL) + return ""; + + path = lwc_string_data(url->components.path); + path_len = lwc_string_length(url->components.path); + + if (path_len == 0) + return ""; + + if (path_len == 1 && *path == '/') + return "/"; + + leaf = path + path_len; + + do { + leaf--; + } while ((leaf != path) && (*leaf != '/')); + + if (*leaf == '/') + leaf++; + + return leaf; +} + + +/* exported interface, documented in nsurl.h */ +size_t nsurl_length(const nsurl *url) +{ + assert(url != NULL); + + return url->length; +} + + +/* exported interface, documented in nsurl.h */ +uint32_t nsurl_hash(const nsurl *url) +{ + assert(url != NULL); + + return url->hash; +} + + +/* exported interface, documented in nsurl.h */ +nserror nsurl_join(const nsurl *base, const char *rel, nsurl **joined) +{ + struct url_markers m; + struct nsurl_components c; + size_t length; + char *buff; + char *buff_pos; + char *buff_start; + struct nsurl_component_lengths str_len = { 0, 0, 0, 0, 0, 0, 0, 0 }; + enum nsurl_string_flags str_flags = 0; + nserror error = 0; + enum { + NSURL_F_REL = 0, + NSURL_F_BASE_SCHEME = (1 << 0), + NSURL_F_BASE_AUTHORITY = (1 << 1), + NSURL_F_BASE_PATH = (1 << 2), + NSURL_F_MERGED_PATH = (1 << 3), + NSURL_F_BASE_QUERY = (1 << 4) + } joined_parts; + + assert(base != NULL); + assert(rel != NULL); + +#ifdef NSURL_DEBUG + LOG("base: \"%s\", rel: \"%s\"", nsurl_access(base), rel); +#endif + + /* Peg out the URL sections */ + nsurl__get_string_markers(rel, &m, true); + + /* Get the length of the longest section */ + length = nsurl__get_longest_section(&m); + + /* Initially assume that the joined URL can be formed entierly from + * the relative URL. + */ + joined_parts = NSURL_F_REL; + + /* Update joined_compnents to indicate any required parts from the + * base URL. + */ + if (m.scheme_end - m.start <= 0) { + /* The relative url has no scheme. + * Use base URL's scheme. */ + joined_parts |= NSURL_F_BASE_SCHEME; + + if (m.path - m.authority <= 0) { + /* The relative URL has no authority. + * Use base URL's authority. */ + joined_parts |= NSURL_F_BASE_AUTHORITY; + + if (m.query - m.path <= 0) { + /* The relative URL has no path. + * Use base URL's path. */ + joined_parts |= NSURL_F_BASE_PATH; + + if (m.fragment - m.query <= 0) { + /* The relative URL has no query. + * Use base URL's query. */ + joined_parts |= NSURL_F_BASE_QUERY; + } + + } else if (*(rel + m.path) != '/') { + /* Relative URL has relative path */ + joined_parts |= NSURL_F_MERGED_PATH; + } + } + } + + /* Allocate enough memory to url escape the longest section, plus + * space for path merging (if required). + */ + if (joined_parts & NSURL_F_MERGED_PATH) { + /* Need to merge paths */ + length += (base->components.path != NULL) ? + lwc_string_length(base->components.path) : 0; + } + length *= 4; + /* Plus space for removing dots from path */ + length += (m.query - m.path) + ((base->components.path != NULL) ? + lwc_string_length(base->components.path) : 0); + + buff = malloc(length + 5); + if (buff == NULL) { + return NSERROR_NOMEM; + } + + buff_pos = buff; + + /* Form joined URL from base or rel components, as appropriate */ + + if (joined_parts & NSURL_F_BASE_SCHEME) { + c.scheme_type = base->components.scheme_type; + + c.scheme = nsurl__component_copy(base->components.scheme); + } else { + c.scheme_type = m.scheme_type; + + error = nsurl__create_from_section(rel, URL_SCHEME, &m, buff, &c); + if (error != NSERROR_OK) { + free(buff); + return error; + } + } + + if (joined_parts & NSURL_F_BASE_AUTHORITY) { + c.username = nsurl__component_copy(base->components.username); + c.password = nsurl__component_copy(base->components.password); + c.host = nsurl__component_copy(base->components.host); + c.port = nsurl__component_copy(base->components.port); + } else { + error = nsurl__create_from_section(rel, URL_CREDENTIALS, &m, + buff, &c); + if (error == NSERROR_OK) { + error = nsurl__create_from_section(rel, URL_HOST, &m, + buff, &c); + } + if (error != NSERROR_OK) { + free(buff); + return error; + } + } + + if (joined_parts & NSURL_F_BASE_PATH) { + c.path = nsurl__component_copy(base->components.path); + + } else if (joined_parts & NSURL_F_MERGED_PATH) { + struct url_markers m_path; + size_t new_length; + + if (base->components.host != NULL && + base->components.path == NULL) { + /* Append relative path to "/". */ + *(buff_pos++) = '/'; + memcpy(buff_pos, rel + m.path, m.query - m.path); + buff_pos += m.query - m.path; + + } else { + /* Append relative path to all but last segment of + * base path. */ + size_t path_end = lwc_string_length( + base->components.path); + const char *path = lwc_string_data( + base->components.path); + + while (*(path + path_end) != '/' && + path_end != 0) { + path_end--; + } + if (*(path + path_end) == '/') + path_end++; + + /* Copy the base part */ + memcpy(buff_pos, path, path_end); + buff_pos += path_end; + + /* Copy the relative part */ + memcpy(buff_pos, rel + m.path, m.query - m.path); + buff_pos += m.query - m.path; + } + + /* add termination to string */ + *buff_pos++ = '\0'; + + new_length = nsurl__remove_dot_segments(buff, buff_pos); + + m_path.path = 0; + m_path.query = new_length; + + buff_start = buff_pos + new_length; + error = nsurl__create_from_section(buff_pos, URL_PATH, &m_path, + buff_start, &c); + if (error != NSERROR_OK) { + free(buff); + return error; + } + + } else { + struct url_markers m_path; + size_t new_length; + + memcpy(buff_pos, rel + m.path, m.query - m.path); + buff_pos += m.query - m.path; + *(buff_pos++) = '\0'; + + new_length = nsurl__remove_dot_segments(buff, buff_pos); + + m_path.path = 0; + m_path.query = new_length; + + buff_start = buff_pos + new_length; + + error = nsurl__create_from_section(buff_pos, URL_PATH, &m_path, + buff_start, &c); + if (error != NSERROR_OK) { + free(buff); + return error; + } + } + + if (joined_parts & NSURL_F_BASE_QUERY) { + c.query = nsurl__component_copy(base->components.query); + } else { + error = nsurl__create_from_section(rel, URL_QUERY, &m, + buff, &c); + if (error != NSERROR_OK) { + free(buff); + return error; + } + } + + error = nsurl__create_from_section(rel, URL_FRAGMENT, &m, buff, &c); + + /* Free temporary buffer */ + free(buff); + + if (error != NSERROR_OK) { + return error; + } + + /* Get the string length and find which parts of url are present */ + nsurl__get_string_data(&c, NSURL_WITH_FRAGMENT, &length, + &str_len, &str_flags); + + /* Create NetSurf URL object */ + *joined = malloc(sizeof(nsurl) + length + 1); /* Add 1 for \0 */ + if (*joined == NULL) { + return NSERROR_NOMEM; + } + + (*joined)->components = c; + (*joined)->length = length; + + /* Fill out the url string */ + nsurl_get_string(&c, (*joined)->string, &str_len, str_flags); + + /* Get the nsurl's hash */ + nsurl_calc_hash(*joined); + + /* Give the URL a reference */ + (*joined)->count = 1; + + return NSERROR_OK; +} + + +/* exported interface, documented in nsurl.h */ +nserror nsurl_defragment(const nsurl *url, nsurl **no_frag) +{ + size_t length; + char *pos; + + assert(url != NULL); + + /* check for source url having no fragment already */ + if (url->components.fragment == NULL) { + *no_frag = (nsurl *)url; + + (*no_frag)->count++; + + return NSERROR_OK; + } + + /* Find the change in length from url to new_url */ + length = url->length; + if (url->components.fragment != NULL) { + length -= 1 + lwc_string_length(url->components.fragment); + } + + /* Create NetSurf URL object */ + *no_frag = malloc(sizeof(nsurl) + length + 1); /* Add 1 for \0 */ + if (*no_frag == NULL) { + return NSERROR_NOMEM; + } + + /* Copy components */ + (*no_frag)->components.scheme = + nsurl__component_copy(url->components.scheme); + (*no_frag)->components.username = + nsurl__component_copy(url->components.username); + (*no_frag)->components.password = + nsurl__component_copy(url->components.password); + (*no_frag)->components.host = + nsurl__component_copy(url->components.host); + (*no_frag)->components.port = + nsurl__component_copy(url->components.port); + (*no_frag)->components.path = + nsurl__component_copy(url->components.path); + (*no_frag)->components.query = + nsurl__component_copy(url->components.query); + (*no_frag)->components.fragment = NULL; + + (*no_frag)->components.scheme_type = url->components.scheme_type; + + (*no_frag)->length = length; + + /* Fill out the url string */ + pos = (*no_frag)->string; + memcpy(pos, url->string, length); + pos += length; + *pos = '\0'; + + /* Get the nsurl's hash */ + nsurl_calc_hash(*no_frag); + + /* Give the URL a reference */ + (*no_frag)->count = 1; + + return NSERROR_OK; +} + + +/* exported interface, documented in nsurl.h */ +nserror nsurl_refragment(const nsurl *url, lwc_string *frag, nsurl **new_url) +{ + int frag_len; + int base_len; + char *pos; + size_t len; + + assert(url != NULL); + assert(frag != NULL); + + /* Find the change in length from url to new_url */ + base_len = url->length; + if (url->components.fragment != NULL) { + base_len -= 1 + lwc_string_length(url->components.fragment); + } + frag_len = lwc_string_length(frag); + + /* Set new_url's length */ + len = base_len + 1 /* # */ + frag_len; + + /* Create NetSurf URL object */ + *new_url = malloc(sizeof(nsurl) + len + 1); /* Add 1 for \0 */ + if (*new_url == NULL) { + return NSERROR_NOMEM; + } + + (*new_url)->length = len; + + /* Set string */ + pos = (*new_url)->string; + memcpy(pos, url->string, base_len); + pos += base_len; + *pos = '#'; + memcpy(++pos, lwc_string_data(frag), frag_len); + pos += frag_len; + *pos = '\0'; + + /* Copy components */ + (*new_url)->components.scheme = + nsurl__component_copy(url->components.scheme); + (*new_url)->components.username = + nsurl__component_copy(url->components.username); + (*new_url)->components.password = + nsurl__component_copy(url->components.password); + (*new_url)->components.host = + nsurl__component_copy(url->components.host); + (*new_url)->components.port = + nsurl__component_copy(url->components.port); + (*new_url)->components.path = + nsurl__component_copy(url->components.path); + (*new_url)->components.query = + nsurl__component_copy(url->components.query); + (*new_url)->components.fragment = + lwc_string_ref(frag); + + (*new_url)->components.scheme_type = url->components.scheme_type; + + /* Get the nsurl's hash */ + nsurl_calc_hash(*new_url); + + /* Give the URL a reference */ + (*new_url)->count = 1; + + return NSERROR_OK; +} + + +/* exported interface, documented in nsurl.h */ +nserror nsurl_replace_query(const nsurl *url, const char *query, + nsurl **new_url) +{ + int query_len; /* Length of new query string, including '?' */ + int frag_len = 0; /* Length of fragment, including '#' */ + int base_len; /* Length of URL up to start of query */ + char *pos; + size_t len; + lwc_string *lwc_query; + + assert(url != NULL); + assert(query != NULL); + assert(query[0] == '?'); + + /* Get the length of the new query */ + query_len = strlen(query); + + /* Find the change in length from url to new_url */ + base_len = url->length; + if (url->components.query != NULL) { + base_len -= lwc_string_length(url->components.query); + } + if (url->components.fragment != NULL) { + frag_len = 1 + lwc_string_length(url->components.fragment); + base_len -= frag_len; + } + + /* Set new_url's length */ + len = base_len + query_len + frag_len; + + /* Create NetSurf URL object */ + *new_url = malloc(sizeof(nsurl) + len + 1); /* Add 1 for \0 */ + if (*new_url == NULL) { + return NSERROR_NOMEM; + } + + if (lwc_intern_string(query, query_len, &lwc_query) != lwc_error_ok) { + free(*new_url); + return NSERROR_NOMEM; + } + + (*new_url)->length = len; + + /* Set string */ + pos = (*new_url)->string; + memcpy(pos, url->string, base_len); + pos += base_len; + memcpy(pos, query, query_len); + pos += query_len; + if (url->components.fragment != NULL) { + const char *frag = lwc_string_data(url->components.fragment); + *pos = '#'; + memcpy(++pos, frag, frag_len - 1); + pos += frag_len - 1; + } + *pos = '\0'; + + /* Copy components */ + (*new_url)->components.scheme = + nsurl__component_copy(url->components.scheme); + (*new_url)->components.username = + nsurl__component_copy(url->components.username); + (*new_url)->components.password = + nsurl__component_copy(url->components.password); + (*new_url)->components.host = + nsurl__component_copy(url->components.host); + (*new_url)->components.port = + nsurl__component_copy(url->components.port); + (*new_url)->components.path = + nsurl__component_copy(url->components.path); + (*new_url)->components.query = lwc_query; + (*new_url)->components.fragment = + nsurl__component_copy(url->components.fragment); + + (*new_url)->components.scheme_type = url->components.scheme_type; + + /* Get the nsurl's hash */ + nsurl_calc_hash(*new_url); + + /* Give the URL a reference */ + (*new_url)->count = 1; + + return NSERROR_OK; +} + + +/* exported interface documented in utils/nsurl.h */ +nserror nsurl_nice(const nsurl *url, char **result, bool remove_extensions) +{ + const char *data; + size_t len; + size_t pos; + bool match; + char *name; + + assert(url != NULL); + + *result = 0; + + /* extract the last component of the path, if possible */ + if ((url->components.path != NULL) && + (lwc_string_length(url->components.path) != 0) && + (lwc_string_isequal(url->components.path, + corestring_lwc_slash_, &match) == lwc_error_ok) && + (match == false)) { + bool first = true; + bool keep_looking; + + /* Get hold of the string data we're examining */ + data = lwc_string_data(url->components.path); + len = lwc_string_length(url->components.path); + pos = len; + + do { + keep_looking = false; + pos--; + + /* Find last '/' with stuff after it */ + while (pos != 0) { + if (data[pos] == '/' && pos < len - 1) { + break; + } + pos--; + } + + if (pos == 0) { + break; + } + + if (first) { + if (strncasecmp("/default.", data + pos, + SLEN("/default.")) == 0) { + keep_looking = true; + + } else if (strncasecmp("/index.", + data + pos, + 6) == 0) { + keep_looking = true; + + } + first = false; + } + + } while (keep_looking); + + if (data[pos] == '/') + pos++; + + if (strncasecmp("default.", data + pos, 8) != 0 && + strncasecmp("index.", data + pos, 6) != 0) { + size_t end = pos; + while (data[end] != '\0' && data[end] != '/') { + end++; + } + if (end - pos != 0) { + name = malloc(end - pos + 1); + if (name == NULL) { + return NSERROR_NOMEM; + } + memcpy(name, data + pos, end - pos); + name[end - pos] = '\0'; + if (remove_extensions) { + /* strip any extenstion */ + char *dot = strchr(name, '.'); + if (dot && dot != name) { + *dot = '\0'; + } + } + *result = name; + return NSERROR_OK; + } + } + } + + if (url->components.host != NULL) { + name = strdup(lwc_string_data(url->components.host)); + + for (pos = 0; name[pos] != '\0'; pos++) { + if (name[pos] == '.') { + name[pos] = '_'; + } + } + + *result = name; + return NSERROR_OK; + } + + return NSERROR_NOT_FOUND; +} + + +/* exported interface, documented in nsurl.h */ +nserror nsurl_parent(const nsurl *url, nsurl **new_url) +{ + lwc_string *lwc_path; + size_t old_path_len, new_path_len; + size_t len; + const char* path = NULL; + char *pos; + + assert(url != NULL); + + old_path_len = (url->components.path == NULL) ? 0 : + lwc_string_length(url->components.path); + + /* Find new path length */ + if (old_path_len == 0) { + new_path_len = old_path_len; + } else { + path = lwc_string_data(url->components.path); + + new_path_len = old_path_len; + if (old_path_len > 1) { + /* Skip over any trailing / */ + if (path[new_path_len - 1] == '/') + new_path_len--; + + /* Work back to next / */ + while (new_path_len > 0 && + path[new_path_len - 1] != '/') + new_path_len--; + } + } + + /* Find the length of new_url */ + len = url->length; + if (url->components.query != NULL) { + len -= lwc_string_length(url->components.query); + } + if (url->components.fragment != NULL) { + len -= 1; /* # */ + len -= lwc_string_length(url->components.fragment); + } + len -= old_path_len - new_path_len; + + /* Create NetSurf URL object */ + *new_url = malloc(sizeof(nsurl) + len + 1); /* Add 1 for \0 */ + if (*new_url == NULL) { + return NSERROR_NOMEM; + } + + /* Make new path */ + if (old_path_len == 0) { + lwc_path = NULL; + } else if (old_path_len == new_path_len) { + lwc_path = lwc_string_ref(url->components.path); + } else { + if (lwc_intern_string(path, old_path_len - new_path_len, + &lwc_path) != lwc_error_ok) { + free(*new_url); + return NSERROR_NOMEM; + } + } + + (*new_url)->length = len; + + /* Set string */ + pos = (*new_url)->string; + memcpy(pos, url->string, len); + pos += len; + *pos = '\0'; + + /* Copy components */ + (*new_url)->components.scheme = + nsurl__component_copy(url->components.scheme); + (*new_url)->components.username = + nsurl__component_copy(url->components.username); + (*new_url)->components.password = + nsurl__component_copy(url->components.password); + (*new_url)->components.host = + nsurl__component_copy(url->components.host); + (*new_url)->components.port = + nsurl__component_copy(url->components.port); + (*new_url)->components.path = lwc_path; + (*new_url)->components.query = NULL; + (*new_url)->components.fragment = NULL; + + (*new_url)->components.scheme_type = url->components.scheme_type; + + /* Get the nsurl's hash */ + nsurl_calc_hash(*new_url); + + /* Give the URL a reference */ + (*new_url)->count = 1; + + return NSERROR_OK; +} + -- cgit v1.2.3