From 95e5ede775fc842324b87d797ca00c7576b9a359 Mon Sep 17 00:00:00 2001 From: Michael Drake Date: Sun, 15 Jan 2017 15:09:36 +0000 Subject: nsurl: Split out URL parsing. --- utils/nsurl/Makefile | 3 +- utils/nsurl/nsurl.c | 1528 +--------------------------------------------- utils/nsurl/parse.c | 1602 +++++++++++++++++++++++++++++++++++++++++++++++++ utils/nsurl/private.h | 68 +++ 4 files changed, 1677 insertions(+), 1524 deletions(-) create mode 100644 utils/nsurl/parse.c (limited to 'utils') diff --git a/utils/nsurl/Makefile b/utils/nsurl/Makefile index 08656f300..71304b292 100644 --- a/utils/nsurl/Makefile +++ b/utils/nsurl/Makefile @@ -1,6 +1,7 @@ # nsurl utils sources S_NSURL := \ - nsurl.c + nsurl.c \ + parse.c S_NSURL := $(addprefix utils/nsurl/,$(S_NSURL)) \ No newline at end of file diff --git a/utils/nsurl/nsurl.c b/utils/nsurl/nsurl.c index bc77c58ea..b60eb07ca 100644 --- a/utils/nsurl/nsurl.c +++ b/utils/nsurl/nsurl.c @@ -50,71 +50,6 @@ #undef NSURL_DEBUG -/** Marker set, indicating positions of sections within a URL string */ -struct url_markers { - size_t start; /** start of URL */ - size_t scheme_end; - size_t authority; - - size_t colon_first; - size_t at; - size_t colon_last; - - size_t path; - size_t query; - size_t fragment; - - size_t end; /** end of URL */ - - enum nsurl_scheme_type scheme_type; -}; - - -/** Marker set, indicating positions of sections within a URL string */ -struct nsurl_component_lengths { - size_t scheme; - size_t username; - size_t password; - size_t host; - size_t port; - size_t path; - size_t query; - size_t fragment; -}; - - -/** Flags indicating which parts of a URL string are required for a nsurl */ -enum nsurl_string_flags { - NSURL_F_SCHEME = (1 << 0), - NSURL_F_SCHEME_PUNCTUATION = (1 << 1), - NSURL_F_AUTHORITY_PUNCTUATION = (1 << 2), - NSURL_F_USERNAME = (1 << 3), - NSURL_F_PASSWORD = (1 << 4), - NSURL_F_CREDENTIALS_PUNCTUATION = (1 << 5), - NSURL_F_HOST = (1 << 6), - NSURL_F_PORT = (1 << 7), - NSURL_F_AUTHORITY = (NSURL_F_USERNAME | - NSURL_F_PASSWORD | - NSURL_F_HOST | - NSURL_F_PORT), - NSURL_F_PATH = (1 << 8), - NSURL_F_QUERY = (1 << 9), - NSURL_F_FRAGMENT_PUNCTUATION = (1 << 10), - NSURL_F_FRAGMENT = (1 << 11) -}; - - -/** Sections of a URL */ -enum url_sections { - URL_SCHEME, - URL_CREDENTIALS, - URL_HOST, - URL_PATH, - URL_QUERY, - URL_FRAGMENT -}; - - #define nsurl__component_copy(c) (c == NULL) ? NULL : lwc_string_ref(c) #define nsurl__component_compare(c1, c2, match) \ @@ -125,1129 +60,6 @@ enum url_sections { *match = false; \ } -/** - * Return a hex digit for the given numerical value. - * - * \param digit the value to get the hex digit for. - * \return character in range 0-9A-F - */ -inline static char digit2uppercase_hex(unsigned char digit) { - assert(digit < 16); - return "0123456789ABCDEF"[digit]; -} - -/** - * determine if a character is unreserved - * - * \param c character to classify. - * \return true if the character is unreserved else false. - */ -static bool nsurl__is_unreserved(unsigned char c) -{ - /* From RFC3986 section 2.3 (unreserved characters) - * - * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" - * - */ - static const bool unreserved[256] = { - false, false, false, false, false, false, false, false, /* 00 */ - false, false, false, false, false, false, false, false, /* 08 */ - false, false, false, false, false, false, false, false, /* 10 */ - false, false, false, false, false, false, false, false, /* 18 */ - false, false, false, false, false, false, false, false, /* 20 */ - false, false, false, false, false, true, true, false, /* 28 */ - true, true, true, true, true, true, true, true, /* 30 */ - true, true, false, false, false, false, false, false, /* 38 */ - false, true, true, true, true, true, true, true, /* 40 */ - true, true, true, true, true, true, true, true, /* 48 */ - true, true, true, true, true, true, true, true, /* 50 */ - true, true, true, false, false, false, false, true, /* 58 */ - false, true, true, true, true, true, true, true, /* 60 */ - true, true, true, true, true, true, true, true, /* 68 */ - true, true, true, true, true, true, true, true, /* 70 */ - true, true, true, false, false, false, true, false, /* 78 */ - false, false, false, false, false, false, false, false, /* 80 */ - false, false, false, false, false, false, false, false, /* 88 */ - false, false, false, false, false, false, false, false, /* 90 */ - false, false, false, false, false, false, false, false, /* 98 */ - false, false, false, false, false, false, false, false, /* A0 */ - false, false, false, false, false, false, false, false, /* A8 */ - false, false, false, false, false, false, false, false, /* B0 */ - false, false, false, false, false, false, false, false, /* B8 */ - false, false, false, false, false, false, false, false, /* C0 */ - false, false, false, false, false, false, false, false, /* C8 */ - false, false, false, false, false, false, false, false, /* D0 */ - false, false, false, false, false, false, false, false, /* D8 */ - false, false, false, false, false, false, false, false, /* E0 */ - false, false, false, false, false, false, false, false, /* E8 */ - false, false, false, false, false, false, false, false, /* F0 */ - false, false, false, false, false, false, false, false /* F8 */ - }; - return unreserved[c]; -} - -/** - * determine if a character should be percent escaped. - * - * The ASCII codes which should not be percent escaped - * - * \param c character to classify. - * \return true if the character should not be escaped else false. - */ -static bool nsurl__is_no_escape(unsigned char c) -{ - static const bool no_escape[256] = { - false, false, false, false, false, false, false, false, /* 00 */ - false, false, false, false, false, false, false, false, /* 08 */ - false, false, false, false, false, false, false, false, /* 10 */ - false, false, false, false, false, false, false, false, /* 18 */ - false, true, false, true, true, false, true, true, /* 20 */ - true, true, true, true, true, true, true, true, /* 28 */ - true, true, true, true, true, true, true, true, /* 30 */ - true, true, true, true, false, true, false, true, /* 38 */ - true, true, true, true, true, true, true, true, /* 40 */ - true, true, true, true, true, true, true, true, /* 48 */ - true, true, true, true, true, true, true, true, /* 50 */ - true, true, true, true, false, true, false, true, /* 58 */ - false, true, true, true, true, true, true, true, /* 60 */ - true, true, true, true, true, true, true, true, /* 68 */ - true, true, true, true, true, true, true, true, /* 70 */ - true, true, true, false, true, false, true, false, /* 78 */ - false, false, false, false, false, false, false, false, /* 80 */ - false, false, false, false, false, false, false, false, /* 88 */ - false, false, false, false, false, false, false, false, /* 90 */ - false, false, false, false, false, false, false, false, /* 98 */ - false, false, false, false, false, false, false, false, /* A0 */ - false, false, false, false, false, false, false, false, /* A8 */ - false, false, false, false, false, false, false, false, /* B0 */ - false, false, false, false, false, false, false, false, /* B8 */ - false, false, false, false, false, false, false, false, /* C0 */ - false, false, false, false, false, false, false, false, /* C8 */ - false, false, false, false, false, false, false, false, /* D0 */ - false, false, false, false, false, false, false, false, /* D8 */ - false, false, false, false, false, false, false, false, /* E0 */ - false, false, false, false, false, false, false, false, /* E8 */ - false, false, false, false, false, false, false, false, /* F0 */ - false, false, false, false, false, false, false, false, /* F8 */ - }; - return no_escape[c]; -} - - -/** - * Obtains a set of markers delimiting sections in a URL string - * - * \param url_s URL string - * \param markers Updated to mark sections in the URL string - * \param joining True iff URL string is a relative URL for joining - */ -static void nsurl__get_string_markers(const char * const url_s, - struct url_markers *markers, bool joining) -{ - const char *pos = url_s; /** current position in url_s */ - bool is_http = false; - bool trailing_whitespace = false; - - /* Initialise marker set */ - struct url_markers marker = { 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, NSURL_SCHEME_OTHER }; - - /* Skip any leading whitespace in url_s */ - while (ascii_is_space(*pos)) - pos++; - - /* Record start point */ - marker.start = pos - url_s; - - marker.scheme_end = marker.authority = marker.colon_first = marker.at = - marker.colon_last = marker.path = marker.start; - - if (*pos == '\0') { - /* Nothing but whitespace, early exit */ - marker.query = marker.fragment = marker.end = marker.path; - *markers = marker; - return; - } - - /* Get scheme */ - if (ascii_is_alpha(*pos)) { - pos++; - - while (*pos != ':' && *pos != '\0') { - if (!ascii_is_alphanumerical(*pos) && (*pos != '+') && - (*pos != '-') && (*pos != '.')) { - /* This character is not valid in the - * scheme */ - break; - } - pos++; - } - - if (*pos == ':') { - /* This delimits the end of the scheme */ - size_t off; - - marker.scheme_end = pos - url_s; - - off = marker.scheme_end - marker.start; - - /* Detect http(s) and mailto for scheme specifc - * normalisation */ - if (off == SLEN("http") && - (((*(pos - off + 0) == 'h') || - (*(pos - off + 0) == 'H')) && - ((*(pos - off + 1) == 't') || - (*(pos - off + 1) == 'T')) && - ((*(pos - off + 2) == 't') || - (*(pos - off + 2) == 'T')) && - ((*(pos - off + 3) == 'p') || - (*(pos - off + 3) == 'P')))) { - marker.scheme_type = NSURL_SCHEME_HTTP; - is_http = true; - } else if (off == SLEN("https") && - (((*(pos - off + 0) == 'h') || - (*(pos - off + 0) == 'H')) && - ((*(pos - off + 1) == 't') || - (*(pos - off + 1) == 'T')) && - ((*(pos - off + 2) == 't') || - (*(pos - off + 2) == 'T')) && - ((*(pos - off + 3) == 'p') || - (*(pos - off + 3) == 'P')) && - ((*(pos - off + 4) == 's') || - (*(pos - off + 4) == 'S')))) { - marker.scheme_type = NSURL_SCHEME_HTTPS; - is_http = true; - } else if (off == SLEN("ftp") && - (((*(pos - off + 0) == 'f') || - (*(pos - off + 0) == 'F')) && - ((*(pos - off + 1) == 't') || - (*(pos - off + 1) == 'T')) && - ((*(pos - off + 2) == 'p') || - (*(pos - off + 2) == 'P')))) { - marker.scheme_type = NSURL_SCHEME_FTP; - } else if (off == SLEN("mailto") && - (((*(pos - off + 0) == 'm') || - (*(pos - off + 0) == 'M')) && - ((*(pos - off + 1) == 'a') || - (*(pos - off + 1) == 'A')) && - ((*(pos - off + 2) == 'i') || - (*(pos - off + 2) == 'I')) && - ((*(pos - off + 3) == 'l') || - (*(pos - off + 3) == 'L')) && - ((*(pos - off + 4) == 't') || - (*(pos - off + 4) == 'T')) && - ((*(pos - off + 5) == 'o') || - (*(pos - off + 5) == 'O')))) { - marker.scheme_type = NSURL_SCHEME_MAILTO; - } - - /* Skip over colon */ - pos++; - - /* Mark place as start of authority */ - marker.authority = marker.colon_first = marker.at = - marker.colon_last = marker.path = - pos - url_s; - - } else { - /* Not found a scheme */ - if (joining == false) { - /* Assuming no scheme == http */ - marker.scheme_type = NSURL_SCHEME_HTTP; - is_http = true; - } - } - } - - /* Get authority - * - * Two slashes always indicates the start of an authority. - * - * We are more relaxed in the case of http: - * a. when joining, one or more slashes indicates start of authority - * b. when not joining, we assume authority if no scheme was present - * and in the case of mailto: when we assume there is an authority. - */ - if ((*pos == '/' && *(pos + 1) == '/') || - (is_http && ((joining && *pos == '/') || - (joining == false && - marker.scheme_end != marker.start))) || - marker.scheme_type == NSURL_SCHEME_MAILTO) { - - /* Skip over leading slashes */ - if (*pos == '/') { - if (is_http == false) { - if (*pos == '/') pos++; - if (*pos == '/') pos++; - } else { - while (*pos == '/') - pos++; - } - - marker.authority = marker.colon_first = marker.at = - marker.colon_last = marker.path = - pos - url_s; - } - - /* Need to get (or complete) the authority */ - while (*pos != '\0') { - if (*pos == '/' || *pos == '?' || *pos == '#') { - /* End of the authority */ - break; - - } else if (marker.scheme_type != NSURL_SCHEME_MAILTO && - *pos == ':' && marker.colon_first == - marker.authority) { - /* could be username:password or host:port - * separator */ - marker.colon_first = pos - url_s; - - } else if (marker.scheme_type != NSURL_SCHEME_MAILTO && - *pos == ':' && marker.colon_first != - marker.authority) { - /* could be host:port separator */ - marker.colon_last = pos - url_s; - - } else if (*pos == '@' && marker.at == - marker.authority) { - /* Credentials @ host separator */ - marker.at = pos - url_s; - } - - pos++; - } - - marker.path = pos - url_s; - - } else if ((*pos == '\0' || *pos == '/') && - joining == false && is_http == true) { - marker.path = pos - url_s; - } - - /* Get path - * - * Needs to start with '/' if there's no authority - */ - if (*pos == '/' || ((marker.path == marker.authority) && - (*pos != '?') && (*pos != '#') && (*pos != '\0'))) { - while (*(++pos) != '\0') { - if (*pos == '?' || *pos == '#') { - /* End of the path */ - break; - } - } - } - - marker.query = pos - url_s; - - /* Get query */ - if (*pos == '?') { - while (*(++pos) != '\0') { - if (*pos == '#') { - /* End of the query */ - break; - } - } - } - - marker.fragment = pos - url_s; - - /* Get fragment */ - if (*pos == '#') { - while (*(++pos) != '\0') - ; - } - - /* We got to the end of url_s. - * Need to skip back over trailing whitespace to find end of URL */ - pos--; - if (pos >= url_s && ascii_is_space(*pos)) { - trailing_whitespace = true; - while (pos >= url_s && ascii_is_space(*pos)) - pos--; - } - - marker.end = pos + 1 - url_s; - - if (trailing_whitespace == true) { - /* Ensure last url section doesn't pass end */ - if (marker.fragment > marker.end) - marker.fragment = marker.end; - if (marker.query > marker.end) - marker.query = marker.end; - if (marker.path > marker.end) - marker.path = marker.end; - if (marker.colon_last > marker.end) - marker.colon_last = marker.end; - if (marker.at > marker.end) - marker.at = marker.end; - if (marker.colon_last > marker.end) - marker.colon_last = marker.end; - if (marker.fragment > marker.end) - marker.fragment = marker.end; - } - -#ifdef NSURL_DEBUG - LOG("marker.start: %i", marker.start); - LOG("marker.scheme_end: %i", marker.scheme_end); - LOG("marker.authority: %i", marker.authority); - - LOG("marker.colon_first: %i", marker.colon_first); - LOG("marker.at: %i", marker.at); - LOG("marker.colon_last: %i", marker.colon_last); - - LOG("marker.path: %i", marker.path); - LOG("marker.query: %i", marker.query); - LOG("marker.fragment: %i", marker.fragment); - - LOG("marker.end: %i", marker.end); -#endif - - /* Got all the URL components pegged out now */ - *markers = marker; -} - - -/** - * Remove dot segments from a path, as per rfc 3986, 5.2.4 - * - * \param path path to remove dot segments from ('\0' terminated) - * \param output path with dot segments removed - * \return size of output - */ -static size_t nsurl__remove_dot_segments(char *path, char *output) -{ - char *path_pos = path; - char *output_pos = output; - - while (*path_pos != '\0') { -#ifdef NSURL_DEBUG - LOG(" in:%s", path_pos); - LOG("out:%.*s", output_pos - output, output); -#endif - if (*path_pos == '.') { - if (*(path_pos + 1) == '.' && - *(path_pos + 2) == '/') { - /* Found prefix of "../" */ - path_pos += SLEN("../"); - continue; - - } else if (*(path_pos + 1) == '/') { - /* Found prefix of "./" */ - path_pos += SLEN("./"); - continue; - } - } else if (*path_pos == '/' && *(path_pos + 1) == '.') { - if (*(path_pos + 2) == '/') { - /* Found prefix of "/./" */ - path_pos += SLEN("/."); - continue; - - } else if (*(path_pos + 2) == '\0') { - /* Found "/." at end of path */ - *(output_pos++) = '/'; - - /* End of input path */ - break; - - } else if (*(path_pos + 2) == '.') { - if (*(path_pos + 3) == '/') { - /* Found prefix of "/../" */ - path_pos += SLEN("/.."); - - if (output_pos > output) - output_pos--; - while (output_pos > output && - *output_pos != '/') - output_pos--; - - continue; - - } else if (*(path_pos + 3) == '\0') { - /* Found "/.." at end of path */ - - while (output_pos > output && - *(output_pos -1 ) !='/') - output_pos--; - - /* End of input path */ - break; - } - } - } else if (*path_pos == '.') { - if (*(path_pos + 1) == '\0') { - /* Found "." at end of path */ - - /* End of input path */ - break; - - } else if (*(path_pos + 1) == '.' && - *(path_pos + 2) == '\0') { - /* Found ".." at end of path */ - - /* End of input path */ - break; - } - } - /* Copy first character into output path */ - *output_pos++ = *path_pos++; - - /* Copy up to but not including next '/' */ - while ((*path_pos != '/') && (*path_pos != '\0')) - *output_pos++ = *path_pos++; - } - - return output_pos - output; -} - - -/** - * Get the length of the longest section - * - * \param m markers delimiting url sections in a string - * \return the length of the longest section - */ -static size_t nsurl__get_longest_section(struct url_markers *m) -{ - size_t length = m->scheme_end - m->start; /* scheme */ - - if (length < m->at - m->authority) /* credentials */ - length = m->at - m->authority; - - if (length < m->path - m->at) /* host */ - length = m->path - m->at; - - if (length < m->query - m->path) /* path */ - length = m->query - m->path; - - if (length < m->fragment - m->query) /* query */ - length = m->fragment - m->query; - - if (length < m->end - m->fragment) /* fragment */ - length = m->end - m->fragment; - - return length; -} - - -/** - * Converts two hexadecimal digits to a single number - * - * \param c1 most significant hex digit - * \param c2 least significant hex digit - * \return the total value of the two digit hex number, or -ve if input not hex - * - * For unescaping url encoded characters. - */ -static inline int nsurl__get_ascii_offset(char c1, char c2) -{ - int offset; - - /* Use 1st char as most significant hex digit */ - if (ascii_is_digit(c1)) - offset = 16 * (c1 - '0'); - else if (c1 >= 'a' && c1 <= 'f') - offset = 16 * (c1 - 'a' + 10); - else if (c1 >= 'A' && c1 <= 'F') - offset = 16 * (c1 - 'A' + 10); - else - /* Not valid hex */ - return -1; - - /* Use 2nd char as least significant hex digit and sum */ - if (ascii_is_digit(c2)) - offset += c2 - '0'; - else if (c2 >= 'a' && c2 <= 'f') - offset += c2 - 'a' + 10; - else if (c2 >= 'A' && c2 <= 'F') - offset += c2 - 'A' + 10; - else - /* Not valid hex */ - return -1; - - return offset; -} - - -/** - * Create the components of a NetSurf URL object for a section of a URL string - * - * \param url_s URL string - * \param section Sets which section of URL string is to be normalised - * \param pegs Set of markers delimiting the URL string's sections - * \param pos_norm A buffer large enough for the normalised string (*3 + 1) - * \param url A NetSurf URL object, to which components may be added - * \return NSERROR_OK on success, appropriate error otherwise - * - * The section of url_s is normalised appropriately. - */ -static nserror nsurl__create_from_section(const char * const url_s, - const enum url_sections section, - const struct url_markers *pegs, - char *pos_norm, - struct nsurl_components *url) -{ - nserror ret; - int ascii_offset; - int start = 0; - int end = 0; - const char *pos; - const char *pos_url_s; - char *norm_start = pos_norm; - char *host; - size_t copy_len; - size_t length; - size_t host_len; - enum { - NSURL_F_NO_PORT = (1 << 0) - } flags = 0; - - switch (section) { - case URL_SCHEME: - start = pegs->start; - end = pegs->scheme_end; - break; - - case URL_CREDENTIALS: - start = pegs->authority; - end = pegs->at; - break; - - case URL_HOST: - start = (pegs->at == pegs->authority && - *(url_s + pegs->at) != '@') ? - pegs->at : - pegs->at + 1; - end = pegs->path; - break; - - case URL_PATH: - start = pegs->path; - end = pegs->query; - break; - - case URL_QUERY: - start = pegs->query; - end = pegs->fragment; - break; - - case URL_FRAGMENT: - start = (*(url_s + pegs->fragment) != '#') ? - pegs->fragment : - pegs->fragment + 1; - end = pegs->end; - break; - } - - if (end < start) - end = start; - - length = end - start; - - /* Stage 1: Normalise the required section */ - - pos = pos_url_s = url_s + start; - copy_len = 0; - for (; pos < url_s + end; pos++) { - if (*pos == '%' && (pos + 2 < url_s + end)) { - /* Might be an escaped character needing unescaped */ - - /* Find which character which was escaped */ - ascii_offset = nsurl__get_ascii_offset(*(pos + 1), - *(pos + 2)); - - if (ascii_offset < 0) { - /* % with invalid hex digits. */ - copy_len++; - continue; - } - - if ((section != URL_SCHEME && section != URL_HOST) && - (nsurl__is_unreserved(ascii_offset) == false)) { - /* This character should be escaped after all, - * just let it get copied */ - copy_len += 3; - pos += 2; - continue; - } - - if (copy_len > 0) { - /* Copy up to here */ - memcpy(pos_norm, pos_url_s, copy_len); - pos_norm += copy_len; - copy_len = 0; - } - - /* Put the unescaped character in the normalised URL */ - *(pos_norm++) = (char)ascii_offset; - pos += 2; - pos_url_s = pos + 1; - - length -= 2; - - } else if ((section != URL_SCHEME && section != URL_HOST) && - (nsurl__is_no_escape(*pos) == false)) { - - /* This needs to be escaped */ - if (copy_len > 0) { - /* Copy up to here */ - memcpy(pos_norm, pos_url_s, copy_len); - pos_norm += copy_len; - copy_len = 0; - } - - /* escape */ - *(pos_norm++) = '%'; - *(pos_norm++) = digit2uppercase_hex( - ((unsigned char)*pos) >> 4); - *(pos_norm++) = digit2uppercase_hex( - ((unsigned char)*pos) & 0xf); - pos_url_s = pos + 1; - - length += 2; - - } else if ((section == URL_SCHEME || section == URL_HOST) && - ascii_is_alpha_upper(*pos)) { - /* Lower case this letter */ - - if (copy_len > 0) { - /* Copy up to here */ - memcpy(pos_norm, pos_url_s, copy_len); - pos_norm += copy_len; - copy_len = 0; - } - /* Copy lower cased letter into normalised URL */ - *(pos_norm++) = ascii_to_lower(*pos); - pos_url_s = pos + 1; - - } else { - /* This character is safe in normalised URL */ - copy_len++; - } - } - - if (copy_len > 0) { - /* Copy up to here */ - memcpy(pos_norm, pos_url_s, copy_len); - pos_norm += copy_len; - } - - /* Mark end of section */ - (*pos_norm) = '\0'; - - /* Stage 2: Create the URL components for the required section */ - switch (section) { - case URL_SCHEME: - if (length == 0) { - /* No scheme, assuming http */ - url->scheme = lwc_string_ref(corestring_lwc_http); - } else { - /* Add scheme to URL */ - if (lwc_intern_string(norm_start, length, - &url->scheme) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } - - break; - - case URL_CREDENTIALS: - url->username = NULL; - url->password = NULL; - - if (length != 0 && *norm_start != ':') { - char *sec_start = norm_start; - if (pegs->colon_first != pegs->authority && - pegs->at > pegs->colon_first + 1) { - /* there's a password */ - sec_start += pegs->colon_first - - pegs->authority + 1; - if (lwc_intern_string(sec_start, - pegs->at - pegs->colon_first -1, - &url->password) != - lwc_error_ok) { - return NSERROR_NOMEM; - } - - /* update start pos and length for username */ - sec_start = norm_start; - length -= pegs->at - pegs->colon_first; - } else if (pegs->colon_first != pegs->authority && - pegs->at == pegs->colon_first + 1) { - /* strip username colon */ - length--; - } - - /* Username */ - if (lwc_intern_string(sec_start, length, - &url->username) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } - - break; - - case URL_HOST: - url->host = NULL; - url->port = NULL; - - if (length != 0) { - size_t colon = 0; - char *sec_start = norm_start; - if (pegs->at < pegs->colon_first && - pegs->colon_last == pegs->authority) { - /* There's one colon and it's after @ marker */ - colon = pegs->colon_first; - } else if (pegs->colon_last != pegs->authority) { - /* There's more than one colon */ - colon = pegs->colon_last; - } else { - /* There's no colon that could be a port - * separator */ - flags |= NSURL_F_NO_PORT; - } - - if (!(flags & NSURL_F_NO_PORT)) { - /* Determine whether colon is a port separator - */ - sec_start += colon - pegs->at; - while (++sec_start < norm_start + length) { - if (!ascii_is_digit(*sec_start)) { - /* Character after port isn't a - * digit; not a port separator - */ - flags |= NSURL_F_NO_PORT; - break; - } - } - } - - if (!(flags & NSURL_F_NO_PORT)) { - /* There's a port */ - size_t skip = (pegs->at == pegs->authority) ? - 1 : 0; - sec_start = norm_start + colon - pegs->at + - skip; - if (url->scheme != NULL && - url->scheme_type == - NSURL_SCHEME_HTTP && - length - - (colon - pegs->at + skip) == 2 && - *sec_start == '8' && - *(sec_start + 1) == '0') { - /* Scheme is http, and port is default - * (80) */ - flags |= NSURL_F_NO_PORT; - } - - if (length <= (colon - pegs->at + skip)) { - /* No space for a port after the colon - */ - flags |= NSURL_F_NO_PORT; - } - - /* Add non-redundant ports to NetSurf URL */ - sec_start = norm_start + colon - pegs->at + - skip; - if (!(flags & NSURL_F_NO_PORT) && - lwc_intern_string(sec_start, - length - - (colon - pegs->at + skip), - &url->port) != lwc_error_ok) { - return NSERROR_NOMEM; - } - - /* update length for host */ - skip = (pegs->at == pegs->authority) ? 0 : 1; - length = colon - pegs->at - skip; - } - - /* host */ - /* Encode host according to IDNA2008 */ - ret = idna_encode(norm_start, length, &host, &host_len); - if (ret == NSERROR_OK) { - /* valid idna encoding */ - if (lwc_intern_string(host, host_len, - &url->host) != lwc_error_ok) { - return NSERROR_NOMEM; - } - free(host); - } else { - /* fall back to straight interning */ - if (lwc_intern_string(norm_start, length, - &url->host) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } - } - - break; - - case URL_PATH: - if (length != 0) { - if (lwc_intern_string(norm_start, length, - &url->path) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } else if (url->host != NULL && - url->scheme_type != NSURL_SCHEME_MAILTO) { - /* Set empty path to "/", if there's a host */ - if (lwc_intern_string("/", SLEN("/"), - &url->path) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } else { - url->path = NULL; - } - - break; - - case URL_QUERY: - if (length != 0) { - if (lwc_intern_string(norm_start, length, - &url->query) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } else { - url->query = NULL; - } - - break; - - case URL_FRAGMENT: - if (length != 0) { - if (lwc_intern_string(norm_start, length, - &url->fragment) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } else { - url->fragment = NULL; - } - - break; - } - - return NSERROR_OK; -} - - -/** - * Get nsurl string info; total length, component lengths, & components present - * - * \param url NetSurf URL components - * \param parts Which parts of the URL are required in the string - * \param url_l Updated to total string length - * \param lengths Updated with individual component lengths - * \param pflags Updated to contain relevant string flags - */ -static void nsurl__get_string_data(const struct nsurl_components *url, - nsurl_component parts, size_t *url_l, - struct nsurl_component_lengths *lengths, - enum nsurl_string_flags *pflags) -{ - enum nsurl_string_flags flags = *pflags; - *url_l = 0; - - /* Intersection of required parts and available parts gives - * the output parts */ - if (url->scheme && parts & NSURL_SCHEME) { - flags |= NSURL_F_SCHEME; - - lengths->scheme = lwc_string_length(url->scheme); - *url_l += lengths->scheme; - } - - if (url->username && parts & NSURL_USERNAME) { - flags |= NSURL_F_USERNAME; - - lengths->username = lwc_string_length(url->username); - *url_l += lengths->username; - } - - if (url->password && parts & NSURL_PASSWORD) { - flags |= NSURL_F_PASSWORD; - - lengths->password = lwc_string_length(url->password); - *url_l += SLEN(":") + lengths->password; - } - - if (url->host && parts & NSURL_HOST) { - flags |= NSURL_F_HOST; - - lengths->host = lwc_string_length(url->host); - *url_l += lengths->host; - } - - if (url->port && parts & NSURL_PORT) { - flags |= NSURL_F_PORT; - - lengths->port = lwc_string_length(url->port); - *url_l += SLEN(":") + lengths->port; - } - - if (url->path && parts & NSURL_PATH) { - flags |= NSURL_F_PATH; - - lengths->path = lwc_string_length(url->path); - *url_l += lengths->path; - } - - if (url->query && parts & NSURL_QUERY) { - flags |= NSURL_F_QUERY; - - lengths->query = lwc_string_length(url->query); - *url_l += lengths->query; - } - - if (url->fragment && parts & NSURL_FRAGMENT) { - flags |= NSURL_F_FRAGMENT; - - lengths->fragment = lwc_string_length(url->fragment); - *url_l += lengths->fragment; - } - - /* Turn on any spanned punctuation */ - if ((flags & NSURL_F_SCHEME) && (parts > NSURL_SCHEME)) { - flags |= NSURL_F_SCHEME_PUNCTUATION; - - *url_l += SLEN(":"); - } - - if ((flags & NSURL_F_SCHEME) && (flags > NSURL_F_SCHEME) && - url->path && lwc_string_data(url->path)[0] == '/') { - flags |= NSURL_F_AUTHORITY_PUNCTUATION; - - *url_l += SLEN("//"); - } - - if ((flags & (NSURL_F_USERNAME | NSURL_F_PASSWORD)) && - flags & NSURL_F_HOST) { - flags |= NSURL_F_CREDENTIALS_PUNCTUATION; - - *url_l += SLEN("@"); - } - - if ((flags & ~NSURL_F_FRAGMENT) && (flags & NSURL_F_FRAGMENT)) { - flags |= NSURL_F_FRAGMENT_PUNCTUATION; - - *url_l += SLEN("#"); - } - - *pflags = flags; -} - - -/** - * Get nsurl string info; total length, component lengths, & components present - * - * \param url NetSurf URL components - * \param url_s Updated to contain the string - * \param l Individual component lengths - * \param flags String flags - */ -static void nsurl_get_string(const struct nsurl_components *url, char *url_s, - struct nsurl_component_lengths *l, - enum nsurl_string_flags flags) -{ - char *pos; - - /* Copy the required parts into the url string */ - pos = url_s; - - if (flags & NSURL_F_SCHEME) { - memcpy(pos, lwc_string_data(url->scheme), l->scheme); - pos += l->scheme; - } - - if (flags & NSURL_F_SCHEME_PUNCTUATION) { - *(pos++) = ':'; - } - - if (flags & NSURL_F_AUTHORITY_PUNCTUATION) { - *(pos++) = '/'; - *(pos++) = '/'; - } - - if (flags & NSURL_F_USERNAME) { - memcpy(pos, lwc_string_data(url->username), l->username); - pos += l->username; - } - - if (flags & NSURL_F_PASSWORD) { - *(pos++) = ':'; - memcpy(pos, lwc_string_data(url->password), l->password); - pos += l->password; - } - - if (flags & NSURL_F_CREDENTIALS_PUNCTUATION) { - *(pos++) = '@'; - } - - if (flags & NSURL_F_HOST) { - memcpy(pos, lwc_string_data(url->host), l->host); - pos += l->host; - } - - if (flags & NSURL_F_PORT) { - *(pos++) = ':'; - memcpy(pos, lwc_string_data(url->port), l->port); - pos += l->port; - } - - if (flags & NSURL_F_PATH) { - memcpy(pos, lwc_string_data(url->path), l->path); - pos += l->path; - } - - if (flags & NSURL_F_QUERY) { - memcpy(pos, lwc_string_data(url->query), l->query); - pos += l->query; - } - - if (flags & NSURL_F_FRAGMENT) { - if (flags & NSURL_F_FRAGMENT_PUNCTUATION) - *(pos++) = '#'; - memcpy(pos, lwc_string_data(url->fragment), l->fragment); - pos += l->fragment; - } - - *pos = '\0'; -} - - -/** - * Calculate hash value - * - * \param url NetSurf URL object to set hash value for - */ -static void nsurl_calc_hash(nsurl *url) -{ - uint32_t hash = 0; - - if (url->components.scheme) - hash ^= lwc_string_hash_value(url->components.scheme); - - if (url->components.username) - hash ^= lwc_string_hash_value(url->components.username); - - if (url->components.password) - hash ^= lwc_string_hash_value(url->components.password); - - if (url->components.host) - hash ^= lwc_string_hash_value(url->components.host); - - if (url->components.port) - hash ^= lwc_string_hash_value(url->components.port); - - if (url->components.path) - hash ^= lwc_string_hash_value(url->components.path); - - if (url->components.query) - hash ^= lwc_string_hash_value(url->components.query); - - url->hash = hash; -} - /** * Destroy components @@ -1320,89 +132,6 @@ static void nsurl__dump(const nsurl *url) * NetSurf URL Public API * ******************************************************************************/ -/* exported interface, documented in nsurl.h */ -nserror nsurl_create(const char * const url_s, nsurl **url) -{ - struct url_markers m; - struct nsurl_components c; - size_t length; - char *buff; - struct nsurl_component_lengths str_len = { 0, 0, 0, 0, 0, 0, 0, 0 }; - enum nsurl_string_flags str_flags = 0; - nserror e = NSERROR_OK; - bool match; - - assert(url_s != NULL); - - /* Peg out the URL sections */ - nsurl__get_string_markers(url_s, &m, false); - - /* Get the length of the longest section */ - length = nsurl__get_longest_section(&m); - - /* Allocate enough memory to url escape the longest section */ - buff = malloc(length * 3 + 1); - if (buff == NULL) - return NSERROR_NOMEM; - - /* Set scheme type */ - c.scheme_type = m.scheme_type; - - /* Build NetSurf URL object from sections */ - e |= nsurl__create_from_section(url_s, URL_SCHEME, &m, buff, &c); - e |= nsurl__create_from_section(url_s, URL_CREDENTIALS, &m, buff, &c); - e |= nsurl__create_from_section(url_s, URL_HOST, &m, buff, &c); - e |= nsurl__create_from_section(url_s, URL_PATH, &m, buff, &c); - e |= nsurl__create_from_section(url_s, URL_QUERY, &m, buff, &c); - e |= nsurl__create_from_section(url_s, URL_FRAGMENT, &m, buff, &c); - - /* Finished with buffer */ - free(buff); - - if (e != NSERROR_OK) { - nsurl_destroy_components(&c); - return NSERROR_NOMEM; - } - - /* Validate URL */ - if ((lwc_string_isequal(c.scheme, corestring_lwc_http, - &match) == lwc_error_ok && match == true) || - (lwc_string_isequal(c.scheme, corestring_lwc_https, - &match) == lwc_error_ok && match == true)) { - /* http, https must have host */ - if (c.host == NULL) { - nsurl_destroy_components(&c); - return NSERROR_BAD_URL; - } - } - - /* Get the string length and find which parts of url are present */ - nsurl__get_string_data(&c, NSURL_WITH_FRAGMENT, &length, - &str_len, &str_flags); - - /* Create NetSurf URL object */ - *url = malloc(sizeof(nsurl) + length + 1); /* Add 1 for \0 */ - if (*url == NULL) { - nsurl_destroy_components(&c); - return NSERROR_NOMEM; - } - - (*url)->components = c; - (*url)->length = length; - - /* Fill out the url string */ - nsurl_get_string(&c, (*url)->string, &str_len, str_flags); - - /* Get the nsurl's hash */ - nsurl_calc_hash(*url); - - /* Give the URL a reference */ - (*url)->count = 1; - - return NSERROR_OK; -} - - /* exported interface, documented in nsurl.h */ nsurl *nsurl_ref(nsurl *url) { @@ -1539,7 +268,7 @@ nserror nsurl_get(const nsurl *url, nsurl_component parts, } /* Copy the required parts into the url string */ - nsurl_get_string(&(url->components), *url_s, &str_len, str_flags); + nsurl__get_string(&(url->components), *url_s, &str_len, str_flags); return NSERROR_OK; } @@ -1779,253 +508,6 @@ uint32_t nsurl_hash(const nsurl *url) } -/* exported interface, documented in nsurl.h */ -nserror nsurl_join(const nsurl *base, const char *rel, nsurl **joined) -{ - struct url_markers m; - struct nsurl_components c; - size_t length; - char *buff; - char *buff_pos; - char *buff_start; - struct nsurl_component_lengths str_len = { 0, 0, 0, 0, 0, 0, 0, 0 }; - enum nsurl_string_flags str_flags = 0; - nserror error = 0; - enum { - NSURL_F_REL = 0, - NSURL_F_BASE_SCHEME = (1 << 0), - NSURL_F_BASE_AUTHORITY = (1 << 1), - NSURL_F_BASE_PATH = (1 << 2), - NSURL_F_MERGED_PATH = (1 << 3), - NSURL_F_BASE_QUERY = (1 << 4) - } joined_parts; - - assert(base != NULL); - assert(rel != NULL); - -#ifdef NSURL_DEBUG - LOG("base: \"%s\", rel: \"%s\"", nsurl_access(base), rel); -#endif - - /* Peg out the URL sections */ - nsurl__get_string_markers(rel, &m, true); - - /* Get the length of the longest section */ - length = nsurl__get_longest_section(&m); - - /* Initially assume that the joined URL can be formed entierly from - * the relative URL. - */ - joined_parts = NSURL_F_REL; - - /* Update joined_compnents to indicate any required parts from the - * base URL. - */ - if (m.scheme_end - m.start <= 0) { - /* The relative url has no scheme. - * Use base URL's scheme. */ - joined_parts |= NSURL_F_BASE_SCHEME; - - if (m.path - m.authority <= 0) { - /* The relative URL has no authority. - * Use base URL's authority. */ - joined_parts |= NSURL_F_BASE_AUTHORITY; - - if (m.query - m.path <= 0) { - /* The relative URL has no path. - * Use base URL's path. */ - joined_parts |= NSURL_F_BASE_PATH; - - if (m.fragment - m.query <= 0) { - /* The relative URL has no query. - * Use base URL's query. */ - joined_parts |= NSURL_F_BASE_QUERY; - } - - } else if (*(rel + m.path) != '/') { - /* Relative URL has relative path */ - joined_parts |= NSURL_F_MERGED_PATH; - } - } - } - - /* Allocate enough memory to url escape the longest section, plus - * space for path merging (if required). - */ - if (joined_parts & NSURL_F_MERGED_PATH) { - /* Need to merge paths */ - length += (base->components.path != NULL) ? - lwc_string_length(base->components.path) : 0; - } - length *= 4; - /* Plus space for removing dots from path */ - length += (m.query - m.path) + ((base->components.path != NULL) ? - lwc_string_length(base->components.path) : 0); - - buff = malloc(length + 5); - if (buff == NULL) { - return NSERROR_NOMEM; - } - - buff_pos = buff; - - /* Form joined URL from base or rel components, as appropriate */ - - if (joined_parts & NSURL_F_BASE_SCHEME) { - c.scheme_type = base->components.scheme_type; - - c.scheme = nsurl__component_copy(base->components.scheme); - } else { - c.scheme_type = m.scheme_type; - - error = nsurl__create_from_section(rel, URL_SCHEME, &m, buff, &c); - if (error != NSERROR_OK) { - free(buff); - return error; - } - } - - if (joined_parts & NSURL_F_BASE_AUTHORITY) { - c.username = nsurl__component_copy(base->components.username); - c.password = nsurl__component_copy(base->components.password); - c.host = nsurl__component_copy(base->components.host); - c.port = nsurl__component_copy(base->components.port); - } else { - error = nsurl__create_from_section(rel, URL_CREDENTIALS, &m, - buff, &c); - if (error == NSERROR_OK) { - error = nsurl__create_from_section(rel, URL_HOST, &m, - buff, &c); - } - if (error != NSERROR_OK) { - free(buff); - return error; - } - } - - if (joined_parts & NSURL_F_BASE_PATH) { - c.path = nsurl__component_copy(base->components.path); - - } else if (joined_parts & NSURL_F_MERGED_PATH) { - struct url_markers m_path; - size_t new_length; - - if (base->components.host != NULL && - base->components.path == NULL) { - /* Append relative path to "/". */ - *(buff_pos++) = '/'; - memcpy(buff_pos, rel + m.path, m.query - m.path); - buff_pos += m.query - m.path; - - } else { - /* Append relative path to all but last segment of - * base path. */ - size_t path_end = lwc_string_length( - base->components.path); - const char *path = lwc_string_data( - base->components.path); - - while (*(path + path_end) != '/' && - path_end != 0) { - path_end--; - } - if (*(path + path_end) == '/') - path_end++; - - /* Copy the base part */ - memcpy(buff_pos, path, path_end); - buff_pos += path_end; - - /* Copy the relative part */ - memcpy(buff_pos, rel + m.path, m.query - m.path); - buff_pos += m.query - m.path; - } - - /* add termination to string */ - *buff_pos++ = '\0'; - - new_length = nsurl__remove_dot_segments(buff, buff_pos); - - m_path.path = 0; - m_path.query = new_length; - - buff_start = buff_pos + new_length; - error = nsurl__create_from_section(buff_pos, URL_PATH, &m_path, - buff_start, &c); - if (error != NSERROR_OK) { - free(buff); - return error; - } - - } else { - struct url_markers m_path; - size_t new_length; - - memcpy(buff_pos, rel + m.path, m.query - m.path); - buff_pos += m.query - m.path; - *(buff_pos++) = '\0'; - - new_length = nsurl__remove_dot_segments(buff, buff_pos); - - m_path.path = 0; - m_path.query = new_length; - - buff_start = buff_pos + new_length; - - error = nsurl__create_from_section(buff_pos, URL_PATH, &m_path, - buff_start, &c); - if (error != NSERROR_OK) { - free(buff); - return error; - } - } - - if (joined_parts & NSURL_F_BASE_QUERY) { - c.query = nsurl__component_copy(base->components.query); - } else { - error = nsurl__create_from_section(rel, URL_QUERY, &m, - buff, &c); - if (error != NSERROR_OK) { - free(buff); - return error; - } - } - - error = nsurl__create_from_section(rel, URL_FRAGMENT, &m, buff, &c); - - /* Free temporary buffer */ - free(buff); - - if (error != NSERROR_OK) { - return error; - } - - /* Get the string length and find which parts of url are present */ - nsurl__get_string_data(&c, NSURL_WITH_FRAGMENT, &length, - &str_len, &str_flags); - - /* Create NetSurf URL object */ - *joined = malloc(sizeof(nsurl) + length + 1); /* Add 1 for \0 */ - if (*joined == NULL) { - return NSERROR_NOMEM; - } - - (*joined)->components = c; - (*joined)->length = length; - - /* Fill out the url string */ - nsurl_get_string(&c, (*joined)->string, &str_len, str_flags); - - /* Get the nsurl's hash */ - nsurl_calc_hash(*joined); - - /* Give the URL a reference */ - (*joined)->count = 1; - - return NSERROR_OK; -} - - /* exported interface, documented in nsurl.h */ nserror nsurl_defragment(const nsurl *url, nsurl **no_frag) { @@ -2083,7 +565,7 @@ nserror nsurl_defragment(const nsurl *url, nsurl **no_frag) *pos = '\0'; /* Get the nsurl's hash */ - nsurl_calc_hash(*no_frag); + nsurl__calc_hash(*no_frag); /* Give the URL a reference */ (*no_frag)->count = 1; @@ -2151,7 +633,7 @@ nserror nsurl_refragment(const nsurl *url, lwc_string *frag, nsurl **new_url) (*new_url)->components.scheme_type = url->components.scheme_type; /* Get the nsurl's hash */ - nsurl_calc_hash(*new_url); + nsurl__calc_hash(*new_url); /* Give the URL a reference */ (*new_url)->count = 1; @@ -2238,7 +720,7 @@ nserror nsurl_replace_query(const nsurl *url, const char *query, (*new_url)->components.scheme_type = url->components.scheme_type; /* Get the nsurl's hash */ - nsurl_calc_hash(*new_url); + nsurl__calc_hash(*new_url); /* Give the URL a reference */ (*new_url)->count = 1; @@ -2441,7 +923,7 @@ nserror nsurl_parent(const nsurl *url, nsurl **new_url) (*new_url)->components.scheme_type = url->components.scheme_type; /* Get the nsurl's hash */ - nsurl_calc_hash(*new_url); + nsurl__calc_hash(*new_url); /* Give the URL a reference */ (*new_url)->count = 1; diff --git a/utils/nsurl/parse.c b/utils/nsurl/parse.c new file mode 100644 index 000000000..0cdbbd4c3 --- /dev/null +++ b/utils/nsurl/parse.c @@ -0,0 +1,1602 @@ +/* + * Copyright 2011 Michael Drake + * + * This file is part of NetSurf, http://www.netsurf-browser.org/ + * + * NetSurf is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * NetSurf is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +/** + * \file + * NetSurf URL handling implementation. + * + * This is the common implementation of all URL handling within the + * browser. This implementation is based upon RFC3986 although this has + * been superceeded by https://url.spec.whatwg.org/ which is based on + * actual contemporary implementations. + * + * Care must be taken with character encodings within this module as + * the specifications work with specific ascii ranges and must not be + * affected by locale. Hence the c library character type functions + * are not used. + */ + +#include +#include +#include +#include +#include + +#include "utils/ascii.h" +#include "utils/corestrings.h" +#include "utils/errors.h" +#include "utils/idna.h" +#include "utils/log.h" +#include "utils/nsurl.h" +#include "utils/nsurl/private.h" +#include "utils/utils.h" + +/* Define to enable NSURL debugging */ +#undef NSURL_DEBUG + + +/** Marker set, indicating positions of sections within a URL string */ +struct url_markers { + size_t start; /** start of URL */ + size_t scheme_end; + size_t authority; + + size_t colon_first; + size_t at; + size_t colon_last; + + size_t path; + size_t query; + size_t fragment; + + size_t end; /** end of URL */ + + enum nsurl_scheme_type scheme_type; +}; + + +/** Sections of a URL */ +enum url_sections { + URL_SCHEME, + URL_CREDENTIALS, + URL_HOST, + URL_PATH, + URL_QUERY, + URL_FRAGMENT +}; + + +#define nsurl__component_copy(c) (c == NULL) ? NULL : lwc_string_ref(c) + +#define nsurl__component_compare(c1, c2, match) \ + if (c1 && c2 && lwc_error_ok == \ + lwc_string_isequal(c1, c2, match)) { \ + /* do nothing */ \ + } else if (c1 || c2) { \ + *match = false; \ + } + +/** + * Return a hex digit for the given numerical value. + * + * \param digit the value to get the hex digit for. + * \return character in range 0-9A-F + */ +inline static char digit2uppercase_hex(unsigned char digit) { + assert(digit < 16); + return "0123456789ABCDEF"[digit]; +} + +/** + * determine if a character is unreserved + * + * \param c character to classify. + * \return true if the character is unreserved else false. + */ +static bool nsurl__is_unreserved(unsigned char c) +{ + /* From RFC3986 section 2.3 (unreserved characters) + * + * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" + * + */ + static const bool unreserved[256] = { + false, false, false, false, false, false, false, false, /* 00 */ + false, false, false, false, false, false, false, false, /* 08 */ + false, false, false, false, false, false, false, false, /* 10 */ + false, false, false, false, false, false, false, false, /* 18 */ + false, false, false, false, false, false, false, false, /* 20 */ + false, false, false, false, false, true, true, false, /* 28 */ + true, true, true, true, true, true, true, true, /* 30 */ + true, true, false, false, false, false, false, false, /* 38 */ + false, true, true, true, true, true, true, true, /* 40 */ + true, true, true, true, true, true, true, true, /* 48 */ + true, true, true, true, true, true, true, true, /* 50 */ + true, true, true, false, false, false, false, true, /* 58 */ + false, true, true, true, true, true, true, true, /* 60 */ + true, true, true, true, true, true, true, true, /* 68 */ + true, true, true, true, true, true, true, true, /* 70 */ + true, true, true, false, false, false, true, false, /* 78 */ + false, false, false, false, false, false, false, false, /* 80 */ + false, false, false, false, false, false, false, false, /* 88 */ + false, false, false, false, false, false, false, false, /* 90 */ + false, false, false, false, false, false, false, false, /* 98 */ + false, false, false, false, false, false, false, false, /* A0 */ + false, false, false, false, false, false, false, false, /* A8 */ + false, false, false, false, false, false, false, false, /* B0 */ + false, false, false, false, false, false, false, false, /* B8 */ + false, false, false, false, false, false, false, false, /* C0 */ + false, false, false, false, false, false, false, false, /* C8 */ + false, false, false, false, false, false, false, false, /* D0 */ + false, false, false, false, false, false, false, false, /* D8 */ + false, false, false, false, false, false, false, false, /* E0 */ + false, false, false, false, false, false, false, false, /* E8 */ + false, false, false, false, false, false, false, false, /* F0 */ + false, false, false, false, false, false, false, false /* F8 */ + }; + return unreserved[c]; +} + +/** + * determine if a character should be percent escaped. + * + * The ASCII codes which should not be percent escaped + * + * \param c character to classify. + * \return true if the character should not be escaped else false. + */ +static bool nsurl__is_no_escape(unsigned char c) +{ + static const bool no_escape[256] = { + false, false, false, false, false, false, false, false, /* 00 */ + false, false, false, false, false, false, false, false, /* 08 */ + false, false, false, false, false, false, false, false, /* 10 */ + false, false, false, false, false, false, false, false, /* 18 */ + false, true, false, true, true, false, true, true, /* 20 */ + true, true, true, true, true, true, true, true, /* 28 */ + true, true, true, true, true, true, true, true, /* 30 */ + true, true, true, true, false, true, false, true, /* 38 */ + true, true, true, true, true, true, true, true, /* 40 */ + true, true, true, true, true, true, true, true, /* 48 */ + true, true, true, true, true, true, true, true, /* 50 */ + true, true, true, true, false, true, false, true, /* 58 */ + false, true, true, true, true, true, true, true, /* 60 */ + true, true, true, true, true, true, true, true, /* 68 */ + true, true, true, true, true, true, true, true, /* 70 */ + true, true, true, false, true, false, true, false, /* 78 */ + false, false, false, false, false, false, false, false, /* 80 */ + false, false, false, false, false, false, false, false, /* 88 */ + false, false, false, false, false, false, false, false, /* 90 */ + false, false, false, false, false, false, false, false, /* 98 */ + false, false, false, false, false, false, false, false, /* A0 */ + false, false, false, false, false, false, false, false, /* A8 */ + false, false, false, false, false, false, false, false, /* B0 */ + false, false, false, false, false, false, false, false, /* B8 */ + false, false, false, false, false, false, false, false, /* C0 */ + false, false, false, false, false, false, false, false, /* C8 */ + false, false, false, false, false, false, false, false, /* D0 */ + false, false, false, false, false, false, false, false, /* D8 */ + false, false, false, false, false, false, false, false, /* E0 */ + false, false, false, false, false, false, false, false, /* E8 */ + false, false, false, false, false, false, false, false, /* F0 */ + false, false, false, false, false, false, false, false, /* F8 */ + }; + return no_escape[c]; +} + + +/** + * Obtains a set of markers delimiting sections in a URL string + * + * \param url_s URL string + * \param markers Updated to mark sections in the URL string + * \param joining True iff URL string is a relative URL for joining + */ +static void nsurl__get_string_markers(const char * const url_s, + struct url_markers *markers, bool joining) +{ + const char *pos = url_s; /** current position in url_s */ + bool is_http = false; + bool trailing_whitespace = false; + + /* Initialise marker set */ + struct url_markers marker = { 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, NSURL_SCHEME_OTHER }; + + /* Skip any leading whitespace in url_s */ + while (ascii_is_space(*pos)) + pos++; + + /* Record start point */ + marker.start = pos - url_s; + + marker.scheme_end = marker.authority = marker.colon_first = marker.at = + marker.colon_last = marker.path = marker.start; + + if (*pos == '\0') { + /* Nothing but whitespace, early exit */ + marker.query = marker.fragment = marker.end = marker.path; + *markers = marker; + return; + } + + /* Get scheme */ + if (ascii_is_alpha(*pos)) { + pos++; + + while (*pos != ':' && *pos != '\0') { + if (!ascii_is_alphanumerical(*pos) && (*pos != '+') && + (*pos != '-') && (*pos != '.')) { + /* This character is not valid in the + * scheme */ + break; + } + pos++; + } + + if (*pos == ':') { + /* This delimits the end of the scheme */ + size_t off; + + marker.scheme_end = pos - url_s; + + off = marker.scheme_end - marker.start; + + /* Detect http(s) and mailto for scheme specifc + * normalisation */ + if (off == SLEN("http") && + (((*(pos - off + 0) == 'h') || + (*(pos - off + 0) == 'H')) && + ((*(pos - off + 1) == 't') || + (*(pos - off + 1) == 'T')) && + ((*(pos - off + 2) == 't') || + (*(pos - off + 2) == 'T')) && + ((*(pos - off + 3) == 'p') || + (*(pos - off + 3) == 'P')))) { + marker.scheme_type = NSURL_SCHEME_HTTP; + is_http = true; + } else if (off == SLEN("https") && + (((*(pos - off + 0) == 'h') || + (*(pos - off + 0) == 'H')) && + ((*(pos - off + 1) == 't') || + (*(pos - off + 1) == 'T')) && + ((*(pos - off + 2) == 't') || + (*(pos - off + 2) == 'T')) && + ((*(pos - off + 3) == 'p') || + (*(pos - off + 3) == 'P')) && + ((*(pos - off + 4) == 's') || + (*(pos - off + 4) == 'S')))) { + marker.scheme_type = NSURL_SCHEME_HTTPS; + is_http = true; + } else if (off == SLEN("ftp") && + (((*(pos - off + 0) == 'f') || + (*(pos - off + 0) == 'F')) && + ((*(pos - off + 1) == 't') || + (*(pos - off + 1) == 'T')) && + ((*(pos - off + 2) == 'p') || + (*(pos - off + 2) == 'P')))) { + marker.scheme_type = NSURL_SCHEME_FTP; + } else if (off == SLEN("mailto") && + (((*(pos - off + 0) == 'm') || + (*(pos - off + 0) == 'M')) && + ((*(pos - off + 1) == 'a') || + (*(pos - off + 1) == 'A')) && + ((*(pos - off + 2) == 'i') || + (*(pos - off + 2) == 'I')) && + ((*(pos - off + 3) == 'l') || + (*(pos - off + 3) == 'L')) && + ((*(pos - off + 4) == 't') || + (*(pos - off + 4) == 'T')) && + ((*(pos - off + 5) == 'o') || + (*(pos - off + 5) == 'O')))) { + marker.scheme_type = NSURL_SCHEME_MAILTO; + } + + /* Skip over colon */ + pos++; + + /* Mark place as start of authority */ + marker.authority = marker.colon_first = marker.at = + marker.colon_last = marker.path = + pos - url_s; + + } else { + /* Not found a scheme */ + if (joining == false) { + /* Assuming no scheme == http */ + marker.scheme_type = NSURL_SCHEME_HTTP; + is_http = true; + } + } + } + + /* Get authority + * + * Two slashes always indicates the start of an authority. + * + * We are more relaxed in the case of http: + * a. when joining, one or more slashes indicates start of authority + * b. when not joining, we assume authority if no scheme was present + * and in the case of mailto: when we assume there is an authority. + */ + if ((*pos == '/' && *(pos + 1) == '/') || + (is_http && ((joining && *pos == '/') || + (joining == false && + marker.scheme_end != marker.start))) || + marker.scheme_type == NSURL_SCHEME_MAILTO) { + + /* Skip over leading slashes */ + if (*pos == '/') { + if (is_http == false) { + if (*pos == '/') pos++; + if (*pos == '/') pos++; + } else { + while (*pos == '/') + pos++; + } + + marker.authority = marker.colon_first = marker.at = + marker.colon_last = marker.path = + pos - url_s; + } + + /* Need to get (or complete) the authority */ + while (*pos != '\0') { + if (*pos == '/' || *pos == '?' || *pos == '#') { + /* End of the authority */ + break; + + } else if (marker.scheme_type != NSURL_SCHEME_MAILTO && + *pos == ':' && marker.colon_first == + marker.authority) { + /* could be username:password or host:port + * separator */ + marker.colon_first = pos - url_s; + + } else if (marker.scheme_type != NSURL_SCHEME_MAILTO && + *pos == ':' && marker.colon_first != + marker.authority) { + /* could be host:port separator */ + marker.colon_last = pos - url_s; + + } else if (*pos == '@' && marker.at == + marker.authority) { + /* Credentials @ host separator */ + marker.at = pos - url_s; + } + + pos++; + } + + marker.path = pos - url_s; + + } else if ((*pos == '\0' || *pos == '/') && + joining == false && is_http == true) { + marker.path = pos - url_s; + } + + /* Get path + * + * Needs to start with '/' if there's no authority + */ + if (*pos == '/' || ((marker.path == marker.authority) && + (*pos != '?') && (*pos != '#') && (*pos != '\0'))) { + while (*(++pos) != '\0') { + if (*pos == '?' || *pos == '#') { + /* End of the path */ + break; + } + } + } + + marker.query = pos - url_s; + + /* Get query */ + if (*pos == '?') { + while (*(++pos) != '\0') { + if (*pos == '#') { + /* End of the query */ + break; + } + } + } + + marker.fragment = pos - url_s; + + /* Get fragment */ + if (*pos == '#') { + while (*(++pos) != '\0') + ; + } + + /* We got to the end of url_s. + * Need to skip back over trailing whitespace to find end of URL */ + pos--; + if (pos >= url_s && ascii_is_space(*pos)) { + trailing_whitespace = true; + while (pos >= url_s && ascii_is_space(*pos)) + pos--; + } + + marker.end = pos + 1 - url_s; + + if (trailing_whitespace == true) { + /* Ensure last url section doesn't pass end */ + if (marker.fragment > marker.end) + marker.fragment = marker.end; + if (marker.query > marker.end) + marker.query = marker.end; + if (marker.path > marker.end) + marker.path = marker.end; + if (marker.colon_last > marker.end) + marker.colon_last = marker.end; + if (marker.at > marker.end) + marker.at = marker.end; + if (marker.colon_last > marker.end) + marker.colon_last = marker.end; + if (marker.fragment > marker.end) + marker.fragment = marker.end; + } + +#ifdef NSURL_DEBUG + LOG("marker.start: %i", marker.start); + LOG("marker.scheme_end: %i", marker.scheme_end); + LOG("marker.authority: %i", marker.authority); + + LOG("marker.colon_first: %i", marker.colon_first); + LOG("marker.at: %i", marker.at); + LOG("marker.colon_last: %i", marker.colon_last); + + LOG("marker.path: %i", marker.path); + LOG("marker.query: %i", marker.query); + LOG("marker.fragment: %i", marker.fragment); + + LOG("marker.end: %i", marker.end); +#endif + + /* Got all the URL components pegged out now */ + *markers = marker; +} + + +/** + * Remove dot segments from a path, as per rfc 3986, 5.2.4 + * + * \param path path to remove dot segments from ('\0' terminated) + * \param output path with dot segments removed + * \return size of output + */ +static size_t nsurl__remove_dot_segments(char *path, char *output) +{ + char *path_pos = path; + char *output_pos = output; + + while (*path_pos != '\0') { +#ifdef NSURL_DEBUG + LOG(" in:%s", path_pos); + LOG("out:%.*s", output_pos - output, output); +#endif + if (*path_pos == '.') { + if (*(path_pos + 1) == '.' && + *(path_pos + 2) == '/') { + /* Found prefix of "../" */ + path_pos += SLEN("../"); + continue; + + } else if (*(path_pos + 1) == '/') { + /* Found prefix of "./" */ + path_pos += SLEN("./"); + continue; + } + } else if (*path_pos == '/' && *(path_pos + 1) == '.') { + if (*(path_pos + 2) == '/') { + /* Found prefix of "/./" */ + path_pos += SLEN("/."); + continue; + + } else if (*(path_pos + 2) == '\0') { + /* Found "/." at end of path */ + *(output_pos++) = '/'; + + /* End of input path */ + break; + + } else if (*(path_pos + 2) == '.') { + if (*(path_pos + 3) == '/') { + /* Found prefix of "/../" */ + path_pos += SLEN("/.."); + + if (output_pos > output) + output_pos--; + while (output_pos > output && + *output_pos != '/') + output_pos--; + + continue; + + } else if (*(path_pos + 3) == '\0') { + /* Found "/.." at end of path */ + + while (output_pos > output && + *(output_pos -1 ) !='/') + output_pos--; + + /* End of input path */ + break; + } + } + } else if (*path_pos == '.') { + if (*(path_pos + 1) == '\0') { + /* Found "." at end of path */ + + /* End of input path */ + break; + + } else if (*(path_pos + 1) == '.' && + *(path_pos + 2) == '\0') { + /* Found ".." at end of path */ + + /* End of input path */ + break; + } + } + /* Copy first character into output path */ + *output_pos++ = *path_pos++; + + /* Copy up to but not including next '/' */ + while ((*path_pos != '/') && (*path_pos != '\0')) + *output_pos++ = *path_pos++; + } + + return output_pos - output; +} + + +/** + * Get the length of the longest section + * + * \param m markers delimiting url sections in a string + * \return the length of the longest section + */ +static size_t nsurl__get_longest_section(struct url_markers *m) +{ + size_t length = m->scheme_end - m->start; /* scheme */ + + if (length < m->at - m->authority) /* credentials */ + length = m->at - m->authority; + + if (length < m->path - m->at) /* host */ + length = m->path - m->at; + + if (length < m->query - m->path) /* path */ + length = m->query - m->path; + + if (length < m->fragment - m->query) /* query */ + length = m->fragment - m->query; + + if (length < m->end - m->fragment) /* fragment */ + length = m->end - m->fragment; + + return length; +} + + +/** + * Converts two hexadecimal digits to a single number + * + * \param c1 most significant hex digit + * \param c2 least significant hex digit + * \return the total value of the two digit hex number, or -ve if input not hex + * + * For unescaping url encoded characters. + */ +static inline int nsurl__get_ascii_offset(char c1, char c2) +{ + int offset; + + /* Use 1st char as most significant hex digit */ + if (ascii_is_digit(c1)) + offset = 16 * (c1 - '0'); + else if (c1 >= 'a' && c1 <= 'f') + offset = 16 * (c1 - 'a' + 10); + else if (c1 >= 'A' && c1 <= 'F') + offset = 16 * (c1 - 'A' + 10); + else + /* Not valid hex */ + return -1; + + /* Use 2nd char as least significant hex digit and sum */ + if (ascii_is_digit(c2)) + offset += c2 - '0'; + else if (c2 >= 'a' && c2 <= 'f') + offset += c2 - 'a' + 10; + else if (c2 >= 'A' && c2 <= 'F') + offset += c2 - 'A' + 10; + else + /* Not valid hex */ + return -1; + + return offset; +} + + +/** + * Create the components of a NetSurf URL object for a section of a URL string + * + * \param url_s URL string + * \param section Sets which section of URL string is to be normalised + * \param pegs Set of markers delimiting the URL string's sections + * \param pos_norm A buffer large enough for the normalised string (*3 + 1) + * \param url A NetSurf URL object, to which components may be added + * \return NSERROR_OK on success, appropriate error otherwise + * + * The section of url_s is normalised appropriately. + */ +static nserror nsurl__create_from_section(const char * const url_s, + const enum url_sections section, + const struct url_markers *pegs, + char *pos_norm, + struct nsurl_components *url) +{ + nserror ret; + int ascii_offset; + int start = 0; + int end = 0; + const char *pos; + const char *pos_url_s; + char *norm_start = pos_norm; + char *host; + size_t copy_len; + size_t length; + size_t host_len; + enum { + NSURL_F_NO_PORT = (1 << 0) + } flags = 0; + + switch (section) { + case URL_SCHEME: + start = pegs->start; + end = pegs->scheme_end; + break; + + case URL_CREDENTIALS: + start = pegs->authority; + end = pegs->at; + break; + + case URL_HOST: + start = (pegs->at == pegs->authority && + *(url_s + pegs->at) != '@') ? + pegs->at : + pegs->at + 1; + end = pegs->path; + break; + + case URL_PATH: + start = pegs->path; + end = pegs->query; + break; + + case URL_QUERY: + start = pegs->query; + end = pegs->fragment; + break; + + case URL_FRAGMENT: + start = (*(url_s + pegs->fragment) != '#') ? + pegs->fragment : + pegs->fragment + 1; + end = pegs->end; + break; + } + + if (end < start) + end = start; + + length = end - start; + + /* Stage 1: Normalise the required section */ + + pos = pos_url_s = url_s + start; + copy_len = 0; + for (; pos < url_s + end; pos++) { + if (*pos == '%' && (pos + 2 < url_s + end)) { + /* Might be an escaped character needing unescaped */ + + /* Find which character which was escaped */ + ascii_offset = nsurl__get_ascii_offset(*(pos + 1), + *(pos + 2)); + + if (ascii_offset < 0) { + /* % with invalid hex digits. */ + copy_len++; + continue; + } + + if ((section != URL_SCHEME && section != URL_HOST) && + (nsurl__is_unreserved(ascii_offset) == false)) { + /* This character should be escaped after all, + * just let it get copied */ + copy_len += 3; + pos += 2; + continue; + } + + if (copy_len > 0) { + /* Copy up to here */ + memcpy(pos_norm, pos_url_s, copy_len); + pos_norm += copy_len; + copy_len = 0; + } + + /* Put the unescaped character in the normalised URL */ + *(pos_norm++) = (char)ascii_offset; + pos += 2; + pos_url_s = pos + 1; + + length -= 2; + + } else if ((section != URL_SCHEME && section != URL_HOST) && + (nsurl__is_no_escape(*pos) == false)) { + + /* This needs to be escaped */ + if (copy_len > 0) { + /* Copy up to here */ + memcpy(pos_norm, pos_url_s, copy_len); + pos_norm += copy_len; + copy_len = 0; + } + + /* escape */ + *(pos_norm++) = '%'; + *(pos_norm++) = digit2uppercase_hex( + ((unsigned char)*pos) >> 4); + *(pos_norm++) = digit2uppercase_hex( + ((unsigned char)*pos) & 0xf); + pos_url_s = pos + 1; + + length += 2; + + } else if ((section == URL_SCHEME || section == URL_HOST) && + ascii_is_alpha_upper(*pos)) { + /* Lower case this letter */ + + if (copy_len > 0) { + /* Copy up to here */ + memcpy(pos_norm, pos_url_s, copy_len); + pos_norm += copy_len; + copy_len = 0; + } + /* Copy lower cased letter into normalised URL */ + *(pos_norm++) = ascii_to_lower(*pos); + pos_url_s = pos + 1; + + } else { + /* This character is safe in normalised URL */ + copy_len++; + } + } + + if (copy_len > 0) { + /* Copy up to here */ + memcpy(pos_norm, pos_url_s, copy_len); + pos_norm += copy_len; + } + + /* Mark end of section */ + (*pos_norm) = '\0'; + + /* Stage 2: Create the URL components for the required section */ + switch (section) { + case URL_SCHEME: + if (length == 0) { + /* No scheme, assuming http */ + url->scheme = lwc_string_ref(corestring_lwc_http); + } else { + /* Add scheme to URL */ + if (lwc_intern_string(norm_start, length, + &url->scheme) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } + + break; + + case URL_CREDENTIALS: + url->username = NULL; + url->password = NULL; + + if (length != 0 && *norm_start != ':') { + char *sec_start = norm_start; + if (pegs->colon_first != pegs->authority && + pegs->at > pegs->colon_first + 1) { + /* there's a password */ + sec_start += pegs->colon_first - + pegs->authority + 1; + if (lwc_intern_string(sec_start, + pegs->at - pegs->colon_first -1, + &url->password) != + lwc_error_ok) { + return NSERROR_NOMEM; + } + + /* update start pos and length for username */ + sec_start = norm_start; + length -= pegs->at - pegs->colon_first; + } else if (pegs->colon_first != pegs->authority && + pegs->at == pegs->colon_first + 1) { + /* strip username colon */ + length--; + } + + /* Username */ + if (lwc_intern_string(sec_start, length, + &url->username) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } + + break; + + case URL_HOST: + url->host = NULL; + url->port = NULL; + + if (length != 0) { + size_t colon = 0; + char *sec_start = norm_start; + if (pegs->at < pegs->colon_first && + pegs->colon_last == pegs->authority) { + /* There's one colon and it's after @ marker */ + colon = pegs->colon_first; + } else if (pegs->colon_last != pegs->authority) { + /* There's more than one colon */ + colon = pegs->colon_last; + } else { + /* There's no colon that could be a port + * separator */ + flags |= NSURL_F_NO_PORT; + } + + if (!(flags & NSURL_F_NO_PORT)) { + /* Determine whether colon is a port separator + */ + sec_start += colon - pegs->at; + while (++sec_start < norm_start + length) { + if (!ascii_is_digit(*sec_start)) { + /* Character after port isn't a + * digit; not a port separator + */ + flags |= NSURL_F_NO_PORT; + break; + } + } + } + + if (!(flags & NSURL_F_NO_PORT)) { + /* There's a port */ + size_t skip = (pegs->at == pegs->authority) ? + 1 : 0; + sec_start = norm_start + colon - pegs->at + + skip; + if (url->scheme != NULL && + url->scheme_type == + NSURL_SCHEME_HTTP && + length - + (colon - pegs->at + skip) == 2 && + *sec_start == '8' && + *(sec_start + 1) == '0') { + /* Scheme is http, and port is default + * (80) */ + flags |= NSURL_F_NO_PORT; + } + + if (length <= (colon - pegs->at + skip)) { + /* No space for a port after the colon + */ + flags |= NSURL_F_NO_PORT; + } + + /* Add non-redundant ports to NetSurf URL */ + sec_start = norm_start + colon - pegs->at + + skip; + if (!(flags & NSURL_F_NO_PORT) && + lwc_intern_string(sec_start, + length - + (colon - pegs->at + skip), + &url->port) != lwc_error_ok) { + return NSERROR_NOMEM; + } + + /* update length for host */ + skip = (pegs->at == pegs->authority) ? 0 : 1; + length = colon - pegs->at - skip; + } + + /* host */ + /* Encode host according to IDNA2008 */ + ret = idna_encode(norm_start, length, &host, &host_len); + if (ret == NSERROR_OK) { + /* valid idna encoding */ + if (lwc_intern_string(host, host_len, + &url->host) != lwc_error_ok) { + return NSERROR_NOMEM; + } + free(host); + } else { + /* fall back to straight interning */ + if (lwc_intern_string(norm_start, length, + &url->host) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } + } + + break; + + case URL_PATH: + if (length != 0) { + if (lwc_intern_string(norm_start, length, + &url->path) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } else if (url->host != NULL && + url->scheme_type != NSURL_SCHEME_MAILTO) { + /* Set empty path to "/", if there's a host */ + if (lwc_intern_string("/", SLEN("/"), + &url->path) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } else { + url->path = NULL; + } + + break; + + case URL_QUERY: + if (length != 0) { + if (lwc_intern_string(norm_start, length, + &url->query) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } else { + url->query = NULL; + } + + break; + + case URL_FRAGMENT: + if (length != 0) { + if (lwc_intern_string(norm_start, length, + &url->fragment) != lwc_error_ok) { + return NSERROR_NOMEM; + } + } else { + url->fragment = NULL; + } + + break; + } + + return NSERROR_OK; +} + + +/* Exported function, documented in utils/nsurl/private.h */ +void nsurl__get_string_data(const struct nsurl_components *url, + nsurl_component parts, size_t *url_l, + struct nsurl_component_lengths *lengths, + enum nsurl_string_flags *pflags) +{ + enum nsurl_string_flags flags = *pflags; + *url_l = 0; + + /* Intersection of required parts and available parts gives + * the output parts */ + if (url->scheme && parts & NSURL_SCHEME) { + flags |= NSURL_F_SCHEME; + + lengths->scheme = lwc_string_length(url->scheme); + *url_l += lengths->scheme; + } + + if (url->username && parts & NSURL_USERNAME) { + flags |= NSURL_F_USERNAME; + + lengths->username = lwc_string_length(url->username); + *url_l += lengths->username; + } + + if (url->password && parts & NSURL_PASSWORD) { + flags |= NSURL_F_PASSWORD; + + lengths->password = lwc_string_length(url->password); + *url_l += SLEN(":") + lengths->password; + } + + if (url->host && parts & NSURL_HOST) { + flags |= NSURL_F_HOST; + + lengths->host = lwc_string_length(url->host); + *url_l += lengths->host; + } + + if (url->port && parts & NSURL_PORT) { + flags |= NSURL_F_PORT; + + lengths->port = lwc_string_length(url->port); + *url_l += SLEN(":") + lengths->port; + } + + if (url->path && parts & NSURL_PATH) { + flags |= NSURL_F_PATH; + + lengths->path = lwc_string_length(url->path); + *url_l += lengths->path; + } + + if (url->query && parts & NSURL_QUERY) { + flags |= NSURL_F_QUERY; + + lengths->query = lwc_string_length(url->query); + *url_l += lengths->query; + } + + if (url->fragment && parts & NSURL_FRAGMENT) { + flags |= NSURL_F_FRAGMENT; + + lengths->fragment = lwc_string_length(url->fragment); + *url_l += lengths->fragment; + } + + /* Turn on any spanned punctuation */ + if ((flags & NSURL_F_SCHEME) && (parts > NSURL_SCHEME)) { + flags |= NSURL_F_SCHEME_PUNCTUATION; + + *url_l += SLEN(":"); + } + + if ((flags & NSURL_F_SCHEME) && (flags > NSURL_F_SCHEME) && + url->path && lwc_string_data(url->path)[0] == '/') { + flags |= NSURL_F_AUTHORITY_PUNCTUATION; + + *url_l += SLEN("//"); + } + + if ((flags & (NSURL_F_USERNAME | NSURL_F_PASSWORD)) && + flags & NSURL_F_HOST) { + flags |= NSURL_F_CREDENTIALS_PUNCTUATION; + + *url_l += SLEN("@"); + } + + if ((flags & ~NSURL_F_FRAGMENT) && (flags & NSURL_F_FRAGMENT)) { + flags |= NSURL_F_FRAGMENT_PUNCTUATION; + + *url_l += SLEN("#"); + } + + *pflags = flags; +} + + +/* Exported function, documented in utils/nsurl/private.h */ +void nsurl__get_string(const struct nsurl_components *url, char *url_s, + struct nsurl_component_lengths *l, + enum nsurl_string_flags flags) +{ + char *pos; + + /* Copy the required parts into the url string */ + pos = url_s; + + if (flags & NSURL_F_SCHEME) { + memcpy(pos, lwc_string_data(url->scheme), l->scheme); + pos += l->scheme; + } + + if (flags & NSURL_F_SCHEME_PUNCTUATION) { + *(pos++) = ':'; + } + + if (flags & NSURL_F_AUTHORITY_PUNCTUATION) { + *(pos++) = '/'; + *(pos++) = '/'; + } + + if (flags & NSURL_F_USERNAME) { + memcpy(pos, lwc_string_data(url->username), l->username); + pos += l->username; + } + + if (flags & NSURL_F_PASSWORD) { + *(pos++) = ':'; + memcpy(pos, lwc_string_data(url->password), l->password); + pos += l->password; + } + + if (flags & NSURL_F_CREDENTIALS_PUNCTUATION) { + *(pos++) = '@'; + } + + if (flags & NSURL_F_HOST) { + memcpy(pos, lwc_string_data(url->host), l->host); + pos += l->host; + } + + if (flags & NSURL_F_PORT) { + *(pos++) = ':'; + memcpy(pos, lwc_string_data(url->port), l->port); + pos += l->port; + } + + if (flags & NSURL_F_PATH) { + memcpy(pos, lwc_string_data(url->path), l->path); + pos += l->path; + } + + if (flags & NSURL_F_QUERY) { + memcpy(pos, lwc_string_data(url->query), l->query); + pos += l->query; + } + + if (flags & NSURL_F_FRAGMENT) { + if (flags & NSURL_F_FRAGMENT_PUNCTUATION) + *(pos++) = '#'; + memcpy(pos, lwc_string_data(url->fragment), l->fragment); + pos += l->fragment; + } + + *pos = '\0'; +} + + +/** + * Calculate hash value + * + * \param url NetSurf URL object to set hash value for + */ +void nsurl__calc_hash(nsurl *url) +{ + uint32_t hash = 0; + + if (url->components.scheme) + hash ^= lwc_string_hash_value(url->components.scheme); + + if (url->components.username) + hash ^= lwc_string_hash_value(url->components.username); + + if (url->components.password) + hash ^= lwc_string_hash_value(url->components.password); + + if (url->components.host) + hash ^= lwc_string_hash_value(url->components.host); + + if (url->components.port) + hash ^= lwc_string_hash_value(url->components.port); + + if (url->components.path) + hash ^= lwc_string_hash_value(url->components.path); + + if (url->components.query) + hash ^= lwc_string_hash_value(url->components.query); + + url->hash = hash; +} + + +/** + * Destroy components + * + * \param c url components + */ +static void nsurl_destroy_components(struct nsurl_components *c) +{ + if (c->scheme) + lwc_string_unref(c->scheme); + + if (c->username) + lwc_string_unref(c->username); + + if (c->password) + lwc_string_unref(c->password); + + if (c->host) + lwc_string_unref(c->host); + + if (c->port) + lwc_string_unref(c->port); + + if (c->path) + lwc_string_unref(c->path); + + if (c->query) + lwc_string_unref(c->query); + + if (c->fragment) + lwc_string_unref(c->fragment); +} + + +#ifdef NSURL_DEBUG +/** + * Dump a NetSurf URL's internal components + * + * \param url The NetSurf URL to dump components of + */ +static void nsurl__dump(const nsurl *url) +{ + if (url->components.scheme) + LOG(" Scheme: %s", lwc_string_data(url->components.scheme)); + + if (url->components.username) + LOG("Username: %s", lwc_string_data(url->components.username)); + + if (url->components.password) + LOG("Password: %s", lwc_string_data(url->components.password)); + + if (url->components.host) + LOG(" Host: %s", lwc_string_data(url->components.host)); + + if (url->components.port) + LOG(" Port: %s", lwc_string_data(url->components.port)); + + if (url->components.path) + LOG(" Path: %s", lwc_string_data(url->components.path)); + + if (url->components.query) + LOG(" Query: %s", lwc_string_data(url->components.query)); + + if (url->components.fragment) + LOG("Fragment: %s", lwc_string_data(url->components.fragment)); +} +#endif + +/****************************************************************************** + * NetSurf URL Public API * + ******************************************************************************/ + +/* exported interface, documented in nsurl.h */ +nserror nsurl_create(const char * const url_s, nsurl **url) +{ + struct url_markers m; + struct nsurl_components c; + size_t length; + char *buff; + struct nsurl_component_lengths str_len = { 0, 0, 0, 0, 0, 0, 0, 0 }; + enum nsurl_string_flags str_flags = 0; + nserror e = NSERROR_OK; + bool match; + + assert(url_s != NULL); + + /* Peg out the URL sections */ + nsurl__get_string_markers(url_s, &m, false); + + /* Get the length of the longest section */ + length = nsurl__get_longest_section(&m); + + /* Allocate enough memory to url escape the longest section */ + buff = malloc(length * 3 + 1); + if (buff == NULL) + return NSERROR_NOMEM; + + /* Set scheme type */ + c.scheme_type = m.scheme_type; + + /* Build NetSurf URL object from sections */ + e |= nsurl__create_from_section(url_s, URL_SCHEME, &m, buff, &c); + e |= nsurl__create_from_section(url_s, URL_CREDENTIALS, &m, buff, &c); + e |= nsurl__create_from_section(url_s, URL_HOST, &m, buff, &c); + e |= nsurl__create_from_section(url_s, URL_PATH, &m, buff, &c); + e |= nsurl__create_from_section(url_s, URL_QUERY, &m, buff, &c); + e |= nsurl__create_from_section(url_s, URL_FRAGMENT, &m, buff, &c); + + /* Finished with buffer */ + free(buff); + + if (e != NSERROR_OK) { + nsurl_destroy_components(&c); + return NSERROR_NOMEM; + } + + /* Validate URL */ + if ((lwc_string_isequal(c.scheme, corestring_lwc_http, + &match) == lwc_error_ok && match == true) || + (lwc_string_isequal(c.scheme, corestring_lwc_https, + &match) == lwc_error_ok && match == true)) { + /* http, https must have host */ + if (c.host == NULL) { + nsurl_destroy_components(&c); + return NSERROR_BAD_URL; + } + } + + /* Get the string length and find which parts of url are present */ + nsurl__get_string_data(&c, NSURL_WITH_FRAGMENT, &length, + &str_len, &str_flags); + + /* Create NetSurf URL object */ + *url = malloc(sizeof(nsurl) + length + 1); /* Add 1 for \0 */ + if (*url == NULL) { + nsurl_destroy_components(&c); + return NSERROR_NOMEM; + } + + (*url)->components = c; + (*url)->length = length; + + /* Fill out the url string */ + nsurl__get_string(&c, (*url)->string, &str_len, str_flags); + + /* Get the nsurl's hash */ + nsurl__calc_hash(*url); + + /* Give the URL a reference */ + (*url)->count = 1; + + return NSERROR_OK; +} + + +/* exported interface, documented in nsurl.h */ +nserror nsurl_join(const nsurl *base, const char *rel, nsurl **joined) +{ + struct url_markers m; + struct nsurl_components c; + size_t length; + char *buff; + char *buff_pos; + char *buff_start; + struct nsurl_component_lengths str_len = { 0, 0, 0, 0, 0, 0, 0, 0 }; + enum nsurl_string_flags str_flags = 0; + nserror error = 0; + enum { + NSURL_F_REL = 0, + NSURL_F_BASE_SCHEME = (1 << 0), + NSURL_F_BASE_AUTHORITY = (1 << 1), + NSURL_F_BASE_PATH = (1 << 2), + NSURL_F_MERGED_PATH = (1 << 3), + NSURL_F_BASE_QUERY = (1 << 4) + } joined_parts; + + assert(base != NULL); + assert(rel != NULL); + +#ifdef NSURL_DEBUG + LOG("base: \"%s\", rel: \"%s\"", nsurl_access(base), rel); +#endif + + /* Peg out the URL sections */ + nsurl__get_string_markers(rel, &m, true); + + /* Get the length of the longest section */ + length = nsurl__get_longest_section(&m); + + /* Initially assume that the joined URL can be formed entierly from + * the relative URL. + */ + joined_parts = NSURL_F_REL; + + /* Update joined_compnents to indicate any required parts from the + * base URL. + */ + if (m.scheme_end - m.start <= 0) { + /* The relative url has no scheme. + * Use base URL's scheme. */ + joined_parts |= NSURL_F_BASE_SCHEME; + + if (m.path - m.authority <= 0) { + /* The relative URL has no authority. + * Use base URL's authority. */ + joined_parts |= NSURL_F_BASE_AUTHORITY; + + if (m.query - m.path <= 0) { + /* The relative URL has no path. + * Use base URL's path. */ + joined_parts |= NSURL_F_BASE_PATH; + + if (m.fragment - m.query <= 0) { + /* The relative URL has no query. + * Use base URL's query. */ + joined_parts |= NSURL_F_BASE_QUERY; + } + + } else if (*(rel + m.path) != '/') { + /* Relative URL has relative path */ + joined_parts |= NSURL_F_MERGED_PATH; + } + } + } + + /* Allocate enough memory to url escape the longest section, plus + * space for path merging (if required). + */ + if (joined_parts & NSURL_F_MERGED_PATH) { + /* Need to merge paths */ + length += (base->components.path != NULL) ? + lwc_string_length(base->components.path) : 0; + } + length *= 4; + /* Plus space for removing dots from path */ + length += (m.query - m.path) + ((base->components.path != NULL) ? + lwc_string_length(base->components.path) : 0); + + buff = malloc(length + 5); + if (buff == NULL) { + return NSERROR_NOMEM; + } + + buff_pos = buff; + + /* Form joined URL from base or rel components, as appropriate */ + + if (joined_parts & NSURL_F_BASE_SCHEME) { + c.scheme_type = base->components.scheme_type; + + c.scheme = nsurl__component_copy(base->components.scheme); + } else { + c.scheme_type = m.scheme_type; + + error = nsurl__create_from_section(rel, URL_SCHEME, &m, buff, &c); + if (error != NSERROR_OK) { + free(buff); + return error; + } + } + + if (joined_parts & NSURL_F_BASE_AUTHORITY) { + c.username = nsurl__component_copy(base->components.username); + c.password = nsurl__component_copy(base->components.password); + c.host = nsurl__component_copy(base->components.host); + c.port = nsurl__component_copy(base->components.port); + } else { + error = nsurl__create_from_section(rel, URL_CREDENTIALS, &m, + buff, &c); + if (error == NSERROR_OK) { + error = nsurl__create_from_section(rel, URL_HOST, &m, + buff, &c); + } + if (error != NSERROR_OK) { + free(buff); + return error; + } + } + + if (joined_parts & NSURL_F_BASE_PATH) { + c.path = nsurl__component_copy(base->components.path); + + } else if (joined_parts & NSURL_F_MERGED_PATH) { + struct url_markers m_path; + size_t new_length; + + if (base->components.host != NULL && + base->components.path == NULL) { + /* Append relative path to "/". */ + *(buff_pos++) = '/'; + memcpy(buff_pos, rel + m.path, m.query - m.path); + buff_pos += m.query - m.path; + + } else { + /* Append relative path to all but last segment of + * base path. */ + size_t path_end = lwc_string_length( + base->components.path); + const char *path = lwc_string_data( + base->components.path); + + while (*(path + path_end) != '/' && + path_end != 0) { + path_end--; + } + if (*(path + path_end) == '/') + path_end++; + + /* Copy the base part */ + memcpy(buff_pos, path, path_end); + buff_pos += path_end; + + /* Copy the relative part */ + memcpy(buff_pos, rel + m.path, m.query - m.path); + buff_pos += m.query - m.path; + } + + /* add termination to string */ + *buff_pos++ = '\0'; + + new_length = nsurl__remove_dot_segments(buff, buff_pos); + + m_path.path = 0; + m_path.query = new_length; + + buff_start = buff_pos + new_length; + error = nsurl__create_from_section(buff_pos, URL_PATH, &m_path, + buff_start, &c); + if (error != NSERROR_OK) { + free(buff); + return error; + } + + } else { + struct url_markers m_path; + size_t new_length; + + memcpy(buff_pos, rel + m.path, m.query - m.path); + buff_pos += m.query - m.path; + *(buff_pos++) = '\0'; + + new_length = nsurl__remove_dot_segments(buff, buff_pos); + + m_path.path = 0; + m_path.query = new_length; + + buff_start = buff_pos + new_length; + + error = nsurl__create_from_section(buff_pos, URL_PATH, &m_path, + buff_start, &c); + if (error != NSERROR_OK) { + free(buff); + return error; + } + } + + if (joined_parts & NSURL_F_BASE_QUERY) { + c.query = nsurl__component_copy(base->components.query); + } else { + error = nsurl__create_from_section(rel, URL_QUERY, &m, + buff, &c); + if (error != NSERROR_OK) { + free(buff); + return error; + } + } + + error = nsurl__create_from_section(rel, URL_FRAGMENT, &m, buff, &c); + + /* Free temporary buffer */ + free(buff); + + if (error != NSERROR_OK) { + return error; + } + + /* Get the string length and find which parts of url are present */ + nsurl__get_string_data(&c, NSURL_WITH_FRAGMENT, &length, + &str_len, &str_flags); + + /* Create NetSurf URL object */ + *joined = malloc(sizeof(nsurl) + length + 1); /* Add 1 for \0 */ + if (*joined == NULL) { + return NSERROR_NOMEM; + } + + (*joined)->components = c; + (*joined)->length = length; + + /* Fill out the url string */ + nsurl__get_string(&c, (*joined)->string, &str_len, str_flags); + + /* Get the nsurl's hash */ + nsurl__calc_hash(*joined); + + /* Give the URL a reference */ + (*joined)->count = 1; + + return NSERROR_OK; +} + diff --git a/utils/nsurl/private.h b/utils/nsurl/private.h index b8132c535..f8ba51f67 100644 --- a/utils/nsurl/private.h +++ b/utils/nsurl/private.h @@ -21,6 +21,7 @@ #include +#include "utils/nsurl.h" #include "utils/utils.h" /** A type for URL schemes */ @@ -72,4 +73,71 @@ struct nsurl { char string[FLEX_ARRAY_LEN_DECL]; /* Full URL as a string */ }; + +/** Marker set, indicating positions of sections within a URL string */ +struct nsurl_component_lengths { + size_t scheme; + size_t username; + size_t password; + size_t host; + size_t port; + size_t path; + size_t query; + size_t fragment; +}; + + +/** Flags indicating which parts of a URL string are required for a nsurl */ +enum nsurl_string_flags { + NSURL_F_SCHEME = (1 << 0), + NSURL_F_SCHEME_PUNCTUATION = (1 << 1), + NSURL_F_AUTHORITY_PUNCTUATION = (1 << 2), + NSURL_F_USERNAME = (1 << 3), + NSURL_F_PASSWORD = (1 << 4), + NSURL_F_CREDENTIALS_PUNCTUATION = (1 << 5), + NSURL_F_HOST = (1 << 6), + NSURL_F_PORT = (1 << 7), + NSURL_F_AUTHORITY = (NSURL_F_USERNAME | + NSURL_F_PASSWORD | + NSURL_F_HOST | + NSURL_F_PORT), + NSURL_F_PATH = (1 << 8), + NSURL_F_QUERY = (1 << 9), + NSURL_F_FRAGMENT_PUNCTUATION = (1 << 10), + NSURL_F_FRAGMENT = (1 << 11) +}; + +/** + * Get nsurl string info; total length, component lengths, & components present + * + * \param url NetSurf URL components + * \param url_s Updated to contain the string + * \param l Individual component lengths + * \param flags String flags + */ +void nsurl__get_string(const struct nsurl_components *url, char *url_s, + struct nsurl_component_lengths *l, + enum nsurl_string_flags flags); + +/** + * Get nsurl string info; total length, component lengths, & components present + * + * \param url NetSurf URL components + * \param parts Which parts of the URL are required in the string + * \param url_l Updated to total string length + * \param lengths Updated with individual component lengths + * \param pflags Updated to contain relevant string flags + */ +void nsurl__get_string_data(const struct nsurl_components *url, + nsurl_component parts, size_t *url_l, + struct nsurl_component_lengths *lengths, + enum nsurl_string_flags *pflags); + +/** + * Calculate hash value + * + * \param url NetSurf URL object to set hash value for + */ +void nsurl__calc_hash(nsurl *url); + #endif -- cgit v1.2.3