From 95e5ede775fc842324b87d797ca00c7576b9a359 Mon Sep 17 00:00:00 2001 From: Michael Drake Date: Sun, 15 Jan 2017 15:09:36 +0000 Subject: nsurl: Split out URL parsing. --- utils/nsurl/nsurl.c | 1528 +-------------------------------------------------- 1 file changed, 5 insertions(+), 1523 deletions(-) (limited to 'utils/nsurl/nsurl.c') diff --git a/utils/nsurl/nsurl.c b/utils/nsurl/nsurl.c index bc77c58ea..b60eb07ca 100644 --- a/utils/nsurl/nsurl.c +++ b/utils/nsurl/nsurl.c @@ -50,71 +50,6 @@ #undef NSURL_DEBUG -/** Marker set, indicating positions of sections within a URL string */ -struct url_markers { - size_t start; /** start of URL */ - size_t scheme_end; - size_t authority; - - size_t colon_first; - size_t at; - size_t colon_last; - - size_t path; - size_t query; - size_t fragment; - - size_t end; /** end of URL */ - - enum nsurl_scheme_type scheme_type; -}; - - -/** Marker set, indicating positions of sections within a URL string */ -struct nsurl_component_lengths { - size_t scheme; - size_t username; - size_t password; - size_t host; - size_t port; - size_t path; - size_t query; - size_t fragment; -}; - - -/** Flags indicating which parts of a URL string are required for a nsurl */ -enum nsurl_string_flags { - NSURL_F_SCHEME = (1 << 0), - NSURL_F_SCHEME_PUNCTUATION = (1 << 1), - NSURL_F_AUTHORITY_PUNCTUATION = (1 << 2), - NSURL_F_USERNAME = (1 << 3), - NSURL_F_PASSWORD = (1 << 4), - NSURL_F_CREDENTIALS_PUNCTUATION = (1 << 5), - NSURL_F_HOST = (1 << 6), - NSURL_F_PORT = (1 << 7), - NSURL_F_AUTHORITY = (NSURL_F_USERNAME | - NSURL_F_PASSWORD | - NSURL_F_HOST | - NSURL_F_PORT), - NSURL_F_PATH = (1 << 8), - NSURL_F_QUERY = (1 << 9), - NSURL_F_FRAGMENT_PUNCTUATION = (1 << 10), - NSURL_F_FRAGMENT = (1 << 11) -}; - - -/** Sections of a URL */ -enum url_sections { - URL_SCHEME, - URL_CREDENTIALS, - URL_HOST, - URL_PATH, - URL_QUERY, - URL_FRAGMENT -}; - - #define nsurl__component_copy(c) (c == NULL) ? NULL : lwc_string_ref(c) #define nsurl__component_compare(c1, c2, match) \ @@ -125,1129 +60,6 @@ enum url_sections { *match = false; \ } -/** - * Return a hex digit for the given numerical value. - * - * \param digit the value to get the hex digit for. - * \return character in range 0-9A-F - */ -inline static char digit2uppercase_hex(unsigned char digit) { - assert(digit < 16); - return "0123456789ABCDEF"[digit]; -} - -/** - * determine if a character is unreserved - * - * \param c character to classify. - * \return true if the character is unreserved else false. - */ -static bool nsurl__is_unreserved(unsigned char c) -{ - /* From RFC3986 section 2.3 (unreserved characters) - * - * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" - * - */ - static const bool unreserved[256] = { - false, false, false, false, false, false, false, false, /* 00 */ - false, false, false, false, false, false, false, false, /* 08 */ - false, false, false, false, false, false, false, false, /* 10 */ - false, false, false, false, false, false, false, false, /* 18 */ - false, false, false, false, false, false, false, false, /* 20 */ - false, false, false, false, false, true, true, false, /* 28 */ - true, true, true, true, true, true, true, true, /* 30 */ - true, true, false, false, false, false, false, false, /* 38 */ - false, true, true, true, true, true, true, true, /* 40 */ - true, true, true, true, true, true, true, true, /* 48 */ - true, true, true, true, true, true, true, true, /* 50 */ - true, true, true, false, false, false, false, true, /* 58 */ - false, true, true, true, true, true, true, true, /* 60 */ - true, true, true, true, true, true, true, true, /* 68 */ - true, true, true, true, true, true, true, true, /* 70 */ - true, true, true, false, false, false, true, false, /* 78 */ - false, false, false, false, false, false, false, false, /* 80 */ - false, false, false, false, false, false, false, false, /* 88 */ - false, false, false, false, false, false, false, false, /* 90 */ - false, false, false, false, false, false, false, false, /* 98 */ - false, false, false, false, false, false, false, false, /* A0 */ - false, false, false, false, false, false, false, false, /* A8 */ - false, false, false, false, false, false, false, false, /* B0 */ - false, false, false, false, false, false, false, false, /* B8 */ - false, false, false, false, false, false, false, false, /* C0 */ - false, false, false, false, false, false, false, false, /* C8 */ - false, false, false, false, false, false, false, false, /* D0 */ - false, false, false, false, false, false, false, false, /* D8 */ - false, false, false, false, false, false, false, false, /* E0 */ - false, false, false, false, false, false, false, false, /* E8 */ - false, false, false, false, false, false, false, false, /* F0 */ - false, false, false, false, false, false, false, false /* F8 */ - }; - return unreserved[c]; -} - -/** - * determine if a character should be percent escaped. - * - * The ASCII codes which should not be percent escaped - * - * \param c character to classify. - * \return true if the character should not be escaped else false. - */ -static bool nsurl__is_no_escape(unsigned char c) -{ - static const bool no_escape[256] = { - false, false, false, false, false, false, false, false, /* 00 */ - false, false, false, false, false, false, false, false, /* 08 */ - false, false, false, false, false, false, false, false, /* 10 */ - false, false, false, false, false, false, false, false, /* 18 */ - false, true, false, true, true, false, true, true, /* 20 */ - true, true, true, true, true, true, true, true, /* 28 */ - true, true, true, true, true, true, true, true, /* 30 */ - true, true, true, true, false, true, false, true, /* 38 */ - true, true, true, true, true, true, true, true, /* 40 */ - true, true, true, true, true, true, true, true, /* 48 */ - true, true, true, true, true, true, true, true, /* 50 */ - true, true, true, true, false, true, false, true, /* 58 */ - false, true, true, true, true, true, true, true, /* 60 */ - true, true, true, true, true, true, true, true, /* 68 */ - true, true, true, true, true, true, true, true, /* 70 */ - true, true, true, false, true, false, true, false, /* 78 */ - false, false, false, false, false, false, false, false, /* 80 */ - false, false, false, false, false, false, false, false, /* 88 */ - false, false, false, false, false, false, false, false, /* 90 */ - false, false, false, false, false, false, false, false, /* 98 */ - false, false, false, false, false, false, false, false, /* A0 */ - false, false, false, false, false, false, false, false, /* A8 */ - false, false, false, false, false, false, false, false, /* B0 */ - false, false, false, false, false, false, false, false, /* B8 */ - false, false, false, false, false, false, false, false, /* C0 */ - false, false, false, false, false, false, false, false, /* C8 */ - false, false, false, false, false, false, false, false, /* D0 */ - false, false, false, false, false, false, false, false, /* D8 */ - false, false, false, false, false, false, false, false, /* E0 */ - false, false, false, false, false, false, false, false, /* E8 */ - false, false, false, false, false, false, false, false, /* F0 */ - false, false, false, false, false, false, false, false, /* F8 */ - }; - return no_escape[c]; -} - - -/** - * Obtains a set of markers delimiting sections in a URL string - * - * \param url_s URL string - * \param markers Updated to mark sections in the URL string - * \param joining True iff URL string is a relative URL for joining - */ -static void nsurl__get_string_markers(const char * const url_s, - struct url_markers *markers, bool joining) -{ - const char *pos = url_s; /** current position in url_s */ - bool is_http = false; - bool trailing_whitespace = false; - - /* Initialise marker set */ - struct url_markers marker = { 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, NSURL_SCHEME_OTHER }; - - /* Skip any leading whitespace in url_s */ - while (ascii_is_space(*pos)) - pos++; - - /* Record start point */ - marker.start = pos - url_s; - - marker.scheme_end = marker.authority = marker.colon_first = marker.at = - marker.colon_last = marker.path = marker.start; - - if (*pos == '\0') { - /* Nothing but whitespace, early exit */ - marker.query = marker.fragment = marker.end = marker.path; - *markers = marker; - return; - } - - /* Get scheme */ - if (ascii_is_alpha(*pos)) { - pos++; - - while (*pos != ':' && *pos != '\0') { - if (!ascii_is_alphanumerical(*pos) && (*pos != '+') && - (*pos != '-') && (*pos != '.')) { - /* This character is not valid in the - * scheme */ - break; - } - pos++; - } - - if (*pos == ':') { - /* This delimits the end of the scheme */ - size_t off; - - marker.scheme_end = pos - url_s; - - off = marker.scheme_end - marker.start; - - /* Detect http(s) and mailto for scheme specifc - * normalisation */ - if (off == SLEN("http") && - (((*(pos - off + 0) == 'h') || - (*(pos - off + 0) == 'H')) && - ((*(pos - off + 1) == 't') || - (*(pos - off + 1) == 'T')) && - ((*(pos - off + 2) == 't') || - (*(pos - off + 2) == 'T')) && - ((*(pos - off + 3) == 'p') || - (*(pos - off + 3) == 'P')))) { - marker.scheme_type = NSURL_SCHEME_HTTP; - is_http = true; - } else if (off == SLEN("https") && - (((*(pos - off + 0) == 'h') || - (*(pos - off + 0) == 'H')) && - ((*(pos - off + 1) == 't') || - (*(pos - off + 1) == 'T')) && - ((*(pos - off + 2) == 't') || - (*(pos - off + 2) == 'T')) && - ((*(pos - off + 3) == 'p') || - (*(pos - off + 3) == 'P')) && - ((*(pos - off + 4) == 's') || - (*(pos - off + 4) == 'S')))) { - marker.scheme_type = NSURL_SCHEME_HTTPS; - is_http = true; - } else if (off == SLEN("ftp") && - (((*(pos - off + 0) == 'f') || - (*(pos - off + 0) == 'F')) && - ((*(pos - off + 1) == 't') || - (*(pos - off + 1) == 'T')) && - ((*(pos - off + 2) == 'p') || - (*(pos - off + 2) == 'P')))) { - marker.scheme_type = NSURL_SCHEME_FTP; - } else if (off == SLEN("mailto") && - (((*(pos - off + 0) == 'm') || - (*(pos - off + 0) == 'M')) && - ((*(pos - off + 1) == 'a') || - (*(pos - off + 1) == 'A')) && - ((*(pos - off + 2) == 'i') || - (*(pos - off + 2) == 'I')) && - ((*(pos - off + 3) == 'l') || - (*(pos - off + 3) == 'L')) && - ((*(pos - off + 4) == 't') || - (*(pos - off + 4) == 'T')) && - ((*(pos - off + 5) == 'o') || - (*(pos - off + 5) == 'O')))) { - marker.scheme_type = NSURL_SCHEME_MAILTO; - } - - /* Skip over colon */ - pos++; - - /* Mark place as start of authority */ - marker.authority = marker.colon_first = marker.at = - marker.colon_last = marker.path = - pos - url_s; - - } else { - /* Not found a scheme */ - if (joining == false) { - /* Assuming no scheme == http */ - marker.scheme_type = NSURL_SCHEME_HTTP; - is_http = true; - } - } - } - - /* Get authority - * - * Two slashes always indicates the start of an authority. - * - * We are more relaxed in the case of http: - * a. when joining, one or more slashes indicates start of authority - * b. when not joining, we assume authority if no scheme was present - * and in the case of mailto: when we assume there is an authority. - */ - if ((*pos == '/' && *(pos + 1) == '/') || - (is_http && ((joining && *pos == '/') || - (joining == false && - marker.scheme_end != marker.start))) || - marker.scheme_type == NSURL_SCHEME_MAILTO) { - - /* Skip over leading slashes */ - if (*pos == '/') { - if (is_http == false) { - if (*pos == '/') pos++; - if (*pos == '/') pos++; - } else { - while (*pos == '/') - pos++; - } - - marker.authority = marker.colon_first = marker.at = - marker.colon_last = marker.path = - pos - url_s; - } - - /* Need to get (or complete) the authority */ - while (*pos != '\0') { - if (*pos == '/' || *pos == '?' || *pos == '#') { - /* End of the authority */ - break; - - } else if (marker.scheme_type != NSURL_SCHEME_MAILTO && - *pos == ':' && marker.colon_first == - marker.authority) { - /* could be username:password or host:port - * separator */ - marker.colon_first = pos - url_s; - - } else if (marker.scheme_type != NSURL_SCHEME_MAILTO && - *pos == ':' && marker.colon_first != - marker.authority) { - /* could be host:port separator */ - marker.colon_last = pos - url_s; - - } else if (*pos == '@' && marker.at == - marker.authority) { - /* Credentials @ host separator */ - marker.at = pos - url_s; - } - - pos++; - } - - marker.path = pos - url_s; - - } else if ((*pos == '\0' || *pos == '/') && - joining == false && is_http == true) { - marker.path = pos - url_s; - } - - /* Get path - * - * Needs to start with '/' if there's no authority - */ - if (*pos == '/' || ((marker.path == marker.authority) && - (*pos != '?') && (*pos != '#') && (*pos != '\0'))) { - while (*(++pos) != '\0') { - if (*pos == '?' || *pos == '#') { - /* End of the path */ - break; - } - } - } - - marker.query = pos - url_s; - - /* Get query */ - if (*pos == '?') { - while (*(++pos) != '\0') { - if (*pos == '#') { - /* End of the query */ - break; - } - } - } - - marker.fragment = pos - url_s; - - /* Get fragment */ - if (*pos == '#') { - while (*(++pos) != '\0') - ; - } - - /* We got to the end of url_s. - * Need to skip back over trailing whitespace to find end of URL */ - pos--; - if (pos >= url_s && ascii_is_space(*pos)) { - trailing_whitespace = true; - while (pos >= url_s && ascii_is_space(*pos)) - pos--; - } - - marker.end = pos + 1 - url_s; - - if (trailing_whitespace == true) { - /* Ensure last url section doesn't pass end */ - if (marker.fragment > marker.end) - marker.fragment = marker.end; - if (marker.query > marker.end) - marker.query = marker.end; - if (marker.path > marker.end) - marker.path = marker.end; - if (marker.colon_last > marker.end) - marker.colon_last = marker.end; - if (marker.at > marker.end) - marker.at = marker.end; - if (marker.colon_last > marker.end) - marker.colon_last = marker.end; - if (marker.fragment > marker.end) - marker.fragment = marker.end; - } - -#ifdef NSURL_DEBUG - LOG("marker.start: %i", marker.start); - LOG("marker.scheme_end: %i", marker.scheme_end); - LOG("marker.authority: %i", marker.authority); - - LOG("marker.colon_first: %i", marker.colon_first); - LOG("marker.at: %i", marker.at); - LOG("marker.colon_last: %i", marker.colon_last); - - LOG("marker.path: %i", marker.path); - LOG("marker.query: %i", marker.query); - LOG("marker.fragment: %i", marker.fragment); - - LOG("marker.end: %i", marker.end); -#endif - - /* Got all the URL components pegged out now */ - *markers = marker; -} - - -/** - * Remove dot segments from a path, as per rfc 3986, 5.2.4 - * - * \param path path to remove dot segments from ('\0' terminated) - * \param output path with dot segments removed - * \return size of output - */ -static size_t nsurl__remove_dot_segments(char *path, char *output) -{ - char *path_pos = path; - char *output_pos = output; - - while (*path_pos != '\0') { -#ifdef NSURL_DEBUG - LOG(" in:%s", path_pos); - LOG("out:%.*s", output_pos - output, output); -#endif - if (*path_pos == '.') { - if (*(path_pos + 1) == '.' && - *(path_pos + 2) == '/') { - /* Found prefix of "../" */ - path_pos += SLEN("../"); - continue; - - } else if (*(path_pos + 1) == '/') { - /* Found prefix of "./" */ - path_pos += SLEN("./"); - continue; - } - } else if (*path_pos == '/' && *(path_pos + 1) == '.') { - if (*(path_pos + 2) == '/') { - /* Found prefix of "/./" */ - path_pos += SLEN("/."); - continue; - - } else if (*(path_pos + 2) == '\0') { - /* Found "/." at end of path */ - *(output_pos++) = '/'; - - /* End of input path */ - break; - - } else if (*(path_pos + 2) == '.') { - if (*(path_pos + 3) == '/') { - /* Found prefix of "/../" */ - path_pos += SLEN("/.."); - - if (output_pos > output) - output_pos--; - while (output_pos > output && - *output_pos != '/') - output_pos--; - - continue; - - } else if (*(path_pos + 3) == '\0') { - /* Found "/.." at end of path */ - - while (output_pos > output && - *(output_pos -1 ) !='/') - output_pos--; - - /* End of input path */ - break; - } - } - } else if (*path_pos == '.') { - if (*(path_pos + 1) == '\0') { - /* Found "." at end of path */ - - /* End of input path */ - break; - - } else if (*(path_pos + 1) == '.' && - *(path_pos + 2) == '\0') { - /* Found ".." at end of path */ - - /* End of input path */ - break; - } - } - /* Copy first character into output path */ - *output_pos++ = *path_pos++; - - /* Copy up to but not including next '/' */ - while ((*path_pos != '/') && (*path_pos != '\0')) - *output_pos++ = *path_pos++; - } - - return output_pos - output; -} - - -/** - * Get the length of the longest section - * - * \param m markers delimiting url sections in a string - * \return the length of the longest section - */ -static size_t nsurl__get_longest_section(struct url_markers *m) -{ - size_t length = m->scheme_end - m->start; /* scheme */ - - if (length < m->at - m->authority) /* credentials */ - length = m->at - m->authority; - - if (length < m->path - m->at) /* host */ - length = m->path - m->at; - - if (length < m->query - m->path) /* path */ - length = m->query - m->path; - - if (length < m->fragment - m->query) /* query */ - length = m->fragment - m->query; - - if (length < m->end - m->fragment) /* fragment */ - length = m->end - m->fragment; - - return length; -} - - -/** - * Converts two hexadecimal digits to a single number - * - * \param c1 most significant hex digit - * \param c2 least significant hex digit - * \return the total value of the two digit hex number, or -ve if input not hex - * - * For unescaping url encoded characters. - */ -static inline int nsurl__get_ascii_offset(char c1, char c2) -{ - int offset; - - /* Use 1st char as most significant hex digit */ - if (ascii_is_digit(c1)) - offset = 16 * (c1 - '0'); - else if (c1 >= 'a' && c1 <= 'f') - offset = 16 * (c1 - 'a' + 10); - else if (c1 >= 'A' && c1 <= 'F') - offset = 16 * (c1 - 'A' + 10); - else - /* Not valid hex */ - return -1; - - /* Use 2nd char as least significant hex digit and sum */ - if (ascii_is_digit(c2)) - offset += c2 - '0'; - else if (c2 >= 'a' && c2 <= 'f') - offset += c2 - 'a' + 10; - else if (c2 >= 'A' && c2 <= 'F') - offset += c2 - 'A' + 10; - else - /* Not valid hex */ - return -1; - - return offset; -} - - -/** - * Create the components of a NetSurf URL object for a section of a URL string - * - * \param url_s URL string - * \param section Sets which section of URL string is to be normalised - * \param pegs Set of markers delimiting the URL string's sections - * \param pos_norm A buffer large enough for the normalised string (*3 + 1) - * \param url A NetSurf URL object, to which components may be added - * \return NSERROR_OK on success, appropriate error otherwise - * - * The section of url_s is normalised appropriately. - */ -static nserror nsurl__create_from_section(const char * const url_s, - const enum url_sections section, - const struct url_markers *pegs, - char *pos_norm, - struct nsurl_components *url) -{ - nserror ret; - int ascii_offset; - int start = 0; - int end = 0; - const char *pos; - const char *pos_url_s; - char *norm_start = pos_norm; - char *host; - size_t copy_len; - size_t length; - size_t host_len; - enum { - NSURL_F_NO_PORT = (1 << 0) - } flags = 0; - - switch (section) { - case URL_SCHEME: - start = pegs->start; - end = pegs->scheme_end; - break; - - case URL_CREDENTIALS: - start = pegs->authority; - end = pegs->at; - break; - - case URL_HOST: - start = (pegs->at == pegs->authority && - *(url_s + pegs->at) != '@') ? - pegs->at : - pegs->at + 1; - end = pegs->path; - break; - - case URL_PATH: - start = pegs->path; - end = pegs->query; - break; - - case URL_QUERY: - start = pegs->query; - end = pegs->fragment; - break; - - case URL_FRAGMENT: - start = (*(url_s + pegs->fragment) != '#') ? - pegs->fragment : - pegs->fragment + 1; - end = pegs->end; - break; - } - - if (end < start) - end = start; - - length = end - start; - - /* Stage 1: Normalise the required section */ - - pos = pos_url_s = url_s + start; - copy_len = 0; - for (; pos < url_s + end; pos++) { - if (*pos == '%' && (pos + 2 < url_s + end)) { - /* Might be an escaped character needing unescaped */ - - /* Find which character which was escaped */ - ascii_offset = nsurl__get_ascii_offset(*(pos + 1), - *(pos + 2)); - - if (ascii_offset < 0) { - /* % with invalid hex digits. */ - copy_len++; - continue; - } - - if ((section != URL_SCHEME && section != URL_HOST) && - (nsurl__is_unreserved(ascii_offset) == false)) { - /* This character should be escaped after all, - * just let it get copied */ - copy_len += 3; - pos += 2; - continue; - } - - if (copy_len > 0) { - /* Copy up to here */ - memcpy(pos_norm, pos_url_s, copy_len); - pos_norm += copy_len; - copy_len = 0; - } - - /* Put the unescaped character in the normalised URL */ - *(pos_norm++) = (char)ascii_offset; - pos += 2; - pos_url_s = pos + 1; - - length -= 2; - - } else if ((section != URL_SCHEME && section != URL_HOST) && - (nsurl__is_no_escape(*pos) == false)) { - - /* This needs to be escaped */ - if (copy_len > 0) { - /* Copy up to here */ - memcpy(pos_norm, pos_url_s, copy_len); - pos_norm += copy_len; - copy_len = 0; - } - - /* escape */ - *(pos_norm++) = '%'; - *(pos_norm++) = digit2uppercase_hex( - ((unsigned char)*pos) >> 4); - *(pos_norm++) = digit2uppercase_hex( - ((unsigned char)*pos) & 0xf); - pos_url_s = pos + 1; - - length += 2; - - } else if ((section == URL_SCHEME || section == URL_HOST) && - ascii_is_alpha_upper(*pos)) { - /* Lower case this letter */ - - if (copy_len > 0) { - /* Copy up to here */ - memcpy(pos_norm, pos_url_s, copy_len); - pos_norm += copy_len; - copy_len = 0; - } - /* Copy lower cased letter into normalised URL */ - *(pos_norm++) = ascii_to_lower(*pos); - pos_url_s = pos + 1; - - } else { - /* This character is safe in normalised URL */ - copy_len++; - } - } - - if (copy_len > 0) { - /* Copy up to here */ - memcpy(pos_norm, pos_url_s, copy_len); - pos_norm += copy_len; - } - - /* Mark end of section */ - (*pos_norm) = '\0'; - - /* Stage 2: Create the URL components for the required section */ - switch (section) { - case URL_SCHEME: - if (length == 0) { - /* No scheme, assuming http */ - url->scheme = lwc_string_ref(corestring_lwc_http); - } else { - /* Add scheme to URL */ - if (lwc_intern_string(norm_start, length, - &url->scheme) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } - - break; - - case URL_CREDENTIALS: - url->username = NULL; - url->password = NULL; - - if (length != 0 && *norm_start != ':') { - char *sec_start = norm_start; - if (pegs->colon_first != pegs->authority && - pegs->at > pegs->colon_first + 1) { - /* there's a password */ - sec_start += pegs->colon_first - - pegs->authority + 1; - if (lwc_intern_string(sec_start, - pegs->at - pegs->colon_first -1, - &url->password) != - lwc_error_ok) { - return NSERROR_NOMEM; - } - - /* update start pos and length for username */ - sec_start = norm_start; - length -= pegs->at - pegs->colon_first; - } else if (pegs->colon_first != pegs->authority && - pegs->at == pegs->colon_first + 1) { - /* strip username colon */ - length--; - } - - /* Username */ - if (lwc_intern_string(sec_start, length, - &url->username) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } - - break; - - case URL_HOST: - url->host = NULL; - url->port = NULL; - - if (length != 0) { - size_t colon = 0; - char *sec_start = norm_start; - if (pegs->at < pegs->colon_first && - pegs->colon_last == pegs->authority) { - /* There's one colon and it's after @ marker */ - colon = pegs->colon_first; - } else if (pegs->colon_last != pegs->authority) { - /* There's more than one colon */ - colon = pegs->colon_last; - } else { - /* There's no colon that could be a port - * separator */ - flags |= NSURL_F_NO_PORT; - } - - if (!(flags & NSURL_F_NO_PORT)) { - /* Determine whether colon is a port separator - */ - sec_start += colon - pegs->at; - while (++sec_start < norm_start + length) { - if (!ascii_is_digit(*sec_start)) { - /* Character after port isn't a - * digit; not a port separator - */ - flags |= NSURL_F_NO_PORT; - break; - } - } - } - - if (!(flags & NSURL_F_NO_PORT)) { - /* There's a port */ - size_t skip = (pegs->at == pegs->authority) ? - 1 : 0; - sec_start = norm_start + colon - pegs->at + - skip; - if (url->scheme != NULL && - url->scheme_type == - NSURL_SCHEME_HTTP && - length - - (colon - pegs->at + skip) == 2 && - *sec_start == '8' && - *(sec_start + 1) == '0') { - /* Scheme is http, and port is default - * (80) */ - flags |= NSURL_F_NO_PORT; - } - - if (length <= (colon - pegs->at + skip)) { - /* No space for a port after the colon - */ - flags |= NSURL_F_NO_PORT; - } - - /* Add non-redundant ports to NetSurf URL */ - sec_start = norm_start + colon - pegs->at + - skip; - if (!(flags & NSURL_F_NO_PORT) && - lwc_intern_string(sec_start, - length - - (colon - pegs->at + skip), - &url->port) != lwc_error_ok) { - return NSERROR_NOMEM; - } - - /* update length for host */ - skip = (pegs->at == pegs->authority) ? 0 : 1; - length = colon - pegs->at - skip; - } - - /* host */ - /* Encode host according to IDNA2008 */ - ret = idna_encode(norm_start, length, &host, &host_len); - if (ret == NSERROR_OK) { - /* valid idna encoding */ - if (lwc_intern_string(host, host_len, - &url->host) != lwc_error_ok) { - return NSERROR_NOMEM; - } - free(host); - } else { - /* fall back to straight interning */ - if (lwc_intern_string(norm_start, length, - &url->host) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } - } - - break; - - case URL_PATH: - if (length != 0) { - if (lwc_intern_string(norm_start, length, - &url->path) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } else if (url->host != NULL && - url->scheme_type != NSURL_SCHEME_MAILTO) { - /* Set empty path to "/", if there's a host */ - if (lwc_intern_string("/", SLEN("/"), - &url->path) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } else { - url->path = NULL; - } - - break; - - case URL_QUERY: - if (length != 0) { - if (lwc_intern_string(norm_start, length, - &url->query) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } else { - url->query = NULL; - } - - break; - - case URL_FRAGMENT: - if (length != 0) { - if (lwc_intern_string(norm_start, length, - &url->fragment) != lwc_error_ok) { - return NSERROR_NOMEM; - } - } else { - url->fragment = NULL; - } - - break; - } - - return NSERROR_OK; -} - - -/** - * Get nsurl string info; total length, component lengths, & components present - * - * \param url NetSurf URL components - * \param parts Which parts of the URL are required in the string - * \param url_l Updated to total string length - * \param lengths Updated with individual component lengths - * \param pflags Updated to contain relevant string flags - */ -static void nsurl__get_string_data(const struct nsurl_components *url, - nsurl_component parts, size_t *url_l, - struct nsurl_component_lengths *lengths, - enum nsurl_string_flags *pflags) -{ - enum nsurl_string_flags flags = *pflags; - *url_l = 0; - - /* Intersection of required parts and available parts gives - * the output parts */ - if (url->scheme && parts & NSURL_SCHEME) { - flags |= NSURL_F_SCHEME; - - lengths->scheme = lwc_string_length(url->scheme); - *url_l += lengths->scheme; - } - - if (url->username && parts & NSURL_USERNAME) { - flags |= NSURL_F_USERNAME; - - lengths->username = lwc_string_length(url->username); - *url_l += lengths->username; - } - - if (url->password && parts & NSURL_PASSWORD) { - flags |= NSURL_F_PASSWORD; - - lengths->password = lwc_string_length(url->password); - *url_l += SLEN(":") + lengths->password; - } - - if (url->host && parts & NSURL_HOST) { - flags |= NSURL_F_HOST; - - lengths->host = lwc_string_length(url->host); - *url_l += lengths->host; - } - - if (url->port && parts & NSURL_PORT) { - flags |= NSURL_F_PORT; - - lengths->port = lwc_string_length(url->port); - *url_l += SLEN(":") + lengths->port; - } - - if (url->path && parts & NSURL_PATH) { - flags |= NSURL_F_PATH; - - lengths->path = lwc_string_length(url->path); - *url_l += lengths->path; - } - - if (url->query && parts & NSURL_QUERY) { - flags |= NSURL_F_QUERY; - - lengths->query = lwc_string_length(url->query); - *url_l += lengths->query; - } - - if (url->fragment && parts & NSURL_FRAGMENT) { - flags |= NSURL_F_FRAGMENT; - - lengths->fragment = lwc_string_length(url->fragment); - *url_l += lengths->fragment; - } - - /* Turn on any spanned punctuation */ - if ((flags & NSURL_F_SCHEME) && (parts > NSURL_SCHEME)) { - flags |= NSURL_F_SCHEME_PUNCTUATION; - - *url_l += SLEN(":"); - } - - if ((flags & NSURL_F_SCHEME) && (flags > NSURL_F_SCHEME) && - url->path && lwc_string_data(url->path)[0] == '/') { - flags |= NSURL_F_AUTHORITY_PUNCTUATION; - - *url_l += SLEN("//"); - } - - if ((flags & (NSURL_F_USERNAME | NSURL_F_PASSWORD)) && - flags & NSURL_F_HOST) { - flags |= NSURL_F_CREDENTIALS_PUNCTUATION; - - *url_l += SLEN("@"); - } - - if ((flags & ~NSURL_F_FRAGMENT) && (flags & NSURL_F_FRAGMENT)) { - flags |= NSURL_F_FRAGMENT_PUNCTUATION; - - *url_l += SLEN("#"); - } - - *pflags = flags; -} - - -/** - * Get nsurl string info; total length, component lengths, & components present - * - * \param url NetSurf URL components - * \param url_s Updated to contain the string - * \param l Individual component lengths - * \param flags String flags - */ -static void nsurl_get_string(const struct nsurl_components *url, char *url_s, - struct nsurl_component_lengths *l, - enum nsurl_string_flags flags) -{ - char *pos; - - /* Copy the required parts into the url string */ - pos = url_s; - - if (flags & NSURL_F_SCHEME) { - memcpy(pos, lwc_string_data(url->scheme), l->scheme); - pos += l->scheme; - } - - if (flags & NSURL_F_SCHEME_PUNCTUATION) { - *(pos++) = ':'; - } - - if (flags & NSURL_F_AUTHORITY_PUNCTUATION) { - *(pos++) = '/'; - *(pos++) = '/'; - } - - if (flags & NSURL_F_USERNAME) { - memcpy(pos, lwc_string_data(url->username), l->username); - pos += l->username; - } - - if (flags & NSURL_F_PASSWORD) { - *(pos++) = ':'; - memcpy(pos, lwc_string_data(url->password), l->password); - pos += l->password; - } - - if (flags & NSURL_F_CREDENTIALS_PUNCTUATION) { - *(pos++) = '@'; - } - - if (flags & NSURL_F_HOST) { - memcpy(pos, lwc_string_data(url->host), l->host); - pos += l->host; - } - - if (flags & NSURL_F_PORT) { - *(pos++) = ':'; - memcpy(pos, lwc_string_data(url->port), l->port); - pos += l->port; - } - - if (flags & NSURL_F_PATH) { - memcpy(pos, lwc_string_data(url->path), l->path); - pos += l->path; - } - - if (flags & NSURL_F_QUERY) { - memcpy(pos, lwc_string_data(url->query), l->query); - pos += l->query; - } - - if (flags & NSURL_F_FRAGMENT) { - if (flags & NSURL_F_FRAGMENT_PUNCTUATION) - *(pos++) = '#'; - memcpy(pos, lwc_string_data(url->fragment), l->fragment); - pos += l->fragment; - } - - *pos = '\0'; -} - - -/** - * Calculate hash value - * - * \param url NetSurf URL object to set hash value for - */ -static void nsurl_calc_hash(nsurl *url) -{ - uint32_t hash = 0; - - if (url->components.scheme) - hash ^= lwc_string_hash_value(url->components.scheme); - - if (url->components.username) - hash ^= lwc_string_hash_value(url->components.username); - - if (url->components.password) - hash ^= lwc_string_hash_value(url->components.password); - - if (url->components.host) - hash ^= lwc_string_hash_value(url->components.host); - - if (url->components.port) - hash ^= lwc_string_hash_value(url->components.port); - - if (url->components.path) - hash ^= lwc_string_hash_value(url->components.path); - - if (url->components.query) - hash ^= lwc_string_hash_value(url->components.query); - - url->hash = hash; -} - /** * Destroy components @@ -1320,89 +132,6 @@ static void nsurl__dump(const nsurl *url) * NetSurf URL Public API * ******************************************************************************/ -/* exported interface, documented in nsurl.h */ -nserror nsurl_create(const char * const url_s, nsurl **url) -{ - struct url_markers m; - struct nsurl_components c; - size_t length; - char *buff; - struct nsurl_component_lengths str_len = { 0, 0, 0, 0, 0, 0, 0, 0 }; - enum nsurl_string_flags str_flags = 0; - nserror e = NSERROR_OK; - bool match; - - assert(url_s != NULL); - - /* Peg out the URL sections */ - nsurl__get_string_markers(url_s, &m, false); - - /* Get the length of the longest section */ - length = nsurl__get_longest_section(&m); - - /* Allocate enough memory to url escape the longest section */ - buff = malloc(length * 3 + 1); - if (buff == NULL) - return NSERROR_NOMEM; - - /* Set scheme type */ - c.scheme_type = m.scheme_type; - - /* Build NetSurf URL object from sections */ - e |= nsurl__create_from_section(url_s, URL_SCHEME, &m, buff, &c); - e |= nsurl__create_from_section(url_s, URL_CREDENTIALS, &m, buff, &c); - e |= nsurl__create_from_section(url_s, URL_HOST, &m, buff, &c); - e |= nsurl__create_from_section(url_s, URL_PATH, &m, buff, &c); - e |= nsurl__create_from_section(url_s, URL_QUERY, &m, buff, &c); - e |= nsurl__create_from_section(url_s, URL_FRAGMENT, &m, buff, &c); - - /* Finished with buffer */ - free(buff); - - if (e != NSERROR_OK) { - nsurl_destroy_components(&c); - return NSERROR_NOMEM; - } - - /* Validate URL */ - if ((lwc_string_isequal(c.scheme, corestring_lwc_http, - &match) == lwc_error_ok && match == true) || - (lwc_string_isequal(c.scheme, corestring_lwc_https, - &match) == lwc_error_ok && match == true)) { - /* http, https must have host */ - if (c.host == NULL) { - nsurl_destroy_components(&c); - return NSERROR_BAD_URL; - } - } - - /* Get the string length and find which parts of url are present */ - nsurl__get_string_data(&c, NSURL_WITH_FRAGMENT, &length, - &str_len, &str_flags); - - /* Create NetSurf URL object */ - *url = malloc(sizeof(nsurl) + length + 1); /* Add 1 for \0 */ - if (*url == NULL) { - nsurl_destroy_components(&c); - return NSERROR_NOMEM; - } - - (*url)->components = c; - (*url)->length = length; - - /* Fill out the url string */ - nsurl_get_string(&c, (*url)->string, &str_len, str_flags); - - /* Get the nsurl's hash */ - nsurl_calc_hash(*url); - - /* Give the URL a reference */ - (*url)->count = 1; - - return NSERROR_OK; -} - - /* exported interface, documented in nsurl.h */ nsurl *nsurl_ref(nsurl *url) { @@ -1539,7 +268,7 @@ nserror nsurl_get(const nsurl *url, nsurl_component parts, } /* Copy the required parts into the url string */ - nsurl_get_string(&(url->components), *url_s, &str_len, str_flags); + nsurl__get_string(&(url->components), *url_s, &str_len, str_flags); return NSERROR_OK; } @@ -1779,253 +508,6 @@ uint32_t nsurl_hash(const nsurl *url) } -/* exported interface, documented in nsurl.h */ -nserror nsurl_join(const nsurl *base, const char *rel, nsurl **joined) -{ - struct url_markers m; - struct nsurl_components c; - size_t length; - char *buff; - char *buff_pos; - char *buff_start; - struct nsurl_component_lengths str_len = { 0, 0, 0, 0, 0, 0, 0, 0 }; - enum nsurl_string_flags str_flags = 0; - nserror error = 0; - enum { - NSURL_F_REL = 0, - NSURL_F_BASE_SCHEME = (1 << 0), - NSURL_F_BASE_AUTHORITY = (1 << 1), - NSURL_F_BASE_PATH = (1 << 2), - NSURL_F_MERGED_PATH = (1 << 3), - NSURL_F_BASE_QUERY = (1 << 4) - } joined_parts; - - assert(base != NULL); - assert(rel != NULL); - -#ifdef NSURL_DEBUG - LOG("base: \"%s\", rel: \"%s\"", nsurl_access(base), rel); -#endif - - /* Peg out the URL sections */ - nsurl__get_string_markers(rel, &m, true); - - /* Get the length of the longest section */ - length = nsurl__get_longest_section(&m); - - /* Initially assume that the joined URL can be formed entierly from - * the relative URL. - */ - joined_parts = NSURL_F_REL; - - /* Update joined_compnents to indicate any required parts from the - * base URL. - */ - if (m.scheme_end - m.start <= 0) { - /* The relative url has no scheme. - * Use base URL's scheme. */ - joined_parts |= NSURL_F_BASE_SCHEME; - - if (m.path - m.authority <= 0) { - /* The relative URL has no authority. - * Use base URL's authority. */ - joined_parts |= NSURL_F_BASE_AUTHORITY; - - if (m.query - m.path <= 0) { - /* The relative URL has no path. - * Use base URL's path. */ - joined_parts |= NSURL_F_BASE_PATH; - - if (m.fragment - m.query <= 0) { - /* The relative URL has no query. - * Use base URL's query. */ - joined_parts |= NSURL_F_BASE_QUERY; - } - - } else if (*(rel + m.path) != '/') { - /* Relative URL has relative path */ - joined_parts |= NSURL_F_MERGED_PATH; - } - } - } - - /* Allocate enough memory to url escape the longest section, plus - * space for path merging (if required). - */ - if (joined_parts & NSURL_F_MERGED_PATH) { - /* Need to merge paths */ - length += (base->components.path != NULL) ? - lwc_string_length(base->components.path) : 0; - } - length *= 4; - /* Plus space for removing dots from path */ - length += (m.query - m.path) + ((base->components.path != NULL) ? - lwc_string_length(base->components.path) : 0); - - buff = malloc(length + 5); - if (buff == NULL) { - return NSERROR_NOMEM; - } - - buff_pos = buff; - - /* Form joined URL from base or rel components, as appropriate */ - - if (joined_parts & NSURL_F_BASE_SCHEME) { - c.scheme_type = base->components.scheme_type; - - c.scheme = nsurl__component_copy(base->components.scheme); - } else { - c.scheme_type = m.scheme_type; - - error = nsurl__create_from_section(rel, URL_SCHEME, &m, buff, &c); - if (error != NSERROR_OK) { - free(buff); - return error; - } - } - - if (joined_parts & NSURL_F_BASE_AUTHORITY) { - c.username = nsurl__component_copy(base->components.username); - c.password = nsurl__component_copy(base->components.password); - c.host = nsurl__component_copy(base->components.host); - c.port = nsurl__component_copy(base->components.port); - } else { - error = nsurl__create_from_section(rel, URL_CREDENTIALS, &m, - buff, &c); - if (error == NSERROR_OK) { - error = nsurl__create_from_section(rel, URL_HOST, &m, - buff, &c); - } - if (error != NSERROR_OK) { - free(buff); - return error; - } - } - - if (joined_parts & NSURL_F_BASE_PATH) { - c.path = nsurl__component_copy(base->components.path); - - } else if (joined_parts & NSURL_F_MERGED_PATH) { - struct url_markers m_path; - size_t new_length; - - if (base->components.host != NULL && - base->components.path == NULL) { - /* Append relative path to "/". */ - *(buff_pos++) = '/'; - memcpy(buff_pos, rel + m.path, m.query - m.path); - buff_pos += m.query - m.path; - - } else { - /* Append relative path to all but last segment of - * base path. */ - size_t path_end = lwc_string_length( - base->components.path); - const char *path = lwc_string_data( - base->components.path); - - while (*(path + path_end) != '/' && - path_end != 0) { - path_end--; - } - if (*(path + path_end) == '/') - path_end++; - - /* Copy the base part */ - memcpy(buff_pos, path, path_end); - buff_pos += path_end; - - /* Copy the relative part */ - memcpy(buff_pos, rel + m.path, m.query - m.path); - buff_pos += m.query - m.path; - } - - /* add termination to string */ - *buff_pos++ = '\0'; - - new_length = nsurl__remove_dot_segments(buff, buff_pos); - - m_path.path = 0; - m_path.query = new_length; - - buff_start = buff_pos + new_length; - error = nsurl__create_from_section(buff_pos, URL_PATH, &m_path, - buff_start, &c); - if (error != NSERROR_OK) { - free(buff); - return error; - } - - } else { - struct url_markers m_path; - size_t new_length; - - memcpy(buff_pos, rel + m.path, m.query - m.path); - buff_pos += m.query - m.path; - *(buff_pos++) = '\0'; - - new_length = nsurl__remove_dot_segments(buff, buff_pos); - - m_path.path = 0; - m_path.query = new_length; - - buff_start = buff_pos + new_length; - - error = nsurl__create_from_section(buff_pos, URL_PATH, &m_path, - buff_start, &c); - if (error != NSERROR_OK) { - free(buff); - return error; - } - } - - if (joined_parts & NSURL_F_BASE_QUERY) { - c.query = nsurl__component_copy(base->components.query); - } else { - error = nsurl__create_from_section(rel, URL_QUERY, &m, - buff, &c); - if (error != NSERROR_OK) { - free(buff); - return error; - } - } - - error = nsurl__create_from_section(rel, URL_FRAGMENT, &m, buff, &c); - - /* Free temporary buffer */ - free(buff); - - if (error != NSERROR_OK) { - return error; - } - - /* Get the string length and find which parts of url are present */ - nsurl__get_string_data(&c, NSURL_WITH_FRAGMENT, &length, - &str_len, &str_flags); - - /* Create NetSurf URL object */ - *joined = malloc(sizeof(nsurl) + length + 1); /* Add 1 for \0 */ - if (*joined == NULL) { - return NSERROR_NOMEM; - } - - (*joined)->components = c; - (*joined)->length = length; - - /* Fill out the url string */ - nsurl_get_string(&c, (*joined)->string, &str_len, str_flags); - - /* Get the nsurl's hash */ - nsurl_calc_hash(*joined); - - /* Give the URL a reference */ - (*joined)->count = 1; - - return NSERROR_OK; -} - - /* exported interface, documented in nsurl.h */ nserror nsurl_defragment(const nsurl *url, nsurl **no_frag) { @@ -2083,7 +565,7 @@ nserror nsurl_defragment(const nsurl *url, nsurl **no_frag) *pos = '\0'; /* Get the nsurl's hash */ - nsurl_calc_hash(*no_frag); + nsurl__calc_hash(*no_frag); /* Give the URL a reference */ (*no_frag)->count = 1; @@ -2151,7 +633,7 @@ nserror nsurl_refragment(const nsurl *url, lwc_string *frag, nsurl **new_url) (*new_url)->components.scheme_type = url->components.scheme_type; /* Get the nsurl's hash */ - nsurl_calc_hash(*new_url); + nsurl__calc_hash(*new_url); /* Give the URL a reference */ (*new_url)->count = 1; @@ -2238,7 +720,7 @@ nserror nsurl_replace_query(const nsurl *url, const char *query, (*new_url)->components.scheme_type = url->components.scheme_type; /* Get the nsurl's hash */ - nsurl_calc_hash(*new_url); + nsurl__calc_hash(*new_url); /* Give the URL a reference */ (*new_url)->count = 1; @@ -2441,7 +923,7 @@ nserror nsurl_parent(const nsurl *url, nsurl **new_url) (*new_url)->components.scheme_type = url->components.scheme_type; /* Get the nsurl's hash */ - nsurl_calc_hash(*new_url); + nsurl__calc_hash(*new_url); /* Give the URL a reference */ (*new_url)->count = 1; -- cgit v1.2.3