diff options
author | Indrajith K L | 2022-12-03 17:00:20 +0530 |
---|---|---|
committer | Indrajith K L | 2022-12-03 17:00:20 +0530 |
commit | f5c4671bfbad96bf346bd7e9a21fc4317b4959df (patch) | |
tree | 2764fc62da58f2ba8da7ed341643fc359873142f /v_windows/v/vlib/net/urllib | |
download | cli-tools-windows-master.tar.gz cli-tools-windows-master.tar.bz2 cli-tools-windows-master.zip |
Diffstat (limited to 'v_windows/v/vlib/net/urllib')
-rw-r--r-- | v_windows/v/vlib/net/urllib/urllib.v | 1095 | ||||
-rw-r--r-- | v_windows/v/vlib/net/urllib/urllib_test.v | 51 | ||||
-rw-r--r-- | v_windows/v/vlib/net/urllib/values.v | 87 |
3 files changed, 1233 insertions, 0 deletions
diff --git a/v_windows/v/vlib/net/urllib/urllib.v b/v_windows/v/vlib/net/urllib/urllib.v new file mode 100644 index 0000000..3b02ef6 --- /dev/null +++ b/v_windows/v/vlib/net/urllib/urllib.v @@ -0,0 +1,1095 @@ +// urllib parses URLs and implements query escaping. +// See RFC 3986. This module generally follows RFC 3986, except where +// it deviates for compatibility reasons. +// Based off: https://github.com/golang/go/blob/master/src/net/url/url.go +// Last commit: https://github.com/golang/go/commit/fe2ed5054176935d4adcf13e891715ccf2ee3cce +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. +module urllib + +import strings + +enum EncodingMode { + encode_path + encode_path_segment + encode_host + encode_zone + encode_user_password + encode_query_component + encode_fragment +} + +const ( + err_msg_escape = 'unescape: invalid URL escape' + err_msg_parse = 'parse: failed parsing url' +) + +fn error_msg(message string, val string) string { + mut msg := 'net.urllib.$message' + if val != '' { + msg = '$msg ($val)' + } + return msg +} + +// Return true if the specified character should be escaped when +// appearing in a URL string, according to RFC 3986. +// +// Please be informed that for now should_escape does not check all +// reserved characters correctly. See golang.org/issue/5684. +fn should_escape(c byte, mode EncodingMode) bool { + // §2.3 Unreserved characters (alphanum) + if (`a` <= c && c <= `z`) || (`A` <= c && c <= `Z`) || (`0` <= c && c <= `9`) { + return false + } + if mode == .encode_host || mode == .encode_zone { + // §3.2.2 host allows + // sub-delims = `!` / `$` / `&` / ``` / `(` / `)` / `*` / `+` / `,` / `;` / `=` + // as part of reg-name. + // We add : because we include :port as part of host. + // We add [ ] because we include [ipv6]:port as part of host. + // We add < > because they`re the only characters left that + // we could possibly allow, and parse will reject them if we + // escape them (because hosts can`t use %-encoding for + // ASCII bytes). + if c in [`!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `:`, `[`, `]`, `<`, `>`, + `"`, + ] { + return false + } + } + match c { + `-`, `_`, `.`, `~` { + // §2.3 Unreserved characters (mark) + return false + } + `$`, `&`, `+`, `,`, `/`, `:`, `;`, `=`, `?`, `@` { + // §2.2 Reserved characters (reserved) + // Different sections of the URL allow a few of + // the reserved characters to appear unescaped. + match mode { + .encode_path { + // §3.3 + // The RFC allows : @ & = + $ but saves / ; , for assigning + // meaning to individual path segments. This package + // only manipulates the path as a whole, so we allow those + // last three as well. That leaves only ? to escape. + return c == `?` + } + .encode_path_segment { + // §3.3 + // The RFC allows : @ & = + $ but saves / ; , for assigning + // meaning to individual path segments. + return c == `/` || c == `;` || c == `,` || c == `?` + } + .encode_user_password { + // §3.2.1 + // The RFC allows `;`, `:`, `&`, `=`, `+`, `$`, and `,` in + // userinfo, so we must escape only `@`, `/`, and `?`. + // The parsing of userinfo treats `:` as special so we must escape + // that too. + return c == `@` || c == `/` || c == `?` || c == `:` + } + .encode_query_component { + // §3.4 + // The RFC reserves (so we must escape) everything. + return true + } + .encode_fragment { + // §4.1 + // The RFC text is silent but the grammar allows + // everything, so escape nothing. + return false + } + else {} + } + } + else {} + } + if mode == .encode_fragment { + // RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are + // included in reserved from RFC 2396 §2.2. The remaining sub-delims do not + // need to be escaped. To minimize potential breakage, we apply two restrictions: + // (1) we always escape sub-delims outside of the fragment, and (2) we always + // escape single quote to avoid breaking callers that had previously assumed that + // single quotes would be escaped. See issue #19917. + match c { + `!`, `(`, `)`, `*` { return false } + else {} + } + } + // Everything else must be escaped. + return true +} + +// query_unescape does the inverse transformation of query_escape, +// converting each 3-byte encoded substring of the form '%AB' into the +// hex-decoded byte 0xAB. +// It returns an error if any % is not followed by two hexadecimal +// digits. +pub fn query_unescape(s string) ?string { + return unescape(s, .encode_query_component) +} + +// path_unescape does the inverse transformation of path_escape, +// converting each 3-byte encoded substring of the form '%AB' into the +// hex-decoded byte 0xAB. It returns an error if any % is not followed +// by two hexadecimal digits. +// +// path_unescape is identical to query_unescape except that it does not +// unescape '+' to ' ' (space). +pub fn path_unescape(s string) ?string { + return unescape(s, .encode_path_segment) +} + +// unescape unescapes a string; the mode specifies +// which section of the URL string is being unescaped. +fn unescape(s_ string, mode EncodingMode) ?string { + mut s := s_ + // Count %, check that they're well-formed. + mut n := 0 + mut has_plus := false + for i := 0; i < s.len; { + x := s[i] + match x { + `%` { + if s == '' { + break + } + n++ + if i + 2 >= s.len || !ishex(s[i + 1]) || !ishex(s[i + 2]) { + if mode == .encode_query_component && i + 1 < s.len { + s = s[..i] + '%25' + s[(i + 1)..] + i += 4 // skip the %25 and the next character + continue + } + s = s[i..] + if s.len > 3 { + s = s[..3] + } + return error(error_msg(urllib.err_msg_escape, s)) + } + // Per https://tools.ietf.org/html/rfc3986#page-21 + // in the host component %-encoding can only be used + // for non-ASCII bytes. + // But https://tools.ietf.org/html/rfc6874#section-2 + // introduces %25 being allowed to escape a percent sign + // in IPv6 scoped-address literals. Yay. + if i + 3 >= s.len && mode == .encode_host && unhex(s[i + 1]) < 8 + && s[i..i + 3] != '%25' { + return error(error_msg(urllib.err_msg_escape, s[i..i + 3])) + } + if mode == .encode_zone { + // RFC 6874 says basically 'anything goes' for zone identifiers + // and that even non-ASCII can be redundantly escaped, + // but it seems prudent to restrict %-escaped bytes here to those + // that are valid host name bytes in their unescaped form. + // That is, you can use escaping in the zone identifier but not + // to introduce bytes you couldn't just write directly. + // But Windows puts spaces here! Yay. + if i + 3 >= s.len { + return error(error_msg('unescape: invalid escape sequence', '')) + } + v := ((unhex(s[i + 1]) << byte(4)) | unhex(s[i + 2])) + if s[i..i + 3] != '%25' && v != ` ` && should_escape(v, .encode_host) { + error(error_msg(urllib.err_msg_escape, s[i..i + 3])) + } + } + i += 3 + } + `+` { + has_plus = mode == .encode_query_component + i++ + } + else { + if (mode == .encode_host || mode == .encode_zone) && s[i] < 0x80 + && should_escape(s[i], mode) { + error(error_msg('unescape: invalid character in host name', s[i..i + 1])) + } + i++ + } + } + } + if n == 0 && !has_plus { + return s + } + if s.len < 2 * n { + return error(error_msg('unescape: invalid escape sequence', '')) + } + mut t := strings.new_builder(s.len - 2 * n) + for i := 0; i < s.len; i++ { + x := s[i] + match x { + `%` { + if i + 2 >= s.len { + return error(error_msg('unescape: invalid escape sequence', '')) + } + t.write_string(((unhex(s[i + 1]) << byte(4)) | unhex(s[i + 2])).ascii_str()) + i += 2 + } + `+` { + if mode == .encode_query_component { + t.write_string(' ') + } else { + t.write_string('+') + } + } + else { + t.write_string(s[i].ascii_str()) + } + } + } + return t.str() +} + +// query_escape escapes the string so it can be safely placed +// inside a URL query. +pub fn query_escape(s string) string { + return escape(s, .encode_query_component) +} + +// path_escape escapes the string so it can be safely placed inside a URL path segment, +// replacing special characters (including /) with %XX sequences as needed. +pub fn path_escape(s string) string { + return escape(s, .encode_path_segment) +} + +fn escape(s string, mode EncodingMode) string { + mut space_count := 0 + mut hex_count := 0 + mut c := byte(0) + for i in 0 .. s.len { + c = s[i] + if should_escape(c, mode) { + if c == ` ` && mode == .encode_query_component { + space_count++ + } else { + hex_count++ + } + } + } + if space_count == 0 && hex_count == 0 { + return s + } + buf := []byte{len: (64)} + mut t := []byte{} + required := s.len + 2 * hex_count + if required <= buf.len { + t = buf[..required] + } else { + t = []byte{len: required} + } + if hex_count == 0 { + copy(t, s.bytes()) + for i in 0 .. s.len { + if s[i] == ` ` { + t[i] = `+` + } + } + return t.bytestr() + } + upperhex := '0123456789ABCDEF' + mut j := 0 + for i in 0 .. s.len { + c1 := s[i] + if c1 == ` ` && mode == .encode_query_component { + t[j] = `+` + j++ + } else if should_escape(c1, mode) { + t[j] = `%` + t[j + 1] = upperhex[c1 >> 4] + t[j + 2] = upperhex[c1 & 15] + j += 3 + } else { + t[j] = s[i] + j++ + } + } + return t.bytestr() +} + +// A URL represents a parsed URL (technically, a URI reference). +// +// The general form represented is: +// +// [scheme:][//[userinfo@]host][/]path[?query][#fragment] +// +// URLs that do not start with a slash after the scheme are interpreted as: +// +// scheme:opaque[?query][#fragment] +// +// Note that the path field is stored in decoded form: /%47%6f%2f becomes /Go/. +// A consequence is that it is impossible to tell which slashes in the path were +// slashes in the raw URL and which were %2f. This distinction is rarely important, +// but when it is, the code should use raw_path, an optional field which only gets +// set if the default encoding is different from path. +// +// URL's String method uses the escaped_path method to obtain the path. See the +// escaped_path method for more details. +pub struct URL { +pub mut: + scheme string + opaque string // encoded opaque data + user &Userinfo // username and password information + host string // host or host:port + path string // path (relative paths may omit leading slash) + raw_path string // encoded path hint (see escaped_path method) + force_query bool // append a query ('?') even if raw_query is empty + raw_query string // encoded query values, without '?' + fragment string // fragment for references, without '#' +} + +// user returns a Userinfo containing the provided username +// and no password set. +pub fn user(username string) &Userinfo { + return &Userinfo{ + username: username + password: '' + password_set: false + } +} + +// user_password returns a Userinfo containing the provided username +// and password. +// +// This functionality should only be used with legacy web sites. +// RFC 2396 warns that interpreting Userinfo this way +// ``is NOT RECOMMENDED, because the passing of authentication +// information in clear text (such as URI) has proven to be a +// security risk in almost every case where it has been used.'' +fn user_password(username string, password string) &Userinfo { + return &Userinfo{username, password, true} +} + +// The Userinfo type is an immutable encapsulation of username and +// password details for a URL. An existing Userinfo value is guaranteed +// to have a username set (potentially empty, as allowed by RFC 2396), +// and optionally a password. +struct Userinfo { +pub: + username string + password string + password_set bool +} + +fn (u &Userinfo) empty() bool { + return isnil(u) || (u.username == '' && u.password == '') +} + +// string returns the encoded userinfo information in the standard form +// of 'username[:password]'. +fn (u &Userinfo) str() string { + if u.empty() { + return '' + } + mut s := escape(u.username, .encode_user_password) + if u.password_set { + s += ':' + escape(u.password, .encode_user_password) + } + return s +} + +// Maybe rawurl is of the form scheme:path. +// (scheme must be [a-zA-Z][a-zA-Z0-9+-.]*) +// If so, return [scheme, path]; else return ['', rawurl] +fn split_by_scheme(rawurl string) ?[]string { + for i in 0 .. rawurl.len { + c := rawurl[i] + if (`a` <= c && c <= `z`) || (`A` <= c && c <= `Z`) { + // do nothing + } else if (`0` <= c && c <= `9`) || (c == `+` || c == `-` || c == `.`) { + if i == 0 { + return ['', rawurl] + } + } else if c == `:` { + if i == 0 { + return error(error_msg('split_by_scheme: missing protocol scheme', '')) + } + return [rawurl[..i], rawurl[i + 1..]] + } else { + // we have encountered an invalid character, + // so there is no valid scheme + return ['', rawurl] + } + } + return ['', rawurl] +} + +fn get_scheme(rawurl string) ?string { + split := split_by_scheme(rawurl) or { return err.msg } + return split[0] +} + +// split slices s into two substrings separated by the first occurence of +// sep. If cutc is true then sep is included with the second substring. +// If sep does not occur in s then s and the empty string is returned. +fn split(s string, sep byte, cutc bool) (string, string) { + i := s.index_byte(sep) + if i < 0 { + return s, '' + } + if cutc { + return s[..i], s[i + 1..] + } + return s[..i], s[i..] +} + +// parse parses rawurl into a URL structure. +// +// The rawurl may be relative (a path, without a host) or absolute +// (starting with a scheme). Trying to parse a hostname and path +// without a scheme is invalid but may not necessarily return an +// error, due to parsing ambiguities. +pub fn parse(rawurl string) ?URL { + // Cut off #frag + u, frag := split(rawurl, `#`, true) + mut url := parse_url(u, false) or { return error(error_msg(urllib.err_msg_parse, u)) } + if frag == '' { + return url + } + f := unescape(frag, .encode_fragment) or { return error(error_msg(urllib.err_msg_parse, + u)) } + url.fragment = f + return url +} + +// parse_request_uri parses rawurl into a URL structure. It assumes that +// rawurl was received in an HTTP request, so the rawurl is interpreted +// only as an absolute URI or an absolute path. +// The string rawurl is assumed not to have a #fragment suffix. +// (Web browsers strip #fragment before sending the URL to a web server.) +fn parse_request_uri(rawurl string) ?URL { + return parse_url(rawurl, true) +} + +// parse_url parses a URL from a string in one of two contexts. If +// via_request is true, the URL is assumed to have arrived via an HTTP request, +// in which case only absolute URLs or path-absolute relative URLs are allowed. +// If via_request is false, all forms of relative URLs are allowed. +[manualfree] +fn parse_url(rawurl string, via_request bool) ?URL { + if string_contains_ctl_byte(rawurl) { + return error(error_msg('parse_url: invalid control character in URL', rawurl)) + } + if rawurl == '' && via_request { + return error(error_msg('parse_url: empty URL', rawurl)) + } + mut url := URL{ + user: 0 + } + if rawurl == '*' { + url.path = '*' + return url + } + // Split off possible leading 'http:', 'mailto:', etc. + // Cannot contain escaped characters. + p := split_by_scheme(rawurl) ? + url.scheme = p[0] + mut rest := p[1] + url.scheme = url.scheme.to_lower() + // if rest.ends_with('?') && strings.count(rest, '?') == 1 { + if rest.ends_with('?') && !rest[..1].contains('?') { + url.force_query = true + rest = rest[..rest.len - 1] + } else { + r, raw_query := split(rest, `?`, true) + rest = r + url.raw_query = raw_query + } + if !rest.starts_with('/') { + if url.scheme != '' { + // We consider rootless paths per RFC 3986 as opaque. + url.opaque = rest + return url + } + if via_request { + return error(error_msg('parse_url: invalid URI for request', '')) + } + // Avoid confusion with malformed schemes, like cache_object:foo/bar. + // See golang.org/issue/16822. + // + // RFC 3986, §3.3: + // In addition, a URI reference (Section 4.1) may be a relative-path reference, + // in which case the first path segment cannot contain a colon (':') character. + colon := rest.index(':') or { return error('there should be a : in the URL') } + slash := rest.index('/') or { return error('there should be a / in the URL') } + if colon >= 0 && (slash < 0 || colon < slash) { + // First path segment has colon. Not allowed in relative URL. + return error(error_msg('parse_url: first path segment in URL cannot contain colon', + '')) + } + } + if ((url.scheme != '' || !via_request) && !rest.starts_with('///')) && rest.starts_with('//') { + authority, r := split(rest[2..], `/`, false) + rest = r + a := parse_authority(authority) ? + url.user = a.user + url.host = a.host + } + // Set path and, optionally, raw_path. + // raw_path is a hint of the encoding of path. We don't want to set it if + // the default escaping of path is equivalent, to help make sure that people + // don't rely on it in general. + url.set_path(rest) ? + return url +} + +struct ParseAuthorityRes { + user &Userinfo + host string +} + +fn parse_authority(authority string) ?ParseAuthorityRes { + i := authority.last_index('@') or { -1 } + mut host := '' + mut zuser := user('') + if i < 0 { + h := parse_host(authority) ? + host = h + } else { + h := parse_host(authority[i + 1..]) ? + host = h + } + if i < 0 { + return ParseAuthorityRes{ + host: host + user: zuser + } + } + mut userinfo := authority[..i] + if !valid_userinfo(userinfo) { + return error(error_msg('parse_authority: invalid userinfo', '')) + } + if !userinfo.contains(':') { + u := unescape(userinfo, .encode_user_password) ? + userinfo = u + zuser = user(userinfo) + } else { + mut username, mut password := split(userinfo, `:`, true) + u := unescape(username, .encode_user_password) ? + username = u + p := unescape(password, .encode_user_password) ? + password = p + zuser = user_password(username, password) + } + return ParseAuthorityRes{ + user: zuser + host: host + } +} + +// parse_host parses host as an authority without user +// information. That is, as host[:port]. +fn parse_host(host string) ?string { + if host.starts_with('[') { + // parse an IP-Literal in RFC 3986 and RFC 6874. + // E.g., '[fe80::1]', '[fe80::1%25en0]', '[fe80::1]:80'. + mut i := host.last_index(']') or { + return error(error_msg("parse_host: missing ']' in host", '')) + } + mut colon_port := host[i + 1..] + if !valid_optional_port(colon_port) { + return error(error_msg('parse_host: invalid port $colon_port after host ', + '')) + } + // RFC 6874 defines that %25 (%-encoded percent) introduces + // the zone identifier, and the zone identifier can use basically + // any %-encoding it likes. That's different from the host, which + // can only %-encode non-ASCII bytes. + // We do impose some restrictions on the zone, to avoid stupidity + // like newlines. + if zone := host[..i].index('%25') { + host1 := unescape(host[..zone], .encode_host) or { return err.msg } + host2 := unescape(host[zone..i], .encode_zone) or { return err.msg } + host3 := unescape(host[i..], .encode_host) or { return err.msg } + return host1 + host2 + host3 + } + if idx := host.last_index(':') { + colon_port = host[idx..] + if !valid_optional_port(colon_port) { + return error(error_msg('parse_host: invalid port $colon_port after host ', + '')) + } + } + } + h := unescape(host, .encode_host) or { return err.msg } + return h + // host = h + // return host +} + +// set_path sets the path and raw_path fields of the URL based on the provided +// escaped path p. It maintains the invariant that raw_path is only specified +// when it differs from the default encoding of the path. +// For example: +// - set_path('/foo/bar') will set path='/foo/bar' and raw_path='' +// - set_path('/foo%2fbar') will set path='/foo/bar' and raw_path='/foo%2fbar' +// set_path will return an error only if the provided path contains an invalid +// escaping. +pub fn (mut u URL) set_path(p string) ?bool { + path := unescape(p, .encode_path) ? + u.path = path + escp := escape(path, .encode_path) + if p == escp { + // Default encoding is fine. + u.raw_path = '' + } else { + u.raw_path = p + } + return true +} + +// escaped_path returns the escaped form of u.path. +// In general there are multiple possible escaped forms of any path. +// escaped_path returns u.raw_path when it is a valid escaping of u.path. +// Otherwise escaped_path ignores u.raw_path and computes an escaped +// form on its own. +// The String and request_uri methods use escaped_path to construct +// their results. +// In general, code should call escaped_path instead of +// reading u.raw_path directly. +pub fn (u &URL) escaped_path() string { + if u.raw_path != '' && valid_encoded_path(u.raw_path) { + unescape(u.raw_path, .encode_path) or { return '' } + return u.raw_path + } + if u.path == '*' { + return '*' // don't escape (Issue 11202) + } + return escape(u.path, .encode_path) +} + +// valid_encoded_path reports whether s is a valid encoded path. +// It must not contain any bytes that require escaping during path encoding. +fn valid_encoded_path(s string) bool { + for i in 0 .. s.len { + // RFC 3986, Appendix A. + // pchar = unreserved / pct-encoded / sub-delims / ':' / '@'. + // should_escape is not quite compliant with the RFC, + // so we check the sub-delims ourselves and let + // should_escape handle the others. + x := s[i] + match x { + `!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `:`, `@` { + // ok + } + `[`, `]` { + // ok - not specified in RFC 3986 but left alone by modern browsers + } + `%` { + // ok - percent encoded, will decode + } + else { + if should_escape(s[i], .encode_path) { + return false + } + } + } + } + return true +} + +// valid_optional_port reports whether port is either an empty string +// or matches /^:\d*$/ +fn valid_optional_port(port string) bool { + if port == '' { + return true + } + if port[0] != `:` { + return false + } + for b in port[1..] { + if b < `0` || b > `9` { + return false + } + } + return true +} + +// str reassembles the URL into a valid URL string. +// The general form of the result is one of: +// +// scheme:opaque?query#fragment +// scheme://userinfo@host/path?query#fragment +// +// If u.opaque is non-empty, String uses the first form; +// otherwise it uses the second form. +// Any non-ASCII characters in host are escaped. +// To obtain the path, String uses u.escaped_path(). +// +// In the second form, the following rules apply: +// - if u.scheme is empty, scheme: is omitted. +// - if u.user is nil, userinfo@ is omitted. +// - if u.host is empty, host/ is omitted. +// - if u.scheme and u.host are empty and u.user is nil, +// the entire scheme://userinfo@host/ is omitted. +// - if u.host is non-empty and u.path begins with a /, +// the form host/path does not add its own /. +// - if u.raw_query is empty, ?query is omitted. +// - if u.fragment is empty, #fragment is omitted. +pub fn (u URL) str() string { + mut buf := strings.new_builder(200) + if u.scheme != '' { + buf.write_string(u.scheme) + buf.write_string(':') + } + if u.opaque != '' { + buf.write_string(u.opaque) + } else { + if u.scheme != '' || u.host != '' || !u.user.empty() { + if u.host != '' || u.path != '' || !u.user.empty() { + buf.write_string('//') + } + if !u.user.empty() { + buf.write_string(u.user.str()) + buf.write_string('@') + } + if u.host != '' { + buf.write_string(escape(u.host, .encode_host)) + } + } + path := u.escaped_path() + if path != '' && path[0] != `/` && u.host != '' { + buf.write_string('/') + } + if buf.len == 0 { + // RFC 3986 §4.2 + // A path segment that contains a colon character (e.g., 'this:that') + // cannot be used as the first segment of a relative-path reference, as + // it would be mistaken for a scheme name. Such a segment must be + // preceded by a dot-segment (e.g., './this:that') to make a relative- + // path reference. + i := path.index_byte(`:`) + if i > -1 { + // TODO remove this when autofree handles tmp + // expressions like this + if i > -1 && path[..i].index_byte(`/`) == -1 { + buf.write_string('./') + } + } + } + buf.write_string(path) + } + if u.force_query || u.raw_query != '' { + buf.write_string('?') + buf.write_string(u.raw_query) + } + if u.fragment != '' { + buf.write_string('#') + buf.write_string(escape(u.fragment, .encode_fragment)) + } + return buf.str() +} + +// Values maps a string key to a list of values. +// It is typically used for query parameters and form values. +// Unlike in the http.Header map, the keys in a Values map +// are case-sensitive. +// parseQuery parses the URL-encoded query string and returns +// a map listing the values specified for each key. +// parseQuery always returns a non-nil map containing all the +// valid query parameters found; err describes the first decoding error +// encountered, if any. +// +// Query is expected to be a list of key=value settings separated by +// ampersands or semicolons. A setting without an equals sign is +// interpreted as a key set to an empty value. +pub fn parse_query(query string) ?Values { + mut m := new_values() + parse_query_values(mut m, query) ? + return m +} + +// parse_query_silent is the same as parse_query +// but any errors will be silent +fn parse_query_silent(query string) Values { + mut m := new_values() + parse_query_values(mut m, query) or {} + return m +} + +fn parse_query_values(mut m Values, query string) ?bool { + mut had_error := false + mut q := query + for q != '' { + mut key := q + mut i := key.index_any('&;') + if i >= 0 { + q = key[i + 1..] + key = key[..i] + } else { + q = '' + } + if key == '' { + continue + } + mut value := '' + if idx := key.index('=') { + i = idx + value = key[i + 1..] + key = key[..i] + } + k := query_unescape(key) or { + had_error = true + continue + } + key = k + v := query_unescape(value) or { + had_error = true + continue + } + value = v + m.add(key, value) + } + if had_error { + return error(error_msg('parse_query_values: failed parsing query string', '')) + } + return true +} + +// encode encodes the values into ``URL encoded'' form +// ('bar=baz&foo=quux') sorted by key. +pub fn (v Values) encode() string { + if v.len == 0 { + return '' + } + mut buf := strings.new_builder(200) + mut keys := []string{} + for k, _ in v.data { + keys << k + } + keys.sort() + for k in keys { + vs := v.data[k] + key_kscaped := query_escape(k) + for _, val in vs.data { + if buf.len > 0 { + buf.write_string('&') + } + buf.write_string(key_kscaped) + buf.write_string('=') + buf.write_string(query_escape(val)) + } + } + return buf.str() +} + +// resolve_path applies special path segments from refs and applies +// them to base, per RFC 3986. +fn resolve_path(base string, ref string) string { + mut full := '' + if ref == '' { + full = base + } else if ref[0] != `/` { + i := base.last_index('/') or { -1 } + full = base[..i + 1] + ref + } else { + full = ref + } + if full == '' { + return '' + } + mut dst := []string{} + src := full.split('/') + for _, elem in src { + match elem { + '.' { + // drop + } + '..' { + if dst.len > 0 { + dst = dst[..dst.len - 1] + } + } + else { + dst << elem + } + } + } + last := src[src.len - 1] + if last == '.' || last == '..' { + // Add final slash to the joined path. + dst << '' + } + return '/' + dst.join('/').trim_left('/') +} + +// is_abs reports whether the URL is absolute. +// Absolute means that it has a non-empty scheme. +pub fn (u &URL) is_abs() bool { + return u.scheme != '' +} + +// parse parses a URL in the context of the receiver. The provided URL +// may be relative or absolute. parse returns nil, err on parse +// failure, otherwise its return value is the same as resolve_reference. +pub fn (u &URL) parse(ref string) ?URL { + refurl := parse(ref) ? + return u.resolve_reference(refurl) +} + +// resolve_reference resolves a URI reference to an absolute URI from +// an absolute base URI u, per RFC 3986 Section 5.2. The URI reference +// may be relative or absolute. resolve_reference always returns a new +// URL instance, even if the returned URL is identical to either the +// base or reference. If ref is an absolute URL, then resolve_reference +// ignores base and returns a copy of ref. +pub fn (u &URL) resolve_reference(ref &URL) ?URL { + mut url := *ref + if ref.scheme == '' { + url.scheme = u.scheme + } + if ref.scheme != '' || ref.host != '' || !ref.user.empty() { + // The 'absoluteURI' or 'net_path' cases. + // We can ignore the error from set_path since we know we provided a + // validly-escaped path. + url.set_path(resolve_path(ref.escaped_path(), '')) ? + return url + } + if ref.opaque != '' { + url.user = user('') + url.host = '' + url.path = '' + return url + } + if ref.path == '' && ref.raw_query == '' { + url.raw_query = u.raw_query + if ref.fragment == '' { + url.fragment = u.fragment + } + } + // The 'abs_path' or 'rel_path' cases. + url.host = u.host + url.user = u.user + url.set_path(resolve_path(u.escaped_path(), ref.escaped_path())) ? + return url +} + +// query parses raw_query and returns the corresponding values. +// It silently discards malformed value pairs. +// To check errors use parseQuery. +pub fn (u &URL) query() Values { + v := parse_query_silent(u.raw_query) + return v +} + +// request_uri returns the encoded path?query or opaque?query +// string that would be used in an HTTP request for u. +pub fn (u &URL) request_uri() string { + mut result := u.opaque + if result == '' { + result = u.escaped_path() + if result == '' { + result = '/' + } + } else { + if result.starts_with('//') { + result = u.scheme + ':' + result + } + } + if u.force_query || u.raw_query != '' { + result += '?' + u.raw_query + } + return result +} + +// hostname returns u.host, stripping any valid port number if present. +// +// If the result is enclosed in square brackets, as literal IPv6 addresses are, +// the square brackets are removed from the result. +pub fn (u &URL) hostname() string { + host, _ := split_host_port(u.host) + return host +} + +// port returns the port part of u.host, without the leading colon. +// If u.host doesn't contain a port, port returns an empty string. +pub fn (u &URL) port() string { + _, port := split_host_port(u.host) + return port +} + +// split_host_port separates host and port. If the port is not valid, it returns +// the entire input as host, and it doesn't check the validity of the host. +// Per RFC 3986, it requires ports to be numeric. +fn split_host_port(hostport string) (string, string) { + mut host := hostport + mut port := '' + colon := host.last_index_byte(`:`) + if colon != -1 { + if valid_optional_port(host[colon..]) { + port = host[colon + 1..] + host = host[..colon] + } + } + if host.starts_with('[') && host.ends_with(']') { + host = host[1..host.len - 1] + } + return host, port +} + +// valid_userinfo reports whether s is a valid userinfo string per RFC 3986 +// Section 3.2.1: +// userinfo = *( unreserved / pct-encoded / sub-delims / ':' ) +// unreserved = ALPHA / DIGIT / '-' / '.' / '_' / '~' +// sub-delims = '!' / '$' / '&' / ''' / '(' / ')' +// / '*' / '+' / ',' / ';' / '=' +// +// It doesn't validate pct-encoded. The caller does that via fn unescape. +pub fn valid_userinfo(s string) bool { + for r in s { + if `A` <= r && r <= `Z` { + continue + } + if `a` <= r && r <= `z` { + continue + } + if `0` <= r && r <= `9` { + continue + } + match r { + `-`, `.`, `_`, `:`, `~`, `!`, `$`, `&`, `\\`, `(`, `)`, `*`, `+`, `,`, `;`, `=`, `%`, + `@` { + continue + } + else { + return false + } + } + } + return true +} + +// string_contains_ctl_byte reports whether s contains any ASCII control character. +fn string_contains_ctl_byte(s string) bool { + for i in 0 .. s.len { + b := s[i] + if b < ` ` || b == 0x7f { + return true + } + } + return false +} + +pub fn ishex(c byte) bool { + if `0` <= c && c <= `9` { + return true + } else if `a` <= c && c <= `f` { + return true + } else if `A` <= c && c <= `F` { + return true + } + return false +} + +fn unhex(c byte) byte { + if `0` <= c && c <= `9` { + return c - `0` + } else if `a` <= c && c <= `f` { + return c - `a` + 10 + } else if `A` <= c && c <= `F` { + return c - `A` + 10 + } + return 0 +} diff --git a/v_windows/v/vlib/net/urllib/urllib_test.v b/v_windows/v/vlib/net/urllib/urllib_test.v new file mode 100644 index 0000000..0870c81 --- /dev/null +++ b/v_windows/v/vlib/net/urllib/urllib_test.v @@ -0,0 +1,51 @@ +// Copyright (c) 2019-2021 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +import net.urllib + +fn test_net_urllib() { + test_query := 'Hellö Wörld@vlang' + assert urllib.query_escape(test_query) == 'Hell%C3%B6+W%C3%B6rld%40vlang' + + test_url := 'https://joe:pass@www.mydomain.com:8080/som/url?param1=test1¶m2=test2&foo=bar#testfragment' + u := urllib.parse(test_url) or { + assert false + return + } + assert u.scheme == 'https' && u.hostname() == 'www.mydomain.com' && u.port() == '8080' + && u.path == '/som/url' && u.fragment == 'testfragment' && u.user.username == 'joe' + && u.user.password == 'pass' +} + +fn test_str() { + url := urllib.parse('https://en.wikipedia.org/wiki/Brazil_(1985_film)') or { + panic('unable to parse URL') + } + assert url.str() == 'https://en.wikipedia.org/wiki/Brazil_(1985_film)' +} + +fn test_escape_unescape() { + original := 'те ст: т\\%' + escaped := urllib.query_escape(original) + assert escaped == '%D1%82%D0%B5+%D1%81%D1%82%3A+%D1%82%5C%25' + unescaped := urllib.query_unescape(escaped) or { + assert false + return + } + assert unescaped == original +} + +fn test_parse_query() ? { + q1 := urllib.parse_query('format=%22%25l%3A+%25c+%25t%22') ? + q2 := urllib.parse_query('format="%l:+%c+%t"') ? + // dump(q1) + // dump(q2) + assert q1.data['format'].data == ['"%l: %c %t"'] + assert q2.data['format'].data == ['"%l: %c %t"'] +} + +fn test_parse_missing_host() ? { + // issue #10311 + url := urllib.parse('http:///') ? + assert url.str() == 'http://///' +} diff --git a/v_windows/v/vlib/net/urllib/values.v b/v_windows/v/vlib/net/urllib/values.v new file mode 100644 index 0000000..ee5c329 --- /dev/null +++ b/v_windows/v/vlib/net/urllib/values.v @@ -0,0 +1,87 @@ +// Copyright (c) 2019-2021 Alexander Medvednikov. All rights reserved. +// Use of this source code is governed by an MIT license +// that can be found in the LICENSE file. +module urllib + +struct Value { +pub mut: + data []string +} + +struct Values { +pub mut: + data map[string]Value + len int +} + +// new_values returns a new Values struct for creating +// urlencoded query string parameters. it can also be to +// post form data with application/x-www-form-urlencoded. +// values.encode() will return the encoded data +pub fn new_values() Values { + return Values{ + data: map[string]Value{} + } +} + +// Currently you will need to use all()[key].data +// once map[string][]string is implemented +// this will be fixed +pub fn (v &Value) all() []string { + return v.data +} + +// get gets the first value associated with the given key. +// If there are no values associated with the key, get returns +// a empty string. +pub fn (v &Values) get(key string) string { + if v.data.len == 0 { + return '' + } + vs := v.data[key] + if vs.data.len == 0 { + return '' + } + return vs.data[0] +} + +// get_all gets the all the values associated with the given key. +// If there are no values associated with the key, get returns +// a empty []string. +pub fn (v &Values) get_all(key string) []string { + if v.data.len == 0 { + return [] + } + vs := v.data[key] + if vs.data.len == 0 { + return [] + } + return vs.data +} + +// set sets the key to value. It replaces any existing +// values. +pub fn (mut v Values) set(key string, value string) { + mut a := v.data[key] + a.data = [value] + v.data[key] = a + v.len = v.data.len +} + +// add adds the value to key. It appends to any existing +// values associated with key. +pub fn (mut v Values) add(key string, value string) { + mut a := v.data[key] + if a.data.len == 0 { + a.data = [] + } + a.data << value + v.data[key] = a + v.len = v.data.len +} + +// del deletes the values associated with key. +pub fn (mut v Values) del(key string) { + v.data.delete(key) + v.len = v.data.len +} |