Mercurial > repos > guerler > springsuite
comparison planemo/lib/python3.7/site-packages/future/backports/urllib/parse.py @ 0:d30785e31577 draft
"planemo upload commit 6eee67778febed82ddd413c3ca40b3183a3898f1"
author | guerler |
---|---|
date | Fri, 31 Jul 2020 00:18:57 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d30785e31577 |
---|---|
1 """ | |
2 Ported using Python-Future from the Python 3.3 standard library. | |
3 | |
4 Parse (absolute and relative) URLs. | |
5 | |
6 urlparse module is based upon the following RFC specifications. | |
7 | |
8 RFC 3986 (STD66): "Uniform Resource Identifiers" by T. Berners-Lee, R. Fielding | |
9 and L. Masinter, January 2005. | |
10 | |
11 RFC 2732 : "Format for Literal IPv6 Addresses in URL's by R.Hinden, B.Carpenter | |
12 and L.Masinter, December 1999. | |
13 | |
14 RFC 2396: "Uniform Resource Identifiers (URI)": Generic Syntax by T. | |
15 Berners-Lee, R. Fielding, and L. Masinter, August 1998. | |
16 | |
17 RFC 2368: "The mailto URL scheme", by P.Hoffman , L Masinter, J. Zawinski, July 1998. | |
18 | |
19 RFC 1808: "Relative Uniform Resource Locators", by R. Fielding, UC Irvine, June | |
20 1995. | |
21 | |
22 RFC 1738: "Uniform Resource Locators (URL)" by T. Berners-Lee, L. Masinter, M. | |
23 McCahill, December 1994 | |
24 | |
25 RFC 3986 is considered the current standard and any future changes to | |
26 urlparse module should conform with it. The urlparse module is | |
27 currently not entirely compliant with this RFC due to defacto | |
28 scenarios for parsing, and for backward compatibility purposes, some | |
29 parsing quirks from older RFCs are retained. The testcases in | |
30 test_urlparse.py provides a good indicator of parsing behavior. | |
31 """ | |
32 from __future__ import absolute_import, division, unicode_literals | |
33 from future.builtins import bytes, chr, dict, int, range, str | |
34 from future.utils import raise_with_traceback | |
35 | |
36 import re | |
37 import sys | |
38 import collections | |
39 | |
40 __all__ = ["urlparse", "urlunparse", "urljoin", "urldefrag", | |
41 "urlsplit", "urlunsplit", "urlencode", "parse_qs", | |
42 "parse_qsl", "quote", "quote_plus", "quote_from_bytes", | |
43 "unquote", "unquote_plus", "unquote_to_bytes"] | |
44 | |
45 # A classification of schemes ('' means apply by default) | |
46 uses_relative = ['ftp', 'http', 'gopher', 'nntp', 'imap', | |
47 'wais', 'file', 'https', 'shttp', 'mms', | |
48 'prospero', 'rtsp', 'rtspu', '', 'sftp', | |
49 'svn', 'svn+ssh'] | |
50 uses_netloc = ['ftp', 'http', 'gopher', 'nntp', 'telnet', | |
51 'imap', 'wais', 'file', 'mms', 'https', 'shttp', | |
52 'snews', 'prospero', 'rtsp', 'rtspu', 'rsync', '', | |
53 'svn', 'svn+ssh', 'sftp', 'nfs', 'git', 'git+ssh'] | |
54 uses_params = ['ftp', 'hdl', 'prospero', 'http', 'imap', | |
55 'https', 'shttp', 'rtsp', 'rtspu', 'sip', 'sips', | |
56 'mms', '', 'sftp', 'tel'] | |
57 | |
58 # These are not actually used anymore, but should stay for backwards | |
59 # compatibility. (They are undocumented, but have a public-looking name.) | |
60 non_hierarchical = ['gopher', 'hdl', 'mailto', 'news', | |
61 'telnet', 'wais', 'imap', 'snews', 'sip', 'sips'] | |
62 uses_query = ['http', 'wais', 'imap', 'https', 'shttp', 'mms', | |
63 'gopher', 'rtsp', 'rtspu', 'sip', 'sips', ''] | |
64 uses_fragment = ['ftp', 'hdl', 'http', 'gopher', 'news', | |
65 'nntp', 'wais', 'https', 'shttp', 'snews', | |
66 'file', 'prospero', ''] | |
67 | |
68 # Characters valid in scheme names | |
69 scheme_chars = ('abcdefghijklmnopqrstuvwxyz' | |
70 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' | |
71 '0123456789' | |
72 '+-.') | |
73 | |
74 # XXX: Consider replacing with functools.lru_cache | |
75 MAX_CACHE_SIZE = 20 | |
76 _parse_cache = {} | |
77 | |
78 def clear_cache(): | |
79 """Clear the parse cache and the quoters cache.""" | |
80 _parse_cache.clear() | |
81 _safe_quoters.clear() | |
82 | |
83 | |
84 # Helpers for bytes handling | |
85 # For 3.2, we deliberately require applications that | |
86 # handle improperly quoted URLs to do their own | |
87 # decoding and encoding. If valid use cases are | |
88 # presented, we may relax this by using latin-1 | |
89 # decoding internally for 3.3 | |
90 _implicit_encoding = 'ascii' | |
91 _implicit_errors = 'strict' | |
92 | |
93 def _noop(obj): | |
94 return obj | |
95 | |
96 def _encode_result(obj, encoding=_implicit_encoding, | |
97 errors=_implicit_errors): | |
98 return obj.encode(encoding, errors) | |
99 | |
100 def _decode_args(args, encoding=_implicit_encoding, | |
101 errors=_implicit_errors): | |
102 return tuple(x.decode(encoding, errors) if x else '' for x in args) | |
103 | |
104 def _coerce_args(*args): | |
105 # Invokes decode if necessary to create str args | |
106 # and returns the coerced inputs along with | |
107 # an appropriate result coercion function | |
108 # - noop for str inputs | |
109 # - encoding function otherwise | |
110 str_input = isinstance(args[0], str) | |
111 for arg in args[1:]: | |
112 # We special-case the empty string to support the | |
113 # "scheme=''" default argument to some functions | |
114 if arg and isinstance(arg, str) != str_input: | |
115 raise TypeError("Cannot mix str and non-str arguments") | |
116 if str_input: | |
117 return args + (_noop,) | |
118 return _decode_args(args) + (_encode_result,) | |
119 | |
120 # Result objects are more helpful than simple tuples | |
121 class _ResultMixinStr(object): | |
122 """Standard approach to encoding parsed results from str to bytes""" | |
123 __slots__ = () | |
124 | |
125 def encode(self, encoding='ascii', errors='strict'): | |
126 return self._encoded_counterpart(*(x.encode(encoding, errors) for x in self)) | |
127 | |
128 | |
129 class _ResultMixinBytes(object): | |
130 """Standard approach to decoding parsed results from bytes to str""" | |
131 __slots__ = () | |
132 | |
133 def decode(self, encoding='ascii', errors='strict'): | |
134 return self._decoded_counterpart(*(x.decode(encoding, errors) for x in self)) | |
135 | |
136 | |
137 class _NetlocResultMixinBase(object): | |
138 """Shared methods for the parsed result objects containing a netloc element""" | |
139 __slots__ = () | |
140 | |
141 @property | |
142 def username(self): | |
143 return self._userinfo[0] | |
144 | |
145 @property | |
146 def password(self): | |
147 return self._userinfo[1] | |
148 | |
149 @property | |
150 def hostname(self): | |
151 hostname = self._hostinfo[0] | |
152 if not hostname: | |
153 hostname = None | |
154 elif hostname is not None: | |
155 hostname = hostname.lower() | |
156 return hostname | |
157 | |
158 @property | |
159 def port(self): | |
160 port = self._hostinfo[1] | |
161 if port is not None: | |
162 port = int(port, 10) | |
163 # Return None on an illegal port | |
164 if not ( 0 <= port <= 65535): | |
165 return None | |
166 return port | |
167 | |
168 | |
169 class _NetlocResultMixinStr(_NetlocResultMixinBase, _ResultMixinStr): | |
170 __slots__ = () | |
171 | |
172 @property | |
173 def _userinfo(self): | |
174 netloc = self.netloc | |
175 userinfo, have_info, hostinfo = netloc.rpartition('@') | |
176 if have_info: | |
177 username, have_password, password = userinfo.partition(':') | |
178 if not have_password: | |
179 password = None | |
180 else: | |
181 username = password = None | |
182 return username, password | |
183 | |
184 @property | |
185 def _hostinfo(self): | |
186 netloc = self.netloc | |
187 _, _, hostinfo = netloc.rpartition('@') | |
188 _, have_open_br, bracketed = hostinfo.partition('[') | |
189 if have_open_br: | |
190 hostname, _, port = bracketed.partition(']') | |
191 _, have_port, port = port.partition(':') | |
192 else: | |
193 hostname, have_port, port = hostinfo.partition(':') | |
194 if not have_port: | |
195 port = None | |
196 return hostname, port | |
197 | |
198 | |
199 class _NetlocResultMixinBytes(_NetlocResultMixinBase, _ResultMixinBytes): | |
200 __slots__ = () | |
201 | |
202 @property | |
203 def _userinfo(self): | |
204 netloc = self.netloc | |
205 userinfo, have_info, hostinfo = netloc.rpartition(b'@') | |
206 if have_info: | |
207 username, have_password, password = userinfo.partition(b':') | |
208 if not have_password: | |
209 password = None | |
210 else: | |
211 username = password = None | |
212 return username, password | |
213 | |
214 @property | |
215 def _hostinfo(self): | |
216 netloc = self.netloc | |
217 _, _, hostinfo = netloc.rpartition(b'@') | |
218 _, have_open_br, bracketed = hostinfo.partition(b'[') | |
219 if have_open_br: | |
220 hostname, _, port = bracketed.partition(b']') | |
221 _, have_port, port = port.partition(b':') | |
222 else: | |
223 hostname, have_port, port = hostinfo.partition(b':') | |
224 if not have_port: | |
225 port = None | |
226 return hostname, port | |
227 | |
228 | |
229 from collections import namedtuple | |
230 | |
231 _DefragResultBase = namedtuple('DefragResult', 'url fragment') | |
232 _SplitResultBase = namedtuple('SplitResult', 'scheme netloc path query fragment') | |
233 _ParseResultBase = namedtuple('ParseResult', 'scheme netloc path params query fragment') | |
234 | |
235 # For backwards compatibility, alias _NetlocResultMixinStr | |
236 # ResultBase is no longer part of the documented API, but it is | |
237 # retained since deprecating it isn't worth the hassle | |
238 ResultBase = _NetlocResultMixinStr | |
239 | |
240 # Structured result objects for string data | |
241 class DefragResult(_DefragResultBase, _ResultMixinStr): | |
242 __slots__ = () | |
243 def geturl(self): | |
244 if self.fragment: | |
245 return self.url + '#' + self.fragment | |
246 else: | |
247 return self.url | |
248 | |
249 class SplitResult(_SplitResultBase, _NetlocResultMixinStr): | |
250 __slots__ = () | |
251 def geturl(self): | |
252 return urlunsplit(self) | |
253 | |
254 class ParseResult(_ParseResultBase, _NetlocResultMixinStr): | |
255 __slots__ = () | |
256 def geturl(self): | |
257 return urlunparse(self) | |
258 | |
259 # Structured result objects for bytes data | |
260 class DefragResultBytes(_DefragResultBase, _ResultMixinBytes): | |
261 __slots__ = () | |
262 def geturl(self): | |
263 if self.fragment: | |
264 return self.url + b'#' + self.fragment | |
265 else: | |
266 return self.url | |
267 | |
268 class SplitResultBytes(_SplitResultBase, _NetlocResultMixinBytes): | |
269 __slots__ = () | |
270 def geturl(self): | |
271 return urlunsplit(self) | |
272 | |
273 class ParseResultBytes(_ParseResultBase, _NetlocResultMixinBytes): | |
274 __slots__ = () | |
275 def geturl(self): | |
276 return urlunparse(self) | |
277 | |
278 # Set up the encode/decode result pairs | |
279 def _fix_result_transcoding(): | |
280 _result_pairs = ( | |
281 (DefragResult, DefragResultBytes), | |
282 (SplitResult, SplitResultBytes), | |
283 (ParseResult, ParseResultBytes), | |
284 ) | |
285 for _decoded, _encoded in _result_pairs: | |
286 _decoded._encoded_counterpart = _encoded | |
287 _encoded._decoded_counterpart = _decoded | |
288 | |
289 _fix_result_transcoding() | |
290 del _fix_result_transcoding | |
291 | |
292 def urlparse(url, scheme='', allow_fragments=True): | |
293 """Parse a URL into 6 components: | |
294 <scheme>://<netloc>/<path>;<params>?<query>#<fragment> | |
295 Return a 6-tuple: (scheme, netloc, path, params, query, fragment). | |
296 Note that we don't break the components up in smaller bits | |
297 (e.g. netloc is a single string) and we don't expand % escapes.""" | |
298 url, scheme, _coerce_result = _coerce_args(url, scheme) | |
299 splitresult = urlsplit(url, scheme, allow_fragments) | |
300 scheme, netloc, url, query, fragment = splitresult | |
301 if scheme in uses_params and ';' in url: | |
302 url, params = _splitparams(url) | |
303 else: | |
304 params = '' | |
305 result = ParseResult(scheme, netloc, url, params, query, fragment) | |
306 return _coerce_result(result) | |
307 | |
308 def _splitparams(url): | |
309 if '/' in url: | |
310 i = url.find(';', url.rfind('/')) | |
311 if i < 0: | |
312 return url, '' | |
313 else: | |
314 i = url.find(';') | |
315 return url[:i], url[i+1:] | |
316 | |
317 def _splitnetloc(url, start=0): | |
318 delim = len(url) # position of end of domain part of url, default is end | |
319 for c in '/?#': # look for delimiters; the order is NOT important | |
320 wdelim = url.find(c, start) # find first of this delim | |
321 if wdelim >= 0: # if found | |
322 delim = min(delim, wdelim) # use earliest delim position | |
323 return url[start:delim], url[delim:] # return (domain, rest) | |
324 | |
325 def urlsplit(url, scheme='', allow_fragments=True): | |
326 """Parse a URL into 5 components: | |
327 <scheme>://<netloc>/<path>?<query>#<fragment> | |
328 Return a 5-tuple: (scheme, netloc, path, query, fragment). | |
329 Note that we don't break the components up in smaller bits | |
330 (e.g. netloc is a single string) and we don't expand % escapes.""" | |
331 url, scheme, _coerce_result = _coerce_args(url, scheme) | |
332 allow_fragments = bool(allow_fragments) | |
333 key = url, scheme, allow_fragments, type(url), type(scheme) | |
334 cached = _parse_cache.get(key, None) | |
335 if cached: | |
336 return _coerce_result(cached) | |
337 if len(_parse_cache) >= MAX_CACHE_SIZE: # avoid runaway growth | |
338 clear_cache() | |
339 netloc = query = fragment = '' | |
340 i = url.find(':') | |
341 if i > 0: | |
342 if url[:i] == 'http': # optimize the common case | |
343 scheme = url[:i].lower() | |
344 url = url[i+1:] | |
345 if url[:2] == '//': | |
346 netloc, url = _splitnetloc(url, 2) | |
347 if (('[' in netloc and ']' not in netloc) or | |
348 (']' in netloc and '[' not in netloc)): | |
349 raise ValueError("Invalid IPv6 URL") | |
350 if allow_fragments and '#' in url: | |
351 url, fragment = url.split('#', 1) | |
352 if '?' in url: | |
353 url, query = url.split('?', 1) | |
354 v = SplitResult(scheme, netloc, url, query, fragment) | |
355 _parse_cache[key] = v | |
356 return _coerce_result(v) | |
357 for c in url[:i]: | |
358 if c not in scheme_chars: | |
359 break | |
360 else: | |
361 # make sure "url" is not actually a port number (in which case | |
362 # "scheme" is really part of the path) | |
363 rest = url[i+1:] | |
364 if not rest or any(c not in '0123456789' for c in rest): | |
365 # not a port number | |
366 scheme, url = url[:i].lower(), rest | |
367 | |
368 if url[:2] == '//': | |
369 netloc, url = _splitnetloc(url, 2) | |
370 if (('[' in netloc and ']' not in netloc) or | |
371 (']' in netloc and '[' not in netloc)): | |
372 raise ValueError("Invalid IPv6 URL") | |
373 if allow_fragments and '#' in url: | |
374 url, fragment = url.split('#', 1) | |
375 if '?' in url: | |
376 url, query = url.split('?', 1) | |
377 v = SplitResult(scheme, netloc, url, query, fragment) | |
378 _parse_cache[key] = v | |
379 return _coerce_result(v) | |
380 | |
381 def urlunparse(components): | |
382 """Put a parsed URL back together again. This may result in a | |
383 slightly different, but equivalent URL, if the URL that was parsed | |
384 originally had redundant delimiters, e.g. a ? with an empty query | |
385 (the draft states that these are equivalent).""" | |
386 scheme, netloc, url, params, query, fragment, _coerce_result = ( | |
387 _coerce_args(*components)) | |
388 if params: | |
389 url = "%s;%s" % (url, params) | |
390 return _coerce_result(urlunsplit((scheme, netloc, url, query, fragment))) | |
391 | |
392 def urlunsplit(components): | |
393 """Combine the elements of a tuple as returned by urlsplit() into a | |
394 complete URL as a string. The data argument can be any five-item iterable. | |
395 This may result in a slightly different, but equivalent URL, if the URL that | |
396 was parsed originally had unnecessary delimiters (for example, a ? with an | |
397 empty query; the RFC states that these are equivalent).""" | |
398 scheme, netloc, url, query, fragment, _coerce_result = ( | |
399 _coerce_args(*components)) | |
400 if netloc or (scheme and scheme in uses_netloc and url[:2] != '//'): | |
401 if url and url[:1] != '/': url = '/' + url | |
402 url = '//' + (netloc or '') + url | |
403 if scheme: | |
404 url = scheme + ':' + url | |
405 if query: | |
406 url = url + '?' + query | |
407 if fragment: | |
408 url = url + '#' + fragment | |
409 return _coerce_result(url) | |
410 | |
411 def urljoin(base, url, allow_fragments=True): | |
412 """Join a base URL and a possibly relative URL to form an absolute | |
413 interpretation of the latter.""" | |
414 if not base: | |
415 return url | |
416 if not url: | |
417 return base | |
418 base, url, _coerce_result = _coerce_args(base, url) | |
419 bscheme, bnetloc, bpath, bparams, bquery, bfragment = \ | |
420 urlparse(base, '', allow_fragments) | |
421 scheme, netloc, path, params, query, fragment = \ | |
422 urlparse(url, bscheme, allow_fragments) | |
423 if scheme != bscheme or scheme not in uses_relative: | |
424 return _coerce_result(url) | |
425 if scheme in uses_netloc: | |
426 if netloc: | |
427 return _coerce_result(urlunparse((scheme, netloc, path, | |
428 params, query, fragment))) | |
429 netloc = bnetloc | |
430 if path[:1] == '/': | |
431 return _coerce_result(urlunparse((scheme, netloc, path, | |
432 params, query, fragment))) | |
433 if not path and not params: | |
434 path = bpath | |
435 params = bparams | |
436 if not query: | |
437 query = bquery | |
438 return _coerce_result(urlunparse((scheme, netloc, path, | |
439 params, query, fragment))) | |
440 segments = bpath.split('/')[:-1] + path.split('/') | |
441 # XXX The stuff below is bogus in various ways... | |
442 if segments[-1] == '.': | |
443 segments[-1] = '' | |
444 while '.' in segments: | |
445 segments.remove('.') | |
446 while 1: | |
447 i = 1 | |
448 n = len(segments) - 1 | |
449 while i < n: | |
450 if (segments[i] == '..' | |
451 and segments[i-1] not in ('', '..')): | |
452 del segments[i-1:i+1] | |
453 break | |
454 i = i+1 | |
455 else: | |
456 break | |
457 if segments == ['', '..']: | |
458 segments[-1] = '' | |
459 elif len(segments) >= 2 and segments[-1] == '..': | |
460 segments[-2:] = [''] | |
461 return _coerce_result(urlunparse((scheme, netloc, '/'.join(segments), | |
462 params, query, fragment))) | |
463 | |
464 def urldefrag(url): | |
465 """Removes any existing fragment from URL. | |
466 | |
467 Returns a tuple of the defragmented URL and the fragment. If | |
468 the URL contained no fragments, the second element is the | |
469 empty string. | |
470 """ | |
471 url, _coerce_result = _coerce_args(url) | |
472 if '#' in url: | |
473 s, n, p, a, q, frag = urlparse(url) | |
474 defrag = urlunparse((s, n, p, a, q, '')) | |
475 else: | |
476 frag = '' | |
477 defrag = url | |
478 return _coerce_result(DefragResult(defrag, frag)) | |
479 | |
480 _hexdig = '0123456789ABCDEFabcdef' | |
481 _hextobyte = dict(((a + b).encode(), bytes([int(a + b, 16)])) | |
482 for a in _hexdig for b in _hexdig) | |
483 | |
484 def unquote_to_bytes(string): | |
485 """unquote_to_bytes('abc%20def') -> b'abc def'.""" | |
486 # Note: strings are encoded as UTF-8. This is only an issue if it contains | |
487 # unescaped non-ASCII characters, which URIs should not. | |
488 if not string: | |
489 # Is it a string-like object? | |
490 string.split | |
491 return bytes(b'') | |
492 if isinstance(string, str): | |
493 string = string.encode('utf-8') | |
494 ### For Python-Future: | |
495 # It is already a byte-string object, but force it to be newbytes here on | |
496 # Py2: | |
497 string = bytes(string) | |
498 ### | |
499 bits = string.split(b'%') | |
500 if len(bits) == 1: | |
501 return string | |
502 res = [bits[0]] | |
503 append = res.append | |
504 for item in bits[1:]: | |
505 try: | |
506 append(_hextobyte[item[:2]]) | |
507 append(item[2:]) | |
508 except KeyError: | |
509 append(b'%') | |
510 append(item) | |
511 return bytes(b'').join(res) | |
512 | |
513 _asciire = re.compile('([\x00-\x7f]+)') | |
514 | |
515 def unquote(string, encoding='utf-8', errors='replace'): | |
516 """Replace %xx escapes by their single-character equivalent. The optional | |
517 encoding and errors parameters specify how to decode percent-encoded | |
518 sequences into Unicode characters, as accepted by the bytes.decode() | |
519 method. | |
520 By default, percent-encoded sequences are decoded with UTF-8, and invalid | |
521 sequences are replaced by a placeholder character. | |
522 | |
523 unquote('abc%20def') -> 'abc def'. | |
524 """ | |
525 if '%' not in string: | |
526 string.split | |
527 return string | |
528 if encoding is None: | |
529 encoding = 'utf-8' | |
530 if errors is None: | |
531 errors = 'replace' | |
532 bits = _asciire.split(string) | |
533 res = [bits[0]] | |
534 append = res.append | |
535 for i in range(1, len(bits), 2): | |
536 append(unquote_to_bytes(bits[i]).decode(encoding, errors)) | |
537 append(bits[i + 1]) | |
538 return ''.join(res) | |
539 | |
540 def parse_qs(qs, keep_blank_values=False, strict_parsing=False, | |
541 encoding='utf-8', errors='replace'): | |
542 """Parse a query given as a string argument. | |
543 | |
544 Arguments: | |
545 | |
546 qs: percent-encoded query string to be parsed | |
547 | |
548 keep_blank_values: flag indicating whether blank values in | |
549 percent-encoded queries should be treated as blank strings. | |
550 A true value indicates that blanks should be retained as | |
551 blank strings. The default false value indicates that | |
552 blank values are to be ignored and treated as if they were | |
553 not included. | |
554 | |
555 strict_parsing: flag indicating what to do with parsing errors. | |
556 If false (the default), errors are silently ignored. | |
557 If true, errors raise a ValueError exception. | |
558 | |
559 encoding and errors: specify how to decode percent-encoded sequences | |
560 into Unicode characters, as accepted by the bytes.decode() method. | |
561 """ | |
562 parsed_result = {} | |
563 pairs = parse_qsl(qs, keep_blank_values, strict_parsing, | |
564 encoding=encoding, errors=errors) | |
565 for name, value in pairs: | |
566 if name in parsed_result: | |
567 parsed_result[name].append(value) | |
568 else: | |
569 parsed_result[name] = [value] | |
570 return parsed_result | |
571 | |
572 def parse_qsl(qs, keep_blank_values=False, strict_parsing=False, | |
573 encoding='utf-8', errors='replace'): | |
574 """Parse a query given as a string argument. | |
575 | |
576 Arguments: | |
577 | |
578 qs: percent-encoded query string to be parsed | |
579 | |
580 keep_blank_values: flag indicating whether blank values in | |
581 percent-encoded queries should be treated as blank strings. A | |
582 true value indicates that blanks should be retained as blank | |
583 strings. The default false value indicates that blank values | |
584 are to be ignored and treated as if they were not included. | |
585 | |
586 strict_parsing: flag indicating what to do with parsing errors. If | |
587 false (the default), errors are silently ignored. If true, | |
588 errors raise a ValueError exception. | |
589 | |
590 encoding and errors: specify how to decode percent-encoded sequences | |
591 into Unicode characters, as accepted by the bytes.decode() method. | |
592 | |
593 Returns a list, as G-d intended. | |
594 """ | |
595 qs, _coerce_result = _coerce_args(qs) | |
596 pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')] | |
597 r = [] | |
598 for name_value in pairs: | |
599 if not name_value and not strict_parsing: | |
600 continue | |
601 nv = name_value.split('=', 1) | |
602 if len(nv) != 2: | |
603 if strict_parsing: | |
604 raise ValueError("bad query field: %r" % (name_value,)) | |
605 # Handle case of a control-name with no equal sign | |
606 if keep_blank_values: | |
607 nv.append('') | |
608 else: | |
609 continue | |
610 if len(nv[1]) or keep_blank_values: | |
611 name = nv[0].replace('+', ' ') | |
612 name = unquote(name, encoding=encoding, errors=errors) | |
613 name = _coerce_result(name) | |
614 value = nv[1].replace('+', ' ') | |
615 value = unquote(value, encoding=encoding, errors=errors) | |
616 value = _coerce_result(value) | |
617 r.append((name, value)) | |
618 return r | |
619 | |
620 def unquote_plus(string, encoding='utf-8', errors='replace'): | |
621 """Like unquote(), but also replace plus signs by spaces, as required for | |
622 unquoting HTML form values. | |
623 | |
624 unquote_plus('%7e/abc+def') -> '~/abc def' | |
625 """ | |
626 string = string.replace('+', ' ') | |
627 return unquote(string, encoding, errors) | |
628 | |
629 _ALWAYS_SAFE = frozenset(bytes(b'ABCDEFGHIJKLMNOPQRSTUVWXYZ' | |
630 b'abcdefghijklmnopqrstuvwxyz' | |
631 b'0123456789' | |
632 b'_.-')) | |
633 _ALWAYS_SAFE_BYTES = bytes(_ALWAYS_SAFE) | |
634 _safe_quoters = {} | |
635 | |
636 class Quoter(collections.defaultdict): | |
637 """A mapping from bytes (in range(0,256)) to strings. | |
638 | |
639 String values are percent-encoded byte values, unless the key < 128, and | |
640 in the "safe" set (either the specified safe set, or default set). | |
641 """ | |
642 # Keeps a cache internally, using defaultdict, for efficiency (lookups | |
643 # of cached keys don't call Python code at all). | |
644 def __init__(self, safe): | |
645 """safe: bytes object.""" | |
646 self.safe = _ALWAYS_SAFE.union(bytes(safe)) | |
647 | |
648 def __repr__(self): | |
649 # Without this, will just display as a defaultdict | |
650 return "<Quoter %r>" % dict(self) | |
651 | |
652 def __missing__(self, b): | |
653 # Handle a cache miss. Store quoted string in cache and return. | |
654 res = chr(b) if b in self.safe else '%{0:02X}'.format(b) | |
655 self[b] = res | |
656 return res | |
657 | |
658 def quote(string, safe='/', encoding=None, errors=None): | |
659 """quote('abc def') -> 'abc%20def' | |
660 | |
661 Each part of a URL, e.g. the path info, the query, etc., has a | |
662 different set of reserved characters that must be quoted. | |
663 | |
664 RFC 2396 Uniform Resource Identifiers (URI): Generic Syntax lists | |
665 the following reserved characters. | |
666 | |
667 reserved = ";" | "/" | "?" | ":" | "@" | "&" | "=" | "+" | | |
668 "$" | "," | |
669 | |
670 Each of these characters is reserved in some component of a URL, | |
671 but not necessarily in all of them. | |
672 | |
673 By default, the quote function is intended for quoting the path | |
674 section of a URL. Thus, it will not encode '/'. This character | |
675 is reserved, but in typical usage the quote function is being | |
676 called on a path where the existing slash characters are used as | |
677 reserved characters. | |
678 | |
679 string and safe may be either str or bytes objects. encoding must | |
680 not be specified if string is a str. | |
681 | |
682 The optional encoding and errors parameters specify how to deal with | |
683 non-ASCII characters, as accepted by the str.encode method. | |
684 By default, encoding='utf-8' (characters are encoded with UTF-8), and | |
685 errors='strict' (unsupported characters raise a UnicodeEncodeError). | |
686 """ | |
687 if isinstance(string, str): | |
688 if not string: | |
689 return string | |
690 if encoding is None: | |
691 encoding = 'utf-8' | |
692 if errors is None: | |
693 errors = 'strict' | |
694 string = string.encode(encoding, errors) | |
695 else: | |
696 if encoding is not None: | |
697 raise TypeError("quote() doesn't support 'encoding' for bytes") | |
698 if errors is not None: | |
699 raise TypeError("quote() doesn't support 'errors' for bytes") | |
700 return quote_from_bytes(string, safe) | |
701 | |
702 def quote_plus(string, safe='', encoding=None, errors=None): | |
703 """Like quote(), but also replace ' ' with '+', as required for quoting | |
704 HTML form values. Plus signs in the original string are escaped unless | |
705 they are included in safe. It also does not have safe default to '/'. | |
706 """ | |
707 # Check if ' ' in string, where string may either be a str or bytes. If | |
708 # there are no spaces, the regular quote will produce the right answer. | |
709 if ((isinstance(string, str) and ' ' not in string) or | |
710 (isinstance(string, bytes) and b' ' not in string)): | |
711 return quote(string, safe, encoding, errors) | |
712 if isinstance(safe, str): | |
713 space = str(' ') | |
714 else: | |
715 space = bytes(b' ') | |
716 string = quote(string, safe + space, encoding, errors) | |
717 return string.replace(' ', '+') | |
718 | |
719 def quote_from_bytes(bs, safe='/'): | |
720 """Like quote(), but accepts a bytes object rather than a str, and does | |
721 not perform string-to-bytes encoding. It always returns an ASCII string. | |
722 quote_from_bytes(b'abc def\x3f') -> 'abc%20def%3f' | |
723 """ | |
724 if not isinstance(bs, (bytes, bytearray)): | |
725 raise TypeError("quote_from_bytes() expected bytes") | |
726 if not bs: | |
727 return str('') | |
728 ### For Python-Future: | |
729 bs = bytes(bs) | |
730 ### | |
731 if isinstance(safe, str): | |
732 # Normalize 'safe' by converting to bytes and removing non-ASCII chars | |
733 safe = str(safe).encode('ascii', 'ignore') | |
734 else: | |
735 ### For Python-Future: | |
736 safe = bytes(safe) | |
737 ### | |
738 safe = bytes([c for c in safe if c < 128]) | |
739 if not bs.rstrip(_ALWAYS_SAFE_BYTES + safe): | |
740 return bs.decode() | |
741 try: | |
742 quoter = _safe_quoters[safe] | |
743 except KeyError: | |
744 _safe_quoters[safe] = quoter = Quoter(safe).__getitem__ | |
745 return str('').join([quoter(char) for char in bs]) | |
746 | |
747 def urlencode(query, doseq=False, safe='', encoding=None, errors=None): | |
748 """Encode a sequence of two-element tuples or dictionary into a URL query string. | |
749 | |
750 If any values in the query arg are sequences and doseq is true, each | |
751 sequence element is converted to a separate parameter. | |
752 | |
753 If the query arg is a sequence of two-element tuples, the order of the | |
754 parameters in the output will match the order of parameters in the | |
755 input. | |
756 | |
757 The query arg may be either a string or a bytes type. When query arg is a | |
758 string, the safe, encoding and error parameters are sent the quote_plus for | |
759 encoding. | |
760 """ | |
761 | |
762 if hasattr(query, "items"): | |
763 query = query.items() | |
764 else: | |
765 # It's a bother at times that strings and string-like objects are | |
766 # sequences. | |
767 try: | |
768 # non-sequence items should not work with len() | |
769 # non-empty strings will fail this | |
770 if len(query) and not isinstance(query[0], tuple): | |
771 raise TypeError | |
772 # Zero-length sequences of all types will get here and succeed, | |
773 # but that's a minor nit. Since the original implementation | |
774 # allowed empty dicts that type of behavior probably should be | |
775 # preserved for consistency | |
776 except TypeError: | |
777 ty, va, tb = sys.exc_info() | |
778 raise_with_traceback(TypeError("not a valid non-string sequence " | |
779 "or mapping object"), tb) | |
780 | |
781 l = [] | |
782 if not doseq: | |
783 for k, v in query: | |
784 if isinstance(k, bytes): | |
785 k = quote_plus(k, safe) | |
786 else: | |
787 k = quote_plus(str(k), safe, encoding, errors) | |
788 | |
789 if isinstance(v, bytes): | |
790 v = quote_plus(v, safe) | |
791 else: | |
792 v = quote_plus(str(v), safe, encoding, errors) | |
793 l.append(k + '=' + v) | |
794 else: | |
795 for k, v in query: | |
796 if isinstance(k, bytes): | |
797 k = quote_plus(k, safe) | |
798 else: | |
799 k = quote_plus(str(k), safe, encoding, errors) | |
800 | |
801 if isinstance(v, bytes): | |
802 v = quote_plus(v, safe) | |
803 l.append(k + '=' + v) | |
804 elif isinstance(v, str): | |
805 v = quote_plus(v, safe, encoding, errors) | |
806 l.append(k + '=' + v) | |
807 else: | |
808 try: | |
809 # Is this a sufficient test for sequence-ness? | |
810 x = len(v) | |
811 except TypeError: | |
812 # not a sequence | |
813 v = quote_plus(str(v), safe, encoding, errors) | |
814 l.append(k + '=' + v) | |
815 else: | |
816 # loop over the sequence | |
817 for elt in v: | |
818 if isinstance(elt, bytes): | |
819 elt = quote_plus(elt, safe) | |
820 else: | |
821 elt = quote_plus(str(elt), safe, encoding, errors) | |
822 l.append(k + '=' + elt) | |
823 return str('&').join(l) | |
824 | |
825 # Utilities to parse URLs (most of these return None for missing parts): | |
826 # unwrap('<URL:type://host/path>') --> 'type://host/path' | |
827 # splittype('type:opaquestring') --> 'type', 'opaquestring' | |
828 # splithost('//host[:port]/path') --> 'host[:port]', '/path' | |
829 # splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]' | |
830 # splitpasswd('user:passwd') -> 'user', 'passwd' | |
831 # splitport('host:port') --> 'host', 'port' | |
832 # splitquery('/path?query') --> '/path', 'query' | |
833 # splittag('/path#tag') --> '/path', 'tag' | |
834 # splitattr('/path;attr1=value1;attr2=value2;...') -> | |
835 # '/path', ['attr1=value1', 'attr2=value2', ...] | |
836 # splitvalue('attr=value') --> 'attr', 'value' | |
837 # urllib.parse.unquote('abc%20def') -> 'abc def' | |
838 # quote('abc def') -> 'abc%20def') | |
839 | |
840 def to_bytes(url): | |
841 """to_bytes(u"URL") --> 'URL'.""" | |
842 # Most URL schemes require ASCII. If that changes, the conversion | |
843 # can be relaxed. | |
844 # XXX get rid of to_bytes() | |
845 if isinstance(url, str): | |
846 try: | |
847 url = url.encode("ASCII").decode() | |
848 except UnicodeError: | |
849 raise UnicodeError("URL " + repr(url) + | |
850 " contains non-ASCII characters") | |
851 return url | |
852 | |
853 def unwrap(url): | |
854 """unwrap('<URL:type://host/path>') --> 'type://host/path'.""" | |
855 url = str(url).strip() | |
856 if url[:1] == '<' and url[-1:] == '>': | |
857 url = url[1:-1].strip() | |
858 if url[:4] == 'URL:': url = url[4:].strip() | |
859 return url | |
860 | |
861 _typeprog = None | |
862 def splittype(url): | |
863 """splittype('type:opaquestring') --> 'type', 'opaquestring'.""" | |
864 global _typeprog | |
865 if _typeprog is None: | |
866 import re | |
867 _typeprog = re.compile('^([^/:]+):') | |
868 | |
869 match = _typeprog.match(url) | |
870 if match: | |
871 scheme = match.group(1) | |
872 return scheme.lower(), url[len(scheme) + 1:] | |
873 return None, url | |
874 | |
875 _hostprog = None | |
876 def splithost(url): | |
877 """splithost('//host[:port]/path') --> 'host[:port]', '/path'.""" | |
878 global _hostprog | |
879 if _hostprog is None: | |
880 import re | |
881 _hostprog = re.compile('^//([^/?]*)(.*)$') | |
882 | |
883 match = _hostprog.match(url) | |
884 if match: | |
885 host_port = match.group(1) | |
886 path = match.group(2) | |
887 if path and not path.startswith('/'): | |
888 path = '/' + path | |
889 return host_port, path | |
890 return None, url | |
891 | |
892 _userprog = None | |
893 def splituser(host): | |
894 """splituser('user[:passwd]@host[:port]') --> 'user[:passwd]', 'host[:port]'.""" | |
895 global _userprog | |
896 if _userprog is None: | |
897 import re | |
898 _userprog = re.compile('^(.*)@(.*)$') | |
899 | |
900 match = _userprog.match(host) | |
901 if match: return match.group(1, 2) | |
902 return None, host | |
903 | |
904 _passwdprog = None | |
905 def splitpasswd(user): | |
906 """splitpasswd('user:passwd') -> 'user', 'passwd'.""" | |
907 global _passwdprog | |
908 if _passwdprog is None: | |
909 import re | |
910 _passwdprog = re.compile('^([^:]*):(.*)$',re.S) | |
911 | |
912 match = _passwdprog.match(user) | |
913 if match: return match.group(1, 2) | |
914 return user, None | |
915 | |
916 # splittag('/path#tag') --> '/path', 'tag' | |
917 _portprog = None | |
918 def splitport(host): | |
919 """splitport('host:port') --> 'host', 'port'.""" | |
920 global _portprog | |
921 if _portprog is None: | |
922 import re | |
923 _portprog = re.compile('^(.*):([0-9]+)$') | |
924 | |
925 match = _portprog.match(host) | |
926 if match: return match.group(1, 2) | |
927 return host, None | |
928 | |
929 _nportprog = None | |
930 def splitnport(host, defport=-1): | |
931 """Split host and port, returning numeric port. | |
932 Return given default port if no ':' found; defaults to -1. | |
933 Return numerical port if a valid number are found after ':'. | |
934 Return None if ':' but not a valid number.""" | |
935 global _nportprog | |
936 if _nportprog is None: | |
937 import re | |
938 _nportprog = re.compile('^(.*):(.*)$') | |
939 | |
940 match = _nportprog.match(host) | |
941 if match: | |
942 host, port = match.group(1, 2) | |
943 try: | |
944 if not port: raise ValueError("no digits") | |
945 nport = int(port) | |
946 except ValueError: | |
947 nport = None | |
948 return host, nport | |
949 return host, defport | |
950 | |
951 _queryprog = None | |
952 def splitquery(url): | |
953 """splitquery('/path?query') --> '/path', 'query'.""" | |
954 global _queryprog | |
955 if _queryprog is None: | |
956 import re | |
957 _queryprog = re.compile('^(.*)\?([^?]*)$') | |
958 | |
959 match = _queryprog.match(url) | |
960 if match: return match.group(1, 2) | |
961 return url, None | |
962 | |
963 _tagprog = None | |
964 def splittag(url): | |
965 """splittag('/path#tag') --> '/path', 'tag'.""" | |
966 global _tagprog | |
967 if _tagprog is None: | |
968 import re | |
969 _tagprog = re.compile('^(.*)#([^#]*)$') | |
970 | |
971 match = _tagprog.match(url) | |
972 if match: return match.group(1, 2) | |
973 return url, None | |
974 | |
975 def splitattr(url): | |
976 """splitattr('/path;attr1=value1;attr2=value2;...') -> | |
977 '/path', ['attr1=value1', 'attr2=value2', ...].""" | |
978 words = url.split(';') | |
979 return words[0], words[1:] | |
980 | |
981 _valueprog = None | |
982 def splitvalue(attr): | |
983 """splitvalue('attr=value') --> 'attr', 'value'.""" | |
984 global _valueprog | |
985 if _valueprog is None: | |
986 import re | |
987 _valueprog = re.compile('^([^=]*)=(.*)$') | |
988 | |
989 match = _valueprog.match(attr) | |
990 if match: return match.group(1, 2) | |
991 return attr, None |