comparison env/lib/python3.9/site-packages/pip/_internal/index/collector.py @ 0:4f3585e2f14b draft default tip

"planemo upload commit 60cee0fc7c0cda8592644e1aad72851dec82c959"
author shellac
date Mon, 22 Mar 2021 18:12:50 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:4f3585e2f14b
1 """
2 The main purpose of this module is to expose LinkCollector.collect_links().
3 """
4
5 import cgi
6 import functools
7 import itertools
8 import logging
9 import mimetypes
10 import os
11 import re
12 import urllib.parse
13 import urllib.request
14 from collections import OrderedDict
15
16 from pip._vendor import html5lib, requests
17 from pip._vendor.distlib.compat import unescape
18 from pip._vendor.requests.exceptions import RetryError, SSLError
19
20 from pip._internal.exceptions import NetworkConnectionError
21 from pip._internal.models.link import Link
22 from pip._internal.models.search_scope import SearchScope
23 from pip._internal.network.utils import raise_for_status
24 from pip._internal.utils.filetypes import is_archive_file
25 from pip._internal.utils.misc import pairwise, redact_auth_from_url
26 from pip._internal.utils.typing import MYPY_CHECK_RUNNING
27 from pip._internal.utils.urls import path_to_url, url_to_path
28 from pip._internal.vcs import is_url, vcs
29
30 if MYPY_CHECK_RUNNING:
31 import xml.etree.ElementTree
32 from optparse import Values
33 from typing import (
34 Callable,
35 Iterable,
36 List,
37 MutableMapping,
38 Optional,
39 Sequence,
40 Tuple,
41 Union,
42 )
43
44 from pip._vendor.requests import Response
45
46 from pip._internal.network.session import PipSession
47
48 HTMLElement = xml.etree.ElementTree.Element
49 ResponseHeaders = MutableMapping[str, str]
50
51
52 logger = logging.getLogger(__name__)
53
54
55 def _match_vcs_scheme(url):
56 # type: (str) -> Optional[str]
57 """Look for VCS schemes in the URL.
58
59 Returns the matched VCS scheme, or None if there's no match.
60 """
61 for scheme in vcs.schemes:
62 if url.lower().startswith(scheme) and url[len(scheme)] in '+:':
63 return scheme
64 return None
65
66
67 class _NotHTML(Exception):
68 def __init__(self, content_type, request_desc):
69 # type: (str, str) -> None
70 super().__init__(content_type, request_desc)
71 self.content_type = content_type
72 self.request_desc = request_desc
73
74
75 def _ensure_html_header(response):
76 # type: (Response) -> None
77 """Check the Content-Type header to ensure the response contains HTML.
78
79 Raises `_NotHTML` if the content type is not text/html.
80 """
81 content_type = response.headers.get("Content-Type", "")
82 if not content_type.lower().startswith("text/html"):
83 raise _NotHTML(content_type, response.request.method)
84
85
86 class _NotHTTP(Exception):
87 pass
88
89
90 def _ensure_html_response(url, session):
91 # type: (str, PipSession) -> None
92 """Send a HEAD request to the URL, and ensure the response contains HTML.
93
94 Raises `_NotHTTP` if the URL is not available for a HEAD request, or
95 `_NotHTML` if the content type is not text/html.
96 """
97 scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
98 if scheme not in {'http', 'https'}:
99 raise _NotHTTP()
100
101 resp = session.head(url, allow_redirects=True)
102 raise_for_status(resp)
103
104 _ensure_html_header(resp)
105
106
107 def _get_html_response(url, session):
108 # type: (str, PipSession) -> Response
109 """Access an HTML page with GET, and return the response.
110
111 This consists of three parts:
112
113 1. If the URL looks suspiciously like an archive, send a HEAD first to
114 check the Content-Type is HTML, to avoid downloading a large file.
115 Raise `_NotHTTP` if the content type cannot be determined, or
116 `_NotHTML` if it is not HTML.
117 2. Actually perform the request. Raise HTTP exceptions on network failures.
118 3. Check the Content-Type header to make sure we got HTML, and raise
119 `_NotHTML` otherwise.
120 """
121 if is_archive_file(Link(url).filename):
122 _ensure_html_response(url, session=session)
123
124 logger.debug('Getting page %s', redact_auth_from_url(url))
125
126 resp = session.get(
127 url,
128 headers={
129 "Accept": "text/html",
130 # We don't want to blindly returned cached data for
131 # /simple/, because authors generally expecting that
132 # twine upload && pip install will function, but if
133 # they've done a pip install in the last ~10 minutes
134 # it won't. Thus by setting this to zero we will not
135 # blindly use any cached data, however the benefit of
136 # using max-age=0 instead of no-cache, is that we will
137 # still support conditional requests, so we will still
138 # minimize traffic sent in cases where the page hasn't
139 # changed at all, we will just always incur the round
140 # trip for the conditional GET now instead of only
141 # once per 10 minutes.
142 # For more information, please see pypa/pip#5670.
143 "Cache-Control": "max-age=0",
144 },
145 )
146 raise_for_status(resp)
147
148 # The check for archives above only works if the url ends with
149 # something that looks like an archive. However that is not a
150 # requirement of an url. Unless we issue a HEAD request on every
151 # url we cannot know ahead of time for sure if something is HTML
152 # or not. However we can check after we've downloaded it.
153 _ensure_html_header(resp)
154
155 return resp
156
157
158 def _get_encoding_from_headers(headers):
159 # type: (ResponseHeaders) -> Optional[str]
160 """Determine if we have any encoding information in our headers.
161 """
162 if headers and "Content-Type" in headers:
163 content_type, params = cgi.parse_header(headers["Content-Type"])
164 if "charset" in params:
165 return params['charset']
166 return None
167
168
169 def _determine_base_url(document, page_url):
170 # type: (HTMLElement, str) -> str
171 """Determine the HTML document's base URL.
172
173 This looks for a ``<base>`` tag in the HTML document. If present, its href
174 attribute denotes the base URL of anchor tags in the document. If there is
175 no such tag (or if it does not have a valid href attribute), the HTML
176 file's URL is used as the base URL.
177
178 :param document: An HTML document representation. The current
179 implementation expects the result of ``html5lib.parse()``.
180 :param page_url: The URL of the HTML document.
181 """
182 for base in document.findall(".//base"):
183 href = base.get("href")
184 if href is not None:
185 return href
186 return page_url
187
188
189 def _clean_url_path_part(part):
190 # type: (str) -> str
191 """
192 Clean a "part" of a URL path (i.e. after splitting on "@" characters).
193 """
194 # We unquote prior to quoting to make sure nothing is double quoted.
195 return urllib.parse.quote(urllib.parse.unquote(part))
196
197
198 def _clean_file_url_path(part):
199 # type: (str) -> str
200 """
201 Clean the first part of a URL path that corresponds to a local
202 filesystem path (i.e. the first part after splitting on "@" characters).
203 """
204 # We unquote prior to quoting to make sure nothing is double quoted.
205 # Also, on Windows the path part might contain a drive letter which
206 # should not be quoted. On Linux where drive letters do not
207 # exist, the colon should be quoted. We rely on urllib.request
208 # to do the right thing here.
209 return urllib.request.pathname2url(urllib.request.url2pathname(part))
210
211
212 # percent-encoded: /
213 _reserved_chars_re = re.compile('(@|%2F)', re.IGNORECASE)
214
215
216 def _clean_url_path(path, is_local_path):
217 # type: (str, bool) -> str
218 """
219 Clean the path portion of a URL.
220 """
221 if is_local_path:
222 clean_func = _clean_file_url_path
223 else:
224 clean_func = _clean_url_path_part
225
226 # Split on the reserved characters prior to cleaning so that
227 # revision strings in VCS URLs are properly preserved.
228 parts = _reserved_chars_re.split(path)
229
230 cleaned_parts = []
231 for to_clean, reserved in pairwise(itertools.chain(parts, [''])):
232 cleaned_parts.append(clean_func(to_clean))
233 # Normalize %xx escapes (e.g. %2f -> %2F)
234 cleaned_parts.append(reserved.upper())
235
236 return ''.join(cleaned_parts)
237
238
239 def _clean_link(url):
240 # type: (str) -> str
241 """
242 Make sure a link is fully quoted.
243 For example, if ' ' occurs in the URL, it will be replaced with "%20",
244 and without double-quoting other characters.
245 """
246 # Split the URL into parts according to the general structure
247 # `scheme://netloc/path;parameters?query#fragment`.
248 result = urllib.parse.urlparse(url)
249 # If the netloc is empty, then the URL refers to a local filesystem path.
250 is_local_path = not result.netloc
251 path = _clean_url_path(result.path, is_local_path=is_local_path)
252 return urllib.parse.urlunparse(result._replace(path=path))
253
254
255 def _create_link_from_element(
256 anchor, # type: HTMLElement
257 page_url, # type: str
258 base_url, # type: str
259 ):
260 # type: (...) -> Optional[Link]
261 """
262 Convert an anchor element in a simple repository page to a Link.
263 """
264 href = anchor.get("href")
265 if not href:
266 return None
267
268 url = _clean_link(urllib.parse.urljoin(base_url, href))
269 pyrequire = anchor.get('data-requires-python')
270 pyrequire = unescape(pyrequire) if pyrequire else None
271
272 yanked_reason = anchor.get('data-yanked')
273 if yanked_reason:
274 # This is a unicode string in Python 2 (and 3).
275 yanked_reason = unescape(yanked_reason)
276
277 link = Link(
278 url,
279 comes_from=page_url,
280 requires_python=pyrequire,
281 yanked_reason=yanked_reason,
282 )
283
284 return link
285
286
287 class CacheablePageContent:
288 def __init__(self, page):
289 # type: (HTMLPage) -> None
290 assert page.cache_link_parsing
291 self.page = page
292
293 def __eq__(self, other):
294 # type: (object) -> bool
295 return (isinstance(other, type(self)) and
296 self.page.url == other.page.url)
297
298 def __hash__(self):
299 # type: () -> int
300 return hash(self.page.url)
301
302
303 def with_cached_html_pages(
304 fn, # type: Callable[[HTMLPage], Iterable[Link]]
305 ):
306 # type: (...) -> Callable[[HTMLPage], List[Link]]
307 """
308 Given a function that parses an Iterable[Link] from an HTMLPage, cache the
309 function's result (keyed by CacheablePageContent), unless the HTMLPage
310 `page` has `page.cache_link_parsing == False`.
311 """
312
313 @functools.lru_cache(maxsize=None)
314 def wrapper(cacheable_page):
315 # type: (CacheablePageContent) -> List[Link]
316 return list(fn(cacheable_page.page))
317
318 @functools.wraps(fn)
319 def wrapper_wrapper(page):
320 # type: (HTMLPage) -> List[Link]
321 if page.cache_link_parsing:
322 return wrapper(CacheablePageContent(page))
323 return list(fn(page))
324
325 return wrapper_wrapper
326
327
328 @with_cached_html_pages
329 def parse_links(page):
330 # type: (HTMLPage) -> Iterable[Link]
331 """
332 Parse an HTML document, and yield its anchor elements as Link objects.
333 """
334 document = html5lib.parse(
335 page.content,
336 transport_encoding=page.encoding,
337 namespaceHTMLElements=False,
338 )
339
340 url = page.url
341 base_url = _determine_base_url(document, url)
342 for anchor in document.findall(".//a"):
343 link = _create_link_from_element(
344 anchor,
345 page_url=url,
346 base_url=base_url,
347 )
348 if link is None:
349 continue
350 yield link
351
352
353 class HTMLPage:
354 """Represents one page, along with its URL"""
355
356 def __init__(
357 self,
358 content, # type: bytes
359 encoding, # type: Optional[str]
360 url, # type: str
361 cache_link_parsing=True, # type: bool
362 ):
363 # type: (...) -> None
364 """
365 :param encoding: the encoding to decode the given content.
366 :param url: the URL from which the HTML was downloaded.
367 :param cache_link_parsing: whether links parsed from this page's url
368 should be cached. PyPI index urls should
369 have this set to False, for example.
370 """
371 self.content = content
372 self.encoding = encoding
373 self.url = url
374 self.cache_link_parsing = cache_link_parsing
375
376 def __str__(self):
377 # type: () -> str
378 return redact_auth_from_url(self.url)
379
380
381 def _handle_get_page_fail(
382 link, # type: Link
383 reason, # type: Union[str, Exception]
384 meth=None # type: Optional[Callable[..., None]]
385 ):
386 # type: (...) -> None
387 if meth is None:
388 meth = logger.debug
389 meth("Could not fetch URL %s: %s - skipping", link, reason)
390
391
392 def _make_html_page(response, cache_link_parsing=True):
393 # type: (Response, bool) -> HTMLPage
394 encoding = _get_encoding_from_headers(response.headers)
395 return HTMLPage(
396 response.content,
397 encoding=encoding,
398 url=response.url,
399 cache_link_parsing=cache_link_parsing)
400
401
402 def _get_html_page(link, session=None):
403 # type: (Link, Optional[PipSession]) -> Optional[HTMLPage]
404 if session is None:
405 raise TypeError(
406 "_get_html_page() missing 1 required keyword argument: 'session'"
407 )
408
409 url = link.url.split('#', 1)[0]
410
411 # Check for VCS schemes that do not support lookup as web pages.
412 vcs_scheme = _match_vcs_scheme(url)
413 if vcs_scheme:
414 logger.warning('Cannot look at %s URL %s because it does not support '
415 'lookup as web pages.', vcs_scheme, link)
416 return None
417
418 # Tack index.html onto file:// URLs that point to directories
419 scheme, _, path, _, _, _ = urllib.parse.urlparse(url)
420 if (scheme == 'file' and os.path.isdir(urllib.request.url2pathname(path))):
421 # add trailing slash if not present so urljoin doesn't trim
422 # final segment
423 if not url.endswith('/'):
424 url += '/'
425 url = urllib.parse.urljoin(url, 'index.html')
426 logger.debug(' file: URL is directory, getting %s', url)
427
428 try:
429 resp = _get_html_response(url, session=session)
430 except _NotHTTP:
431 logger.warning(
432 'Skipping page %s because it looks like an archive, and cannot '
433 'be checked by a HTTP HEAD request.', link,
434 )
435 except _NotHTML as exc:
436 logger.warning(
437 'Skipping page %s because the %s request got Content-Type: %s.'
438 'The only supported Content-Type is text/html',
439 link, exc.request_desc, exc.content_type,
440 )
441 except NetworkConnectionError as exc:
442 _handle_get_page_fail(link, exc)
443 except RetryError as exc:
444 _handle_get_page_fail(link, exc)
445 except SSLError as exc:
446 reason = "There was a problem confirming the ssl certificate: "
447 reason += str(exc)
448 _handle_get_page_fail(link, reason, meth=logger.info)
449 except requests.ConnectionError as exc:
450 _handle_get_page_fail(link, f"connection error: {exc}")
451 except requests.Timeout:
452 _handle_get_page_fail(link, "timed out")
453 else:
454 return _make_html_page(resp,
455 cache_link_parsing=link.cache_link_parsing)
456 return None
457
458
459 def _remove_duplicate_links(links):
460 # type: (Iterable[Link]) -> List[Link]
461 """
462 Return a list of links, with duplicates removed and ordering preserved.
463 """
464 # We preserve the ordering when removing duplicates because we can.
465 return list(OrderedDict.fromkeys(links))
466
467
468 def group_locations(locations, expand_dir=False):
469 # type: (Sequence[str], bool) -> Tuple[List[str], List[str]]
470 """
471 Divide a list of locations into two groups: "files" (archives) and "urls."
472
473 :return: A pair of lists (files, urls).
474 """
475 files = []
476 urls = []
477
478 # puts the url for the given file path into the appropriate list
479 def sort_path(path):
480 # type: (str) -> None
481 url = path_to_url(path)
482 if mimetypes.guess_type(url, strict=False)[0] == 'text/html':
483 urls.append(url)
484 else:
485 files.append(url)
486
487 for url in locations:
488
489 is_local_path = os.path.exists(url)
490 is_file_url = url.startswith('file:')
491
492 if is_local_path or is_file_url:
493 if is_local_path:
494 path = url
495 else:
496 path = url_to_path(url)
497 if os.path.isdir(path):
498 if expand_dir:
499 path = os.path.realpath(path)
500 for item in os.listdir(path):
501 sort_path(os.path.join(path, item))
502 elif is_file_url:
503 urls.append(url)
504 else:
505 logger.warning(
506 "Path '%s' is ignored: it is a directory.", path,
507 )
508 elif os.path.isfile(path):
509 sort_path(path)
510 else:
511 logger.warning(
512 "Url '%s' is ignored: it is neither a file "
513 "nor a directory.", url,
514 )
515 elif is_url(url):
516 # Only add url with clear scheme
517 urls.append(url)
518 else:
519 logger.warning(
520 "Url '%s' is ignored. It is either a non-existing "
521 "path or lacks a specific scheme.", url,
522 )
523
524 return files, urls
525
526
527 class CollectedLinks:
528
529 """
530 Encapsulates the return value of a call to LinkCollector.collect_links().
531
532 The return value includes both URLs to project pages containing package
533 links, as well as individual package Link objects collected from other
534 sources.
535
536 This info is stored separately as:
537
538 (1) links from the configured file locations,
539 (2) links from the configured find_links, and
540 (3) urls to HTML project pages, as described by the PEP 503 simple
541 repository API.
542 """
543
544 def __init__(
545 self,
546 files, # type: List[Link]
547 find_links, # type: List[Link]
548 project_urls, # type: List[Link]
549 ):
550 # type: (...) -> None
551 """
552 :param files: Links from file locations.
553 :param find_links: Links from find_links.
554 :param project_urls: URLs to HTML project pages, as described by
555 the PEP 503 simple repository API.
556 """
557 self.files = files
558 self.find_links = find_links
559 self.project_urls = project_urls
560
561
562 class LinkCollector:
563
564 """
565 Responsible for collecting Link objects from all configured locations,
566 making network requests as needed.
567
568 The class's main method is its collect_links() method.
569 """
570
571 def __init__(
572 self,
573 session, # type: PipSession
574 search_scope, # type: SearchScope
575 ):
576 # type: (...) -> None
577 self.search_scope = search_scope
578 self.session = session
579
580 @classmethod
581 def create(cls, session, options, suppress_no_index=False):
582 # type: (PipSession, Values, bool) -> LinkCollector
583 """
584 :param session: The Session to use to make requests.
585 :param suppress_no_index: Whether to ignore the --no-index option
586 when constructing the SearchScope object.
587 """
588 index_urls = [options.index_url] + options.extra_index_urls
589 if options.no_index and not suppress_no_index:
590 logger.debug(
591 'Ignoring indexes: %s',
592 ','.join(redact_auth_from_url(url) for url in index_urls),
593 )
594 index_urls = []
595
596 # Make sure find_links is a list before passing to create().
597 find_links = options.find_links or []
598
599 search_scope = SearchScope.create(
600 find_links=find_links, index_urls=index_urls,
601 )
602 link_collector = LinkCollector(
603 session=session, search_scope=search_scope,
604 )
605 return link_collector
606
607 @property
608 def find_links(self):
609 # type: () -> List[str]
610 return self.search_scope.find_links
611
612 def fetch_page(self, location):
613 # type: (Link) -> Optional[HTMLPage]
614 """
615 Fetch an HTML page containing package links.
616 """
617 return _get_html_page(location, session=self.session)
618
619 def collect_links(self, project_name):
620 # type: (str) -> CollectedLinks
621 """Find all available links for the given project name.
622
623 :return: All the Link objects (unfiltered), as a CollectedLinks object.
624 """
625 search_scope = self.search_scope
626 index_locations = search_scope.get_index_urls_locations(project_name)
627 index_file_loc, index_url_loc = group_locations(index_locations)
628 fl_file_loc, fl_url_loc = group_locations(
629 self.find_links, expand_dir=True,
630 )
631
632 file_links = [
633 Link(url) for url in itertools.chain(index_file_loc, fl_file_loc)
634 ]
635
636 # We trust every directly linked archive in find_links
637 find_link_links = [Link(url, '-f') for url in self.find_links]
638
639 # We trust every url that the user has given us whether it was given
640 # via --index-url or --find-links.
641 # We want to filter out anything that does not have a secure origin.
642 url_locations = [
643 link for link in itertools.chain(
644 # Mark PyPI indices as "cache_link_parsing == False" -- this
645 # will avoid caching the result of parsing the page for links.
646 (Link(url, cache_link_parsing=False) for url in index_url_loc),
647 (Link(url) for url in fl_url_loc),
648 )
649 if self.session.is_secure_origin(link)
650 ]
651
652 url_locations = _remove_duplicate_links(url_locations)
653 lines = [
654 '{} location(s) to search for versions of {}:'.format(
655 len(url_locations), project_name,
656 ),
657 ]
658 for link in url_locations:
659 lines.append(f'* {link}')
660 logger.debug('\n'.join(lines))
661
662 return CollectedLinks(
663 files=file_links,
664 find_links=find_link_links,
665 project_urls=url_locations,
666 )