2The main purpose of this module is to expose LinkCollector.collect_sources().
15from optparse
import Values
43from .sources
import CandidatesFromPage, LinkSource, build_source
46 from typing
import Protocol
52ResponseHeaders = MutableMapping[str, str]
56 """Look for VCS schemes in the URL.
58 Returns the matched VCS scheme, or None if there's no match.
67 def __init__(self, content_type: str, request_desc: str) ->
None:
75 Check the Content-Type header to ensure the response contains a Simple
78 Raises `_NotAPIContent` if the content type is not a valid content-type.
86 "application/vnd.pypi.simple.v1+html",
87 "application/vnd.pypi.simple.v1+json",
101 Send a HEAD request to the URL, and ensure the response contains a simple
104 Raises `_NotHTTP` if the URL is not available for a HEAD request, or
105 `_NotAPIContent` if the content type is not a valid content type.
108 if scheme
not in {
"http",
"https"}:
112 raise_for_status(resp)
118 """Access an Simple API response with GET, and return the response.
120 This consists of three parts:
122 1. If the URL looks suspiciously like an archive, send a HEAD first to
123 check the Content-Type is HTML or Simple API, to avoid downloading a
124 large file. Raise `_NotHTTP` if the content type cannot be determined, or
125 `_NotAPIContent` if it is not HTML or a Simple API.
126 2. Actually perform the request. Raise HTTP exceptions on network failures.
127 3. Check the Content-Type header to make sure we got a Simple API response,
128 and raise `_NotAPIContent` otherwise.
130 if is_archive_file(
Link(url).filename):
133 logger.debug(
"Getting page %s", redact_auth_from_url(url))
140 "application/vnd.pypi.simple.v1+json",
141 "application/vnd.pypi.simple.v1+html; q=0.1",
158 "Cache-Control":
"max-age=0",
161 raise_for_status(resp)
172 "Fetched page %s as %s",
173 redact_auth_from_url(url),
181 """Determine if we have any encoding information in our headers."""
182 if headers
and "Content-Type" in headers:
184 m[
"content-type"] = headers[
"Content-Type"]
200 return hash(self.
page.url)
204 def __call__(self, page:
"IndexContent") -> Iterable[Link]:
210 Given a function that parses an Iterable[Link] from an IndexContent, cache the
211 function's result (keyed by CacheablePageContent), unless the IndexContent
212 `page` has `page.cache_link_parsing == False`.
215 @functools.lru_cache(maxsize=None)
216 def wrapper(cacheable_page: CacheablePageContent) -> List[Link]:
223 return list(fn(page))
225 return wrapper_wrapper
228@with_cached_index_content
229def parse_links(page:
"IndexContent") -> Iterable[Link]:
231 Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
258 """Represents one response (or page), along with its URL"""
264 encoding: Optional[str],
266 cache_link_parsing: bool =
True,
269 :param encoding: the encoding to decode the given content.
270 :param url: the URL from which the HTML was downloaded.
271 :param cache_link_parsing: whether links parsed from this page's url
272 should be cached. PyPI index urls should
273 have this set to False, for example.
282 return redact_auth_from_url(self.
url)
287 HTMLParser that keeps the first base HREF and a list of all anchor
288 elements' attributes.
296 self.anchors: List[Dict[str, Optional[str]]] = []
299 if tag ==
"base" and self.
base_url is None:
304 self.anchors.append(dict(attrs))
306 def get_href(self, attrs: List[Tuple[str, Optional[str]]]) -> Optional[str]:
307 for name, value
in attrs:
315 reason: Union[str, Exception],
316 meth: Optional[Callable[...,
None]] =
None,
320 meth(
"Could not fetch URL %s: %s - skipping", link, reason)
324 response: Response, cache_link_parsing: bool =
True
332 cache_link_parsing=cache_link_parsing,
343 "Cannot look at %s URL %s because it does not support lookup as web pages.",
361 logger.debug(
" file: URL is directory, getting %s", url)
367 "Skipping page %s because it looks like an archive, and cannot "
368 "be checked by a HTTP HEAD request.",
371 except _NotAPIContent
as exc:
373 "Skipping page %s because the %s request got Content-Type: %s. "
374 "The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
375 "application/vnd.pypi.simple.v1+html, and text/html",
380 except NetworkConnectionError
as exc:
382 except RetryError
as exc:
384 except SSLError
as exc:
385 reason =
"There was a problem confirming the ssl certificate: "
398 find_links: Sequence[Optional[LinkSource]]
399 index_urls: Sequence[Optional[LinkSource]]
405 Responsible for collecting Link objects from all configured locations,
406 making network requests as needed.
408 The class's main method is its collect_sources() method.
414 search_scope: SearchScope,
424 suppress_no_index: bool =
False,
425 ) ->
"LinkCollector":
427 :param session: The Session to use to make requests.
428 :param suppress_no_index: Whether to ignore the --no-index option
429 when constructing the SearchScope object.
434 "Ignoring indexes: %s",
435 ",".join(redact_auth_from_url(url)
for url
in index_urls),
443 find_links=find_links,
444 index_urls=index_urls,
449 search_scope=search_scope,
451 return link_collector
459 Fetch an HTML page containing package links.
466 candidates_from_page: CandidatesFromPage,
467 ) -> CollectedSources:
472 candidates_from_page=candidates_from_page,
473 page_validator=self.
session.is_secure_origin,
475 cache_link_parsing=
False,
477 for loc
in self.
search_scope.get_index_urls_locations(project_name)
482 candidates_from_page=candidates_from_page,
483 page_validator=self.
session.is_secure_origin,
485 cache_link_parsing=
True,
494 if s
is not None and s.link is not None
497 f
"{len(lines)} location(s) to search "
498 f
"for versions of {project_name}:"
503 find_links=list(find_links_sources),
504 index_urls=list(index_url_sources),
None __init__(self, "IndexContent" page)
bool __eq__(self, object other)
None handle_starttag(self, str tag, List[Tuple[str, Optional[str]]] attrs)
None __init__(self, str url)
Optional[str] get_href(self, List[Tuple[str, Optional[str]]] attrs)
None __init__(self, bytes content, str content_type, Optional[str] encoding, str url, bool cache_link_parsing=True)
Optional[IndexContent] fetch_response(self, Link location)
None __init__(self, PipSession session, SearchScope search_scope)
List[str] find_links(self)
"LinkCollector" create(cls, PipSession session, Values options, bool suppress_no_index=False)
CollectedSources collect_sources(self, str project_name, CandidatesFromPage candidates_from_page)
Iterable[Link] __call__(self, "IndexContent" page)
None __init__(self, str content_type, str request_desc)
None _handle_get_simple_fail(Link link, Union[str, Exception] reason, Optional[Callable[..., None]] meth=None)
None _ensure_api_response(str url, PipSession session)
Optional[str] _get_encoding_from_headers(ResponseHeaders headers)
IndexContent _make_index_content(Response response, bool cache_link_parsing=True)
None _ensure_api_header(Response response)
Optional[str] _match_vcs_scheme(str url)
Response _get_simple_response(str url, PipSession session)
ParseLinks with_cached_index_content(ParseLinks fn)
Optional["IndexContent"] _get_index_content(Link link, *PipSession session)