Let us walk on the 3-isogeny graph
Loading...
Searching...
No Matches
pip._internal.index.collector Namespace Reference

Data Structures

class  _NotAPIContent
 
class  _NotHTTP
 
class  CacheablePageContent
 
class  CollectedSources
 
class  HTMLLinkParser
 
class  IndexContent
 
class  LinkCollector
 
class  ParseLinks
 

Functions

Optional[str] _match_vcs_scheme (str url)
 
None _ensure_api_header (Response response)
 
None _ensure_api_response (str url, PipSession session)
 
Response _get_simple_response (str url, PipSession session)
 
Optional[str] _get_encoding_from_headers (ResponseHeaders headers)
 
ParseLinks with_cached_index_content (ParseLinks fn)
 
Iterable[Linkparse_links ("IndexContent" page)
 
None _handle_get_simple_fail (Link link, Union[str, Exception] reason, Optional[Callable[..., None]] meth=None)
 
IndexContent _make_index_content (Response response, bool cache_link_parsing=True)
 
Optional["IndexContent"] _get_index_content (Link link, *PipSession session)
 

Variables

 Protocol = object
 
 logger = logging.getLogger(__name__)
 
 ResponseHeaders = MutableMapping[str, str]
 

Detailed Description

The main purpose of this module is to expose LinkCollector.collect_sources().

Function Documentation

◆ _ensure_api_header()

None _ensure_api_header ( Response  response)
protected
Check the Content-Type header to ensure the response contains a Simple
API Response.

Raises `_NotAPIContent` if the content type is not a valid content-type.

Definition at line 73 of file collector.py.

73def _ensure_api_header(response: Response) -> None:
74 """
75 Check the Content-Type header to ensure the response contains a Simple
76 API Response.
77
78 Raises `_NotAPIContent` if the content type is not a valid content-type.
79 """
80 content_type = response.headers.get("Content-Type", "Unknown")
81
82 content_type_l = content_type.lower()
84 (
85 "text/html",
86 "application/vnd.pypi.simple.v1+html",
87 "application/vnd.pypi.simple.v1+json",
88 )
89 ):
90 return
91
92 raise _NotAPIContent(content_type, response.request.method)
93
94
for i

References i.

Referenced by pip._internal.index.collector._get_simple_response().

Here is the caller graph for this function:

◆ _ensure_api_response()

None _ensure_api_response ( str  url,
PipSession  session 
)
protected
Send a HEAD request to the URL, and ensure the response contains a simple
API Response.

Raises `_NotHTTP` if the URL is not available for a HEAD request, or
`_NotAPIContent` if the content type is not a valid content type.

Definition at line 99 of file collector.py.

99def _ensure_api_response(url: str, session: PipSession) -> None:
100 """
101 Send a HEAD request to the URL, and ensure the response contains a simple
102 API Response.
103
104 Raises `_NotHTTP` if the URL is not available for a HEAD request, or
105 `_NotAPIContent` if the content type is not a valid content type.
106 """
107 scheme, netloc, path, query, fragment = urllib.parse.urlsplit(url)
108 if scheme not in {"http", "https"}:
109 raise _NotHTTP()
110
111 resp = session.head(url, allow_redirects=True)
112 raise_for_status(resp)
113
114 _ensure_api_header(resp)
115
116

Referenced by pip._internal.index.collector._get_simple_response().

Here is the caller graph for this function:

◆ _get_encoding_from_headers()

Optional[str] _get_encoding_from_headers ( ResponseHeaders  headers)
protected
Determine if we have any encoding information in our headers.

Definition at line 180 of file collector.py.

180def _get_encoding_from_headers(headers: ResponseHeaders) -> Optional[str]:
181 """Determine if we have any encoding information in our headers."""
182 if headers and "Content-Type" in headers:
184 m["content-type"] = headers["Content-Type"]
185 charset = m.get_param("charset")
186 if charset:
187 return str(charset)
188 return None
189
190

References i.

Referenced by pip._internal.index.collector._make_index_content().

Here is the caller graph for this function:

◆ _get_index_content()

Optional["IndexContent"] _get_index_content ( Link  link,
*PipSession  session 
)
protected

Definition at line 336 of file collector.py.

336def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexContent"]:
337 url = link.url.split("#", 1)[0]
338
339 # Check for VCS schemes that do not support lookup as web pages.
340 vcs_scheme = _match_vcs_scheme(url)
341 if vcs_scheme:
343 "Cannot look at %s URL %s because it does not support lookup as web pages.",
344 vcs_scheme,
345 link,
346 )
347 return None
348
349 # Tack index.html onto file:// URLs that point to directories
350 scheme, _, path, _, _, _ = urllib.parse.urlparse(url)
351 if scheme == "file" and os.path.isdir(urllib.request.url2pathname(path)):
352 # add trailing slash if not present so urljoin doesn't trim
353 # final segment
354 if not url.endswith("/"):
355 url += "/"
356 # TODO: In the future, it would be nice if pip supported PEP 691
357 # style responses in the file:// URLs, however there's no
358 # standard file extension for application/vnd.pypi.simple.v1+json
359 # so we'll need to come up with something on our own.
360 url = urllib.parse.urljoin(url, "index.html")
361 logger.debug(" file: URL is directory, getting %s", url)
362
363 try:
364 resp = _get_simple_response(url, session=session)
365 except _NotHTTP:
367 "Skipping page %s because it looks like an archive, and cannot "
368 "be checked by a HTTP HEAD request.",
369 link,
370 )
371 except _NotAPIContent as exc:
373 "Skipping page %s because the %s request got Content-Type: %s. "
374 "The only supported Content-Types are application/vnd.pypi.simple.v1+json, "
375 "application/vnd.pypi.simple.v1+html, and text/html",
376 link,
379 )
380 except NetworkConnectionError as exc:
381 _handle_get_simple_fail(link, exc)
382 except RetryError as exc:
383 _handle_get_simple_fail(link, exc)
384 except SSLError as exc:
385 reason = "There was a problem confirming the ssl certificate: "
386 reason += str(exc)
387 _handle_get_simple_fail(link, reason, meth=logger.info)
388 except requests.ConnectionError as exc:
389 _handle_get_simple_fail(link, f"connection error: {exc}")
390 except requests.Timeout:
391 _handle_get_simple_fail(link, "timed out")
392 else:
393 return _make_index_content(resp, cache_link_parsing=link.cache_link_parsing)
394 return None
395
396

References pip._internal.index.collector._get_simple_response(), pip._internal.index.collector._handle_get_simple_fail(), pip._internal.index.collector._make_index_content(), pip._internal.index.collector._match_vcs_scheme(), and i.

Referenced by LinkCollector.fetch_response().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ _get_simple_response()

Response _get_simple_response ( str  url,
PipSession  session 
)
protected
Access an Simple API response with GET, and return the response.

This consists of three parts:

1. If the URL looks suspiciously like an archive, send a HEAD first to
   check the Content-Type is HTML or Simple API, to avoid downloading a
   large file. Raise `_NotHTTP` if the content type cannot be determined, or
   `_NotAPIContent` if it is not HTML or a Simple API.
2. Actually perform the request. Raise HTTP exceptions on network failures.
3. Check the Content-Type header to make sure we got a Simple API response,
   and raise `_NotAPIContent` otherwise.

Definition at line 117 of file collector.py.

117def _get_simple_response(url: str, session: PipSession) -> Response:
118 """Access an Simple API response with GET, and return the response.
119
120 This consists of three parts:
121
122 1. If the URL looks suspiciously like an archive, send a HEAD first to
123 check the Content-Type is HTML or Simple API, to avoid downloading a
124 large file. Raise `_NotHTTP` if the content type cannot be determined, or
125 `_NotAPIContent` if it is not HTML or a Simple API.
126 2. Actually perform the request. Raise HTTP exceptions on network failures.
127 3. Check the Content-Type header to make sure we got a Simple API response,
128 and raise `_NotAPIContent` otherwise.
129 """
130 if is_archive_file(Link(url).filename):
131 _ensure_api_response(url, session=session)
132
133 logger.debug("Getting page %s", redact_auth_from_url(url))
134
135 resp = session.get(
136 url,
137 headers={
138 "Accept": ", ".join(
139 [
140 "application/vnd.pypi.simple.v1+json",
141 "application/vnd.pypi.simple.v1+html; q=0.1",
142 "text/html; q=0.01",
143 ]
144 ),
145 # We don't want to blindly returned cached data for
146 # /simple/, because authors generally expecting that
147 # twine upload && pip install will function, but if
148 # they've done a pip install in the last ~10 minutes
149 # it won't. Thus by setting this to zero we will not
150 # blindly use any cached data, however the benefit of
151 # using max-age=0 instead of no-cache, is that we will
152 # still support conditional requests, so we will still
153 # minimize traffic sent in cases where the page hasn't
154 # changed at all, we will just always incur the round
155 # trip for the conditional GET now instead of only
156 # once per 10 minutes.
157 # For more information, please see pypa/pip#5670.
158 "Cache-Control": "max-age=0",
159 },
160 )
161 raise_for_status(resp)
162
163 # The check for archives above only works if the url ends with
164 # something that looks like an archive. However that is not a
165 # requirement of an url. Unless we issue a HEAD request on every
166 # url we cannot know ahead of time for sure if something is a
167 # Simple API response or not. However we can check after we've
168 # downloaded it.
169 _ensure_api_header(resp)
170
172 "Fetched page %s as %s",
173 redact_auth_from_url(url),
174 resp.headers.get("Content-Type", "Unknown"),
175 )
176
177 return resp
178
179

References pip._internal.index.collector._ensure_api_header(), pip._internal.index.collector._ensure_api_response(), and i.

Referenced by pip._internal.index.collector._get_index_content().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ _handle_get_simple_fail()

None _handle_get_simple_fail ( Link  link,
Union[str, Exception]  reason,
Optional[Callable[..., None]]   meth = None 
)
protected

Definition at line 313 of file collector.py.

317) -> None:
318 if meth is None:
319 meth = logger.debug
320 meth("Could not fetch URL %s: %s - skipping", link, reason)
321
322

References i.

Referenced by pip._internal.index.collector._get_index_content().

Here is the caller graph for this function:

◆ _make_index_content()

IndexContent _make_index_content ( Response  response,
bool   cache_link_parsing = True 
)
protected

Definition at line 323 of file collector.py.

325) -> IndexContent:
326 encoding = _get_encoding_from_headers(response.headers)
327 return IndexContent(
329 response.headers["Content-Type"],
330 encoding=encoding,
331 url=response.url,
332 cache_link_parsing=cache_link_parsing,
333 )
334
335

References pip._internal.index.collector._get_encoding_from_headers(), and i.

Referenced by pip._internal.index.collector._get_index_content().

Here is the call graph for this function:
Here is the caller graph for this function:

◆ _match_vcs_scheme()

Optional[str] _match_vcs_scheme ( str  url)
protected
Look for VCS schemes in the URL.

Returns the matched VCS scheme, or None if there's no match.

Definition at line 55 of file collector.py.

55def _match_vcs_scheme(url: str) -> Optional[str]:
56 """Look for VCS schemes in the URL.
57
58 Returns the matched VCS scheme, or None if there's no match.
59 """
60 for scheme in vcs.schemes:
61 if url.lower().startswith(scheme) and url[len(scheme)] in "+:":
62 return scheme
63 return None
64
65

References i.

Referenced by pip._internal.index.collector._get_index_content().

Here is the caller graph for this function:

◆ parse_links()

Iterable[Link] parse_links ( "IndexContent"  page)
Parse a Simple API's Index Content, and yield its anchor elements as Link objects.

Definition at line 229 of file collector.py.

229def parse_links(page: "IndexContent") -> Iterable[Link]:
230 """
231 Parse a Simple API's Index Content, and yield its anchor elements as Link objects.
232 """
233
234 content_type_l = page.content_type.lower()
235 if content_type_l.startswith("application/vnd.pypi.simple.v1+json"):
237 for file in data.get("files", []):
238 link = Link.from_json(file, page.url)
239 if link is None:
240 continue
241 yield link
242 return
243
244 parser = HTMLLinkParser(page.url)
245 encoding = page.encoding or "utf-8"
247
248 url = page.url
249 base_url = parser.base_url or url
250 for anchor in parser.anchors:
251 link = Link.from_element(anchor, page_url=url, base_url=base_url)
252 if link is None:
253 continue
254 yield link
255
256

References i.

◆ with_cached_index_content()

ParseLinks with_cached_index_content ( ParseLinks  fn)
Given a function that parses an Iterable[Link] from an IndexContent, cache the
function's result (keyed by CacheablePageContent), unless the IndexContent
`page` has `page.cache_link_parsing == False`.

Definition at line 208 of file collector.py.

208def with_cached_index_content(fn: ParseLinks) -> ParseLinks:
209 """
210 Given a function that parses an Iterable[Link] from an IndexContent, cache the
211 function's result (keyed by CacheablePageContent), unless the IndexContent
212 `page` has `page.cache_link_parsing == False`.
213 """
214
215 @functools.lru_cache(maxsize=None)
216 def wrapper(cacheable_page: CacheablePageContent) -> List[Link]:
217 return list(fn(cacheable_page.page))
218
219 @functools.wraps(fn)
220 def wrapper_wrapper(page: "IndexContent") -> List[Link]:
222 return wrapper(CacheablePageContent(page))
223 return list(fn(page))
224
225 return wrapper_wrapper
226
227
228@with_cached_index_content

References i.

Variable Documentation

◆ logger

logger = logging.getLogger(__name__)

Definition at line 50 of file collector.py.

◆ Protocol

Protocol = object

Definition at line 48 of file collector.py.

◆ ResponseHeaders

ResponseHeaders = MutableMapping[str, str]

Definition at line 52 of file collector.py.