Let us walk on the 3-isogeny graph
Loading...
Searching...
No Matches
locators.py
Go to the documentation of this file.
1# -*- coding: utf-8 -*-
2#
3# Copyright (C) 2012-2015 Vinay Sajip.
4# Licensed to the Python Software Foundation under a contributor agreement.
5# See LICENSE.txt and CONTRIBUTORS.txt.
6#
7
8import gzip
9from io import BytesIO
10import json
11import logging
12import os
13import posixpath
14import re
15try:
16 import threading
17except ImportError: # pragma: no cover
18 import dummy_threading as threading
19import zlib
20
21from . import DistlibException
22from .compat import (urljoin, urlparse, urlunparse, url2pathname, pathname2url,
23 queue, quote, unescape, build_opener,
24 HTTPRedirectHandler as BaseRedirectHandler, text_type,
25 Request, HTTPError, URLError)
26from .database import Distribution, DistributionPath, make_dist
27from .metadata import Metadata, MetadataInvalidError
28from .util import (cached_property, ensure_slash, split_filename, get_project_data,
29 parse_requirement, parse_name_and_version, ServerProxy,
30 normalize_name)
31from .version import get_scheme, UnsupportedVersionError
32from .wheel import Wheel, is_compatible
33
34logger = logging.getLogger(__name__)
35
36HASHER_HASH = re.compile(r'^(\w+)=([a-f0-9]+)')
37CHARSET = re.compile(r';\s*charset\s*=\s*(.*)\s*$', re.I)
38HTML_CONTENT_TYPE = re.compile('text/html|application/x(ht)?ml')
39DEFAULT_INDEX = 'https://pypi.org/pypi'
40
42 """
43 Return all distribution names known by an index.
44 :param url: The URL of the index.
45 :return: A list of all known distribution names.
46 """
47 if url is None:
48 url = DEFAULT_INDEX
49 client = ServerProxy(url, timeout=3.0)
50 try:
52 finally:
53 client('close')()
54
55class RedirectHandler(BaseRedirectHandler):
56 """
57 A class to work around a bug in some Python 3.2.x releases.
58 """
59 # There's a bug in the base version for some 3.2.x
60 # (e.g. 3.2.2 on Ubuntu Oneiric). If a Location header
61 # returns e.g. /abc, it bails because it says the scheme ''
62 # is bogus, when actually it should use the request's
63 # URL for the scheme. See Python issue #13696.
64 def http_error_302(self, req, fp, code, msg, headers):
65 # Some servers (incorrectly) return multiple Location headers
66 # (so probably same goes for URI). Use first header.
67 newurl = None
68 for key in ('location', 'uri'):
69 if key in headers:
70 newurl = headers[key]
71 break
72 if newurl is None: # pragma: no cover
73 return
74 urlparts = urlparse(newurl)
75 if urlparts.scheme == '':
76 newurl = urljoin(req.get_full_url(), newurl)
77 if hasattr(headers, 'replace_header'):
78 headers.replace_header(key, newurl)
79 else:
80 headers[key] = newurl
81 return BaseRedirectHandler.http_error_302(self, req, fp, code, msg,
82 headers)
83
84 http_error_301 = http_error_303 = http_error_307 = http_error_302
85
86class Locator(object):
87 """
88 A base class for locators - things that locate distributions.
89 """
90 source_extensions = ('.tar.gz', '.tar.bz2', '.tar', '.zip', '.tgz', '.tbz')
91 binary_extensions = ('.egg', '.exe', '.whl')
92 excluded_extensions = ('.pdf',)
93
94 # A list of tags indicating which wheels you want to match. The default
95 # value of None matches against the tags compatible with the running
96 # Python. If you want to match other values, set wheel_tags on a locator
97 # instance to a list of tuples (pyver, abi, arch) which you want to match.
98 wheel_tags = None
99
100 downloadable_extensions = source_extensions + ('.whl',)
101
102 def __init__(self, scheme='default'):
103 """
104 Initialise an instance.
105 :param scheme: Because locators look for most recent versions, they
106 need to know the version scheme to use. This specifies
107 the current PEP-recommended scheme - use ``'legacy'``
108 if you need to support existing distributions on PyPI.
109 """
110 self._cache = {}
111 self.scheme = scheme
112 # Because of bugs in some of the handlers on some of the platforms,
113 # we use our own opener rather than just using urlopen.
114 self.opener = build_opener(RedirectHandler())
115 # If get_project() is called from locate(), the matcher instance
116 # is set from the requirement passed to locate(). See issue #18 for
117 # why this can be useful to know.
118 self.matcher = None
120
121 def get_errors(self):
122 """
123 Return any errors which have occurred.
124 """
125 result = []
126 while not self.errors.empty(): # pragma: no cover
127 try:
128 e = self.errors.get(False)
130 except self.errors.Empty:
131 continue
132 self.errors.task_done()
133 return result
134
135 def clear_errors(self):
136 """
137 Clear any errors which may have been logged.
138 """
139 # Just get the errors and throw them away
140 self.get_errors()
141
142 def clear_cache(self):
143 self._cache.clear()
144
145 def _get_scheme(self):
146 return self._scheme
147
148 def _set_scheme(self, value):
149 self._scheme = value
150
151 scheme = property(_get_scheme, _set_scheme)
152
153 def _get_project(self, name):
154 """
155 For a given project, get a dictionary mapping available versions to Distribution
156 instances.
157
158 This should be implemented in subclasses.
159
160 If called from a locate() request, self.matcher will be set to a
161 matcher for the requirement to satisfy, otherwise it will be None.
162 """
163 raise NotImplementedError('Please implement in the subclass')
164
166 """
167 Return all the distribution names known to this locator.
168 """
169 raise NotImplementedError('Please implement in the subclass')
170
171 def get_project(self, name):
172 """
173 For a given project, get a dictionary mapping available versions to Distribution
174 instances.
175
176 This calls _get_project to do all the work, and just implements a caching layer on top.
177 """
178 if self._cache is None: # pragma: no cover
179 result = self._get_project(name)
180 elif name in self._cache:
181 result = self._cache[name]
182 else:
183 self.clear_errors()
184 result = self._get_project(name)
185 self._cache[name] = result
186 return result
187
188 def score_url(self, url):
189 """
190 Give an url a score which can be used to choose preferred URLs
191 for a given project release.
192 """
193 t = urlparse(url)
194 basename = posixpath.basename(t.path)
195 compatible = True
196 is_wheel = basename.endswith('.whl')
198 if is_wheel:
199 compatible = is_compatible(Wheel(basename), self.wheel_tags)
200 return (t.scheme == 'https', 'pypi.org' in t.netloc,
201 is_downloadable, is_wheel, compatible, basename)
202
203 def prefer_url(self, url1, url2):
204 """
205 Choose one of two URLs where both are candidates for distribution
206 archives for the same version of a distribution (for example,
207 .tar.gz vs. zip).
208
209 The current implementation favours https:// URLs over http://, archives
210 from PyPI over those from other locations, wheel compatibility (if a
211 wheel) and then the archive name.
212 """
213 result = url2
214 if url1:
215 s1 = self.score_url(url1)
216 s2 = self.score_url(url2)
217 if s1 > s2:
218 result = url1
219 if result != url2:
220 logger.debug('Not replacing %r with %r', url1, url2)
221 else:
222 logger.debug('Replacing %r with %r', url1, url2)
223 return result
224
225 def split_filename(self, filename, project_name):
226 """
227 Attempt to split a filename in project name, version and Python version.
228 """
229 return split_filename(filename, project_name)
230
231 def convert_url_to_download_info(self, url, project_name):
232 """
233 See if a URL is a candidate for a download URL for a project (the URL
234 has typically been scraped from an HTML page).
235
236 If it is, a dictionary is returned with keys "name", "version",
237 "filename" and "url"; otherwise, None is returned.
238 """
239 def same_project(name1, name2):
240 return normalize_name(name1) == normalize_name(name2)
241
242 result = None
243 scheme, netloc, path, params, query, frag = urlparse(url)
244 if frag.lower().startswith('egg='): # pragma: no cover
245 logger.debug('%s: version hint in fragment: %r',
246 project_name, frag)
247 m = HASHER_HASH.match(frag)
248 if m:
249 algo, digest = m.groups()
250 else:
251 algo, digest = None, None
252 origpath = path
253 if path and path[-1] == '/': # pragma: no cover
254 path = path[:-1]
255 if path.endswith('.whl'):
256 try:
257 wheel = Wheel(path)
258 if not is_compatible(wheel, self.wheel_tags):
259 logger.debug('Wheel not compatible: %s', path)
260 else:
261 if project_name is None:
262 include = True
263 else:
264 include = same_project(wheel.name, project_name)
265 if include:
266 result = {
267 'name': wheel.name,
268 'version': wheel.version,
269 'filename': wheel.filename,
270 'url': urlunparse((scheme, netloc, origpath,
271 params, query, '')),
272 'python-version': ', '.join(
273 ['.'.join(list(v[2:])) for v in wheel.pyver]),
274 }
275 except Exception as e: # pragma: no cover
276 logger.warning('invalid path for wheel: %s', path)
278 logger.debug('Not downloadable: %s', path)
279 else: # downloadable extension
280 path = filename = posixpath.basename(path)
282 if path.endswith(ext):
283 path = path[:-len(ext)]
284 t = self.split_filename(path, project_name)
285 if not t: # pragma: no cover
286 logger.debug('No match for project/version: %s', path)
287 else:
288 name, version, pyver = t
289 if not project_name or same_project(project_name, name):
290 result = {
291 'name': name,
292 'version': version,
293 'filename': filename,
294 'url': urlunparse((scheme, netloc, origpath,
295 params, query, '')),
296 #'packagetype': 'sdist',
297 }
298 if pyver: # pragma: no cover
299 result['python-version'] = pyver
300 break
301 if result and algo:
302 result['%s_digest' % algo] = digest
303 return result
304
305 def _get_digest(self, info):
306 """
307 Get a digest from a dictionary by looking at a "digests" dictionary
308 or keys of the form 'algo_digest'.
309
310 Returns a 2-tuple (algo, digest) if found, else None. Currently
311 looks only for SHA256, then MD5.
312 """
313 result = None
314 if 'digests' in info:
315 digests = info['digests']
316 for algo in ('sha256', 'md5'):
317 if algo in digests:
318 result = (algo, digests[algo])
319 break
320 if not result:
321 for algo in ('sha256', 'md5'):
322 key = '%s_digest' % algo
323 if key in info:
324 result = (algo, info[key])
325 break
326 return result
327
328 def _update_version_data(self, result, info):
329 """
330 Update a result dictionary (the final result from _get_project) with a
331 dictionary for a specific version, which typically holds information
332 gleaned from a filename or URL for an archive for the distribution.
333 """
334 name = info.pop('name')
335 version = info.pop('version')
336 if version in result:
337 dist = result[version]
338 md = dist.metadata
339 else:
340 dist = make_dist(name, version, scheme=self.scheme)
341 md = dist.metadata
342 dist.digest = digest = self._get_digest(info)
343 url = info['url']
344 result['digests'][url] = digest
345 if md.source_url != info['url']:
347 result['urls'].setdefault(version, set()).add(url)
348 dist.locator = self
349 result[version] = dist
350
351 def locate(self, requirement, prereleases=False):
352 """
353 Find the most recent distribution which matches the given
354 requirement.
355
356 :param requirement: A requirement of the form 'foo (1.0)' or perhaps
357 'foo (>= 1.0, < 2.0, != 1.3)'
358 :param prereleases: If ``True``, allow pre-release versions
359 to be located. Otherwise, pre-release versions
360 are not returned.
361 :return: A :class:`Distribution` instance, or ``None`` if no such
362 distribution could be located.
363 """
364 result = None
365 r = parse_requirement(requirement)
366 if r is None: # pragma: no cover
367 raise DistlibException('Not a valid requirement: %r' % requirement)
368 scheme = get_scheme(self.scheme)
369 self.matcher = matcher = scheme.matcher(r.requirement)
370 logger.debug('matcher: %s (%s)', matcher, type(matcher).__name__)
371 versions = self.get_project(r.name)
372 if len(versions) > 2: # urls and digests keys are present
373 # sometimes, versions are invalid
374 slist = []
376 for k in versions:
377 if k in ('urls', 'digests'):
378 continue
379 try:
380 if not matcher.match(k):
381 pass # logger.debug('%s did not match %r', matcher, k)
382 else:
383 if prereleases or not vcls(k).is_prerelease:
384 slist.append(k)
385 # else:
386 # logger.debug('skipping pre-release '
387 # 'version %s of %s', k, matcher.name)
388 except Exception: # pragma: no cover
389 logger.warning('error matching %s with %r', matcher, k)
390 pass # slist.append(k)
391 if len(slist) > 1:
392 slist = sorted(slist, key=scheme.key)
393 if slist:
394 logger.debug('sorted list: %s', slist)
395 version = slist[-1]
396 result = versions[version]
397 if result:
398 if r.extras:
400 result.download_urls = versions.get('urls', {}).get(version, set())
401 d = {}
402 sd = versions.get('digests', {})
403 for url in result.download_urls:
404 if url in sd: # pragma: no cover
405 d[url] = sd[url]
407 self.matcher = None
408 return result
409
410
412 """
413 This locator uses XML-RPC to locate distributions. It therefore
414 cannot be used with simple mirrors (that only mirror file content).
415 """
416 def __init__(self, url, **kwargs):
417 """
418 Initialise an instance.
419
420 :param url: The URL to use for XML-RPC.
421 :param kwargs: Passed to the superclass constructor.
422 """
423 super(PyPIRPCLocator, self).__init__(**kwargs)
424 self.base_url = url
425 self.client = ServerProxy(url, timeout=3.0)
426
428 """
429 Return all the distribution names known to this locator.
430 """
431 return set(self.client.list_packages())
432
433 def _get_project(self, name):
434 result = {'urls': {}, 'digests': {}}
435 versions = self.client.package_releases(name, True)
436 for v in versions:
437 urls = self.client.release_urls(name, v)
438 data = self.client.release_data(name, v)
439 metadata = Metadata(scheme=self.scheme)
440 metadata.name = data['name']
441 metadata.version = data['version']
442 metadata.license = data.get('license')
443 metadata.keywords = data.get('keywords', [])
444 metadata.summary = data.get('summary')
445 dist = Distribution(metadata)
446 if urls:
447 info = urls[0]
448 metadata.source_url = info['url']
449 dist.digest = self._get_digest(info)
450 dist.locator = self
451 result[v] = dist
452 for info in urls:
453 url = info['url']
454 digest = self._get_digest(info)
455 result['urls'].setdefault(v, set()).add(url)
456 result['digests'][url] = digest
457 return result
458
460 """
461 This locator uses PyPI's JSON interface. It's very limited in functionality
462 and probably not worth using.
463 """
464 def __init__(self, url, **kwargs):
465 super(PyPIJSONLocator, self).__init__(**kwargs)
466 self.base_url = ensure_slash(url)
467
469 """
470 Return all the distribution names known to this locator.
471 """
472 raise NotImplementedError('Not available from this locator')
473
474 def _get_project(self, name):
475 result = {'urls': {}, 'digests': {}}
476 url = urljoin(self.base_url, '%s/json' % quote(name))
477 try:
478 resp = self.opener.open(url)
479 data = resp.read().decode() # for now
480 d = json.loads(data)
481 md = Metadata(scheme=self.scheme)
482 data = d['info']
483 md.name = data['name']
484 md.version = data['version']
485 md.license = data.get('license')
486 md.keywords = data.get('keywords', [])
487 md.summary = data.get('summary')
488 dist = Distribution(md)
489 dist.locator = self
490 urls = d['urls']
491 result[md.version] = dist
492 for info in d['urls']:
493 url = info['url']
495 dist.digests[url] = self._get_digest(info)
496 result['urls'].setdefault(md.version, set()).add(url)
497 result['digests'][url] = self._get_digest(info)
498 # Now get other releases
499 for version, infos in d['releases'].items():
500 if version == md.version:
501 continue # already done
502 omd = Metadata(scheme=self.scheme)
504 omd.version = version
505 odist = Distribution(omd)
506 odist.locator = self
507 result[version] = odist
508 for info in infos:
509 url = info['url']
511 odist.digests[url] = self._get_digest(info)
512 result['urls'].setdefault(version, set()).add(url)
513 result['digests'][url] = self._get_digest(info)
514# for info in urls:
515# md.source_url = info['url']
516# dist.digest = self._get_digest(info)
517# dist.locator = self
518# for info in urls:
519# url = info['url']
520# result['urls'].setdefault(md.version, set()).add(url)
521# result['digests'][url] = self._get_digest(info)
522 except Exception as e:
523 self.errors.put(text_type(e))
524 logger.exception('JSON fetch failed: %s', e)
525 return result
526
527
528class Page(object):
529 """
530 This class represents a scraped HTML page.
531 """
532 # The following slightly hairy-looking regex just looks for the contents of
533 # an anchor link, which has an attribute "href" either immediately preceded
534 # or immediately followed by a "rel" attribute. The attribute values can be
535 # declared with double quotes, single quotes or no quotes - which leads to
536 # the length of the expression.
537 _href = re.compile("""
538(rel\\s*=\\s*(?:"(?P<rel1>[^"]*)"|'(?P<rel2>[^']*)'|(?P<rel3>[^>\\s\n]*))\\s+)?
539href\\s*=\\s*(?:"(?P<url1>[^"]*)"|'(?P<url2>[^']*)'|(?P<url3>[^>\\s\n]*))
540(\\s+rel\\s*=\\s*(?:"(?P<rel4>[^"]*)"|'(?P<rel5>[^']*)'|(?P<rel6>[^>\\s\n]*)))?
541""", re.I | re.S | re.X)
542 _base = re.compile(r"""<base\s+href\s*=\s*['"]?([^'">]+)""", re.I | re.S)
543
544 def __init__(self, data, url):
545 """
546 Initialise an instance with the Unicode page contents and the URL they
547 came from.
548 """
549 self.data = data
550 self.base_url = self.url = url
551 m = self._base.search(self.data)
552 if m:
553 self.base_url = m.group(1)
554
555 _clean_re = re.compile(r'[^a-z0-9$&+,/:;=?@.#%_\\|-]', re.I)
556
557 @cached_property
558 def links(self):
559 """
560 Return the URLs of all the links on a page together with information
561 about their "rel" attribute, for determining which ones to treat as
562 downloads and which ones to queue for further scraping.
563 """
564 def clean(url):
565 "Tidy up an URL."
566 scheme, netloc, path, params, query, frag = urlparse(url)
567 return urlunparse((scheme, netloc, quote(path),
568 params, query, frag))
569
570 result = set()
571 for match in self._href.finditer(self.data):
572 d = match.groupdict('')
573 rel = (d['rel1'] or d['rel2'] or d['rel3'] or
574 d['rel4'] or d['rel5'] or d['rel6'])
575 url = d['url1'] or d['url2'] or d['url3']
576 url = urljoin(self.base_url, url)
577 url = unescape(url)
578 url = self._clean_re.sub(lambda m: '%%%2x' % ord(m.group(0)), url)
579 result.add((url, rel))
580 # We sort the result, hoping to bring the most recent versions
581 # to the front
582 result = sorted(result, key=lambda t: t[0], reverse=True)
583 return result
584
585
587 """
588 A locator which scrapes HTML pages to locate downloads for a distribution.
589 This runs multiple threads to do the I/O; performance is at least as good
590 as pip's PackageFinder, which works in an analogous fashion.
591 """
592
593 # These are used to deal with various Content-Encoding schemes.
594 decoders = {
595 'deflate': zlib.decompress,
596 'gzip': lambda b: gzip.GzipFile(fileobj=BytesIO(b)).read(),
597 'none': lambda b: b,
598 }
599
600 def __init__(self, url, timeout=None, num_workers=10, **kwargs):
601 """
602 Initialise an instance.
603 :param url: The root URL to use for scraping.
604 :param timeout: The timeout, in seconds, to be applied to requests.
605 This defaults to ``None`` (no timeout specified).
606 :param num_workers: The number of worker threads you want to do I/O,
607 This defaults to 10.
608 :param kwargs: Passed to the superclass.
609 """
610 super(SimpleScrapingLocator, self).__init__(**kwargs)
611 self.base_url = ensure_slash(url)
612 self.timeout = timeout
613 self._page_cache = {}
614 self._seen = set()
616 self._bad_hosts = set()
617 self.skip_externals = False
618 self.num_workers = num_workers
620 # See issue #45: we need to be resilient when the locator is used
621 # in a thread, e.g. with concurrent.futures. We can't use self._lock
622 # as it is for coordinating our internal threads - the ones created
623 # in _prepare_threads.
625 self.platform_check = False # See issue #112
626
628 """
629 Threads are created only when get_project is called, and terminate
630 before it returns. They are there primarily to parallelise I/O (i.e.
631 fetching web pages).
632 """
633 self._threads = []
634 for i in range(self.num_workers):
635 t = threading.Thread(target=self._fetch)
636 t.daemon = True
637 t.start()
638 self._threads.append(t)
639
640 def _wait_threads(self):
641 """
642 Tell all the threads to terminate (by sending a sentinel value) and
643 wait for them to do so.
644 """
645 # Note that you need two loops, since you can't say which
646 # thread will get each sentinel
647 for t in self._threads:
648 self._to_fetch.put(None) # sentinel
649 for t in self._threads:
650 t.join()
651 self._threads = []
652
653 def _get_project(self, name):
654 result = {'urls': {}, 'digests': {}}
655 with self._gplock:
656 self.result = result
657 self.project_name = name
658 url = urljoin(self.base_url, '%s/' % quote(name))
659 self._seen.clear()
660 self._page_cache.clear()
661 self._prepare_threads()
662 try:
663 logger.debug('Queueing %s', url)
664 self._to_fetch.put(url)
665 self._to_fetch.join()
666 finally:
667 self._wait_threads()
668 del self.result
669 return result
670
671 platform_dependent = re.compile(r'\b(linux_(i\d86|x86_64|arm\w+)|'
672 r'win(32|_amd64)|macosx_?\d+)\b', re.I)
673
675 """
676 Does an URL refer to a platform-specific download?
677 """
678 return self.platform_dependent.search(url)
679
680 def _process_download(self, url):
681 """
682 See if an URL is a suitable download for a project.
683
684 If it is, register information in the result dictionary (for
685 _get_project) about the specific version it's for.
686
687 Note that the return value isn't actually used other than as a boolean
688 value.
689 """
690 if self.platform_check and self._is_platform_dependent(url):
691 info = None
692 else:
693 info = self.convert_url_to_download_info(url, self.project_name)
694 logger.debug('process_download: %s -> %s', url, info)
695 if info:
696 with self._lock: # needed because self.result is shared
697 self._update_version_data(self.result, info)
698 return info
699
700 def _should_queue(self, link, referrer, rel):
701 """
702 Determine whether a link URL from a referring page and with a
703 particular "rel" attribute should be queued for scraping.
704 """
705 scheme, netloc, path, _, _, _ = urlparse(link)
708 result = False
709 elif self.skip_externals and not link.startswith(self.base_url):
710 result = False
711 elif not referrer.startswith(self.base_url):
712 result = False
713 elif rel not in ('homepage', 'download'):
714 result = False
715 elif scheme not in ('http', 'https', 'ftp'):
716 result = False
717 elif self._is_platform_dependent(link):
718 result = False
719 else:
720 host = netloc.split(':', 1)[0]
721 if host.lower() == 'localhost':
722 result = False
723 else:
724 result = True
725 logger.debug('should_queue: %s (%s) from %s -> %s', link, rel,
726 referrer, result)
727 return result
728
729 def _fetch(self):
730 """
731 Get a URL to fetch from the work queue, get the HTML page, examine its
732 links for download candidates and candidates for further scraping.
733
734 This is a handy method to run in a thread.
735 """
736 while True:
737 url = self._to_fetch.get()
738 try:
739 if url:
740 page = self.get_page(url)
741 if page is None: # e.g. after an error
742 continue
743 for link, rel in page.links:
744 if link not in self._seen:
745 try:
746 self._seen.add(link)
747 if (not self._process_download(link) and
748 self._should_queue(link, url, rel)):
749 logger.debug('Queueing %s from %s', link, url)
750 self._to_fetch.put(link)
751 except MetadataInvalidError: # e.g. invalid versions
752 pass
753 except Exception as e: # pragma: no cover
754 self.errors.put(text_type(e))
755 finally:
756 # always do this, to avoid hangs :-)
757 self._to_fetch.task_done()
758 if not url:
759 #logger.debug('Sentinel seen, quitting.')
760 break
761
762 def get_page(self, url):
763 """
764 Get the HTML for an URL, possibly from an in-memory cache.
765
766 XXX TODO Note: this cache is never actually cleared. It's assumed that
767 the data won't get stale over the lifetime of a locator instance (not
768 necessarily true for the default_locator).
769 """
770 # http://peak.telecommunity.com/DevCenter/EasyInstall#package-index-api
771 scheme, netloc, path, _, _, _ = urlparse(url)
772 if scheme == 'file' and os.path.isdir(url2pathname(path)):
773 url = urljoin(ensure_slash(url), 'index.html')
774
775 if url in self._page_cache:
776 result = self._page_cache[url]
777 logger.debug('Returning %s from cache: %s', url, result)
778 else:
779 host = netloc.split(':', 1)[0]
780 result = None
781 if host in self._bad_hosts:
782 logger.debug('Skipping %s due to bad host %s', url, host)
783 else:
784 req = Request(url, headers={'Accept-encoding': 'identity'})
785 try:
786 logger.debug('Fetching %s', url)
787 resp = self.opener.open(req, timeout=self.timeout)
788 logger.debug('Fetched %s', url)
789 headers = resp.info()
790 content_type = headers.get('Content-Type', '')
791 if HTML_CONTENT_TYPE.match(content_type):
792 final_url = resp.geturl()
793 data = resp.read()
794 encoding = headers.get('Content-Encoding')
795 if encoding:
796 decoder = self.decoders[encoding] # fail if not found
797 data = decoder(data)
798 encoding = 'utf-8'
799 m = CHARSET.search(content_type)
800 if m:
801 encoding = m.group(1)
802 try:
803 data = data.decode(encoding)
804 except UnicodeError: # pragma: no cover
805 data = data.decode('latin-1') # fallback
806 result = Page(data, final_url)
807 self._page_cache[final_url] = result
808 except HTTPError as e:
809 if e.code != 404:
810 logger.exception('Fetch failed: %s: %s', url, e)
811 except URLError as e: # pragma: no cover
812 logger.exception('Fetch failed: %s: %s', url, e)
813 with self._lock:
814 self._bad_hosts.add(host)
815 except Exception as e: # pragma: no cover
816 logger.exception('Fetch failed: %s: %s', url, e)
817 finally:
818 self._page_cache[url] = result # even if None (failure)
819 return result
820
821 _distname_re = re.compile('<a href=[^>]*>([^<]+)<')
822
824 """
825 Return all the distribution names known to this locator.
826 """
827 result = set()
828 page = self.get_page(self.base_url)
829 if not page:
830 raise DistlibException('Unable to get %s' % self.base_url)
831 for match in self._distname_re.finditer(page.data):
833 return result
834
836 """
837 This class locates distributions in a directory tree.
838 """
839
840 def __init__(self, path, **kwargs):
841 """
842 Initialise an instance.
843 :param path: The root of the directory tree to search.
844 :param kwargs: Passed to the superclass constructor,
845 except for:
846 * recursive - if True (the default), subdirectories are
847 recursed into. If False, only the top-level directory
848 is searched,
849 """
850 self.recursive = kwargs.pop('recursive', True)
851 super(DirectoryLocator, self).__init__(**kwargs)
852 path = os.path.abspath(path)
853 if not os.path.isdir(path): # pragma: no cover
854 raise DistlibException('Not a directory: %r' % path)
855 self.base_dir = path
856
857 def should_include(self, filename, parent):
858 """
859 Should a filename be considered as a candidate for a distribution
860 archive? As well as the filename, the directory which contains it
861 is provided, though not used by the current implementation.
862 """
864
865 def _get_project(self, name):
866 result = {'urls': {}, 'digests': {}}
867 for root, dirs, files in os.walk(self.base_dir):
868 for fn in files:
869 if self.should_include(fn, root):
870 fn = os.path.join(root, fn)
871 url = urlunparse(('file', '',
872 pathname2url(os.path.abspath(fn)),
873 '', '', ''))
874 info = self.convert_url_to_download_info(url, name)
875 if info:
876 self._update_version_data(result, info)
877 if not self.recursive:
878 break
879 return result
880
882 """
883 Return all the distribution names known to this locator.
884 """
885 result = set()
886 for root, dirs, files in os.walk(self.base_dir):
887 for fn in files:
888 if self.should_include(fn, root):
889 fn = os.path.join(root, fn)
890 url = urlunparse(('file', '',
891 pathname2url(os.path.abspath(fn)),
892 '', '', ''))
893 info = self.convert_url_to_download_info(url, None)
894 if info:
895 result.add(info['name'])
896 if not self.recursive:
897 break
898 return result
899
901 """
902 This locator uses special extended metadata (not available on PyPI) and is
903 the basis of performant dependency resolution in distlib. Other locators
904 require archive downloads before dependencies can be determined! As you
905 might imagine, that can be slow.
906 """
908 """
909 Return all the distribution names known to this locator.
910 """
911 raise NotImplementedError('Not available from this locator')
912
913 def _get_project(self, name):
914 result = {'urls': {}, 'digests': {}}
915 data = get_project_data(name)
916 if data:
917 for info in data.get('files', []):
918 if info['ptype'] != 'sdist' or info['pyversion'] != 'source':
919 continue
920 # We don't store summary in project metadata as it makes
921 # the data bigger for no benefit during dependency
922 # resolution
923 dist = make_dist(data['name'], info['version'],
924 summary=data.get('summary',
925 'Placeholder for summary'),
926 scheme=self.scheme)
927 md = dist.metadata
928 md.source_url = info['url']
929 # TODO SHA256 digest
930 if 'digest' in info and info['digest']:
931 dist.digest = ('md5', info['digest'])
932 md.dependencies = info.get('requirements', {})
933 dist.exports = info.get('exports', {})
934 result[dist.version] = dist
935 result['urls'].setdefault(dist.version, set()).add(info['url'])
936 return result
937
939 """
940 This locator finds installed distributions in a path. It can be useful for
941 adding to an :class:`AggregatingLocator`.
942 """
943 def __init__(self, distpath, **kwargs):
944 """
945 Initialise an instance.
946
947 :param distpath: A :class:`DistributionPath` instance to search.
948 """
949 super(DistPathLocator, self).__init__(**kwargs)
950 assert isinstance(distpath, DistributionPath)
951 self.distpath = distpath
952
953 def _get_project(self, name):
954 dist = self.distpath.get_distribution(name)
955 if dist is None:
956 result = {'urls': {}, 'digests': {}}
957 else:
958 result = {
959 dist.version: dist,
960 'urls': {dist.version: set([dist.source_url])},
961 'digests': {dist.version: set([None])}
962 }
963 return result
964
965
967 """
968 This class allows you to chain and/or merge a list of locators.
969 """
970 def __init__(self, *locators, **kwargs):
971 """
972 Initialise an instance.
973
974 :param locators: The list of locators to search.
975 :param kwargs: Passed to the superclass constructor,
976 except for:
977 * merge - if False (the default), the first successful
978 search from any of the locators is returned. If True,
979 the results from all locators are merged (this can be
980 slow).
981 """
982 self.merge = kwargs.pop('merge', False)
983 self.locators = locators
984 super(AggregatingLocator, self).__init__(**kwargs)
985
986 def clear_cache(self):
987 super(AggregatingLocator, self).clear_cache()
988 for locator in self.locators:
990
991 def _set_scheme(self, value):
992 self._scheme_scheme = value
993 for locator in self.locators:
994 locator.scheme = value
995
996 scheme = property(Locator.scheme.fget, _set_scheme)
997
998 def _get_project(self, name):
999 result = {}
1000 for locator in self.locators:
1001 d = locator.get_project(name)
1002 if d:
1003 if self.merge:
1004 files = result.get('urls', {})
1005 digests = result.get('digests', {})
1006 # next line could overwrite result['urls'], result['digests']
1007 result.update(d)
1008 df = result.get('urls')
1009 if files and df:
1010 for k, v in files.items():
1011 if k in df:
1012 df[k] |= v
1013 else:
1014 df[k] = v
1015 dd = result.get('digests')
1016 if digests and dd:
1017 dd.update(digests)
1018 else:
1019 # See issue #18. If any dists are found and we're looking
1020 # for specific constraints, we only return something if
1021 # a match is found. For example, if a DirectoryLocator
1022 # returns just foo (1.0) while we're looking for
1023 # foo (>= 2.0), we'll pretend there was nothing there so
1024 # that subsequent locators can be queried. Otherwise we
1025 # would just return foo (1.0) which would then lead to a
1026 # failure to find foo (>= 2.0), because other locators
1027 # weren't searched. Note that this only matters when
1028 # merge=False.
1029 if self.matcher is None:
1030 found = True
1031 else:
1032 found = False
1033 for k in d:
1034 if self.matcher.match(k):
1035 found = True
1036 break
1037 if found:
1038 result = d
1039 break
1040 return result
1041
1043 """
1044 Return all the distribution names known to this locator.
1045 """
1046 result = set()
1047 for locator in self.locators:
1048 try:
1050 except NotImplementedError:
1051 pass
1052 return result
1053
1054
1055# We use a legacy scheme simply because most of the dists on PyPI use legacy
1056# versions which don't conform to PEP 440.
1057default_locator = AggregatingLocator(
1058 # JSONLocator(), # don't use as PEP 426 is withdrawn
1059 SimpleScrapingLocator('https://pypi.org/simple/',
1060 timeout=3.0),
1061 scheme='legacy')
1062
1064
1065
1066class DependencyFinder(object):
1067 """
1068 Locate dependencies for distributions.
1069 """
1070
1071 def __init__(self, locator=None):
1072 """
1073 Initialise an instance, using the specified locator
1074 to locate distributions.
1075 """
1076 self.locator = locator or default_locator
1077 self.schemescheme = get_scheme(self.locator.scheme)
1078
1079 def add_distribution(self, dist):
1080 """
1081 Add a distribution to the finder. This will update internal information
1082 about who provides what.
1083 :param dist: The distribution to add.
1084 """
1085 logger.debug('adding distribution %s', dist)
1086 name = dist.key
1087 self.dists_by_name[name] = dist
1088 self.dists[(name, dist.version)] = dist
1089 for p in dist.provides:
1090 name, version = parse_name_and_version(p)
1091 logger.debug('Add to provided: %s, %s, %s', name, version, dist)
1092 self.provided.setdefault(name, set()).add((version, dist))
1093
1094 def remove_distribution(self, dist):
1095 """
1096 Remove a distribution from the finder. This will update internal
1097 information about who provides what.
1098 :param dist: The distribution to remove.
1099 """
1100 logger.debug('removing distribution %s', dist)
1101 name = dist.key
1102 del self.dists_by_name[name]
1103 del self.dists[(name, dist.version)]
1104 for p in dist.provides:
1105 name, version = parse_name_and_version(p)
1106 logger.debug('Remove from provided: %s, %s, %s', name, version, dist)
1107 s = self.provided[name]
1108 s.remove((version, dist))
1109 if not s:
1110 del self.provided[name]
1111
1112 def get_matcher(self, reqt):
1113 """
1114 Get a version matcher for a requirement.
1115 :param reqt: The requirement
1116 :type reqt: str
1117 :return: A version matcher (an instance of
1118 :class:`distlib.version.Matcher`).
1119 """
1120 try:
1121 matcher = self.schemescheme.matcher(reqt)
1122 except UnsupportedVersionError: # pragma: no cover
1123 # XXX compat-mode if cannot read the version
1124 name = reqt.split()[0]
1125 matcher = self.schemescheme.matcher(name)
1126 return matcher
1127
1128 def find_providers(self, reqt):
1129 """
1130 Find the distributions which can fulfill a requirement.
1131
1132 :param reqt: The requirement.
1133 :type reqt: str
1134 :return: A set of distribution which can fulfill the requirement.
1135 """
1136 matcher = self.get_matcher(reqt)
1137 name = matcher.key # case-insensitive
1138 result = set()
1139 provided = self.provided
1140 if name in provided:
1141 for version, provider in provided[name]:
1142 try:
1143 match = matcher.match(version)
1144 except UnsupportedVersionError:
1145 match = False
1146
1147 if match:
1148 result.add(provider)
1149 break
1150 return result
1151
1152 def try_to_replace(self, provider, other, problems):
1153 """
1154 Attempt to replace one provider with another. This is typically used
1155 when resolving dependencies from multiple sources, e.g. A requires
1156 (B >= 1.0) while C requires (B >= 1.1).
1157
1158 For successful replacement, ``provider`` must meet all the requirements
1159 which ``other`` fulfills.
1160
1161 :param provider: The provider we are trying to replace with.
1162 :param other: The provider we're trying to replace.
1163 :param problems: If False is returned, this will contain what
1164 problems prevented replacement. This is currently
1165 a tuple of the literal string 'cantreplace',
1166 ``provider``, ``other`` and the set of requirements
1167 that ``provider`` couldn't fulfill.
1168 :return: True if we can replace ``other`` with ``provider``, else
1169 False.
1170 """
1171 rlist = self.reqts[other]
1172 unmatched = set()
1173 for s in rlist:
1174 matcher = self.get_matcher(s)
1176 unmatched.add(s)
1177 if unmatched:
1178 # can't replace other with provider
1179 problems.add(('cantreplace', provider, other,
1180 frozenset(unmatched)))
1181 result = False
1182 else:
1183 # can replace other with provider
1184 self.remove_distribution(other)
1185 del self.reqts[other]
1186 for s in rlist:
1187 self.reqts.setdefault(provider, set()).add(s)
1188 self.add_distribution(provider)
1189 result = True
1190 return result
1191
1192 def find(self, requirement, meta_extras=None, prereleases=False):
1193 """
1194 Find a distribution and all distributions it depends on.
1195
1196 :param requirement: The requirement specifying the distribution to
1197 find, or a Distribution instance.
1198 :param meta_extras: A list of meta extras such as :test:, :build: and
1199 so on.
1200 :param prereleases: If ``True``, allow pre-release versions to be
1201 returned - otherwise, don't return prereleases
1202 unless they're all that's available.
1203
1204 Return a set of :class:`Distribution` instances and a set of
1205 problems.
1206
1207 The distributions returned should be such that they have the
1208 :attr:`required` attribute set to ``True`` if they were
1209 from the ``requirement`` passed to ``find()``, and they have the
1210 :attr:`build_time_dependency` attribute set to ``True`` unless they
1211 are post-installation dependencies of the ``requirement``.
1212
1213 The problems should be a tuple consisting of the string
1214 ``'unsatisfied'`` and the requirement which couldn't be satisfied
1215 by any distribution known to the locator.
1216 """
1217
1218 self.provided = {}
1219 self.dists = {}
1220 self.dists_by_name = {}
1221 self.reqts = {}
1222
1223 meta_extras = set(meta_extras or [])
1224 if ':*:' in meta_extras:
1225 meta_extras.remove(':*:')
1226 # :meta: and :run: are implicitly included
1227 meta_extras |= set([':test:', ':build:', ':dev:'])
1228
1229 if isinstance(requirement, Distribution):
1230 dist = odist = requirement
1231 logger.debug('passed %s as requirement', odist)
1232 else:
1233 dist = odist = self.locator.locate(requirement,
1234 prereleases=prereleases)
1235 if dist is None:
1236 raise DistlibException('Unable to locate %r' % requirement)
1237 logger.debug('located %s', odist)
1238 dist.requested = True
1239 problems = set()
1240 todo = set([dist])
1241 install_dists = set([odist])
1242 while todo:
1243 dist = todo.pop()
1244 name = dist.key # case-insensitive
1245 if name not in self.dists_by_name:
1246 self.add_distribution(dist)
1247 else:
1248 #import pdb; pdb.set_trace()
1249 other = self.dists_by_name[name]
1250 if other != dist:
1251 self.try_to_replace(dist, other, problems)
1252
1254 sreqts = dist.build_requires
1255 ereqts = set()
1256 if meta_extras and dist in install_dists:
1257 for key in ('test', 'build', 'dev'):
1258 e = ':%s:' % key
1259 if e in meta_extras:
1260 ereqts |= getattr(dist, '%s_requires' % key)
1261 all_reqts = ireqts | sreqts | ereqts
1262 for r in all_reqts:
1263 providers = self.find_providers(r)
1264 if not providers:
1265 logger.debug('No providers found for %r', r)
1266 provider = self.locator.locate(r, prereleases=prereleases)
1267 # If no provider is found and we didn't consider
1268 # prereleases, consider them now.
1269 if provider is None and not prereleases:
1270 provider = self.locator.locate(r, prereleases=True)
1271 if provider is None:
1272 logger.debug('Cannot satisfy %r', r)
1273 problems.add(('unsatisfied', r))
1274 else:
1276 if (n, v) not in self.dists:
1277 todo.add(provider)
1278 providers.add(provider)
1279 if r in ireqts and dist in install_dists:
1280 install_dists.add(provider)
1281 logger.debug('Adding %s to install_dists',
1283 for p in providers:
1284 name = p.key
1285 if name not in self.dists_by_name:
1286 self.reqts.setdefault(p, set()).add(r)
1287 else:
1288 other = self.dists_by_name[name]
1289 if other != p:
1290 # see if other can be replaced by p
1291 self.try_to_replace(p, other, problems)
1292
1293 dists = set(self.dists.values())
1294 for dist in dists:
1295 dist.build_time_dependency = dist not in install_dists
1297 logger.debug('%s is a build-time dependency only.',
1299 logger.debug('find done for %s', odist)
1300 return dists, problems
_update_version_data(self, result, info)
Definition locators.py:328
locate(self, requirement, prereleases=False)
Definition locators.py:351
__init__(self, scheme='default')
Definition locators.py:102
convert_url_to_download_info(self, url, project_name)
Definition locators.py:231
split_filename(self, filename, project_name)
Definition locators.py:225
http_error_302(self, req, fp, code, msg, headers)
Definition locators.py:64
__init__(self, url, timeout=None, num_workers=10, **kwargs)
Definition locators.py:600
get_all_distribution_names(url=None)
Definition locators.py:41
for i
clear
Definition prime_search.m:1