From ad93d54bf3aa83ccafb2159c40573fcb252f8bfc Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Sat, 20 Jul 2024 14:13:50 -0400 Subject: [PATCH 1/5] cache "concrete" dists by Distribution instead of InstallRequirement --- news/12863.trivial.rst | 1 + src/pip/_internal/distributions/__init__.py | 5 + src/pip/_internal/distributions/base.py | 19 +++- src/pip/_internal/distributions/installed.py | 14 ++- src/pip/_internal/distributions/sdist.py | 20 +++- src/pip/_internal/distributions/wheel.py | 4 +- src/pip/_internal/metadata/base.py | 21 ++++ .../_internal/metadata/importlib/_dists.py | 19 +++- src/pip/_internal/metadata/importlib/_envs.py | 8 +- src/pip/_internal/metadata/pkg_resources.py | 15 ++- src/pip/_internal/operations/prepare.py | 47 +++++---- src/pip/_internal/req/req_install.py | 96 ++++++++++++++----- .../resolution/resolvelib/resolver.py | 2 +- .../metadata/test_metadata_pkg_resources.py | 1 + tests/unit/test_req.py | 31 +++++- 15 files changed, 231 insertions(+), 72 deletions(-) create mode 100644 news/12863.trivial.rst diff --git a/news/12863.trivial.rst b/news/12863.trivial.rst new file mode 100644 index 00000000000..dc36a82a0df --- /dev/null +++ b/news/12863.trivial.rst @@ -0,0 +1 @@ +Cache "concrete" dists by ``Distribution`` instead of ``InstallRequirement``. diff --git a/src/pip/_internal/distributions/__init__.py b/src/pip/_internal/distributions/__init__.py index 9a89a838b9a..f6089daf40d 100644 --- a/src/pip/_internal/distributions/__init__.py +++ b/src/pip/_internal/distributions/__init__.py @@ -1,4 +1,5 @@ from pip._internal.distributions.base import AbstractDistribution +from pip._internal.distributions.installed import InstalledDistribution from pip._internal.distributions.sdist import SourceDistribution from pip._internal.distributions.wheel import WheelDistribution from pip._internal.req.req_install import InstallRequirement @@ -8,6 +9,10 @@ def make_distribution_for_install_requirement( install_req: InstallRequirement, ) -> AbstractDistribution: """Returns a Distribution for the given InstallRequirement""" + # Only pre-installed requirements will have a .satisfied_by dist. + if install_req.satisfied_by: + return InstalledDistribution(install_req) + # Editable requirements will always be source distributions. They use the # legacy logic until we create a modern standard for them. if install_req.editable: diff --git a/src/pip/_internal/distributions/base.py b/src/pip/_internal/distributions/base.py index 6e4d0c91a90..0a132b88f98 100644 --- a/src/pip/_internal/distributions/base.py +++ b/src/pip/_internal/distributions/base.py @@ -37,11 +37,17 @@ def build_tracker_id(self) -> Optional[str]: If None, then this dist has no work to do in the build tracker, and ``.prepare_distribution_metadata()`` will not be called.""" - raise NotImplementedError() + ... @abc.abstractmethod def get_metadata_distribution(self) -> BaseDistribution: - raise NotImplementedError() + """Generate a concrete ``BaseDistribution`` instance for this artifact. + + The implementation should also cache the result with + ``self.req.cache_concrete_dist()`` so the distribution is available to other + users of the ``InstallRequirement``. This method is not called within the build + tracker context, so it should not identify any new setup requirements.""" + ... @abc.abstractmethod def prepare_distribution_metadata( @@ -50,4 +56,11 @@ def prepare_distribution_metadata( build_isolation: bool, check_build_deps: bool, ) -> None: - raise NotImplementedError() + """Generate the information necessary to extract metadata from the artifact. + + This method will be executed within the context of ``BuildTracker#track()``, so + it needs to fully identify any setup requirements so they can be added to the + same active set of tracked builds, while ``.get_metadata_distribution()`` takes + care of generating and caching the ``BaseDistribution`` to expose to the rest of + the resolve.""" + ... diff --git a/src/pip/_internal/distributions/installed.py b/src/pip/_internal/distributions/installed.py index ab8d53be740..83e99fca9ca 100644 --- a/src/pip/_internal/distributions/installed.py +++ b/src/pip/_internal/distributions/installed.py @@ -1,9 +1,11 @@ -from typing import Optional +from typing import TYPE_CHECKING, Optional from pip._internal.distributions.base import AbstractDistribution -from pip._internal.index.package_finder import PackageFinder from pip._internal.metadata import BaseDistribution +if TYPE_CHECKING: + from pip._internal.index.package_finder import PackageFinder + class InstalledDistribution(AbstractDistribution): """Represents an installed package. @@ -17,12 +19,14 @@ def build_tracker_id(self) -> Optional[str]: return None def get_metadata_distribution(self) -> BaseDistribution: - assert self.req.satisfied_by is not None, "not actually installed" - return self.req.satisfied_by + dist = self.req.satisfied_by + assert dist is not None, "not actually installed" + self.req.cache_concrete_dist(dist) + return dist def prepare_distribution_metadata( self, - finder: PackageFinder, + finder: "PackageFinder", build_isolation: bool, check_build_deps: bool, ) -> None: diff --git a/src/pip/_internal/distributions/sdist.py b/src/pip/_internal/distributions/sdist.py index 28ea5cea16c..3e4a3e8d371 100644 --- a/src/pip/_internal/distributions/sdist.py +++ b/src/pip/_internal/distributions/sdist.py @@ -1,10 +1,10 @@ import logging -from typing import TYPE_CHECKING, Iterable, Optional, Set, Tuple +from typing import TYPE_CHECKING, Iterable, Set, Tuple from pip._internal.build_env import BuildEnvironment from pip._internal.distributions.base import AbstractDistribution from pip._internal.exceptions import InstallationError -from pip._internal.metadata import BaseDistribution +from pip._internal.metadata import BaseDistribution, get_directory_distribution from pip._internal.utils.subprocess import runner_with_spinner_message if TYPE_CHECKING: @@ -21,13 +21,19 @@ class SourceDistribution(AbstractDistribution): """ @property - def build_tracker_id(self) -> Optional[str]: + def build_tracker_id(self) -> str: """Identify this requirement uniquely by its link.""" assert self.req.link return self.req.link.url_without_fragment def get_metadata_distribution(self) -> BaseDistribution: - return self.req.get_dist() + assert ( + self.req.metadata_directory + ), "Set as part of .prepare_distribution_metadata()" + dist = get_directory_distribution(self.req.metadata_directory) + self.req.cache_concrete_dist(dist) + self.req.validate_sdist_metadata() + return dist def prepare_distribution_metadata( self, @@ -66,7 +72,11 @@ def prepare_distribution_metadata( self._raise_conflicts("the backend dependencies", conflicting) if missing: self._raise_missing_reqs(missing) - self.req.prepare_metadata() + + # NB: we must still call .cache_concrete_dist() and .validate_sdist_metadata() + # before the InstallRequirement itself has been updated with the metadata from + # this directory! + self.req.prepare_metadata_directory() def _prepare_build_backend(self, finder: "PackageFinder") -> None: # Isolate in a BuildEnvironment and install the build-time diff --git a/src/pip/_internal/distributions/wheel.py b/src/pip/_internal/distributions/wheel.py index bfadd39dcb7..e75c7910379 100644 --- a/src/pip/_internal/distributions/wheel.py +++ b/src/pip/_internal/distributions/wheel.py @@ -31,7 +31,9 @@ def get_metadata_distribution(self) -> BaseDistribution: assert self.req.local_file_path, "Set as part of preparation during download" assert self.req.name, "Wheels are never unnamed" wheel = FilesystemWheel(self.req.local_file_path) - return get_wheel_distribution(wheel, canonicalize_name(self.req.name)) + dist = get_wheel_distribution(wheel, canonicalize_name(self.req.name)) + self.req.cache_concrete_dist(dist) + return dist def prepare_distribution_metadata( self, diff --git a/src/pip/_internal/metadata/base.py b/src/pip/_internal/metadata/base.py index 9eabcdb278b..7e8c5b6f177 100644 --- a/src/pip/_internal/metadata/base.py +++ b/src/pip/_internal/metadata/base.py @@ -97,6 +97,15 @@ class RequiresEntry(NamedTuple): class BaseDistribution(Protocol): + @property + def is_concrete(self) -> bool: + """Whether the distribution really exists somewhere on disk. + + If this is false, it has been synthesized from metadata, e.g. via + ``.from_metadata_file_contents()``, or ``.from_wheel()`` against + a ``MemoryWheel``.""" + raise NotImplementedError() + @classmethod def from_directory(cls, directory: str) -> "BaseDistribution": """Load the distribution from a metadata directory. @@ -667,6 +676,10 @@ def iter_installed_distributions( class Wheel(Protocol): location: str + @property + def is_concrete(self) -> bool: + raise NotImplementedError() + def as_zipfile(self) -> zipfile.ZipFile: raise NotImplementedError() @@ -675,6 +688,10 @@ class FilesystemWheel(Wheel): def __init__(self, location: str) -> None: self.location = location + @property + def is_concrete(self) -> bool: + return True + def as_zipfile(self) -> zipfile.ZipFile: return zipfile.ZipFile(self.location, allowZip64=True) @@ -684,5 +701,9 @@ def __init__(self, location: str, stream: IO[bytes]) -> None: self.location = location self.stream = stream + @property + def is_concrete(self) -> bool: + return False + def as_zipfile(self) -> zipfile.ZipFile: return zipfile.ZipFile(self.stream, allowZip64=True) diff --git a/src/pip/_internal/metadata/importlib/_dists.py b/src/pip/_internal/metadata/importlib/_dists.py index 36cd326232e..63b5ff1a012 100644 --- a/src/pip/_internal/metadata/importlib/_dists.py +++ b/src/pip/_internal/metadata/importlib/_dists.py @@ -102,16 +102,22 @@ def __init__( dist: importlib.metadata.Distribution, info_location: Optional[BasePath], installed_location: Optional[BasePath], + concrete: bool, ) -> None: self._dist = dist self._info_location = info_location self._installed_location = installed_location + self._concrete = concrete + + @property + def is_concrete(self) -> bool: + return self._concrete @classmethod def from_directory(cls, directory: str) -> BaseDistribution: info_location = pathlib.Path(directory) dist = importlib.metadata.Distribution.at(info_location) - return cls(dist, info_location, info_location.parent) + return cls(dist, info_location, info_location.parent, concrete=True) @classmethod def from_metadata_file_contents( @@ -128,7 +134,7 @@ def from_metadata_file_contents( metadata_path.write_bytes(metadata_contents) # Construct dist pointing to the newly created directory. dist = importlib.metadata.Distribution.at(metadata_path.parent) - return cls(dist, metadata_path.parent, None) + return cls(dist, metadata_path.parent, None, concrete=False) @classmethod def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: @@ -137,7 +143,14 @@ def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: dist = WheelDistribution.from_zipfile(zf, name, wheel.location) except zipfile.BadZipFile as e: raise InvalidWheel(wheel.location, name) from e - return cls(dist, dist.info_location, pathlib.PurePosixPath(wheel.location)) + except UnsupportedWheel as e: + raise UnsupportedWheel(f"{name} has an invalid wheel, {e}") + return cls( + dist, + dist.info_location, + pathlib.PurePosixPath(wheel.location), + concrete=wheel.is_concrete, + ) @property def location(self) -> Optional[str]: diff --git a/src/pip/_internal/metadata/importlib/_envs.py b/src/pip/_internal/metadata/importlib/_envs.py index 70cb7a6009a..b7355d4935c 100644 --- a/src/pip/_internal/metadata/importlib/_envs.py +++ b/src/pip/_internal/metadata/importlib/_envs.py @@ -80,7 +80,7 @@ def find(self, location: str) -> Iterator[BaseDistribution]: installed_location: Optional[BasePath] = None else: installed_location = info_location.parent - yield Distribution(dist, info_location, installed_location) + yield Distribution(dist, info_location, installed_location, concrete=True) def find_linked(self, location: str) -> Iterator[BaseDistribution]: """Read location in egg-link files and return distributions in there. @@ -104,7 +104,7 @@ def find_linked(self, location: str) -> Iterator[BaseDistribution]: continue target_location = str(path.joinpath(target_rel)) for dist, info_location in self._find_impl(target_location): - yield Distribution(dist, info_location, path) + yield Distribution(dist, info_location, path, concrete=True) def _find_eggs_in_dir(self, location: str) -> Iterator[BaseDistribution]: from pip._vendor.pkg_resources import find_distributions @@ -116,7 +116,7 @@ def _find_eggs_in_dir(self, location: str) -> Iterator[BaseDistribution]: if not entry.name.endswith(".egg"): continue for dist in find_distributions(entry.path): - yield legacy.Distribution(dist) + yield legacy.Distribution(dist, concrete=True) def _find_eggs_in_zip(self, location: str) -> Iterator[BaseDistribution]: from pip._vendor.pkg_resources import find_eggs_in_zip @@ -128,7 +128,7 @@ def _find_eggs_in_zip(self, location: str) -> Iterator[BaseDistribution]: except zipimport.ZipImportError: return for dist in find_eggs_in_zip(importer, location): - yield legacy.Distribution(dist) + yield legacy.Distribution(dist, concrete=True) def find_eggs(self, location: str) -> Iterator[BaseDistribution]: """Find eggs in a location. diff --git a/src/pip/_internal/metadata/pkg_resources.py b/src/pip/_internal/metadata/pkg_resources.py index 4ea84f93a6f..428b1f4a7dd 100644 --- a/src/pip/_internal/metadata/pkg_resources.py +++ b/src/pip/_internal/metadata/pkg_resources.py @@ -81,8 +81,9 @@ def run_script(self, script_name: str, namespace: str) -> None: class Distribution(BaseDistribution): - def __init__(self, dist: pkg_resources.Distribution) -> None: + def __init__(self, dist: pkg_resources.Distribution, concrete: bool) -> None: self._dist = dist + self._concrete = concrete # This is populated lazily, to avoid loading metadata for all possible # distributions eagerly. self.__extra_mapping: Optional[Mapping[NormalizedName, str]] = None @@ -96,6 +97,10 @@ def _extra_mapping(self) -> Mapping[NormalizedName, str]: return self.__extra_mapping + @property + def is_concrete(self) -> bool: + return self._concrete + @classmethod def from_directory(cls, directory: str) -> BaseDistribution: dist_dir = directory.rstrip(os.sep) @@ -114,7 +119,7 @@ def from_directory(cls, directory: str) -> BaseDistribution: dist_name = os.path.splitext(dist_dir_name)[0].split("-")[0] dist = dist_cls(base_dir, project_name=dist_name, metadata=metadata) - return cls(dist) + return cls(dist, concrete=True) @classmethod def from_metadata_file_contents( @@ -131,7 +136,7 @@ def from_metadata_file_contents( metadata=InMemoryMetadata(metadata_dict, filename), project_name=project_name, ) - return cls(dist) + return cls(dist, concrete=False) @classmethod def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: @@ -152,7 +157,7 @@ def from_wheel(cls, wheel: Wheel, name: str) -> BaseDistribution: metadata=InMemoryMetadata(metadata_dict, wheel.location), project_name=name, ) - return cls(dist) + return cls(dist, concrete=wheel.is_concrete) @property def location(self) -> Optional[str]: @@ -264,7 +269,7 @@ def from_paths(cls, paths: Optional[List[str]]) -> BaseEnvironment: def _iter_distributions(self) -> Iterator[BaseDistribution]: for dist in self._ws: - yield Distribution(dist) + yield Distribution(dist, concrete=True) def _search_distribution(self, name: str) -> Optional[BaseDistribution]: """Find a distribution matching the ``name`` in the environment. diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index e6aa3447200..7ad301a3c06 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -14,7 +14,6 @@ from pip._vendor.packaging.utils import canonicalize_name from pip._internal.distributions import make_distribution_for_install_requirement -from pip._internal.distributions.installed import InstalledDistribution from pip._internal.exceptions import ( DirectoryUrlHashUnsupported, HashMismatch, @@ -190,6 +189,8 @@ def _check_download_dir( ) -> Optional[str]: """Check download_dir for previously downloaded file with correct hash If a correct file is found return its path else None + + If a file is found at the given path, but with an invalid hash, the file is deleted. """ download_path = os.path.join(download_dir, link.filename) @@ -520,7 +521,9 @@ def prepare_linked_requirement( # The file is not available, attempt to fetch only metadata metadata_dist = self._fetch_metadata_only(req) if metadata_dist is not None: - req.needs_more_preparation = True + # These reqs now have the dependency information from the downloaded + # metadata, without having downloaded the actual dist at all. + req.cache_virtual_metadata_only_dist(metadata_dist) return metadata_dist # None of the optimizations worked, fully prepare the requirement @@ -530,27 +533,27 @@ def prepare_linked_requirements_more( self, reqs: Iterable[InstallRequirement], parallel_builds: bool = False ) -> None: """Prepare linked requirements more, if needed.""" - reqs = [req for req in reqs if req.needs_more_preparation] + partially_downloaded_reqs: List[InstallRequirement] = [] for req in reqs: + if req.is_concrete: + continue + # Determine if any of these requirements were already downloaded. if self.download_dir is not None and req.link.is_wheel: hashes = self._get_linked_req_hashes(req) - file_path = _check_download_dir(req.link, self.download_dir, hashes) + # If the file is there, but doesn't match the hash, delete it and print + # a warning. We will be downloading it again via + # partially_downloaded_reqs. + file_path = _check_download_dir( + req.link, self.download_dir, hashes, warn_on_hash_mismatch=True + ) if file_path is not None: + # If the hash does match, then we still need to generate a concrete + # dist, but we don't have to download the wheel again. self._downloaded[req.link.url] = file_path - req.needs_more_preparation = False - # Prepare requirements we found were already downloaded for some - # reason. The other downloads will be completed separately. - partially_downloaded_reqs: List[InstallRequirement] = [] - for req in reqs: - if req.needs_more_preparation: - partially_downloaded_reqs.append(req) - else: - self._prepare_linked_requirement(req, parallel_builds) + partially_downloaded_reqs.append(req) - # TODO: separate this part out from RequirementPreparer when the v1 - # resolver can be removed! self._complete_partial_requirements( partially_downloaded_reqs, parallel_builds=parallel_builds, @@ -651,6 +654,7 @@ def _prepare_linked_requirement( def save_linked_requirement(self, req: InstallRequirement) -> None: assert self.download_dir is not None assert req.link is not None + assert req.is_concrete link = req.link if link.is_vcs or (link.is_existing_dir() and req.editable): # Make a .zip of the source_dir we already created. @@ -705,6 +709,8 @@ def prepare_editable_requirement( req.check_if_exists(self.use_user_site) + # This should already have been populated by the preparation of the source dist. + assert req.is_concrete return dist def prepare_installed_requirement( @@ -729,4 +735,13 @@ def prepare_installed_requirement( "completely repeatable environment, install into an " "empty virtualenv." ) - return InstalledDistribution(req).get_metadata_distribution() + dist = _get_prepared_distribution( + req, + self.build_tracker, + self.finder, + self.build_isolation, + self.check_build_deps, + ) + + assert req.is_concrete + return dist diff --git a/src/pip/_internal/req/req_install.py b/src/pip/_internal/req/req_install.py index 834bc513356..2710142f26a 100644 --- a/src/pip/_internal/req/req_install.py +++ b/src/pip/_internal/req/req_install.py @@ -7,7 +7,16 @@ import zipfile from optparse import Values from pathlib import Path -from typing import Any, Collection, Dict, Iterable, List, Optional, Sequence, Union +from typing import ( + TYPE_CHECKING, + Collection, + Dict, + Iterable, + List, + Optional, + Sequence, + Union, +) from pip._vendor.packaging.markers import Marker from pip._vendor.packaging.requirements import Requirement @@ -23,10 +32,7 @@ from pip._internal.metadata import ( BaseDistribution, get_default_environment, - get_directory_distribution, - get_wheel_distribution, ) -from pip._internal.metadata.base import FilesystemWheel from pip._internal.models.direct_url import DirectUrl from pip._internal.models.link import Link from pip._internal.operations.build.metadata import generate_metadata @@ -59,6 +65,9 @@ from pip._internal.utils.virtualenv import running_under_virtualenv from pip._internal.vcs import vcs +if TYPE_CHECKING: + import email.message + logger = logging.getLogger(__name__) @@ -150,6 +159,7 @@ def __init__( self.hash_options = hash_options if hash_options else {} self.config_settings = config_settings # Set to True after successful preparation of this requirement + # TODO: this is only used in the legacy resolver: remove this! self.prepared = False # User supplied requirement are explicitly requested for installation # by the user via CLI arguments or requirements files, as opposed to, @@ -191,8 +201,11 @@ def __init__( ) self.use_pep517 = True - # This requirement needs more preparation before it can be built - self.needs_more_preparation = False + # When a dist is computed for this requirement, cache it here so it's visible + # everywhere within pip and isn't computed more than once. This may be + # a "virtual" dist without a physical location on the filesystem, or + # a "concrete" dist which has been fully downloaded. + self._dist: Optional[BaseDistribution] = None # This requirement needs to be unpacked before it can be installed. self._archive_source: Optional[Path] = None @@ -550,11 +563,11 @@ def isolated_editable_sanity_check(self) -> None: f"Consider using a build backend that supports PEP 660." ) - def prepare_metadata(self) -> None: + def prepare_metadata_directory(self) -> None: """Ensure that project metadata is available. - Under PEP 517 and PEP 660, call the backend hook to prepare the metadata. - Under legacy processing, call setup.py egg-info. + Under PEP 517 and PEP 660, call the backend hook to prepare the metadata + directory. Under legacy processing, call setup.py egg-info. """ assert self.source_dir, f"No source dir for {self}" details = self.name or f"from {self.link}" @@ -586,6 +599,8 @@ def prepare_metadata(self) -> None: details=details, ) + def validate_sdist_metadata(self) -> None: + """Ensure that we have a dist, and ensure it corresponds to expectations.""" # Act on the newly generated metadata, based on the name and version. if not self.name: self._set_requirement() @@ -595,25 +610,54 @@ def prepare_metadata(self) -> None: self.assert_source_matches_version() @property - def metadata(self) -> Any: - if not hasattr(self, "_metadata"): - self._metadata = self.get_dist().metadata - - return self._metadata + def metadata(self) -> "email.message.Message": + return self.get_dist().metadata def get_dist(self) -> BaseDistribution: - if self.metadata_directory: - return get_directory_distribution(self.metadata_directory) - elif self.local_file_path and self.is_wheel: - assert self.req is not None - return get_wheel_distribution( - FilesystemWheel(self.local_file_path), - canonicalize_name(self.req.name), - ) - raise AssertionError( - f"InstallRequirement {self} has no metadata directory and no wheel: " - f"can't make a distribution." - ) + """Retrieve the dist resolved from this requirement. + + :raises AssertionError: if the resolver has not yet been executed. + """ + if self._dist is None: + raise AssertionError(f"{self!r} has no dist associated.") + return self._dist + + def cache_virtual_metadata_only_dist(self, dist: BaseDistribution) -> None: + """Associate a "virtual" metadata-only dist to this requirement. + + This dist cannot be installed, but it can be used to complete the resolve + process. + + :raises AssertionError: if a dist has already been associated. + :raises AssertionError: if the provided dist is "concrete", i.e. exists + somewhere on the filesystem. + """ + assert self._dist is None, self + assert not dist.is_concrete, dist + self._dist = dist + + def cache_concrete_dist(self, dist: BaseDistribution) -> None: + """Associate a "concrete" dist to this requirement. + + A concrete dist exists somewhere on the filesystem and can be installed. + + :raises AssertionError: if a concrete dist has already been associated. + :raises AssertionError: if the provided dist is not concrete. + """ + if self._dist is not None: + # If we set a dist twice for the same requirement, we must be hydrating + # a concrete dist for what was previously virtual. This will occur in the + # case of `install --dry-run` when PEP 658 metadata is available. + + # TODO(#12186): avoid setting dist twice! + # assert not self._dist.is_concrete + pass + assert dist.is_concrete + self._dist = dist + + @property + def is_concrete(self) -> bool: + return self._dist is not None and self._dist.is_concrete def assert_source_matches_version(self) -> None: assert self.source_dir, f"No source dir for {self}" diff --git a/src/pip/_internal/resolution/resolvelib/resolver.py b/src/pip/_internal/resolution/resolvelib/resolver.py index c12beef0b2a..c22d1707b5e 100644 --- a/src/pip/_internal/resolution/resolvelib/resolver.py +++ b/src/pip/_internal/resolution/resolvelib/resolver.py @@ -179,7 +179,7 @@ def resolve( self.factory.preparer.prepare_linked_requirements_more(reqs) for req in reqs: req.prepared = True - req.needs_more_preparation = False + assert req.is_concrete return req_set def get_installation_order( diff --git a/tests/unit/metadata/test_metadata_pkg_resources.py b/tests/unit/metadata/test_metadata_pkg_resources.py index 6044c14e4ca..195442cbe1e 100644 --- a/tests/unit/metadata/test_metadata_pkg_resources.py +++ b/tests/unit/metadata/test_metadata_pkg_resources.py @@ -104,6 +104,7 @@ def test_wheel_metadata_works() -> None: metadata=InMemoryMetadata({"METADATA": metadata.as_bytes()}, ""), project_name=name, ), + concrete=False, ) assert name == dist.canonical_name == dist.raw_name diff --git a/tests/unit/test_req.py b/tests/unit/test_req.py index 8a95c058706..5757850c6b3 100644 --- a/tests/unit/test_req.py +++ b/tests/unit/test_req.py @@ -23,6 +23,7 @@ PreviousBuildDirError, ) from pip._internal.index.package_finder import PackageFinder +from pip._internal.metadata import get_metadata_distribution from pip._internal.models.direct_url import ArchiveInfo, DirectUrl, DirInfo, VcsInfo from pip._internal.models.link import Link from pip._internal.network.session import PipSession @@ -144,7 +145,11 @@ def test_no_reuse_existing_build_dir(self, data: TestData) -> None: ): resolver.resolve(reqset.all_requirements, True) - def test_environment_marker_extras(self, data: TestData) -> None: + def test_environment_marker_extras( + self, + data: TestData, + monkeypatch: pytest.MonkeyPatch, + ) -> None: """ Test that the environment marker extras are used with non-wheel installs. @@ -154,6 +159,13 @@ def test_environment_marker_extras(self, data: TestData) -> None: os.fspath(data.packages.joinpath("LocalEnvironMarker")), ) req.user_supplied = True + + def cache_concrete_dist(self, dist): # type: ignore[no-untyped-def] + self._dist = dist + + monkeypatch.setattr( + req, "cache_concrete_dist", partial(cache_concrete_dist, req) + ) reqset.add_unnamed_requirement(req) finder = make_test_finder(find_links=[data.find_links]) with self._basic_resolver(finder) as resolver: @@ -499,12 +511,23 @@ def test_download_info_local_dir(self, data: TestData) -> None: assert req.download_info.url.startswith("file://") assert isinstance(req.download_info.info, DirInfo) - def test_download_info_local_editable_dir(self, data: TestData) -> None: + def test_download_info_local_editable_dir( + self, + data: TestData, + monkeypatch: pytest.MonkeyPatch, + ) -> None: """Test that download_info is set for requirements from a local editable dir.""" finder = make_test_finder() with self._basic_resolver(finder) as resolver: ireq_url = data.packages.joinpath("FSPkg").as_uri() ireq = get_processed_req_from_line(f"-e {ireq_url}#egg=FSPkg") + + def cache_concrete_dist(self, dist): # type: ignore[no-untyped-def] + self._dist = dist + + monkeypatch.setattr( + ireq, "cache_concrete_dist", partial(cache_concrete_dist, ireq) + ) reqset = resolver.resolve([ireq], True) assert len(reqset.all_requirements) == 1 req = reqset.all_requirements[0] @@ -909,7 +932,9 @@ def test_mismatched_versions(caplog: pytest.LogCaptureFixture) -> None: metadata = email.message.Message() metadata["name"] = "simplewheel" metadata["version"] = "1.0" - req._metadata = metadata + req._dist = get_metadata_distribution( + bytes(metadata), "simplewheel-1.0.whl", "simplewheel" + ) req.assert_source_matches_version() assert caplog.records[-1].message == ( From 52133baf6c9d41f6264f64e1e71e93116ac75b4b Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 22 Jul 2024 19:09:04 -0400 Subject: [PATCH 2/5] refactor much of RequirementPreparer and add lengthy docs --- news/12871.trivial.rst | 1 + src/pip/_internal/distributions/__init__.py | 9 +- src/pip/_internal/operations/prepare.py | 324 +++++++++++--------- 3 files changed, 181 insertions(+), 153 deletions(-) create mode 100644 news/12871.trivial.rst diff --git a/news/12871.trivial.rst b/news/12871.trivial.rst new file mode 100644 index 00000000000..186e2bcb3c4 --- /dev/null +++ b/news/12871.trivial.rst @@ -0,0 +1 @@ +Refactor much of ``RequirementPreparer`` to avoid duplicated code paths for metadata-only requirements. diff --git a/src/pip/_internal/distributions/__init__.py b/src/pip/_internal/distributions/__init__.py index f6089daf40d..a870b227d1e 100644 --- a/src/pip/_internal/distributions/__init__.py +++ b/src/pip/_internal/distributions/__init__.py @@ -8,7 +8,14 @@ def make_distribution_for_install_requirement( install_req: InstallRequirement, ) -> AbstractDistribution: - """Returns a Distribution for the given InstallRequirement""" + """Returns an AbstractDistribution for the given InstallRequirement. + + As AbstractDistribution only covers installable artifacts, this method may only be + invoked at the conclusion of a resolve, when the RequirementPreparer has downloaded + the file corresponding to the resolved dist. Commands which intend to consume + metadata-only resolves without downloading should not call this method or + consume AbstractDistribution objects. + """ # Only pre-installed requirements will have a .satisfied_by dist. if install_req.satisfied_by: return InstalledDistribution(install_req) diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 7ad301a3c06..17e0c9c28da 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -9,7 +9,7 @@ import shutil from dataclasses import dataclass from pathlib import Path -from typing import Dict, Iterable, List, Optional +from typing import Dict, Iterable, Optional from pip._vendor.packaging.utils import canonicalize_name @@ -63,7 +63,17 @@ def _get_prepared_distribution( build_isolation: bool, check_build_deps: bool, ) -> BaseDistribution: - """Prepare a distribution for installation.""" + """Prepare a distribution for installation. + + This method will only be called by the preparer at the end of the resolve, and only + for commands which need installable artifacts (not just resolved metadata). If the + dist was previously metadata-only, the preparer must have downloaded the file + corresponding to the dist and set ``req.local_file_path``. + + This method will execute ``req.cache_concrete_dist()``, so that after invocation, + ``req.is_concrete`` will be True, because ``req.get_dist()`` will return a concrete + ``Distribution``. + """ abstract_dist = make_distribution_for_install_requirement(req) tracker_id = abstract_dist.build_tracker_id if tracker_id is not None: @@ -305,11 +315,7 @@ def _ensure_link_req_src_dir( self, req: InstallRequirement, parallel_builds: bool ) -> None: """Ensure source_dir of a linked InstallRequirement.""" - # Since source_dir is only set for editable requirements. - if req.link.is_wheel: - # We don't need to unpack wheels, so no need for a source - # directory. - return + assert not req.link.is_wheel assert req.source_dir is None if req.link.is_existing_dir(): # build local directories in-tree @@ -356,6 +362,47 @@ def _get_linked_req_hashes(self, req: InstallRequirement) -> Hashes: # showing the user what the hash should be. return req.hashes(trust_internet=False) or MissingHashes() + def _rewrite_link_and_hashes_for_cached_wheel( + self, req: InstallRequirement, hashes: Hashes + ) -> Optional[Hashes]: + """Check the hash of the requirement's source eagerly and rewrite its link. + + ``req.link`` is unconditionally rewritten to the cached wheel source so that the + requirement corresponds to where it was actually downloaded from instead of the + local cache entry. + + :returns: None if the source hash validated successfully. + """ + assert hashes + assert req.is_wheel_from_cache + assert req.download_info is not None + assert req.link.is_wheel + assert req.link.is_file + + # TODO: is it possible to avoid providing a "wrong" req.link in the first place + # in the resolver, instead of having to patch it up afterwards? + req.link = req.cached_wheel_source_link + + # We need to verify hashes, and we have found the requirement in the cache + # of locally built wheels. + if ( + isinstance(req.download_info.info, ArchiveInfo) + and req.download_info.info.hashes + and hashes.has_one_of(req.download_info.info.hashes) + ): + # At this point we know the requirement was built from a hashable source + # artifact, and we verified that the cache entry's hash of the original + # artifact matches one of the hashes we expect. We don't verify hashes + # against the cached wheel, because the wheel is not the original. + return None + + logger.warning( + "The hashes of the source archive found in cache entry " + "don't match, ignoring cached built wheel " + "and re-downloading source." + ) + return hashes + def _fetch_metadata_only( self, req: InstallRequirement, @@ -447,7 +494,7 @@ def _fetch_metadata_using_lazy_wheel( def _complete_partial_requirements( self, - partially_downloaded_reqs: Iterable[InstallRequirement], + metadata_only_reqs: Iterable[InstallRequirement], parallel_builds: bool = False, ) -> None: """Download any requirements which were only fetched by metadata.""" @@ -458,10 +505,9 @@ def _complete_partial_requirements( # Map each link to the requirement that owns it. This allows us to set # `req.local_file_path` on the appropriate requirement after passing # all the links at once into BatchDownloader. - links_to_fully_download: Dict[Link, InstallRequirement] = {} - for req in partially_downloaded_reqs: - assert req.link - links_to_fully_download[req.link] = req + links_to_fully_download: Dict[Link, InstallRequirement] = { + req.link: req for req in metadata_only_reqs + } batch_download = self._batch_download( links_to_fully_download.keys(), @@ -486,9 +532,33 @@ def _complete_partial_requirements( # This step is necessary to ensure all lazy wheels are processed # successfully by the 'download', 'wheel', and 'install' commands. - for req in partially_downloaded_reqs: + for req in metadata_only_reqs: self._prepare_linked_requirement(req, parallel_builds) + def _check_download_path(self, req: InstallRequirement) -> None: + """Check if the relevant file is already available in the download directory. + + If so, check its hash, and delete the file if the hash doesn't match.""" + if self.download_dir is None: + return + if not req.link.is_wheel: + return + + hashes = self._get_linked_req_hashes(req) + if file_path := _check_download_dir( + req.link, + self.download_dir, + hashes, + # When a locally built wheel has been found in cache, we don't warn + # about re-downloading when the already downloaded wheel hash does + # not match. This is because the hash must be checked against the + # original link, not the cached link. It that case the already + # downloaded file will be removed and re-fetched from cache (which + # implies a hash check against the cache entry's origin.json). + warn_on_hash_mismatch=not req.is_wheel_from_cache, + ): + self._downloaded[req.link.url] = file_path + def prepare_linked_requirement( self, req: InstallRequirement, parallel_builds: bool = False ) -> BaseDistribution: @@ -496,35 +566,15 @@ def prepare_linked_requirement( assert req.link self._log_preparing_link(req) with indent_log(): - # Check if the relevant file is already available - # in the download directory - file_path = None - if self.download_dir is not None and req.link.is_wheel: - hashes = self._get_linked_req_hashes(req) - file_path = _check_download_dir( - req.link, - self.download_dir, - hashes, - # When a locally built wheel has been found in cache, we don't warn - # about re-downloading when the already downloaded wheel hash does - # not match. This is because the hash must be checked against the - # original link, not the cached link. It that case the already - # downloaded file will be removed and re-fetched from cache (which - # implies a hash check against the cache entry's origin.json). - warn_on_hash_mismatch=not req.is_wheel_from_cache, - ) + # See if the file is already downloaded, and check its hash if so. + self._check_download_path(req) - if file_path is not None: - # The file is already available, so mark it as downloaded - self._downloaded[req.link.url] = file_path - else: - # The file is not available, attempt to fetch only metadata - metadata_dist = self._fetch_metadata_only(req) - if metadata_dist is not None: - # These reqs now have the dependency information from the downloaded - # metadata, without having downloaded the actual dist at all. - req.cache_virtual_metadata_only_dist(metadata_dist) - return metadata_dist + # First try to fetch only metadata. + if metadata_dist := self._fetch_metadata_only(req): + # These reqs now have the dependency information from the downloaded + # metadata, without having downloaded the actual dist at all. + req.cache_virtual_metadata_only_dist(metadata_dist) + return metadata_dist # None of the optimizations worked, fully prepare the requirement return self._prepare_linked_requirement(req, parallel_builds) @@ -533,73 +583,37 @@ def prepare_linked_requirements_more( self, reqs: Iterable[InstallRequirement], parallel_builds: bool = False ) -> None: """Prepare linked requirements more, if needed.""" - partially_downloaded_reqs: List[InstallRequirement] = [] - for req in reqs: - if req.is_concrete: - continue - - # Determine if any of these requirements were already downloaded. - if self.download_dir is not None and req.link.is_wheel: - hashes = self._get_linked_req_hashes(req) - # If the file is there, but doesn't match the hash, delete it and print - # a warning. We will be downloading it again via - # partially_downloaded_reqs. - file_path = _check_download_dir( - req.link, self.download_dir, hashes, warn_on_hash_mismatch=True - ) - if file_path is not None: - # If the hash does match, then we still need to generate a concrete - # dist, but we don't have to download the wheel again. - self._downloaded[req.link.url] = file_path - - partially_downloaded_reqs.append(req) - + metadata_only_reqs = [req for req in reqs if not req.is_concrete] self._complete_partial_requirements( - partially_downloaded_reqs, + metadata_only_reqs, parallel_builds=parallel_builds, ) - def _prepare_linked_requirement( - self, req: InstallRequirement, parallel_builds: bool - ) -> BaseDistribution: - assert req.link - link = req.link - - hashes = self._get_linked_req_hashes(req) - - if hashes and req.is_wheel_from_cache: - assert req.download_info is not None - assert link.is_wheel - assert link.is_file - # We need to verify hashes, and we have found the requirement in the cache - # of locally built wheels. - if ( - isinstance(req.download_info.info, ArchiveInfo) - and req.download_info.info.hashes - and hashes.has_one_of(req.download_info.info.hashes) - ): - # At this point we know the requirement was built from a hashable source - # artifact, and we verified that the cache entry's hash of the original - # artifact matches one of the hashes we expect. We don't verify hashes - # against the cached wheel, because the wheel is not the original. - hashes = None - else: - logger.warning( - "The hashes of the source archive found in cache entry " - "don't match, ignoring cached built wheel " - "and re-downloading source." - ) - req.link = req.cached_wheel_source_link - link = req.link + def _ensure_local_file_path( + self, req: InstallRequirement, hashes: Optional[Hashes] + ) -> None: + """Ensure that ``req.link`` is downloaded locally, matches the expected hash, + and that ``req.local_file_path`` is set to the download location.""" + if req.link.is_existing_dir(): + return - self._ensure_link_req_src_dir(req, parallel_builds) + # NB: req.local_file_path may be set already, if it was: + # (1) sourced from a local file (such as a local .tar.gz path), + # (2) also in the wheel cache (e.g. built from an sdist). + # We will overwrite it if so, since the local file path will still point to the + # .tar.gz source instead of the wheel cache entry. - if link.is_existing_dir(): - local_file = None - elif link.url not in self._downloaded: + local_file: Optional[File] = None + # The file may have already been downloaded in batch if it was + # a metadata-only requirement, or if it was already in the download directory. + if file_path := self._downloaded.get(req.link.url, None): + if hashes: + hashes.check_against_path(file_path) + local_file = File(file_path, content_type=None) + else: try: local_file = unpack_url( - link, + req.link, req.source_dir, self._download, self.verbosity, @@ -609,39 +623,13 @@ def _prepare_linked_requirement( except NetworkConnectionError as exc: raise InstallationError( f"Could not install requirement {req} because of HTTP " - f"error {exc} for URL {link}" - ) - else: - file_path = self._downloaded[link.url] - if hashes: - hashes.check_against_path(file_path) - local_file = File(file_path, content_type=None) + f"error {exc} for URL {req.link}" + ) from exc - # If download_info is set, we got it from the wheel cache. - if req.download_info is None: - # Editables don't go through this function (see - # prepare_editable_requirement). - assert not req.editable - req.download_info = direct_url_from_link(link, req.source_dir) - # Make sure we have a hash in download_info. If we got it as part of the - # URL, it will have been verified and we can rely on it. Otherwise we - # compute it from the downloaded file. - # FIXME: https://github.com/pypa/pip/issues/11943 - if ( - isinstance(req.download_info.info, ArchiveInfo) - and not req.download_info.info.hashes - and local_file - ): - hash = hash_file(local_file.path)[0].hexdigest() - # We populate info.hash for backward compatibility. - # This will automatically populate info.hashes. - req.download_info.info.hash = f"sha256={hash}" - - # For use in later processing, - # preserve the file path on the requirement. - if local_file: + if local_file is not None: req.local_file_path = local_file.path + def _prepare_and_finalize_dist(self, req: InstallRequirement) -> BaseDistribution: dist = _get_prepared_distribution( req, self.build_tracker, @@ -649,8 +637,60 @@ def _prepare_linked_requirement( self.build_isolation, self.check_build_deps, ) + assert dist.is_concrete + assert req.is_concrete + assert req.get_dist() is dist return dist + def _prepare_linked_requirement( + self, req: InstallRequirement, parallel_builds: bool + ) -> BaseDistribution: + """Ensure the dist pointing to the fully-resolved requirement is downloaded and + installable.""" + assert req.link, "this requirement must have a download link to fully prepare" + + hashes: Optional[Hashes] = self._get_linked_req_hashes(req) + + if hashes and req.is_wheel_from_cache: + hashes = self._rewrite_link_and_hashes_for_cached_wheel(req, hashes) + + # req.source_dir is only set for editable requirements. We don't need to unpack + # wheels, so no need for a source directory. + if not req.link.is_wheel: + self._ensure_link_req_src_dir(req, parallel_builds) + + # Ensure the dist is downloaded, check its hash, and unpack it into the source + # directory (if applicable). + self._ensure_local_file_path(req, hashes) + + # Set req.download_info for --report output. + if req.download_info is None: + # If download_info is set, we got it from the wheel cache. + self._ensure_download_info(req) + + # Build (if necessary) and prepare the distribution for installation. + return self._prepare_and_finalize_dist(req) + + def _ensure_download_info(self, req: InstallRequirement) -> None: + """Ensure that ``req.download_info`` is set, for uses such as --report.""" + assert req.download_info is None + # Editables don't go through this function (see prepare_editable_requirement). + assert not req.editable + req.download_info = direct_url_from_link(req.link, req.source_dir) + # Make sure we have a hash in download_info. If we got it as part of the + # URL, it will have been verified and we can rely on it. Otherwise we + # compute it from the downloaded file. + # FIXME: https://github.com/pypa/pip/issues/11943 + if ( + isinstance(req.download_info.info, ArchiveInfo) + and not req.download_info.info.hashes + and req.local_file_path + ): + hash = hash_file(req.local_file_path)[0].hexdigest() + # We populate info.hash for backward compatibility. + # This will automatically populate info.hashes. + req.download_info.info.hash = f"sha256={hash}" + def save_linked_requirement(self, req: InstallRequirement) -> None: assert self.download_dir is not None assert req.link is not None @@ -698,20 +738,9 @@ def prepare_editable_requirement( req.update_editable() assert req.source_dir req.download_info = direct_url_for_editable(req.unpacked_source_directory) - - dist = _get_prepared_distribution( - req, - self.build_tracker, - self.finder, - self.build_isolation, - self.check_build_deps, - ) - req.check_if_exists(self.use_user_site) - # This should already have been populated by the preparation of the source dist. - assert req.is_concrete - return dist + return self._prepare_and_finalize_dist(req) def prepare_installed_requirement( self, @@ -735,13 +764,4 @@ def prepare_installed_requirement( "completely repeatable environment, install into an " "empty virtualenv." ) - dist = _get_prepared_distribution( - req, - self.build_tracker, - self.finder, - self.build_isolation, - self.check_build_deps, - ) - - assert req.is_concrete - return dist + return self._prepare_and_finalize_dist(req) From e9849f851ef9b5ee50058738d981c9de6360f236 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Mon, 22 Jul 2024 18:40:08 -0400 Subject: [PATCH 3/5] use .metadata distribution info when possible When performing `install --dry-run` and PEP 658 .metadata files are available to guide the resolve, do not download the associated wheels. Rather use the distribution information directly from the .metadata files when reporting the results on the CLI and in the --report file. - describe the new --dry-run behavior - finalize linked requirements immediately after resolve - introduce is_concrete - funnel InstalledDistribution through _get_prepared_distribution() too - add test for new install --dry-run functionality (no downloading) --- news/12186.bugfix.rst | 1 + src/pip/_internal/commands/download.py | 5 +- src/pip/_internal/commands/install.py | 7 +- src/pip/_internal/commands/wheel.py | 5 +- src/pip/_internal/operations/check.py | 5 +- src/pip/_internal/operations/prepare.py | 59 ++++- src/pip/_internal/req/req_install.py | 5 +- .../resolution/resolvelib/resolver.py | 5 - tests/conftest.py | 37 ++- tests/functional/test_download.py | 3 +- tests/functional/test_install_metadata.py | 236 ++++++++++++++++++ 11 files changed, 342 insertions(+), 26 deletions(-) create mode 100644 news/12186.bugfix.rst create mode 100644 tests/functional/test_install_metadata.py diff --git a/news/12186.bugfix.rst b/news/12186.bugfix.rst new file mode 100644 index 00000000000..b51d84a0cb2 --- /dev/null +++ b/news/12186.bugfix.rst @@ -0,0 +1 @@ +Avoid downloading any dists in ``install --dry-run`` if PEP 658 ``.metadata`` files or lazy wheels are available. diff --git a/src/pip/_internal/commands/download.py b/src/pip/_internal/commands/download.py index 917bbb91d83..5528c1866a0 100644 --- a/src/pip/_internal/commands/download.py +++ b/src/pip/_internal/commands/download.py @@ -130,6 +130,9 @@ def run(self, options: Values, args: List[str]) -> int: self.trace_basic_info(finder) requirement_set = resolver.resolve(reqs, check_supported_wheels=True) + preparer.finalize_linked_requirements( + requirement_set.requirements.values(), require_dist_files=True + ) downloaded: List[str] = [] for req in requirement_set.requirements.values(): @@ -138,8 +141,6 @@ def run(self, options: Values, args: List[str]) -> int: preparer.save_linked_requirement(req) downloaded.append(req.name) - preparer.prepare_linked_requirements_more(requirement_set.requirements.values()) - if downloaded: write_output("Successfully downloaded %s", " ".join(downloaded)) diff --git a/src/pip/_internal/commands/install.py b/src/pip/_internal/commands/install.py index ad45a2f2a57..90d73842ffc 100644 --- a/src/pip/_internal/commands/install.py +++ b/src/pip/_internal/commands/install.py @@ -85,7 +85,8 @@ def add_options(self) -> None: help=( "Don't actually install anything, just print what would be. " "Can be used in combination with --ignore-installed " - "to 'resolve' the requirements." + "to 'resolve' the requirements. If package metadata is available " + "or cached, --dry-run also avoids downloading the dependency at all." ), ) self.cmd_opts.add_option( @@ -379,6 +380,10 @@ def run(self, options: Values, args: List[str]) -> int: requirement_set = resolver.resolve( reqs, check_supported_wheels=not options.target_dir ) + preparer.finalize_linked_requirements( + requirement_set.requirements.values(), + require_dist_files=not options.dry_run, + ) if options.json_report_file: report = InstallationReport(requirement_set.requirements_to_install) diff --git a/src/pip/_internal/commands/wheel.py b/src/pip/_internal/commands/wheel.py index 278719f4e0c..f3a81db6602 100644 --- a/src/pip/_internal/commands/wheel.py +++ b/src/pip/_internal/commands/wheel.py @@ -145,6 +145,9 @@ def run(self, options: Values, args: List[str]) -> int: self.trace_basic_info(finder) requirement_set = resolver.resolve(reqs, check_supported_wheels=True) + preparer.finalize_linked_requirements( + requirement_set.requirements.values(), require_dist_files=True + ) reqs_to_build: List[InstallRequirement] = [] for req in requirement_set.requirements.values(): @@ -153,8 +156,6 @@ def run(self, options: Values, args: List[str]) -> int: elif should_build_for_wheel_command(req): reqs_to_build.append(req) - preparer.prepare_linked_requirements_more(requirement_set.requirements.values()) - # build wheels build_successes, build_failures = build( reqs_to_build, diff --git a/src/pip/_internal/operations/check.py b/src/pip/_internal/operations/check.py index 4b6fbc4c375..4391d3e0121 100644 --- a/src/pip/_internal/operations/check.py +++ b/src/pip/_internal/operations/check.py @@ -23,7 +23,6 @@ from pip._vendor.packaging.utils import NormalizedName, canonicalize_name from pip._vendor.packaging.version import Version -from pip._internal.distributions import make_distribution_for_install_requirement from pip._internal.metadata import get_default_environment from pip._internal.metadata.base import BaseDistribution from pip._internal.req.req_install import InstallRequirement @@ -154,8 +153,8 @@ def _simulate_installation_of( # Modify it as installing requirement_set would (assuming no errors) for inst_req in to_install: - abstract_dist = make_distribution_for_install_requirement(inst_req) - dist = abstract_dist.get_metadata_distribution() + assert inst_req.is_concrete + dist = inst_req.get_dist() name = dist.canonical_name package_set[name] = PackageDetails(dist.version, list(dist.iter_dependencies())) diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 17e0c9c28da..340e4a37f9d 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -579,16 +579,71 @@ def prepare_linked_requirement( # None of the optimizations worked, fully prepare the requirement return self._prepare_linked_requirement(req, parallel_builds) - def prepare_linked_requirements_more( + def _extract_download_info(self, reqs: Iterable[InstallRequirement]) -> None: + """ + `pip install --report` extracts the download info from each requirement for its + JSON output, so we need to make sure every requirement has this before finishing + the resolve. But .download_info will only be populated by the point this method + is called for requirements already found in the wheel cache, so we need to + synthesize it for uncached results. Luckily, a DirectUrl can be parsed directly + from a url without any other context. However, this also means the download info + will only contain a hash if the link itself declares the hash. + """ + for req in reqs: + if req.download_info is None: + self._ensure_download_info(req) + + def _force_fully_prepared( + self, reqs: Iterable[InstallRequirement], assert_has_dist_files: bool + ) -> None: + """ + The legacy resolver seems to prepare requirements differently that can leave + them half-done in certain code paths. I'm not quite sure how it's doing things, + but at least we can do this to make sure they do things right. + """ + for req in reqs: + req.prepared = True + if assert_has_dist_files: + assert req.is_concrete + + def _ensure_dist_files( self, reqs: Iterable[InstallRequirement], parallel_builds: bool = False ) -> None: - """Prepare linked requirements more, if needed.""" + """Download any metadata-only linked requirements.""" metadata_only_reqs = [req for req in reqs if not req.is_concrete] self._complete_partial_requirements( metadata_only_reqs, parallel_builds=parallel_builds, ) + def finalize_linked_requirements( + self, + reqs: Iterable[InstallRequirement], + require_dist_files: bool, + parallel_builds: bool = False, + ) -> None: + """Prepare linked requirements more, if needed. + + Neighboring .metadata files as per PEP 658 or lazy wheels via fast-deps will be + preferred to extract metadata from any concrete requirement (one that has been + mapped to a Link) without downloading the underlying wheel or sdist. When ``pip + install --dry-run`` is called, we want to avoid ever downloading the underlying + dist, but we still need to provide all of the results that pip commands expect + from the typical resolve process. + + Those expectations vary, but one distinction lies in whether the command needs + an actual physical dist somewhere on the filesystem, or just the metadata about + it from the resolver (as in ``pip install --report``). If the command requires + actual physical filesystem locations for the resolved dists, it must call this + method with ``require_dist_files=True`` to fully download anything + that remains. + """ + if require_dist_files: + self._ensure_dist_files(reqs, parallel_builds=parallel_builds) + else: + self._extract_download_info(reqs) + self._force_fully_prepared(reqs, assert_has_dist_files=require_dist_files) + def _ensure_local_file_path( self, req: InstallRequirement, hashes: Optional[Hashes] ) -> None: diff --git a/src/pip/_internal/req/req_install.py b/src/pip/_internal/req/req_install.py index 2710142f26a..a5800c9e3bd 100644 --- a/src/pip/_internal/req/req_install.py +++ b/src/pip/_internal/req/req_install.py @@ -648,10 +648,7 @@ def cache_concrete_dist(self, dist: BaseDistribution) -> None: # If we set a dist twice for the same requirement, we must be hydrating # a concrete dist for what was previously virtual. This will occur in the # case of `install --dry-run` when PEP 658 metadata is available. - - # TODO(#12186): avoid setting dist twice! - # assert not self._dist.is_concrete - pass + assert not self._dist.is_concrete assert dist.is_concrete self._dist = dist diff --git a/src/pip/_internal/resolution/resolvelib/resolver.py b/src/pip/_internal/resolution/resolvelib/resolver.py index c22d1707b5e..4880a5b9802 100644 --- a/src/pip/_internal/resolution/resolvelib/resolver.py +++ b/src/pip/_internal/resolution/resolvelib/resolver.py @@ -175,11 +175,6 @@ def resolve( req_set.add_named_requirement(ireq) - reqs = req_set.all_requirements - self.factory.preparer.prepare_linked_requirements_more(reqs) - for req in reqs: - req.prepared = True - assert req.is_concrete return req_set def get_installation_order( diff --git a/tests/conftest.py b/tests/conftest.py index da4ab5b9dfb..3a168c484a3 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -748,6 +748,9 @@ class FakePackage: requires_dist: Tuple[str, ...] = () # This will override the Name specified in the actual dist's METADATA. metadata_name: Optional[str] = None + # Whether to delete the file this points to, which causes any attempt to fetch this + # package to fail unless it is processed as a metadata-only dist. + delete_linked_file: bool = False def metadata_filename(self) -> str: """This is specified by PEP 658.""" @@ -837,6 +840,27 @@ def fake_packages() -> Dict[str, List[FakePackage]]: ("simple==1.0",), ), ], + "complex-dist": [ + FakePackage( + "complex-dist", + "0.1", + "complex_dist-0.1-py2.py3-none-any.whl", + MetadataKind.Unhashed, + # Validate that the wheel isn't fetched if metadata is available and + # --dry-run is on, when the metadata presents no hash itself. + delete_linked_file=True, + ), + ], + "corruptwheel": [ + FakePackage( + "corruptwheel", + "1.0", + "corruptwheel-1.0-py2.py3-none-any.whl", + # Validate that the wheel isn't fetched if metadata is available and + # --dry-run is on, when the metadata *does* present a hash. + MetadataKind.Sha256, + ), + ], "has-script": [ # Ensure we check PEP 658 metadata hashing errors for wheel files. FakePackage( @@ -922,10 +946,10 @@ def html_index_for_packages( f' {package_link.filename}
' # noqa: E501 ) # (3.2) Copy over the corresponding file in `shared_data.packages`. - shutil.copy( - shared_data.packages / package_link.filename, - pkg_subdir / package_link.filename, - ) + cached_file = shared_data.packages / package_link.filename + new_file = pkg_subdir / package_link.filename + if not package_link.delete_linked_file: + shutil.copy(cached_file, new_file) # (3.3) Write a metadata file, if applicable. if package_link.metadata != MetadataKind.NoFile: with open(pkg_subdir / package_link.metadata_filename(), "wb") as f: @@ -980,7 +1004,8 @@ def html_index_with_onetime_server( """Serve files from a generated pypi index, erroring if a file is downloaded more than once. - Provide `-i http://localhost:8000` to pip invocations to point them at this server. + Provide `-i http://localhost:` to pip invocations to point them at + this server. """ class InDirectoryServer(http.server.ThreadingHTTPServer): @@ -995,7 +1020,7 @@ def finish_request(self: "Self", request: Any, client_address: Any) -> None: class Handler(OneTimeDownloadHandler): _seen_paths: ClassVar[Set[str]] = set() - with InDirectoryServer(("", 8000), Handler) as httpd: + with InDirectoryServer(("", 0), Handler) as httpd: server_thread = threading.Thread(target=httpd.serve_forever) server_thread.start() diff --git a/tests/functional/test_download.py b/tests/functional/test_download.py index d469e71c360..53c815ae98b 100644 --- a/tests/functional/test_download.py +++ b/tests/functional/test_download.py @@ -1273,6 +1273,7 @@ def download_server_html_index( ) -> Callable[..., Tuple[TestPipResult, Path]]: """Execute `pip download` against a generated PyPI index.""" download_dir = tmpdir / "download_dir" + _, server_port = html_index_with_onetime_server.server_address def run_for_generated_index( args: List[str], @@ -1287,7 +1288,7 @@ def run_for_generated_index( "-d", str(download_dir), "-i", - "http://localhost:8000", + f"http://localhost:{server_port}", *args, ] result = script.pip(*pip_args, allow_error=allow_error) diff --git a/tests/functional/test_install_metadata.py b/tests/functional/test_install_metadata.py new file mode 100644 index 00000000000..a7c4dd51807 --- /dev/null +++ b/tests/functional/test_install_metadata.py @@ -0,0 +1,236 @@ +import json +import re +from pathlib import Path +from typing import Any, Callable, Dict, Iterator, List, Tuple + +import pytest +from pip._vendor.packaging.requirements import Requirement + +from pip._internal.models.direct_url import DirectUrl +from pip._internal.utils.urls import path_to_url +from tests.lib import ( + PipTestEnvironment, + TestPipResult, +) + + +@pytest.fixture +def install_with_generated_html_index( + script: PipTestEnvironment, + html_index_for_packages: Path, + tmpdir: Path, +) -> Callable[..., Tuple[TestPipResult, Dict[str, Any]]]: + """Execute `pip download` against a generated PyPI index.""" + output_file = tmpdir / "output_file.json" + + def run_for_generated_index( + args: List[str], + *, + dry_run: bool = True, + allow_error: bool = False, + ) -> Tuple[TestPipResult, Dict[str, Any]]: + """ + Produce a PyPI directory structure pointing to the specified packages, then + execute `pip install --report ... -i ...` pointing to our generated index. + """ + pip_args = [ + "install", + *(("--dry-run",) if dry_run else ()), + "--ignore-installed", + "--report", + str(output_file), + "-i", + path_to_url(str(html_index_for_packages)), + *args, + ] + result = script.pip(*pip_args, allow_error=allow_error) + try: + with open(output_file, "rb") as f: + report = json.load(f) + except FileNotFoundError: + if allow_error: + report = {} + else: + raise + return (result, report) + + return run_for_generated_index + + +def iter_dists(report: Dict[str, Any]) -> Iterator[Tuple[Requirement, DirectUrl]]: + """Parse a (req,url) tuple from each installed dist in the --report json.""" + for inst in report["install"]: + metadata = inst["metadata"] + name = metadata["name"] + version = metadata["version"] + req = Requirement(f"{name}=={version}") + direct_url = DirectUrl.from_dict(inst["download_info"]) + yield (req, direct_url) + + +@pytest.mark.parametrize( + "requirement_to_install, expected_outputs", + [ + ("simple2==1.0", ["simple2==1.0", "simple==1.0"]), + ("simple==2.0", ["simple==2.0"]), + ( + "colander", + ["colander==0.9.9", "translationstring==1.1"], + ), + ( + "compilewheel", + ["compilewheel==1.0", "simple==1.0"], + ), + ], +) +def test_install_with_metadata( + install_with_generated_html_index: Callable[ + ..., Tuple[TestPipResult, Dict[str, Any]] + ], + requirement_to_install: str, + expected_outputs: List[str], +) -> None: + """Verify that if a data-dist-info-metadata attribute is present, then it is used + instead of the actual dist's METADATA.""" + _, report = install_with_generated_html_index( + [requirement_to_install], + ) + installed = sorted(str(r) for r, _ in iter_dists(report)) + assert installed == expected_outputs + + +@pytest.mark.parametrize( + "requirement_to_install, real_hash", + [ + ( + "simple==3.0", + "95e0f200b6302989bcf2cead9465cf229168295ea330ca30d1ffeab5c0fed996", + ), + ( + "has-script", + "16ba92d7f6f992f6de5ecb7d58c914675cf21f57f8e674fb29dcb4f4c9507e5b", + ), + ], +) +def test_incorrect_metadata_hash( + install_with_generated_html_index: Callable[ + ..., Tuple[TestPipResult, Dict[str, Any]] + ], + requirement_to_install: str, + real_hash: str, +) -> None: + """Verify that if a hash for data-dist-info-metadata is provided, it must match the + actual hash of the metadata file.""" + result, _ = install_with_generated_html_index( + [requirement_to_install], + allow_error=True, + ) + assert result.returncode != 0 + expected_msg = f"""\ + Expected sha256 wrong-hash + Got {real_hash}""" + assert expected_msg in result.stderr + + +@pytest.mark.parametrize( + "requirement_to_install, expected_url", + [ + ("simple2==2.0", "simple2-2.0.tar.gz.metadata"), + ("priority", "priority-1.0-py2.py3-none-any.whl.metadata"), + ], +) +def test_metadata_not_found( + install_with_generated_html_index: Callable[ + ..., Tuple[TestPipResult, Dict[str, Any]] + ], + requirement_to_install: str, + expected_url: str, +) -> None: + """Verify that if a data-dist-info-metadata attribute is provided, that pip will + fetch the .metadata file at the location specified by PEP 658, and error + if unavailable.""" + result, _ = install_with_generated_html_index( + [requirement_to_install], + allow_error=True, + ) + assert result.returncode != 0 + expected_re = re.escape(expected_url) + pattern = re.compile( + f"ERROR: 404 Client Error: FileNotFoundError for url:.*{expected_re}" + ) + assert pattern.search(result.stderr), (pattern, result.stderr) + + +def test_produces_error_for_mismatched_package_name_in_metadata( + install_with_generated_html_index: Callable[ + ..., Tuple[TestPipResult, Dict[str, Any]] + ], +) -> None: + """Verify that the package name from the metadata matches the requested package.""" + result, _ = install_with_generated_html_index( + ["simple2==3.0"], + allow_error=True, + ) + assert result.returncode != 0 + assert ( + "simple2-3.0.tar.gz has inconsistent Name: expected 'simple2', but metadata " + "has 'not-simple2'" + ) in result.stdout + + +@pytest.mark.parametrize( + "requirement", + [ + "requires-simple-extra==0.1", + "REQUIRES_SIMPLE-EXTRA==0.1", + "REQUIRES....simple-_-EXTRA==0.1", + ], +) +def test_canonicalizes_package_name_before_verifying_metadata( + install_with_generated_html_index: Callable[ + ..., Tuple[TestPipResult, Dict[str, Any]] + ], + requirement: str, +) -> None: + """Verify that the package name from the command line and the package's + METADATA are both canonicalized before comparison, while the name from the METADATA + is always used verbatim to represent the installed candidate in --report. + + Regression test for https://github.com/pypa/pip/issues/12038 + """ + _, report = install_with_generated_html_index( + [requirement], + ) + reqs = [str(r) for r, _ in iter_dists(report)] + assert reqs == ["Requires_Simple.Extra==0.1"] + + +@pytest.mark.parametrize( + "requirement,err_string", + [ + # It's important that we verify pip won't even attempt to fetch the file, so we + # construct an input that will cause it to error if it tries at all. + ("complex-dist==0.1", "404 Client Error: FileNotFoundError"), + ("corruptwheel==1.0", ".whl is invalid."), + ], +) +def test_dry_run_avoids_downloading_metadata_only_dists( + install_with_generated_html_index: Callable[ + ..., Tuple[TestPipResult, Dict[str, Any]] + ], + requirement: str, + err_string: str, +) -> None: + """Verify that the underlying dist files are not downloaded at all when + `install --dry-run` is used to resolve dists with PEP 658 metadata.""" + _, report = install_with_generated_html_index( + [requirement], + ) + assert [requirement] == [str(r) for r, _ in iter_dists(report)] + result, _ = install_with_generated_html_index( + [requirement], + dry_run=False, + allow_error=True, + ) + assert result.returncode != 0 + assert err_string in result.stderr From b02915a14662e628f5bbcefcbdc7fe486e57d093 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 9 Aug 2023 18:39:53 -0400 Subject: [PATCH 4/5] make LinkMetadataCache - catch an exception when parsing metadata which only occurs in CI - handle --no-cache-dir - call os.makedirs() before writing to cache too - catch InvalidSchema when attempting git urls with BatchDownloader - fix other test failures - reuse should_cache(req) logic - gzip compress link metadata for a slight reduction in disk space - only cache built sdists - don't check should_cache() when fetching - cache lazy wheel dists - add news - turn debug logs in fetching from cache into exceptions - use scandir over listdir when searching normal wheel cache - handle metadata email parsing errors - correctly handle mutable cached requirement - use bz2 over gzip for an extremely slight improvement in disk usage --- news/12256.feature.rst | 1 + src/pip/_internal/cache.py | 118 +++++++++++-- src/pip/_internal/cli/cmdoptions.py | 2 + src/pip/_internal/cli/req_command.py | 13 +- src/pip/_internal/exceptions.py | 19 +++ src/pip/_internal/metadata/__init__.py | 10 +- src/pip/_internal/metadata/base.py | 15 ++ src/pip/_internal/network/download.py | 2 +- src/pip/_internal/operations/prepare.py | 199 +++++++++++++++++++--- src/pip/_internal/req/constructors.py | 2 +- src/pip/_internal/req/req_install.py | 25 ++- src/pip/_internal/req/req_set.py | 2 +- src/pip/_internal/wheel_builder.py | 62 +------ tests/functional/test_check.py | 20 +-- tests/functional/test_install.py | 8 +- tests/functional/test_install_check.py | 2 +- tests/functional/test_install_metadata.py | 5 +- tests/functional/test_install_reqs.py | 2 +- tests/functional/test_wheel.py | 2 +- tests/unit/test_cache.py | 21 ++- tests/unit/test_wheel_builder.py | 11 +- 21 files changed, 404 insertions(+), 137 deletions(-) create mode 100644 news/12256.feature.rst diff --git a/news/12256.feature.rst b/news/12256.feature.rst new file mode 100644 index 00000000000..0b5bb356188 --- /dev/null +++ b/news/12256.feature.rst @@ -0,0 +1 @@ +Cache computed metadata from sdists and lazy wheels in ``~/.cache/pip/link-metadata`` when ``--use-feature=metadata-cache`` is enabled. diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py index 6b4512672db..534d0e866b6 100644 --- a/src/pip/_internal/cache.py +++ b/src/pip/_internal/cache.py @@ -1,12 +1,14 @@ """Cache Management """ +import abc import hashlib import json import logging import os +import re from pathlib import Path -from typing import Any, Dict, List, Optional +from typing import Dict, Iterator, List, Optional, Tuple from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version from pip._vendor.packaging.utils import canonicalize_name @@ -15,21 +17,71 @@ from pip._internal.models.direct_url import DirectUrl from pip._internal.models.link import Link from pip._internal.models.wheel import Wheel +from pip._internal.req.req_install import InstallRequirement from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds from pip._internal.utils.urls import path_to_url +from pip._internal.vcs import vcs logger = logging.getLogger(__name__) +_egg_info_re = re.compile(r"([a-z0-9_.]+)-([a-z0-9_.!+-]+)", re.IGNORECASE) + ORIGIN_JSON_NAME = "origin.json" +def _contains_egg_info(s: str) -> bool: + """Determine whether the string looks like an egg_info. + + :param s: The string to parse. E.g. foo-2.1 + """ + return bool(_egg_info_re.search(s)) + + +def should_cache( + req: InstallRequirement, +) -> bool: + """ + Return whether a built InstallRequirement can be stored in the persistent + wheel cache, assuming the wheel cache is available, and _should_build() + has determined a wheel needs to be built. + """ + if not req.link: + return False + + if req.link.is_wheel: + return False + + if req.editable or not req.source_dir: + # never cache editable requirements + return False + + if req.link and req.link.is_vcs: + # VCS checkout. Do not cache + # unless it points to an immutable commit hash. + assert not req.editable + assert req.source_dir + vcs_backend = vcs.get_backend_for_scheme(req.link.scheme) + assert vcs_backend + if vcs_backend.is_immutable_rev_checkout(req.link.url, req.source_dir): + return True + return False + + assert req.link + base, ext = req.link.splitext() + if _contains_egg_info(base): + return True + + # Otherwise, do not cache. + return False + + def _hash_dict(d: Dict[str, str]) -> str: """Return a stable sha224 of a dictionary.""" s = json.dumps(d, sort_keys=True, separators=(",", ":"), ensure_ascii=True) return hashlib.sha224(s.encode("ascii")).hexdigest() -class Cache: +class Cache(abc.ABC): """An abstract class - provides cache directories for data from links :param cache_dir: The root of the cache. @@ -73,20 +125,28 @@ def _get_cache_path_parts(self, link: Link) -> List[str]: return parts - def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]: - can_not_cache = not self.cache_dir or not canonical_package_name or not link - if can_not_cache: - return [] + @abc.abstractmethod + def get_path_for_link(self, link: Link) -> str: + """Return a directory to store cached items in for link.""" + ... + + def cache_path(self, link: Link) -> Path: + return Path(self.get_path_for_link(link)) - path = self.get_path_for_link(link) - if os.path.isdir(path): - return [(candidate, path) for candidate in os.listdir(path)] - return [] + +class LinkMetadataCache(Cache): + """Persistently store the metadata of dists found at each link.""" def get_path_for_link(self, link: Link) -> str: - """Return a directory to store cached items in for link.""" - raise NotImplementedError() + parts = self._get_cache_path_parts(link) + assert self.cache_dir + return os.path.join(self.cache_dir, "link-metadata", *parts) + +class WheelCacheBase(Cache): + """Specializations to the cache concept for wheels.""" + + @abc.abstractmethod def get( self, link: Link, @@ -96,10 +156,27 @@ def get( """Returns a link to a cached item if it exists, otherwise returns the passed link. """ - raise NotImplementedError() + ... + + def _can_cache(self, link: Link, canonical_package_name: str) -> bool: + return bool(self.cache_dir and canonical_package_name and link) + def _get_candidates( + self, link: Link, canonical_package_name: str + ) -> Iterator[Tuple[str, str]]: + if not self._can_cache(link, canonical_package_name): + return + + path = self.get_path_for_link(link) + if not os.path.isdir(path): + return -class SimpleWheelCache(Cache): + for candidate in os.scandir(path): + if candidate.is_file(): + yield (candidate.name, path) + + +class SimpleWheelCache(WheelCacheBase): """A cache of wheels for future installs.""" def __init__(self, cache_dir: str) -> None: @@ -131,7 +208,7 @@ def get( package_name: Optional[str], supported_tags: List[Tag], ) -> Link: - candidates = [] + candidates: List[Tuple[int, str, str]] = [] if not package_name: return link @@ -205,7 +282,7 @@ def __init__( ) -class WheelCache(Cache): +class WheelCache(WheelCacheBase): """Wraps EphemWheelCache and SimpleWheelCache into a single Cache This Cache allows for gracefully degradation, using the ephem wheel cache @@ -223,6 +300,15 @@ def get_path_for_link(self, link: Link) -> str: def get_ephem_path_for_link(self, link: Link) -> str: return self._ephem_cache.get_path_for_link(link) + def resolve_cache_dir(self, req: InstallRequirement) -> str: + """Return the persistent or temporary cache directory where the built or + downloaded wheel should be stored.""" + cache_available = bool(self.cache_dir) + assert req.link, req + if cache_available and should_cache(req): + return self.get_path_for_link(req.link) + return self.get_ephem_path_for_link(req.link) + def get( self, link: Link, diff --git a/src/pip/_internal/cli/cmdoptions.py b/src/pip/_internal/cli/cmdoptions.py index 0b7cff77bdd..cca4a92954f 100644 --- a/src/pip/_internal/cli/cmdoptions.py +++ b/src/pip/_internal/cli/cmdoptions.py @@ -1009,6 +1009,8 @@ def check_list_path_option(options: Values) -> None: default=[], choices=[ "fast-deps", + "metadata-cache", + "truststore", ] + ALWAYS_ENABLED_FEATURES, help="Enable new functionality, that may be backward incompatible.", diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py index 92900f94ff4..c4355670546 100644 --- a/src/pip/_internal/cli/req_command.py +++ b/src/pip/_internal/cli/req_command.py @@ -10,7 +10,7 @@ from optparse import Values from typing import Any, List, Optional, Tuple -from pip._internal.cache import WheelCache +from pip._internal.cache import LinkMetadataCache, WheelCache from pip._internal.cli import cmdoptions from pip._internal.cli.index_command import IndexGroupCommand from pip._internal.cli.index_command import SessionCommandMixin as SessionCommandMixin @@ -127,6 +127,16 @@ def make_requirement_preparer( "fast-deps has no effect when used with the legacy resolver." ) + if options.cache_dir and "metadata-cache" in options.features_enabled: + logger.warning( + "pip is using a local cache for metadata information. " + "This experimental feature is enabled through " + "--use-feature=metadata-cache and it is not ready for " + "production." + ) + metadata_cache = LinkMetadataCache(options.cache_dir) + else: + metadata_cache = None return RequirementPreparer( build_dir=temp_build_dir_path, src_dir=options.src_dir, @@ -142,6 +152,7 @@ def make_requirement_preparer( lazy_wheel=lazy_wheel, verbosity=verbosity, legacy_resolver=legacy_resolver, + metadata_cache=metadata_cache, ) @classmethod diff --git a/src/pip/_internal/exceptions.py b/src/pip/_internal/exceptions.py index 2587740f73a..7f6c45adaf0 100644 --- a/src/pip/_internal/exceptions.py +++ b/src/pip/_internal/exceptions.py @@ -246,6 +246,25 @@ def __str__(self) -> str: return f"None {self.metadata_name} metadata found for distribution: {self.dist}" +class CacheMetadataError(PipError): + """Raised when de/serializing a requirement into the metadata cache.""" + + def __init__( + self, + req: "InstallRequirement", + reason: str, + ) -> None: + """ + :param req: The requirement we attempted to cache. + :param reason: Context about the precise error that occurred. + """ + self.req = req + self.reason = reason + + def __str__(self) -> str: + return f"{self.reason} for {self.req} from {self.req.link}" + + class UserInstallationInvalid(InstallationError): """A --user install is requested on an environment without user site.""" diff --git a/src/pip/_internal/metadata/__init__.py b/src/pip/_internal/metadata/__init__.py index aa232b6cabd..2ed1fe8d64a 100644 --- a/src/pip/_internal/metadata/__init__.py +++ b/src/pip/_internal/metadata/__init__.py @@ -6,7 +6,14 @@ from pip._internal.utils.misc import strtobool -from .base import BaseDistribution, BaseEnvironment, FilesystemWheel, MemoryWheel, Wheel +from .base import ( + BaseDistribution, + BaseEnvironment, + FilesystemWheel, + MemoryWheel, + Wheel, + serialize_metadata, +) if TYPE_CHECKING: from typing import Literal, Protocol @@ -23,6 +30,7 @@ "get_environment", "get_wheel_distribution", "select_backend", + "serialize_metadata", ] diff --git a/src/pip/_internal/metadata/base.py b/src/pip/_internal/metadata/base.py index 7e8c5b6f177..9c95f9f7a31 100644 --- a/src/pip/_internal/metadata/base.py +++ b/src/pip/_internal/metadata/base.py @@ -1,6 +1,9 @@ import csv +import email.generator import email.message +import email.policy import functools +import io import json import logging import pathlib @@ -90,6 +93,18 @@ def _convert_installed_files_path( return str(pathlib.Path(*info, *entry)) +def serialize_metadata(msg: email.message.Message) -> str: + """Write a dist's metadata to a string. + + Calling ``str(dist.metadata)`` may raise an error by misinterpreting RST directives + as email headers. This method uses the more robust ``email.policy.EmailPolicy`` to + avoid those parsing errors.""" + out = io.StringIO() + g = email.generator.Generator(out, policy=email.policy.EmailPolicy()) + g.flatten(msg) + return out.getvalue() + + class RequiresEntry(NamedTuple): requirement: str extra: str diff --git a/src/pip/_internal/network/download.py b/src/pip/_internal/network/download.py index 5c3bce3d2fd..9a5d6fb9c5d 100644 --- a/src/pip/_internal/network/download.py +++ b/src/pip/_internal/network/download.py @@ -114,7 +114,7 @@ def _get_http_response_filename(resp: Response, link: Link) -> str: def _http_get_download(session: PipSession, link: Link) -> Response: - target_url = link.url.split("#", 1)[0] + target_url = link.url_without_fragment resp = session.get(target_url, headers=HEADERS, stream=True) raise_for_status(resp) return resp diff --git a/src/pip/_internal/operations/prepare.py b/src/pip/_internal/operations/prepare.py index 340e4a37f9d..a319a475ee9 100644 --- a/src/pip/_internal/operations/prepare.py +++ b/src/pip/_internal/operations/prepare.py @@ -4,17 +4,22 @@ # The following comment should be removed at some point in the future. # mypy: strict-optional=False +import bz2 +import json import mimetypes import os import shutil from dataclasses import dataclass from pathlib import Path -from typing import Dict, Iterable, Optional +from typing import Dict, Iterable, List, Optional, Tuple from pip._vendor.packaging.utils import canonicalize_name +from pip._vendor.requests.exceptions import InvalidSchema +from pip._internal.cache import LinkMetadataCache, should_cache from pip._internal.distributions import make_distribution_for_install_requirement from pip._internal.exceptions import ( + CacheMetadataError, DirectoryUrlHashUnsupported, HashMismatch, HashUnpinned, @@ -24,7 +29,11 @@ VcsHashUnsupported, ) from pip._internal.index.package_finder import PackageFinder -from pip._internal.metadata import BaseDistribution, get_metadata_distribution +from pip._internal.metadata import ( + BaseDistribution, + get_metadata_distribution, + serialize_metadata, +) from pip._internal.models.direct_url import ArchiveInfo from pip._internal.models.link import Link from pip._internal.models.wheel import Wheel @@ -62,7 +71,7 @@ def _get_prepared_distribution( finder: PackageFinder, build_isolation: bool, check_build_deps: bool, -) -> BaseDistribution: +) -> Tuple[bool, BaseDistribution]: """Prepare a distribution for installation. This method will only be called by the preparer at the end of the resolve, and only @@ -73,15 +82,20 @@ def _get_prepared_distribution( This method will execute ``req.cache_concrete_dist()``, so that after invocation, ``req.is_concrete`` will be True, because ``req.get_dist()`` will return a concrete ``Distribution``. + + :returns: a 2-tuple: + - (bool): whether the metadata had to be constructed (e.g. from an sdist build), + - (BaseDistribution): the concrete distribution which is ready to be installed. """ abstract_dist = make_distribution_for_install_requirement(req) tracker_id = abstract_dist.build_tracker_id - if tracker_id is not None: + builds_metadata = tracker_id is not None + if builds_metadata: with build_tracker.track(req, tracker_id): abstract_dist.prepare_distribution_metadata( finder, build_isolation, check_build_deps ) - return abstract_dist.get_metadata_distribution() + return (builds_metadata, abstract_dist.get_metadata_distribution()) def unpack_vcs_link(link: Link, location: str, verbosity: int) -> None: @@ -223,6 +237,45 @@ def _check_download_dir( return download_path +@dataclass(frozen=True) +class CacheableDist: + metadata: str + filename: Path + canonical_name: str + + @classmethod + def from_dist(cls, link: Link, dist: BaseDistribution) -> "CacheableDist": + """Extract the serializable data necessary to generate a metadata-only dist.""" + return cls( + metadata=serialize_metadata(dist.metadata), + filename=Path(link.filename), + canonical_name=dist.canonical_name, + ) + + def to_dist(self) -> BaseDistribution: + """Return a metadata-only dist from the deserialized cache entry.""" + return get_metadata_distribution( + metadata_contents=self.metadata.encode("utf-8"), + filename=str(self.filename), + canonical_name=self.canonical_name, + ) + + def to_json(self) -> Dict[str, str]: + return { + "metadata": self.metadata, + "filename": str(self.filename), + "canonical_name": self.canonical_name, + } + + @classmethod + def from_json(cls, args: Dict[str, str]) -> "CacheableDist": + return cls( + metadata=args["metadata"], + filename=Path(args["filename"]), + canonical_name=args["canonical_name"], + ) + + class RequirementPreparer: """Prepares a Requirement""" @@ -242,6 +295,7 @@ def __init__( lazy_wheel: bool, verbosity: int, legacy_resolver: bool, + metadata_cache: Optional[LinkMetadataCache] = None, ) -> None: super().__init__() @@ -284,6 +338,8 @@ def __init__( # Previous "header" printed for a link-based InstallRequirement self._previous_requirement_header = ("", "") + self._metadata_cache = metadata_cache + def _log_preparing_link(self, req: InstallRequirement) -> None: """Provide context for the requirement being prepared.""" if req.link.is_file and not req.is_wheel_from_cache: @@ -413,14 +469,81 @@ def _fetch_metadata_only( ) return None if self.require_hashes: + # Hash checking also means hashes are provided for all reqs, so no resolve + # is necessary and metadata-only fetching provides no speedup. logger.debug( "Metadata-only fetching is not used as hash checking is required", ) return None - # Try PEP 658 metadata first, then fall back to lazy wheel if unavailable. - return self._fetch_metadata_using_link_data_attr( - req - ) or self._fetch_metadata_using_lazy_wheel(req.link) + + return ( + self._fetch_cached_metadata(req) + or self._fetch_metadata_using_link_data_attr(req) + or self._fetch_metadata_using_lazy_wheel(req) + ) + + def _locate_metadata_cache_entry(self, link: Link) -> Optional[Path]: + """If the metadata cache is active, generate a filesystem path from the hash of + the given Link.""" + if self._metadata_cache is None: + return None + + return self._metadata_cache.cache_path(link) + + def _fetch_cached_metadata( + self, req: InstallRequirement + ) -> Optional[BaseDistribution]: + cached_path = self._locate_metadata_cache_entry(req.link) + if cached_path is None: + return None + + # Quietly continue if the cache entry does not exist. + if not os.path.isfile(cached_path): + logger.debug( + "no cached metadata for link %s at %s", + req.link, + cached_path, + ) + return None + + try: + with bz2.open(cached_path, mode="rt", encoding="utf-8") as f: + logger.debug( + "found cached metadata for link %s at %s", req.link, cached_path + ) + args = json.load(f) + cached_dist = CacheableDist.from_json(args) + return cached_dist.to_dist() + except Exception: + raise CacheMetadataError(req, "error reading cached metadata") + + def _cache_metadata( + self, + req: InstallRequirement, + metadata_dist: BaseDistribution, + ) -> None: + cached_path = self._locate_metadata_cache_entry(req.link) + if cached_path is None: + return + + # The cache file exists already, so we have nothing to do. + if os.path.isfile(cached_path): + logger.debug( + "metadata for link %s is already cached at %s", req.link, cached_path + ) + return + + # The metadata cache is split across several subdirectories, so ensure the + # containing directory for the cache file exists before writing. + os.makedirs(str(cached_path.parent), exist_ok=True) + try: + cacheable_dist = CacheableDist.from_dist(req.link, metadata_dist) + args = cacheable_dist.to_json() + logger.debug("caching metadata for link %s at %s", req.link, cached_path) + with bz2.open(cached_path, mode="wt", encoding="utf-8") as f: + json.dump(args, f) + except Exception: + raise CacheMetadataError(req, "failed to serialize metadata") def _fetch_metadata_using_link_data_attr( self, @@ -438,6 +561,9 @@ def _fetch_metadata_using_link_data_attr( metadata_link, ) # (2) Download the contents of the METADATA file, separate from the dist itself. + # NB: this request will hit the CacheControl HTTP cache, which will be very + # quick since the METADATA file is very small. Therefore, we can rely on + # HTTP caching instead of LinkMetadataCache. metadata_file = get_http_url( metadata_link, self._download, @@ -465,36 +591,48 @@ def _fetch_metadata_using_link_data_attr( def _fetch_metadata_using_lazy_wheel( self, - link: Link, + req: InstallRequirement, ) -> Optional[BaseDistribution]: """Fetch metadata using lazy wheel, if possible.""" # --use-feature=fast-deps must be provided. if not self.use_lazy_wheel: return None - if link.is_file or not link.is_wheel: + if req.link.is_file or not req.link.is_wheel: logger.debug( "Lazy wheel is not used as %r does not point to a remote wheel", - link, + req.link, ) return None - wheel = Wheel(link.filename) + wheel = Wheel(req.link.filename) name = canonicalize_name(wheel.name) logger.info( "Obtaining dependency information from %s %s", name, wheel.version, ) - url = link.url.split("#", 1)[0] + try: - return dist_from_wheel_url(name, url, self._session) + lazy_wheel_dist = dist_from_wheel_url( + name, req.link.url_without_fragment, self._session + ) except HTTPRangeRequestUnsupported: - logger.debug("%s does not support range requests", url) + logger.debug("%s does not support range requests", req.link) return None + # If we've used the lazy wheel approach, then PEP 658 metadata is not available. + # If the wheel is very large (>1GB), then retrieving it from the CacheControl + # HTTP cache may take multiple seconds, even on a fast computer, and the + # preparer will unnecessarily copy the cached response to disk before deleting + # it at the end of the run. Caching the dist metadata in LinkMetadataCache means + # later pip executions can retrieve metadata within milliseconds and avoid + # thrashing the disk. + self._cache_metadata(req, lazy_wheel_dist) + return lazy_wheel_dist + def _complete_partial_requirements( self, - metadata_only_reqs: Iterable[InstallRequirement], + metadata_only_reqs: List[InstallRequirement], parallel_builds: bool = False, ) -> None: """Download any requirements which were only fetched by metadata.""" @@ -505,9 +643,24 @@ def _complete_partial_requirements( # Map each link to the requirement that owns it. This allows us to set # `req.local_file_path` on the appropriate requirement after passing # all the links at once into BatchDownloader. - links_to_fully_download: Dict[Link, InstallRequirement] = { - req.link: req for req in metadata_only_reqs - } + links_to_fully_download: Dict[Link, InstallRequirement] = {} + for req in metadata_only_reqs: + assert req.link + + # (1) File URLs don't need to be downloaded, so skip them. + if req.link.scheme == "file": + continue + # (2) If this is e.g. a git url, we don't know how to handle that in the + # BatchDownloader, so leave it for self._prepare_linked_requirement() at + # the end of this method, which knows how to handle any URL. + can_simply_download = True + try: + # This will raise InvalidSchema if our Session can't download it. + self._session.get_adapter(req.link.url) + except InvalidSchema: + can_simply_download = False + if can_simply_download: + links_to_fully_download[req.link] = req batch_download = self._batch_download( links_to_fully_download.keys(), @@ -685,7 +838,7 @@ def _ensure_local_file_path( req.local_file_path = local_file.path def _prepare_and_finalize_dist(self, req: InstallRequirement) -> BaseDistribution: - dist = _get_prepared_distribution( + (builds_metadata, dist) = _get_prepared_distribution( req, self.build_tracker, self.finder, @@ -695,6 +848,10 @@ def _prepare_and_finalize_dist(self, req: InstallRequirement) -> BaseDistributio assert dist.is_concrete assert req.is_concrete assert req.get_dist() is dist + + if builds_metadata and should_cache(req): + self._cache_metadata(req, dist) + return dist def _prepare_linked_requirement( diff --git a/src/pip/_internal/req/constructors.py b/src/pip/_internal/req/constructors.py index d73236e05c6..6055fd577f9 100644 --- a/src/pip/_internal/req/constructors.py +++ b/src/pip/_internal/req/constructors.py @@ -552,7 +552,7 @@ def install_req_extend_extras( """ result = copy.copy(ireq) result.extras = {*ireq.extras, *extras} - result.req = ( + result._req = ( _set_requirement_extras(ireq.req, result.extras) if ireq.req is not None else None diff --git a/src/pip/_internal/req/req_install.py b/src/pip/_internal/req/req_install.py index a5800c9e3bd..69315a9088e 100644 --- a/src/pip/_internal/req/req_install.py +++ b/src/pip/_internal/req/req_install.py @@ -97,7 +97,7 @@ def __init__( permit_editable_wheels: bool = False, ) -> None: assert req is None or isinstance(req, Requirement), req - self.req = req + self._req = req self.comes_from = comes_from self.constraint = constraint self.editable = editable @@ -210,6 +210,23 @@ def __init__( # This requirement needs to be unpacked before it can be installed. self._archive_source: Optional[Path] = None + @property + def req(self) -> Optional[Requirement]: + """Calculate a requirement from the cached dist, if populated. + + The cached dist can be populated by either + ``self.cache_virtual_metadata_only_dist()`` or + ``self.cache_concrete_dist()`` and can also be retrieved with + ``self.get_dist()``.""" + if self._req is not None: + return self._req + if self._dist is not None: + name = self._dist.canonical_name + version = str(self._dist.version) + self._req = Requirement(f"{name}=={version}") + return self._req + return None + def __str__(self) -> str: if self.req: s = redact_auth_from_requirement(self.req) @@ -399,7 +416,7 @@ def ensure_build_location( def _set_requirement(self) -> None: """Set requirement after generating metadata.""" - assert self.req is None + assert self._req is None assert self.metadata is not None assert self.source_dir is not None @@ -409,7 +426,7 @@ def _set_requirement(self) -> None: else: op = "===" - self.req = get_requirement( + self._req = get_requirement( "".join( [ self.metadata["Name"], @@ -435,7 +452,7 @@ def warn_on_mismatching_name(self) -> None: metadata_name, self.name, ) - self.req = get_requirement(metadata_name) + self._req = get_requirement(metadata_name) def check_if_exists(self, use_user_site: bool) -> None: """Find an installed distribution that satisfies or conflicts diff --git a/src/pip/_internal/req/req_set.py b/src/pip/_internal/req/req_set.py index ec7a6e07a25..1bec85cc409 100644 --- a/src/pip/_internal/req/req_set.py +++ b/src/pip/_internal/req/req_set.py @@ -43,7 +43,7 @@ def add_unnamed_requirement(self, install_req: InstallRequirement) -> None: self.unnamed_requirements.append(install_req) def add_named_requirement(self, install_req: InstallRequirement) -> None: - assert install_req.name + assert install_req.name, install_req project_name = canonicalize_name(install_req.name) self.requirements[project_name] = install_req diff --git a/src/pip/_internal/wheel_builder.py b/src/pip/_internal/wheel_builder.py index 93f8e1f5b2f..efe7a54bd1e 100644 --- a/src/pip/_internal/wheel_builder.py +++ b/src/pip/_internal/wheel_builder.py @@ -3,7 +3,6 @@ import logging import os.path -import re import shutil from typing import Iterable, List, Optional, Tuple @@ -25,23 +24,12 @@ from pip._internal.utils.subprocess import call_subprocess from pip._internal.utils.temp_dir import TempDirectory from pip._internal.utils.urls import path_to_url -from pip._internal.vcs import vcs logger = logging.getLogger(__name__) -_egg_info_re = re.compile(r"([a-z0-9_.]+)-([a-z0-9_.!+-]+)", re.IGNORECASE) - BuildResult = Tuple[List[InstallRequirement], List[InstallRequirement]] -def _contains_egg_info(s: str) -> bool: - """Determine whether the string looks like an egg_info. - - :param s: The string to parse. E.g. foo-2.1 - """ - return bool(_egg_info_re.search(s)) - - def _should_build( req: InstallRequirement, need_wheel: bool, @@ -87,54 +75,6 @@ def should_build_for_install_command( return _should_build(req, need_wheel=False) -def _should_cache( - req: InstallRequirement, -) -> Optional[bool]: - """ - Return whether a built InstallRequirement can be stored in the persistent - wheel cache, assuming the wheel cache is available, and _should_build() - has determined a wheel needs to be built. - """ - if req.editable or not req.source_dir: - # never cache editable requirements - return False - - if req.link and req.link.is_vcs: - # VCS checkout. Do not cache - # unless it points to an immutable commit hash. - assert not req.editable - assert req.source_dir - vcs_backend = vcs.get_backend_for_scheme(req.link.scheme) - assert vcs_backend - if vcs_backend.is_immutable_rev_checkout(req.link.url, req.source_dir): - return True - return False - - assert req.link - base, ext = req.link.splitext() - if _contains_egg_info(base): - return True - - # Otherwise, do not cache. - return False - - -def _get_cache_dir( - req: InstallRequirement, - wheel_cache: WheelCache, -) -> str: - """Return the persistent or temporary cache directory where the built - wheel need to be stored. - """ - cache_available = bool(wheel_cache.cache_dir) - assert req.link - if cache_available and _should_cache(req): - cache_dir = wheel_cache.get_path_for_link(req.link) - else: - cache_dir = wheel_cache.get_ephem_path_for_link(req.link) - return cache_dir - - def _verify_one(req: InstallRequirement, wheel_path: str) -> None: canonical_name = canonicalize_name(req.name or "") w = Wheel(os.path.basename(wheel_path)) @@ -315,7 +255,7 @@ def build( build_successes, build_failures = [], [] for req in requirements: assert req.name - cache_dir = _get_cache_dir(req, wheel_cache) + cache_dir = wheel_cache.resolve_cache_dir(req) wheel_file = _build_one( req, cache_dir, diff --git a/tests/functional/test_check.py b/tests/functional/test_check.py index f50f5593e5c..9e59a4af3d0 100644 --- a/tests/functional/test_check.py +++ b/tests/functional/test_check.py @@ -123,10 +123,7 @@ def test_check_complicated_name_missing(script: PipTestEnvironment) -> None: # Without dependency result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert ( - "Successfully installed package_A-1.0" in result.stdout - or "Successfully installed package-A-1.0" in result.stdout - ), str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip("check", expect_error=True) expected_lines = ("package-a 1.0 requires dependency-b, which is not installed.",) @@ -149,10 +146,7 @@ def test_check_complicated_name_broken(script: PipTestEnvironment) -> None: # With broken dependency result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert ( - "Successfully installed package_A-1.0" in result.stdout - or "Successfully installed package-A-1.0" in result.stdout - ), str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip( "install", @@ -185,10 +179,7 @@ def test_check_complicated_name_clean(script: PipTestEnvironment) -> None: ) result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert ( - "Successfully installed package_A-1.0" in result.stdout - or "Successfully installed package-A-1.0" in result.stdout - ), str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip( "install", @@ -216,10 +207,7 @@ def test_check_considers_conditional_reqs(script: PipTestEnvironment) -> None: ) result = script.pip("install", "--no-index", package_a_path, "--no-deps") - assert ( - "Successfully installed package_A-1.0" in result.stdout - or "Successfully installed package-A-1.0" in result.stdout - ), str(result) + assert "Successfully installed package-a-1.0" in result.stdout, str(result) result = script.pip("check", expect_error=True) expected_lines = ("package-a 1.0 requires dependency-b, which is not installed.",) diff --git a/tests/functional/test_install.py b/tests/functional/test_install.py index 8c30a69b515..c4fc5685607 100644 --- a/tests/functional/test_install.py +++ b/tests/functional/test_install.py @@ -2094,7 +2094,7 @@ def test_install_conflict_results_in_warning( # Install pkgA without its dependency result1 = script.pip("install", "--no-index", pkgA_path, "--no-deps") - assert "Successfully installed pkgA-1.0" in result1.stdout, str(result1) + assert "Successfully installed pkga-1.0" in result1.stdout, str(result1) # Then install an incorrect version of the dependency result2 = script.pip( @@ -2104,7 +2104,7 @@ def test_install_conflict_results_in_warning( allow_stderr_error=True, ) assert "pkga 1.0 requires pkgb==1.0" in result2.stderr, str(result2) - assert "Successfully installed pkgB-2.0" in result2.stdout, str(result2) + assert "Successfully installed pkgb-2.0" in result2.stdout, str(result2) def test_install_conflict_warning_can_be_suppressed( @@ -2124,11 +2124,11 @@ def test_install_conflict_warning_can_be_suppressed( # Install pkgA without its dependency result1 = script.pip("install", "--no-index", pkgA_path, "--no-deps") - assert "Successfully installed pkgA-1.0" in result1.stdout, str(result1) + assert "Successfully installed pkga-1.0" in result1.stdout, str(result1) # Then install an incorrect version of the dependency; suppressing warning result2 = script.pip("install", "--no-index", pkgB_path, "--no-warn-conflicts") - assert "Successfully installed pkgB-2.0" in result2.stdout, str(result2) + assert "Successfully installed pkgb-2.0" in result2.stdout, str(result2) def test_target_install_ignores_distutils_config_install_prefix( diff --git a/tests/functional/test_install_check.py b/tests/functional/test_install_check.py index 8a8a7c93a80..d16165e03f4 100644 --- a/tests/functional/test_install_check.py +++ b/tests/functional/test_install_check.py @@ -28,7 +28,7 @@ def test_check_install_canonicalization(script: PipTestEnvironment) -> None: # Let's install pkgA without its dependency result = script.pip("install", "--no-index", pkga_path, "--no-deps") - assert "Successfully installed pkgA-1.0" in result.stdout, str(result) + assert "Successfully installed pkga-1.0" in result.stdout, str(result) # Install the first missing dependency. Only an error for the # second dependency should remain. diff --git a/tests/functional/test_install_metadata.py b/tests/functional/test_install_metadata.py index a7c4dd51807..fd04adf4e8d 100644 --- a/tests/functional/test_install_metadata.py +++ b/tests/functional/test_install_metadata.py @@ -210,7 +210,10 @@ def test_canonicalizes_package_name_before_verifying_metadata( [ # It's important that we verify pip won't even attempt to fetch the file, so we # construct an input that will cause it to error if it tries at all. - ("complex-dist==0.1", "404 Client Error: FileNotFoundError"), + ( + "complex-dist==0.1", + "Could not install packages due to an OSError: [Errno 2] No such file or directory", # noqa: E501 + ), ("corruptwheel==1.0", ".whl is invalid."), ], ) diff --git a/tests/functional/test_install_reqs.py b/tests/functional/test_install_reqs.py index b1aed6ad3f4..81668d9e9da 100644 --- a/tests/functional/test_install_reqs.py +++ b/tests/functional/test_install_reqs.py @@ -614,7 +614,7 @@ def test_install_distribution_full_union( result = script.pip_install_local( to_install, f"{to_install}[bar]", f"{to_install}[baz]" ) - assert "Building wheel for LocalExtras" in result.stdout + assert "Building wheel for localextras" in result.stdout result.did_create(script.site_packages / "simple") result.did_create(script.site_packages / "singlemodule.py") diff --git a/tests/functional/test_wheel.py b/tests/functional/test_wheel.py index 1bddd40dc41..69825899df7 100644 --- a/tests/functional/test_wheel.py +++ b/tests/functional/test_wheel.py @@ -285,7 +285,7 @@ def test_wheel_package_with_latin1_setup( pkg_to_wheel = data.packages.joinpath("SetupPyLatin1") result = script.pip("wheel", pkg_to_wheel) - assert "Successfully built SetupPyUTF8" in result.stdout + assert "Successfully built setuppyutf8" in result.stdout def test_pip_wheel_with_pep518_build_reqs( diff --git a/tests/unit/test_cache.py b/tests/unit/test_cache.py index 30cdb6ebece..8eb51d5f889 100644 --- a/tests/unit/test_cache.py +++ b/tests/unit/test_cache.py @@ -1,13 +1,32 @@ import os from pathlib import Path +import pytest from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version -from pip._internal.cache import WheelCache, _hash_dict +from pip._internal.cache import WheelCache, _contains_egg_info, _hash_dict from pip._internal.models.link import Link from pip._internal.utils.misc import ensure_dir +@pytest.mark.parametrize( + "s, expected", + [ + # Trivial. + ("pip-18.0", True), + # Ambiguous. + ("foo-2-2", True), + ("im-valid", True), + # Invalid. + ("invalid", False), + ("im_invalid", False), + ], +) +def test_contains_egg_info(s: str, expected: bool) -> None: + result = _contains_egg_info(s) + assert result == expected + + def test_falsey_path_none() -> None: wc = WheelCache("") assert wc.cache_dir is None diff --git a/tests/unit/test_wheel_builder.py b/tests/unit/test_wheel_builder.py index d5f372dd5cd..eb23d74ea94 100644 --- a/tests/unit/test_wheel_builder.py +++ b/tests/unit/test_wheel_builder.py @@ -6,7 +6,8 @@ import pytest -from pip._internal import wheel_builder +from pip._internal import cache, wheel_builder +from pip._internal.cache import _contains_egg_info from pip._internal.models.link import Link from pip._internal.operations.build.wheel_legacy import format_command_result from pip._internal.req.req_install import InstallRequirement @@ -28,7 +29,7 @@ ], ) def test_contains_egg_info(s: str, expected: bool) -> None: - result = wheel_builder._contains_egg_info(s) + result = _contains_egg_info(s) assert result == expected @@ -116,7 +117,7 @@ def test_should_build_for_wheel_command(req: ReqMock, expected: bool) -> None: ], ) def test_should_cache(req: ReqMock, expected: bool) -> None: - assert wheel_builder._should_cache(cast(InstallRequirement, req)) is expected + assert cache.should_cache(cast(InstallRequirement, req)) is expected def test_should_cache_git_sha(tmpdir: Path) -> None: @@ -126,12 +127,12 @@ def test_should_cache_git_sha(tmpdir: Path) -> None: # a link referencing a sha should be cached url = "git+https://g.c/o/r@" + commit + "#egg=mypkg" req = ReqMock(link=Link(url), source_dir=repo_path) - assert wheel_builder._should_cache(cast(InstallRequirement, req)) + assert cache.should_cache(cast(InstallRequirement, req)) # a link not referencing a sha should not be cached url = "git+https://g.c/o/r@master#egg=mypkg" req = ReqMock(link=Link(url), source_dir=repo_path) - assert not wheel_builder._should_cache(cast(InstallRequirement, req)) + assert not cache.should_cache(cast(InstallRequirement, req)) def test_format_command_result__INFO(caplog: pytest.LogCaptureFixture) -> None: From f28ecfd27ee7f3dfd308e49cb012cd10cecb6ba9 Mon Sep 17 00:00:00 2001 From: Danny McClanahan <1305167+cosmicexplorer@users.noreply.github.com> Date: Wed, 9 Aug 2023 20:09:20 -0400 Subject: [PATCH 5/5] make FetchResolveCache - pipe in headers arg - provide full context in Link.comes_from - pull in etag and date and cache the outputs - handle --no-cache-dir - add NEWS - remove quotes from etag and use binary checksum to save a few bytes - parse http modified date to compress the cached representation --- news/12257.feature.rst | 1 + src/pip/_internal/cache.py | 32 ++- src/pip/_internal/cli/req_command.py | 7 +- src/pip/_internal/index/collector.py | 50 +++-- src/pip/_internal/index/package_finder.py | 244 +++++++++++++++++++++- src/pip/_internal/models/link.py | 6 +- tests/unit/test_collector.py | 12 +- 7 files changed, 316 insertions(+), 36 deletions(-) create mode 100644 news/12257.feature.rst diff --git a/news/12257.feature.rst b/news/12257.feature.rst new file mode 100644 index 00000000000..21ea8cf5942 --- /dev/null +++ b/news/12257.feature.rst @@ -0,0 +1 @@ +Store HTTP caching headers in ``~/.cache/pip/fetch-resolve`` to reduce bandwidth usage when ``--use-feature=metadata-cache`` is enabled. diff --git a/src/pip/_internal/cache.py b/src/pip/_internal/cache.py index 534d0e866b6..bed8b1b68e4 100644 --- a/src/pip/_internal/cache.py +++ b/src/pip/_internal/cache.py @@ -92,7 +92,9 @@ def __init__(self, cache_dir: str) -> None: assert not cache_dir or os.path.isabs(cache_dir) self.cache_dir = cache_dir or None - def _get_cache_path_parts(self, link: Link) -> List[str]: + def _get_cache_path_parts( + self, link: Link, *, interpreter_dependent: bool + ) -> List[str]: """Get parts of part that must be os.path.joined with cache_dir""" # We want to generate an url to use as our cache key, we don't want to @@ -104,13 +106,14 @@ def _get_cache_path_parts(self, link: Link) -> List[str]: if link.subdirectory_fragment: key_parts["subdirectory"] = link.subdirectory_fragment - # Include interpreter name, major and minor version in cache key - # to cope with ill-behaved sdists that build a different wheel - # depending on the python version their setup.py is being run on, - # and don't encode the difference in compatibility tags. - # https://github.com/pypa/pip/issues/7296 - key_parts["interpreter_name"] = interpreter_name() - key_parts["interpreter_version"] = interpreter_version() + if interpreter_dependent: + # Include interpreter name, major and minor version in cache key + # to cope with ill-behaved sdists that build a different wheel + # depending on the python version their setup.py is being run on, + # and don't encode the difference in compatibility tags. + # https://github.com/pypa/pip/issues/7296 + key_parts["interpreter_name"] = interpreter_name() + key_parts["interpreter_version"] = interpreter_version() # Encode our key url with sha224, we'll use this because it has similar # security properties to sha256, but with a shorter total output (and @@ -138,11 +141,20 @@ class LinkMetadataCache(Cache): """Persistently store the metadata of dists found at each link.""" def get_path_for_link(self, link: Link) -> str: - parts = self._get_cache_path_parts(link) + parts = self._get_cache_path_parts(link, interpreter_dependent=True) assert self.cache_dir return os.path.join(self.cache_dir, "link-metadata", *parts) +class FetchResolveCache(Cache): + def get_path_for_link(self, link: Link) -> str: + # We are reading index links to extract other links from, not executing any + # python code, so these caches are interpreter-independent. + parts = self._get_cache_path_parts(link, interpreter_dependent=False) + assert self.cache_dir + return os.path.join(self.cache_dir, "fetch-resolve", *parts) + + class WheelCacheBase(Cache): """Specializations to the cache concept for wheels.""" @@ -197,7 +209,7 @@ def get_path_for_link(self, link: Link) -> str: :param link: The link of the sdist for which this will cache wheels. """ - parts = self._get_cache_path_parts(link) + parts = self._get_cache_path_parts(link, interpreter_dependent=True) assert self.cache_dir # Store wheels within the root cache_dir return os.path.join(self.cache_dir, "wheels", *parts) diff --git a/src/pip/_internal/cli/req_command.py b/src/pip/_internal/cli/req_command.py index c4355670546..a335bd84dbc 100644 --- a/src/pip/_internal/cli/req_command.py +++ b/src/pip/_internal/cli/req_command.py @@ -10,7 +10,7 @@ from optparse import Values from typing import Any, List, Optional, Tuple -from pip._internal.cache import LinkMetadataCache, WheelCache +from pip._internal.cache import FetchResolveCache, LinkMetadataCache, WheelCache from pip._internal.cli import cmdoptions from pip._internal.cli.index_command import IndexGroupCommand from pip._internal.cli.index_command import SessionCommandMixin as SessionCommandMixin @@ -333,8 +333,13 @@ def _build_package_finder( ignore_requires_python=ignore_requires_python, ) + if bool(options.cache_dir) and ("metadata-cache" in options.features_enabled): + fetch_resolve_cache = FetchResolveCache(options.cache_dir) + else: + fetch_resolve_cache = None return PackageFinder.create( link_collector=link_collector, selection_prefs=selection_prefs, target_python=target_python, + fetch_resolve_cache=fetch_resolve_cache, ) diff --git a/src/pip/_internal/index/collector.py b/src/pip/_internal/index/collector.py index 5f8fdee3d46..ee616fcf42d 100644 --- a/src/pip/_internal/index/collector.py +++ b/src/pip/_internal/index/collector.py @@ -92,7 +92,9 @@ class _NotHTTP(Exception): pass -def _ensure_api_response(url: str, session: PipSession) -> None: +def _ensure_api_response( + url: str, session: PipSession, headers: Optional[Dict[str, str]] = None +) -> None: """ Send a HEAD request to the URL, and ensure the response contains a simple API Response. @@ -104,13 +106,15 @@ def _ensure_api_response(url: str, session: PipSession) -> None: if scheme not in {"http", "https"}: raise _NotHTTP() - resp = session.head(url, allow_redirects=True) + resp = session.head(url, allow_redirects=True, headers=headers) raise_for_status(resp) _ensure_api_header(resp) -def _get_simple_response(url: str, session: PipSession) -> Response: +def _get_simple_response( + url: str, session: PipSession, headers: Optional[Dict[str, str]] = None +) -> Response: """Access an Simple API response with GET, and return the response. This consists of three parts: @@ -124,10 +128,13 @@ def _get_simple_response(url: str, session: PipSession) -> Response: and raise `_NotAPIContent` otherwise. """ if is_archive_file(Link(url).filename): - _ensure_api_response(url, session=session) + _ensure_api_response(url, session=session, headers=headers) logger.debug("Getting page %s", redact_auth_from_url(url)) + logger.debug("headers: %s", str(headers)) + if headers is None: + headers = {} resp = session.get( url, headers={ @@ -152,6 +159,7 @@ def _get_simple_response(url: str, session: PipSession) -> Response: # once per 10 minutes. # For more information, please see pypa/pip#5670. "Cache-Control": "max-age=0", + **headers, }, ) raise_for_status(resp) @@ -230,7 +238,7 @@ def parse_links(page: "IndexContent") -> Iterable[Link]: if content_type_l.startswith("application/vnd.pypi.simple.v1+json"): data = json.loads(page.content) for file in data.get("files", []): - link = Link.from_json(file, page.url) + link = Link.from_json(file, page.url, page_content=page) if link is None: continue yield link @@ -243,7 +251,9 @@ def parse_links(page: "IndexContent") -> Iterable[Link]: url = page.url base_url = parser.base_url or url for anchor in parser.anchors: - link = Link.from_element(anchor, page_url=url, base_url=base_url) + link = Link.from_element( + anchor, page_url=url, base_url=base_url, page_content=page + ) if link is None: continue yield link @@ -258,6 +268,8 @@ class IndexContent: :param cache_link_parsing: whether links parsed from this page's url should be cached. PyPI index urls should have this set to False, for example. + :param etag: The ``ETag`` header from an HTTP request against ``url``. + :param date: The ``Date`` header from an HTTP request against ``url``. """ content: bytes @@ -265,6 +277,8 @@ class IndexContent: encoding: Optional[str] url: str cache_link_parsing: bool = True + etag: Optional[str] = None + date: Optional[str] = None def __str__(self) -> str: return redact_auth_from_url(self.url) @@ -309,7 +323,8 @@ def _handle_get_simple_fail( def _make_index_content( - response: Response, cache_link_parsing: bool = True + response: Response, + cache_link_parsing: bool = True, ) -> IndexContent: encoding = _get_encoding_from_headers(response.headers) return IndexContent( @@ -318,11 +333,15 @@ def _make_index_content( encoding=encoding, url=response.url, cache_link_parsing=cache_link_parsing, + etag=response.headers.get("ETag", None), + date=response.headers.get("Date", None), ) -def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexContent"]: - url = link.url.split("#", 1)[0] +def _get_index_content( + link: Link, *, session: PipSession, headers: Optional[Dict[str, str]] = None +) -> Optional["IndexContent"]: + url = link.url_without_fragment # Check for VCS schemes that do not support lookup as web pages. vcs_scheme = _match_vcs_scheme(url) @@ -349,7 +368,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon logger.debug(" file: URL is directory, getting %s", url) try: - resp = _get_simple_response(url, session=session) + resp = _get_simple_response(url, session=session, headers=headers) except _NotHTTP: logger.warning( "Skipping page %s because it looks like an archive, and cannot " @@ -365,9 +384,7 @@ def _get_index_content(link: Link, *, session: PipSession) -> Optional["IndexCon exc.request_desc, exc.content_type, ) - except NetworkConnectionError as exc: - _handle_get_simple_fail(link, exc) - except RetryError as exc: + except (NetworkConnectionError, RetryError) as exc: _handle_get_simple_fail(link, exc) except SSLError as exc: reason = "There was a problem confirming the ssl certificate: " @@ -441,11 +458,14 @@ def create( def find_links(self) -> List[str]: return self.search_scope.find_links - def fetch_response(self, location: Link) -> Optional[IndexContent]: + def fetch_response( + self, location: Link, headers: Optional[Dict[str, str]] = None + ) -> Optional[IndexContent]: """ Fetch an HTML page containing package links. """ - return _get_index_content(location, session=self.session) + logger.debug("headers: %s", str(headers)) + return _get_index_content(location, session=self.session, headers=headers) def collect_sources( self, diff --git a/src/pip/_internal/index/package_finder.py b/src/pip/_internal/index/package_finder.py index 0d65ce35f37..439dacf3e92 100644 --- a/src/pip/_internal/index/package_finder.py +++ b/src/pip/_internal/index/package_finder.py @@ -1,12 +1,28 @@ """Routines related to PyPI, indexes""" +import binascii +import datetime import enum import functools import itertools import logging +import os import re +import time from dataclasses import dataclass -from typing import TYPE_CHECKING, FrozenSet, Iterable, List, Optional, Set, Tuple, Union +from hashlib import sha256 +from pathlib import Path +from typing import ( + TYPE_CHECKING, + Dict, + FrozenSet, + Iterable, + List, + Optional, + Set, + Tuple, + Union, +) from pip._vendor.packaging import specifiers from pip._vendor.packaging.tags import Tag @@ -14,13 +30,14 @@ from pip._vendor.packaging.version import InvalidVersion, _BaseVersion from pip._vendor.packaging.version import parse as parse_version +from pip._internal.cache import FetchResolveCache from pip._internal.exceptions import ( BestVersionAlreadyInstalled, DistributionNotFound, InvalidWheelFilename, UnsupportedWheel, ) -from pip._internal.index.collector import LinkCollector, parse_links +from pip._internal.index.collector import IndexContent, LinkCollector, parse_links from pip._internal.models.candidate import InstallationCandidate from pip._internal.models.format_control import FormatControl from pip._internal.models.link import Link @@ -592,6 +609,7 @@ def __init__( format_control: Optional[FormatControl] = None, candidate_prefs: Optional[CandidatePreferences] = None, ignore_requires_python: Optional[bool] = None, + fetch_resolve_cache: Optional[FetchResolveCache] = None, ) -> None: """ This constructor is primarily meant to be used by the create() class @@ -619,6 +637,8 @@ def __init__( # These are boring links that have already been logged somehow. self._logged_links: Set[Tuple[Link, LinkType, str]] = set() + self._fetch_resolve_cache = fetch_resolve_cache + # Don't include an allow_yanked default value to make sure each call # site considers whether yanked releases are allowed. This also causes # that decision to be made explicit in the calling code, which helps @@ -629,6 +649,7 @@ def create( link_collector: LinkCollector, selection_prefs: SelectionPreferences, target_python: Optional[TargetPython] = None, + fetch_resolve_cache: Optional[FetchResolveCache] = None, ) -> "PackageFinder": """Create a PackageFinder. @@ -653,6 +674,7 @@ def create( allow_yanked=selection_prefs.allow_yanked, format_control=selection_prefs.format_control, ignore_requires_python=selection_prefs.ignore_requires_python, + fetch_resolve_cache=fetch_resolve_cache, ) @property @@ -774,18 +796,232 @@ def evaluate_links( return candidates - def process_project_url( + _HTTP_DATE_FORMAT = "%a, %d %b %Y %H:%M:%S %Z" + + @classmethod + def _try_load_http_cache_headers( + cls, + etag_path: Path, + date_path: Path, + checksum_path: Path, + project_url: Link, + headers: Dict[str, str], + ) -> Tuple[Optional[str], Optional[datetime.datetime], Optional[bytes]]: + etag: Optional[str] = None + try: + etag = etag_path.read_text() + etag = f'"{etag}"' + logger.debug( + "found cached etag for url %s at %s: %s", + project_url, + etag_path, + etag, + ) + headers["If-None-Match"] = etag + except OSError as e: + logger.debug("no etag found for url %s (%s)", project_url, str(e)) + + date: Optional[datetime.datetime] = None + try: + date_bytes = date_path.read_bytes() + date_int = int.from_bytes(date_bytes, byteorder="big", signed=False) + date = datetime.datetime.fromtimestamp(date_int, tz=datetime.timezone.utc) + logger.debug( + "found cached date for url %s at %s: '%s'", + project_url, + date_path, + date, + ) + headers["If-Modified-Since"] = date.strftime(cls._HTTP_DATE_FORMAT) + except OSError as e: + logger.debug("no date found for url %s (%s)", project_url, str(e)) + + checksum: Optional[bytes] = None + try: + checksum = checksum_path.read_bytes() + logger.debug( + "found checksum for url %s at %s: '%s'", + project_url, + checksum_path, + binascii.b2a_base64(checksum, newline=False).decode("ascii"), + ) + except OSError as e: + logger.debug("no checksum found for url %s (%s)", project_url, str(e)) + + return (etag, date, checksum) + + _quoted_value = re.compile(r'^"([^"]*)"$') + + @classmethod + def _strip_quoted_value(cls, value: str) -> str: + return cls._quoted_value.sub(r"\1", value) + + _now_local = datetime.datetime.now().astimezone() + _local_tz = _now_local.tzinfo + assert _local_tz is not None + _local_tz_name = _local_tz.tzname(_now_local) + + @classmethod + def _write_http_cache_info( + cls, + etag_path: Path, + date_path: Path, + checksum_path: Path, + project_url: Link, + index_response: IndexContent, + prev_etag: Optional[str], + prev_checksum: Optional[bytes], + ) -> Tuple[Optional[str], Optional[datetime.datetime], bytes, bool]: + hasher = sha256() + hasher.update(index_response.content) + new_checksum = hasher.digest() + checksum_path.write_bytes(new_checksum) + page_unmodified = new_checksum == prev_checksum + + new_etag: Optional[str] = index_response.etag + if new_etag is None: + logger.debug("no etag returned from fetch for url %s", project_url.url) + try: + etag_path.unlink() + except OSError: + pass + else: + new_etag = cls._strip_quoted_value(new_etag) + if new_etag != prev_etag: + logger.debug( + "etag for url %s updated from %s -> %s", + project_url.url, + prev_etag, + new_etag, + ) + etag_path.write_text(new_etag) + else: + logger.debug( + "etag was unmodified for url %s (%s)", project_url.url, prev_etag + ) + assert page_unmodified + + new_date: Optional[datetime.datetime] = None + date_str: Optional[str] = index_response.date + if date_str is None: + logger.debug( + "no date header was provided in response for url %s", project_url + ) + else: + date_str = date_str.strip() + new_time = time.strptime(date_str, cls._HTTP_DATE_FORMAT) + new_date = datetime.datetime.strptime(date_str, cls._HTTP_DATE_FORMAT) + # strptime() doesn't set the timezone according to the parsed %Z arg, which + # may be any of "UTC", "GMT", or any element of `time.tzname`. + if new_time.tm_zone in ["UTC", "GMT"]: + logger.debug( + "a UTC timezone was found in response for url %s", project_url + ) + new_date = new_date.replace(tzinfo=datetime.timezone.utc) + else: + assert new_time.tm_zone in time.tzname, new_time + logger.debug( + "a local timezone %s was found in response for url %s", + new_time.tm_zone, + project_url, + ) + if new_time.tm_zone == cls._local_tz_name: + new_date = new_date.replace(tzinfo=cls._local_tz) + else: + logger.debug( + "a local timezone %s had to be discarded in response %s", + new_time.tm_zone, + project_url, + ) + new_date = None + + if new_date is not None: + timestamp = new_date.timestamp() + # The timestamp will only have second resolution according to the parse + # format string _HTTP_DATE_FORMAT. + assert not (timestamp % 1), (new_date, timestamp) + epoch = int(timestamp) + assert epoch >= 0, (new_date, timestamp, epoch) + date_bytes = epoch.to_bytes(length=4, byteorder="big", signed=False) + date_path.write_bytes(date_bytes) + + logger.debug('date "%s" written for url %s', new_date, project_url) + if new_date is None: + try: + date_path.unlink() + except OSError: + pass + + return (new_etag, new_date, new_checksum, page_unmodified) + + def _process_project_url_uncached( self, project_url: Link, link_evaluator: LinkEvaluator ) -> List[InstallationCandidate]: logger.debug( "Fetching project page and analyzing links: %s", project_url, ) + index_response = self._link_collector.fetch_response(project_url) if index_response is None: return [] - page_links = list(parse_links(index_response)) + page_links = parse_links(index_response) + + with indent_log(): + package_links = self.evaluate_links(link_evaluator, links=page_links) + return package_links + + @functools.lru_cache(maxsize=None) + def process_project_url( + self, project_url: Link, link_evaluator: LinkEvaluator + ) -> List[InstallationCandidate]: + if self._fetch_resolve_cache is None: + return self._process_project_url_uncached(project_url, link_evaluator) + + cached_path = self._fetch_resolve_cache.cache_path(project_url) + os.makedirs(str(cached_path), exist_ok=True) + + etag_path = cached_path / "etag" + date_path = cached_path / "modified-since-date" + checksum_path = cached_path / "checksum" + + headers: Dict[str, str] = {} + # NB: mutates headers! + prev_etag, _prev_date, prev_checksum = self._try_load_http_cache_headers( + etag_path, date_path, checksum_path, project_url, headers + ) + + logger.debug( + "Fetching project page and analyzing links: %s", + project_url, + ) + + # A 304 Not Modified is implicitly converted into a reused cached response from + # the Cache-Control library, so we won't explicitly check for a 304. + index_response = self._link_collector.fetch_response( + project_url, + headers=headers, + ) + if index_response is None: + return [] + + ( + _new_etag, + _new_date, + _new_checksum, + page_unmodified, + ) = self._write_http_cache_info( + etag_path, + date_path, + checksum_path, + project_url, + index_response, + prev_etag=prev_etag, + prev_checksum=prev_checksum, + ) + + page_links = parse_links(index_response) with indent_log(): package_links = self.evaluate_links( diff --git a/src/pip/_internal/models/link.py b/src/pip/_internal/models/link.py index 2f41f2f6a09..2297a93ccf8 100644 --- a/src/pip/_internal/models/link.py +++ b/src/pip/_internal/models/link.py @@ -262,6 +262,7 @@ def from_json( cls, file_data: Dict[str, Any], page_url: str, + page_content: Optional["IndexContent"] = None, ) -> Optional["Link"]: """ Convert an pypi json document from a simple repository page into a Link. @@ -301,7 +302,7 @@ def from_json( return cls( url, - comes_from=page_url, + comes_from=page_content or page_url, requires_python=pyrequire, yanked_reason=yanked_reason, hashes=hashes, @@ -314,6 +315,7 @@ def from_element( anchor_attribs: Dict[str, Optional[str]], page_url: str, base_url: str, + page_content: Optional["IndexContent"] = None, ) -> Optional["Link"]: """ Convert an anchor element's attributes in a simple repository page to a Link. @@ -354,7 +356,7 @@ def from_element( return cls( url, - comes_from=page_url, + comes_from=page_content or page_url, requires_python=pyrequire, yanked_reason=yanked_reason, metadata_file_data=metadata_file_data, diff --git a/tests/unit/test_collector.py b/tests/unit/test_collector.py index 882f82ae4fe..792625099fa 100644 --- a/tests/unit/test_collector.py +++ b/tests/unit/test_collector.py @@ -95,7 +95,7 @@ def test_get_simple_response_archive_to_http_scheme( session.assert_has_calls( [ - mock.call.head(url, allow_redirects=True), + mock.call.head(url, allow_redirects=True, headers=None), ] ) mock_raise_for_status.assert_called_once_with(session.head.return_value) @@ -157,7 +157,7 @@ def test_get_simple_response_archive_to_http_scheme_is_html( assert resp is not None assert session.mock_calls == [ - mock.call.head(url, allow_redirects=True), + mock.call.head(url, allow_redirects=True, headers=None), mock.call.get( url, headers={ @@ -245,7 +245,7 @@ def test_get_simple_response_dont_log_clear_text_password( assert resp is not None mock_raise_for_status.assert_called_once_with(resp) - assert len(caplog.records) == 2 + assert len(caplog.records) == 3 record = caplog.records[0] assert record.levelname == "DEBUG" assert record.message.splitlines() == [ @@ -253,6 +253,9 @@ def test_get_simple_response_dont_log_clear_text_password( ] record = caplog.records[1] assert record.levelname == "DEBUG" + assert record.message.splitlines() == ["headers: None"] + record = caplog.records[2] + assert record.levelname == "DEBUG" assert record.message.splitlines() == [ "Fetched page https://user:****@example.com/simple/ as text/html", ] @@ -853,7 +856,7 @@ def test_get_index_content_directory_append_index(tmpdir: Path) -> None: mock_func.return_value = fake_response actual = _get_index_content(Link(dir_url), session=session) assert mock_func.mock_calls == [ - mock.call(expected_url, session=session), + mock.call(expected_url, headers=None, session=session), ], f"actual calls: {mock_func.mock_calls}" assert actual is not None @@ -969,6 +972,7 @@ def test_fetch_response(self, mock_get_simple_response: mock.Mock) -> None: # _get_simple_response(). mock_get_simple_response.assert_called_once_with( url, + headers=None, session=link_collector.session, )