Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

send HTTP caching headers for index pages to further reduce bandwidth usage #12257

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
make LinkMetadataCache
- catch an exception when parsing metadata which only occurs in CI
- handle --no-cache-dir
- call os.makedirs() before writing to cache too
- catch InvalidSchema when attempting git urls with BatchDownloader
- fix other test failures
- reuse should_cache(req) logic
- gzip compress link metadata for a slight reduction in disk space
- only cache built sdists
- don't check should_cache() when fetching
- cache lazy wheel dists
- add news
- turn debug logs in fetching from cache into exceptions
- use scandir over listdir when searching normal wheel cache
- handle metadata email parsing errors
- correctly handle mutable cached requirement
- use bz2 over gzip for an extremely slight improvement in disk usage
cosmicexplorer committed Aug 13, 2024

Verified

This commit was signed with the committer’s verified signature. The key has expired.
cosmicexplorer Danny McClanahan
commit b02915a14662e628f5bbcefcbdc7fe486e57d093
1 change: 1 addition & 0 deletions news/12256.feature.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Cache computed metadata from sdists and lazy wheels in ``~/.cache/pip/link-metadata`` when ``--use-feature=metadata-cache`` is enabled.
118 changes: 102 additions & 16 deletions src/pip/_internal/cache.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,14 @@
"""Cache Management
"""

import abc
import hashlib
import json
import logging
import os
import re
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Dict, Iterator, List, Optional, Tuple

from pip._vendor.packaging.tags import Tag, interpreter_name, interpreter_version
from pip._vendor.packaging.utils import canonicalize_name
@@ -15,21 +17,71 @@
from pip._internal.models.direct_url import DirectUrl
from pip._internal.models.link import Link
from pip._internal.models.wheel import Wheel
from pip._internal.req.req_install import InstallRequirement
from pip._internal.utils.temp_dir import TempDirectory, tempdir_kinds
from pip._internal.utils.urls import path_to_url
from pip._internal.vcs import vcs

logger = logging.getLogger(__name__)

_egg_info_re = re.compile(r"([a-z0-9_.]+)-([a-z0-9_.!+-]+)", re.IGNORECASE)

ORIGIN_JSON_NAME = "origin.json"


def _contains_egg_info(s: str) -> bool:
"""Determine whether the string looks like an egg_info.

:param s: The string to parse. E.g. foo-2.1
"""
return bool(_egg_info_re.search(s))


def should_cache(
req: InstallRequirement,
) -> bool:
"""
Return whether a built InstallRequirement can be stored in the persistent
wheel cache, assuming the wheel cache is available, and _should_build()
has determined a wheel needs to be built.
"""
if not req.link:
return False

if req.link.is_wheel:
return False

if req.editable or not req.source_dir:
# never cache editable requirements
return False

if req.link and req.link.is_vcs:
# VCS checkout. Do not cache
# unless it points to an immutable commit hash.
assert not req.editable
assert req.source_dir
vcs_backend = vcs.get_backend_for_scheme(req.link.scheme)
assert vcs_backend
if vcs_backend.is_immutable_rev_checkout(req.link.url, req.source_dir):
return True
return False

assert req.link
base, ext = req.link.splitext()
if _contains_egg_info(base):
return True

# Otherwise, do not cache.
return False


def _hash_dict(d: Dict[str, str]) -> str:
"""Return a stable sha224 of a dictionary."""
s = json.dumps(d, sort_keys=True, separators=(",", ":"), ensure_ascii=True)
return hashlib.sha224(s.encode("ascii")).hexdigest()


class Cache:
class Cache(abc.ABC):
"""An abstract class - provides cache directories for data from links

:param cache_dir: The root of the cache.
@@ -73,20 +125,28 @@ def _get_cache_path_parts(self, link: Link) -> List[str]:

return parts

def _get_candidates(self, link: Link, canonical_package_name: str) -> List[Any]:
can_not_cache = not self.cache_dir or not canonical_package_name or not link
if can_not_cache:
return []
@abc.abstractmethod
def get_path_for_link(self, link: Link) -> str:
"""Return a directory to store cached items in for link."""
...

def cache_path(self, link: Link) -> Path:
return Path(self.get_path_for_link(link))

path = self.get_path_for_link(link)
if os.path.isdir(path):
return [(candidate, path) for candidate in os.listdir(path)]
return []

class LinkMetadataCache(Cache):
"""Persistently store the metadata of dists found at each link."""

def get_path_for_link(self, link: Link) -> str:
"""Return a directory to store cached items in for link."""
raise NotImplementedError()
parts = self._get_cache_path_parts(link)
assert self.cache_dir
return os.path.join(self.cache_dir, "link-metadata", *parts)


class WheelCacheBase(Cache):
"""Specializations to the cache concept for wheels."""

@abc.abstractmethod
def get(
self,
link: Link,
@@ -96,10 +156,27 @@ def get(
"""Returns a link to a cached item if it exists, otherwise returns the
passed link.
"""
raise NotImplementedError()
...

def _can_cache(self, link: Link, canonical_package_name: str) -> bool:
return bool(self.cache_dir and canonical_package_name and link)

def _get_candidates(
self, link: Link, canonical_package_name: str
) -> Iterator[Tuple[str, str]]:
if not self._can_cache(link, canonical_package_name):
return

path = self.get_path_for_link(link)
if not os.path.isdir(path):
return

class SimpleWheelCache(Cache):
for candidate in os.scandir(path):
if candidate.is_file():
yield (candidate.name, path)


class SimpleWheelCache(WheelCacheBase):
"""A cache of wheels for future installs."""

def __init__(self, cache_dir: str) -> None:
@@ -131,7 +208,7 @@ def get(
package_name: Optional[str],
supported_tags: List[Tag],
) -> Link:
candidates = []
candidates: List[Tuple[int, str, str]] = []

if not package_name:
return link
@@ -205,7 +282,7 @@ def __init__(
)


class WheelCache(Cache):
class WheelCache(WheelCacheBase):
"""Wraps EphemWheelCache and SimpleWheelCache into a single Cache

This Cache allows for gracefully degradation, using the ephem wheel cache
@@ -223,6 +300,15 @@ def get_path_for_link(self, link: Link) -> str:
def get_ephem_path_for_link(self, link: Link) -> str:
return self._ephem_cache.get_path_for_link(link)

def resolve_cache_dir(self, req: InstallRequirement) -> str:
"""Return the persistent or temporary cache directory where the built or
downloaded wheel should be stored."""
cache_available = bool(self.cache_dir)
assert req.link, req
if cache_available and should_cache(req):
return self.get_path_for_link(req.link)
return self.get_ephem_path_for_link(req.link)

def get(
self,
link: Link,
2 changes: 2 additions & 0 deletions src/pip/_internal/cli/cmdoptions.py
Original file line number Diff line number Diff line change
@@ -1009,6 +1009,8 @@ def check_list_path_option(options: Values) -> None:
default=[],
choices=[
"fast-deps",
"metadata-cache",
"truststore",
]
+ ALWAYS_ENABLED_FEATURES,
help="Enable new functionality, that may be backward incompatible.",
13 changes: 12 additions & 1 deletion src/pip/_internal/cli/req_command.py
Original file line number Diff line number Diff line change
@@ -10,7 +10,7 @@
from optparse import Values
from typing import Any, List, Optional, Tuple

from pip._internal.cache import WheelCache
from pip._internal.cache import LinkMetadataCache, WheelCache
from pip._internal.cli import cmdoptions
from pip._internal.cli.index_command import IndexGroupCommand
from pip._internal.cli.index_command import SessionCommandMixin as SessionCommandMixin
@@ -127,6 +127,16 @@ def make_requirement_preparer(
"fast-deps has no effect when used with the legacy resolver."
)

if options.cache_dir and "metadata-cache" in options.features_enabled:
logger.warning(
"pip is using a local cache for metadata information. "
"This experimental feature is enabled through "
"--use-feature=metadata-cache and it is not ready for "
"production."
)
metadata_cache = LinkMetadataCache(options.cache_dir)
else:
metadata_cache = None
return RequirementPreparer(
build_dir=temp_build_dir_path,
src_dir=options.src_dir,
@@ -142,6 +152,7 @@ def make_requirement_preparer(
lazy_wheel=lazy_wheel,
verbosity=verbosity,
legacy_resolver=legacy_resolver,
metadata_cache=metadata_cache,
)

@classmethod
19 changes: 19 additions & 0 deletions src/pip/_internal/exceptions.py
Original file line number Diff line number Diff line change
@@ -246,6 +246,25 @@ def __str__(self) -> str:
return f"None {self.metadata_name} metadata found for distribution: {self.dist}"


class CacheMetadataError(PipError):
"""Raised when de/serializing a requirement into the metadata cache."""

def __init__(
self,
req: "InstallRequirement",
reason: str,
) -> None:
"""
:param req: The requirement we attempted to cache.
:param reason: Context about the precise error that occurred.
"""
self.req = req
self.reason = reason

def __str__(self) -> str:
return f"{self.reason} for {self.req} from {self.req.link}"


class UserInstallationInvalid(InstallationError):
"""A --user install is requested on an environment without user site."""

10 changes: 9 additions & 1 deletion src/pip/_internal/metadata/__init__.py
Original file line number Diff line number Diff line change
@@ -6,7 +6,14 @@

from pip._internal.utils.misc import strtobool

from .base import BaseDistribution, BaseEnvironment, FilesystemWheel, MemoryWheel, Wheel
from .base import (
BaseDistribution,
BaseEnvironment,
FilesystemWheel,
MemoryWheel,
Wheel,
serialize_metadata,
)

if TYPE_CHECKING:
from typing import Literal, Protocol
@@ -23,6 +30,7 @@
"get_environment",
"get_wheel_distribution",
"select_backend",
"serialize_metadata",
]


15 changes: 15 additions & 0 deletions src/pip/_internal/metadata/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import csv
import email.generator
import email.message
import email.policy
import functools
import io
import json
import logging
import pathlib
@@ -90,6 +93,18 @@ def _convert_installed_files_path(
return str(pathlib.Path(*info, *entry))


def serialize_metadata(msg: email.message.Message) -> str:
"""Write a dist's metadata to a string.

Calling ``str(dist.metadata)`` may raise an error by misinterpreting RST directives
as email headers. This method uses the more robust ``email.policy.EmailPolicy`` to
avoid those parsing errors."""
out = io.StringIO()
g = email.generator.Generator(out, policy=email.policy.EmailPolicy())
g.flatten(msg)
return out.getvalue()


class RequiresEntry(NamedTuple):
requirement: str
extra: str
2 changes: 1 addition & 1 deletion src/pip/_internal/network/download.py
Original file line number Diff line number Diff line change
@@ -114,7 +114,7 @@ def _get_http_response_filename(resp: Response, link: Link) -> str:


def _http_get_download(session: PipSession, link: Link) -> Response:
target_url = link.url.split("#", 1)[0]
target_url = link.url_without_fragment
resp = session.get(target_url, headers=HEADERS, stream=True)
raise_for_status(resp)
return resp
Loading