Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

import/get: support pre-fetching LFS objects from Git-LFS repos #10072

Merged
merged 2 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion dvc/fs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

from dvc.config import ConfigError as RepoConfigError
from dvc.config_schema import SCHEMA, Invalid

# pylint: disable=unused-import
from dvc_objects.fs import ( # noqa: F401
LocalFileSystem,
MemoryFileSystem,
Expand All @@ -27,7 +29,7 @@

from .callbacks import Callback
from .data import DataFileSystem # noqa: F401
from .dvc import DVCFileSystem # noqa: F401
from .dvc import DVCFileSystem
from .git import GitFileSystem # noqa: F401

known_implementations.update(
Expand All @@ -47,6 +49,8 @@
def download(
fs: "FileSystem", fs_path: str, to: str, jobs: Optional[int] = None
) -> int:
from dvc.scm import lfs_prefetch

with Callback.as_tqdm_callback(
desc=f"Downloading {fs.path.name(fs_path)}",
unit="files",
Expand All @@ -70,6 +74,8 @@ def download(
from_infos = [fs_path]
to_infos = [to]

if isinstance(fs, DVCFileSystem):
lfs_prefetch(fs, from_infos)
cb.set_size(len(from_infos))
jobs = jobs or fs.jobs
generic.copy(
Expand Down
29 changes: 29 additions & 0 deletions dvc/scm.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@
if TYPE_CHECKING:
from scmrepo.progress import GitProgressEvent

from dvc.fs import FileSystem


class SCMError(DvcException):
"""Base class for source control management errors."""
Expand Down Expand Up @@ -262,3 +264,30 @@ def _time_filter(rev):

rev_resolver = partial(resolve_rev, scm)
return group_by(rev_resolver, results)


def lfs_prefetch(fs: "FileSystem", paths: List[str]):
from scmrepo.git.lfs import fetch as _lfs_fetch

from dvc.fs.dvc import DVCFileSystem
from dvc.fs.git import GitFileSystem

if isinstance(fs, DVCFileSystem) and isinstance(fs.repo.fs, GitFileSystem):
git_fs = fs.repo.fs
scm = fs.repo.scm
assert isinstance(scm, Git)
else:
return

try:
if "filter=lfs" not in git_fs.open(".gitattributes").read():
return
except OSError:
return
with TqdmGit(desc="Checking for Git-LFS objects") as pbar:
_lfs_fetch(
scm,
[git_fs.rev],
include=[(path if path.startswith("/") else f"/{path}") for path in paths],
progress=pbar.update_git,
)
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ dependencies = [
"requests>=2.22",
"rich>=12",
"ruamel.yaml>=0.17.11",
"scmrepo>=1.4.1,<2",
"scmrepo>=1.5.0,<2",
"shortuuid>=0.5",
"shtab<2,>=1.3.4",
"tabulate>=0.8.7",
Expand Down
19 changes: 18 additions & 1 deletion tests/func/test_scm.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import pytest
from git import Repo

from dvc.scm import SCM, Git, NoSCM, SCMError
from dvc.scm import SCM, Git, NoSCM, SCMError, lfs_prefetch


def test_init_none(tmp_dir):
Expand All @@ -27,3 +27,20 @@ def test_init_sub_dir(tmp_dir):

scm = SCM(os.fspath(subdir))
assert scm.root_dir == os.fspath(tmp_dir)


def test_lfs_prefetch(tmp_dir, dvc, scm, mocker):
mock_fetch = mocker.patch("scmrepo.git.lfs.fetch")
rev = scm.get_rev()

with dvc.switch(rev):
lfs_prefetch(dvc.dvcfs, ["foo"])
mock_fetch.assert_not_called()

tmp_dir.scm_gen(
".gitattributes", ".lfs filter=lfs diff=lfs merge=lfs -text", commit="init lfs"
)
rev = scm.get_rev()
with dvc.switch(rev):
lfs_prefetch(dvc.dvcfs, ["foo"])
mock_fetch.assert_called_once()
Loading