diff --git a/dvc/dependency/base.py b/dvc/dependency/base.py index 239c1bcb15..57a11a67ee 100644 --- a/dvc/dependency/base.py +++ b/dvc/dependency/base.py @@ -45,7 +45,7 @@ def update(self, rev=None): self.fs_path = self.fs.version_path(self.fs_path, self.meta.version_id) def download(self, to, jobs=None): - fs_download(self.fs, self.fs_path, to.fs_path, jobs=jobs) + return fs_download(self.fs, self.fs_path, to.fs_path, jobs=jobs) def save(self): super().save() diff --git a/dvc/dependency/repo.py b/dvc/dependency/repo.py index 058891b786..c229c780df 100644 --- a/dvc/dependency/repo.py +++ b/dvc/dependency/repo.py @@ -3,7 +3,6 @@ import voluptuous as vol -from dvc.prompt import confirm from dvc.utils import as_posix from .base import Dependency @@ -12,6 +11,7 @@ from dvc.fs import DVCFileSystem from dvc.output import Output from dvc.stage import Stage + from dvc_data.hashfile.hash_info import HashInfo class RepoDependency(Dependency): @@ -94,29 +94,25 @@ def dumpd(self, **kwargs) -> dict[str, Union[str, dict[str, str]]]: } def download(self, to: "Output", jobs: Optional[int] = None): - from dvc_data.hashfile.build import build - from dvc_data.hashfile.checkout import CheckoutError, checkout - - try: - repo = self._make_fs(locked=True).repo - - _, _, obj = build( - repo.cache.local, - self.fs_path, - repo.dvcfs, - repo.cache.local.fs.PARAM_CHECKSUM, - ) - checkout( - to.fs_path, - to.fs, - obj, - self.repo.cache.local, - ignore=None, - state=self.repo.state, - prompt=confirm, - ) - except (CheckoutError, FileNotFoundError): - super().download(to=to, jobs=jobs) + from dvc.fs import LocalFileSystem + + files = super().download(to=to, jobs=jobs) + if not isinstance(to.fs, LocalFileSystem): + return files + + hashes: list[tuple[str, HashInfo, dict[str, Any]]] = [] + for src_path, dest_path in files: + try: + hash_info = self.fs.info(src_path)["dvc_info"]["entry"].hash_info + dest_info = to.fs.info(dest_path) + except (OSError, KeyError, AttributeError): + # If no hash info found, just keep going and output will be hashed later + continue + if hash_info: + hashes.append((dest_path, hash_info, dest_info)) + cache = to.cache if to.use_cache else to.local_cache + cache.state.save_many(hashes, to.fs) + return files def update(self, rev: Optional[str] = None): if rev: diff --git a/dvc/fs/__init__.py b/dvc/fs/__init__.py index f5d36f3148..0c9cf567ac 100644 --- a/dvc/fs/__init__.py +++ b/dvc/fs/__init__.py @@ -47,7 +47,7 @@ def download( fs: "FileSystem", fs_path: str, to: str, jobs: Optional[int] = None -) -> int: +) -> list[tuple[str, str]]: from dvc.scm import lfs_prefetch from .callbacks import TqdmCallback @@ -61,7 +61,7 @@ def download( ] if not from_infos: localfs.makedirs(to, exist_ok=True) - return 0 + return [] to_infos = [ localfs.join(to, *fs.relparts(info, fs_path)) for info in from_infos ] @@ -81,7 +81,7 @@ def download( cb.set_size(len(from_infos)) jobs = jobs or fs.jobs generic.copy(fs, from_infos, localfs, to_infos, callback=cb, batch_size=jobs) - return len(to_infos) + return list(zip(from_infos, to_infos)) def parse_external_url(url, fs_config=None, config=None): diff --git a/dvc/repo/artifacts.py b/dvc/repo/artifacts.py index e61cdaaaa2..78658542b3 100644 --- a/dvc/repo/artifacts.py +++ b/dvc/repo/artifacts.py @@ -220,7 +220,7 @@ def download( out = resolve_output(path, out, force=force) fs = self.repo.dvcfs - count = fs_download(fs, path, os.path.abspath(out), jobs=jobs) + count = len(fs_download(fs, path, os.path.abspath(out), jobs=jobs)) return count, out @staticmethod diff --git a/tests/func/test_import.py b/tests/func/test_import.py index 86e8b3b967..df78e7a699 100644 --- a/tests/func/test_import.py +++ b/tests/func/test_import.py @@ -6,13 +6,13 @@ from dvc.cachemgr import CacheManager from dvc.config import NoRemoteError -from dvc.dependency import base from dvc.dvcfile import load_file from dvc.fs import system from dvc.scm import Git from dvc.stage.exceptions import StagePathNotFoundError from dvc.testing.tmp_dir import make_subrepo from dvc.utils.fs import remove +from dvc_data.hashfile import hash from dvc_data.index.index import DataIndexDirError @@ -725,14 +725,12 @@ def test_import_invalid_configs(tmp_dir, scm, dvc, erepo_dir): ) -def test_reimport(tmp_dir, scm, dvc, erepo_dir, mocker): +def test_import_no_hash(tmp_dir, scm, dvc, erepo_dir, mocker): with erepo_dir.chdir(): erepo_dir.dvc_gen("foo", "foo content", commit="create foo") - spy = mocker.spy(base, "fs_download") - dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported") - assert spy.called - - spy.reset_mock() - dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported", force=True) - assert not spy.called + spy = mocker.spy(hash, "file_md5") + stage = dvc.imp(os.fspath(erepo_dir), "foo", "foo_imported") + assert spy.call_count == 1 + for call in spy.call_args_list: + assert stage.outs[0].fs_path != call.args[0]