From 0979d8c74cc8620a0f51c6679caa3208c4159f83 Mon Sep 17 00:00:00 2001 From: Ruslan Kuprieiev Date: Thu, 4 Jan 2024 17:35:49 +0200 Subject: [PATCH] index: data_tree: handle partial imports --- dvc/repo/index.py | 57 +++++++++++++++++----------------- dvc/testing/workspace_tests.py | 25 +++++++++++++-- pyproject.toml | 2 +- 3 files changed, 52 insertions(+), 32 deletions(-) diff --git a/dvc/repo/index.py b/dvc/repo/index.py index ee1d6a798a7..35eea98f2c7 100644 --- a/dvc/repo/index.py +++ b/dvc/repo/index.py @@ -255,6 +255,33 @@ def _load_storage_from_out(storage_map, key, out): _load_storage_from_import(storage_map, key, out) +def _build_tree_from_outs(outs): + from dvc_data.hashfile.tree import Tree + + tree = Tree() + for out in outs: + if not out.use_cache: + continue + + ws, key = out.index_key + + if not out.stage.is_partial_import: + tree.add((ws, *key), out.meta, out.hash_info) + continue + + dep = out.stage.deps[0] + if not dep.files: + tree.add((ws, *key), dep.meta, dep.hash_info) + continue + + for okey, ometa, ohi in dep.get_obj(): + tree.add((ws, *key, *okey), ometa, ohi) + + tree.digest() + + return tree + + class Index: def __init__( self, @@ -504,20 +531,7 @@ def plot_keys(self) -> Dict[str, Set["DataIndexKey"]]: @cached_property def data_tree(self): - from dvc_data.hashfile.tree import Tree - - tree = Tree() - for out in self.outs: - if not out.use_cache: - continue - - ws, key = out.index_key - - tree.add((ws, *key), out.meta, out.hash_info) - - tree.digest() - - return tree + return _build_tree_from_outs(self.outs) @cached_property def data(self) -> "Dict[str, DataIndex]": @@ -772,20 +786,7 @@ def data_keys(self) -> Dict[str, Set["DataIndexKey"]]: @cached_property def data_tree(self): - from dvc_data.hashfile.tree import Tree - - tree = Tree() - for out in self.outs: - if not out.use_cache: - continue - - ws, key = out.index_key - - tree.add((ws, *key), out.meta, out.hash_info) - - tree.digest() - - return tree + return _build_tree_from_outs(self.outs) @cached_property def data(self) -> Dict[str, Union["DataIndex", "DataIndexView"]]: diff --git a/dvc/testing/workspace_tests.py b/dvc/testing/workspace_tests.py index c9846873c05..13bdaf7d3cb 100644 --- a/dvc/testing/workspace_tests.py +++ b/dvc/testing/workspace_tests.py @@ -149,14 +149,33 @@ def test_import_dir(self, tmp_dir, dvc, remote_version_aware): assert (tmp_dir / "data_dir" / "subdir" / "file").read_text() == "modified" assert (tmp_dir / "data_dir" / "new_file").read_text() == "new" - def test_import_no_download(self, tmp_dir, dvc, remote_version_aware): + def test_import_no_download(self, tmp_dir, dvc, remote_version_aware, scm): remote_version_aware.gen({"data_dir": {"subdir": {"file": "file"}}}) dvc.imp_url("remote://upstream/data_dir", version_aware=True, no_download=True) + scm.add(["data_dir.dvc", ".gitignore"]) + scm.commit("v1") + scm.tag("v1") + stage = first(dvc.index.stages) assert not stage.outs[0].can_push - dvc.pull() - assert (tmp_dir / "data_dir" / "subdir" / "file").read_text() == "file" + (remote_version_aware / "data_dir" / "foo").write_text("foo") + dvc.update(no_download=True) + assert dvc.pull()["fetched"] == 2 + assert (tmp_dir / "data_dir").read_text() == { + "foo": "foo", + "subdir": {"file": "file"}, + } + scm.add(["data_dir.dvc", ".gitignore"]) + scm.commit("update") + + scm.checkout("v1") + dvc.cache.local.clear() + remove(tmp_dir / "data_dir") + assert dvc.pull()["fetched"] == 1 + assert (tmp_dir / "data_dir").read_text() == { + "subdir": {"file": "file"}, + } dvc.commit(force=True) assert dvc.status() == {} diff --git a/pyproject.toml b/pyproject.toml index 5bf04c37b4e..1eb843141b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,7 +35,7 @@ dependencies = [ "configobj>=5.0.6", "distro>=1.3", "dpath<3,>=2.1.0", - "dvc-data>=3.6,<3.7", + "dvc-data>=3.7,<3.8", "dvc-http>=2.29.0", "dvc-render>=1.0.0,<2", "dvc-studio-client>=0.17.1,<1",