Skip to content

Commit

Permalink
diff: handle granular targets
Browse files Browse the repository at this point in the history
  • Loading branch information
efiop committed Sep 28, 2023
1 parent ab861d2 commit a1b1d21
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 25 deletions.
20 changes: 17 additions & 3 deletions dvc/repo/diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def _hash(entry):
return None


def _diff(old, new, with_missing=False):
def _diff(old, new, data_keys, with_missing=False):
from dvc_data.index.diff import ADD, DELETE, MODIFY, RENAME
from dvc_data.index.diff import diff as idiff

Expand All @@ -34,12 +34,23 @@ def _diff(old, new, with_missing=False):
"not in cache": [],
}

def meta_cmp_key(meta):
if not meta:
return meta
return meta.isdir

for change in idiff(
old,
new,
with_renames=True,
hash_only=True,
meta_cmp_key=meta_cmp_key,
roots=data_keys,
):
if (change.old and change.old.isdir and not change.old.hash_info) or (
change.new and change.new.isdir and not change.new.hash_info
):
continue

if change.typ == ADD:
ret["added"].append(
{
Expand Down Expand Up @@ -116,6 +127,7 @@ def diff(
b_rev = "workspace"
with_missing = True

data_keys = set()
for rev in self.brancher(revs=[a_rev, b_rev]):
if rev == "workspace" and b_rev != "workspace":
# brancher always returns workspace, but we only need to compute
Expand All @@ -132,6 +144,8 @@ def onerror(target, _exc):
recursive=recursive,
)

data_keys.update(view.data_keys.get("repo", set()))

if rev == "workspace":
from .index import build_data_index

Expand Down Expand Up @@ -165,4 +179,4 @@ def onerror(target, _exc):
new = indexes[b_rev]

with ui.status("Calculating diff"):
return _diff(old, new, with_missing=with_missing)
return _diff(old, new, data_keys, with_missing=with_missing)
72 changes: 51 additions & 21 deletions dvc/repo/index.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
import time
from collections import defaultdict
from functools import partial
from typing import (
TYPE_CHECKING,
Expand Down Expand Up @@ -319,6 +320,22 @@ def outs(self) -> Iterator["Output"]:
for stage in self.stages:
yield from stage.outs

@cached_property
def out_data_keys(self) -> Dict[str, Set["DataIndexKey"]]:
by_workspace: Dict[str, Set["DataIndexKey"]] = defaultdict(set)

by_workspace["repo"] = set()
by_workspace["local"] = set()

for out in self.outs:
if not out.use_cache:
continue

ws, key = out.index_key
by_workspace[ws].add(key)

return dict(by_workspace)

@property
def decorated_outs(self) -> Iterator["Output"]:
for output in self.outs:
Expand Down Expand Up @@ -359,8 +376,6 @@ def _plot_sources(self) -> List[str]:

@cached_property
def data_keys(self) -> Dict[str, Set["DataIndexKey"]]:
from collections import defaultdict

by_workspace: Dict[str, Set["DataIndexKey"]] = defaultdict(set)

by_workspace["repo"] = set()
Expand All @@ -377,8 +392,6 @@ def data_keys(self) -> Dict[str, Set["DataIndexKey"]]:

@cached_property
def metric_keys(self) -> Dict[str, Set["DataIndexKey"]]:
from collections import defaultdict

from .metrics.show import _collect_top_level_metrics

by_workspace: Dict[str, Set["DataIndexKey"]] = defaultdict(set)
Expand All @@ -400,8 +413,6 @@ def metric_keys(self) -> Dict[str, Set["DataIndexKey"]]:

@cached_property
def plot_keys(self) -> Dict[str, Set["DataIndexKey"]]:
from collections import defaultdict

by_workspace: Dict[str, Set["DataIndexKey"]] = defaultdict(set)

by_workspace["repo"] = set()
Expand Down Expand Up @@ -520,8 +531,6 @@ def used_objs(
jobs: Optional[int] = None,
push: bool = False,
) -> "ObjectContainer":
from collections import defaultdict

used: "ObjectContainer" = defaultdict(set)
pairs = self.collect_targets(targets, recursive=recursive, with_deps=with_deps)
for stage, filter_info in pairs:
Expand Down Expand Up @@ -640,9 +649,23 @@ def outs(self) -> Iterator["Output"]:
yield from {out for (out, _) in self._filtered_outs}

@cached_property
def _data_prefixes(self) -> Dict[str, "_DataPrefixes"]:
from collections import defaultdict
def out_data_keys(self) -> Dict[str, Set["DataIndexKey"]]:
by_workspace: Dict[str, Set["DataIndexKey"]] = defaultdict(set)

by_workspace["repo"] = set()
by_workspace["local"] = set()

for out in self.outs:
if not out.use_cache:
continue

ws, key = out.index_key
by_workspace[ws].add(key)

return dict(by_workspace)

@cached_property
def _data_prefixes(self) -> Dict[str, "_DataPrefixes"]:
prefixes: Dict[str, "_DataPrefixes"] = defaultdict(
lambda: _DataPrefixes(set(), set())
)
Expand All @@ -660,8 +683,6 @@ def _data_prefixes(self) -> Dict[str, "_DataPrefixes"]:

@cached_property
def data_keys(self) -> Dict[str, Set["DataIndexKey"]]:
from collections import defaultdict

ret: Dict[str, Set["DataIndexKey"]] = defaultdict(set)

for out, filter_info in self._filtered_outs:
Expand Down Expand Up @@ -714,7 +735,7 @@ def key_filter(workspace: str, key: "DataIndexKey"):
return data


def build_data_index( # noqa: C901
def build_data_index( # noqa: C901, PLR0912
index: Union["Index", "IndexView"],
path: str,
fs: "FileSystem",
Expand Down Expand Up @@ -777,14 +798,6 @@ def build_data_index( # noqa: C901
data.add(entry)
callback.relative_update(1)

if compute_hash:
tree_meta, tree = build_tree(data, key, name=hash_name)
out_entry.meta = tree_meta
out_entry.hash_info = tree.hash_info
out_entry.loaded = True
data.add(out_entry)
callback.relative_update(1)

for key in parents:
parent_path = fs.path.join(path, *key)
if not fs.exists(parent_path):
Expand All @@ -793,6 +806,23 @@ def build_data_index( # noqa: C901
data.add(direntry)
callback.relative_update(1)

if compute_hash:
out_keys = index.out_data_keys.get(workspace, set())
data_keys = index.data_keys.get(workspace, set())
for key in data_keys.intersection(out_keys):
hash_name = _get_entry_hash_name(index, workspace, key)

out_entry = data.get(key)
if not out_entry or not out_entry.isdir:
continue

tree_meta, tree = build_tree(data, key, name=hash_name)
out_entry.meta = tree_meta
out_entry.hash_info = tree.hash_info
out_entry.loaded = True
data.add(out_entry)
callback.relative_update(1)

return data


Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ dependencies = [
"configobj>=5.0.6",
"distro>=1.3",
"dpath<3,>=2.1.0",
"dvc-data>=2.16.3,<2.17.0",
"dvc-data>=2.17.1,<2.18.0",
"dvc-http>=2.29.0",
"dvc-render>=0.3.1,<1",
"dvc-studio-client>=0.13.0,<1",
Expand Down
50 changes: 50 additions & 0 deletions tests/func/test_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -668,3 +668,53 @@ def test_rename_multiple_files_same_hashes(tmp_dir, scm, dvc):
},
],
}


def test_diff_granular(tmp_dir, dvc, scm):
tmp_dir.gen(
{
"dir": {
"data": {
"subdir": {"subfoo": "subfoo", "subbar": "subbar"},
"foo": "foo",
"bar": "bar",
},
},
}
)

dvc.add(os.path.join("dir", "data"))
scm.add(os.path.join("dir", "data.dvc"))
scm.add(os.path.join("dir", ".gitignore"))
scm.commit("data")

assert dvc.diff() == {}

(tmp_dir / "dir" / "data" / "subdir" / "new").write_text("new")

assert dvc.diff() == {
"added": [
{"hash": "22af645d1859cb5ca6da0c484f1f37ea", "path": "dir/data/subdir/new"}
],
"deleted": [],
"modified": [
{
"hash": {
"new": "efa5b20d5f935dcc5555b26db6e19b76.dir",
"old": "1aca2c799df82929bbdd976557975546.dir",
},
"path": "dir/data/",
}
],
"not in cache": [],
"renamed": [],
}
assert dvc.diff(targets=[os.path.join("dir", "data", "subdir")]) == {
"added": [
{"hash": "22af645d1859cb5ca6da0c484f1f37ea", "path": "dir/data/subdir/new"}
],
"deleted": [],
"modified": [],
"not in cache": [],
"renamed": [],
}

0 comments on commit a1b1d21

Please sign in to comment.