Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance(diff): faster rename detection #550

Merged
merged 18 commits into from
Aug 13, 2024
Merged
45 changes: 29 additions & 16 deletions src/dvc_data/index/diff.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from collections import deque
import itertools
from collections import defaultdict, deque
from collections.abc import Iterable
from typing import TYPE_CHECKING, Any, Callable, Optional
from typing import TYPE_CHECKING, Any, Callable, Optional, cast

from attrs import define
from fsspec.callbacks import DEFAULT_CALLBACK, Callback
Expand Down Expand Up @@ -246,44 +247,56 @@ def _detect_renames(changes: Iterable[Change]):

for change in changes:
if change.typ == ADD:
assert change.new
added.append(change)
elif change.typ == DELETE:
assert change.old
deleted.append(change)
else:
yield change

def _get_key(change):
return change.key

added[:] = sorted(added, key=_get_key)
deleted[:] = sorted(deleted, key=_get_key)
# Sort the lists to maintain the same order
# as older implementation.
added.sort(key=_get_key)
Northo marked this conversation as resolved.
Show resolved Hide resolved
deleted.sort(key=_get_key)

# Create a dictionary for fast lookup of deletions by hash_info
deleted_dict: dict[Optional[HashInfo], deque[Change]] = defaultdict(deque)
for change in deleted:
# We checked change.old for all deleted above, so cast
change_hash = cast(DataIndexEntry, change.old).hash_info
# appendleft to get queue behaviour (we pop off right)
deleted_dict[change_hash].appendleft(change)

Northo marked this conversation as resolved.
Show resolved Hide resolved
for change in added:
new_entry = change.new
assert new_entry
# We checked change.new for all new above, so cast
new_entry = cast(DataIndexEntry, change.new)

if not new_entry.hash_info:
yield change
continue

index, old_entry = None, None
for idx, ch in enumerate(deleted):
assert ch.old
if ch.old.hash_info == new_entry.hash_info:
index, old_entry = idx, ch.old
break
# If the new entry is the same as a deleted change,
# it is in fact a rename.
# Note: get instead of __getitem__, to avoid creating
# unnecessary entries.
if deleted_dict.get(new_entry.hash_info):
skshetry marked this conversation as resolved.
Show resolved Hide resolved
deletion = deleted_dict[new_entry.hash_info].pop()

if index is not None:
del deleted[index]
yield Change(
RENAME,
old_entry,
deletion.old,
new_entry,
)
else:
yield change

yield from deleted
# Yield the remaining unmatched deletions
if deleted_dict:
yield from itertools.chain.from_iterable(deleted_dict.values())


def diff( # noqa: PLR0913
Expand Down
Loading