From 0b72266287999ea26855b0c4f7b943d0e58743ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Saugat=20Pachhai=20=28=E0=A4=B8=E0=A5=8C=E0=A4=97=E0=A4=BE?= =?UTF-8?q?=E0=A4=A4=29?= Date: Wed, 7 Aug 2024 08:59:18 +0545 Subject: [PATCH] hashfile/checkout: use save_many to save state For MNIST dataset, this drops total runtime for `dvc add` from 24s to 12s for me. --- src/dvc_data/hashfile/checkout.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/dvc_data/hashfile/checkout.py b/src/dvc_data/hashfile/checkout.py index b607727c..e32bb8ff 100644 --- a/src/dvc_data/hashfile/checkout.py +++ b/src/dvc_data/hashfile/checkout.py @@ -13,6 +13,7 @@ from fsspec import Callback from ._ignore import Ignore + from .hash_info import HashInfo logger = logging.getLogger(__name__) @@ -110,10 +111,6 @@ def _checkout_file( else: link(cache, cache_path, fs, path) modified = True - - if state: - state.save(path, fs, change.new.oid) - return modified @@ -203,6 +200,7 @@ def _checkout( _remove(entry_path, fs, change.old.in_cache, force=force, prompt=prompt) failed = [] + hashes_to_update: list[tuple[str, HashInfo, None]] = [] for change in chain(diff.added, diff.modified): entry_path = fs.join(path, *change.new.key) if change.new.key != ROOT else path if change.new.oid.isdir: @@ -223,6 +221,11 @@ def _checkout( ) except CheckoutError as exc: failed.extend(exc.paths) + else: + hashes_to_update.append((entry_path, change.new.oid, fs.info(entry_path))) + + if state is not None: + state.save_many(hashes_to_update, fs) if failed: raise CheckoutError(failed)