Skip to content

Commit

Permalink
Merge pull request #83 from SciCatProject/default-checksum-algo
Browse files Browse the repository at this point in the history
Default checksum algorithm in downloads
  • Loading branch information
jl-wynen authored Apr 5, 2023
2 parents 9a3dec5 + 0eda497 commit b18be12
Show file tree
Hide file tree
Showing 6 changed files with 31 additions and 16 deletions.
3 changes: 3 additions & 0 deletions docs/release-notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,13 @@ Features

* Better HTML formatting of ESS-style scientific metadata.
* Added the ``force`` argument to :func:`scitacean.Client.download_files`.
* Try the default checksum algorithm if the dataset has no algorithm set to check if a local

Breaking changes
~~~~~~~~~~~~~~~~

* Changed default checksum algorithm from md5 to blake2b.

Bugfixes
~~~~~~~~

Expand Down
2 changes: 1 addition & 1 deletion src/scitacean/_dataset_fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,7 @@ def __init__(
techniques: Optional[List[Technique]] = None,
used_software: Optional[List[str]] = None,
validation_status: Optional[str] = None,
checksum_algorithm: Optional[str] = "md5",
checksum_algorithm: Optional[str] = "blake2b",
_read_only: Optional[Dict[str, Any]] = None,
_orig_datablocks: Optional[List[OrigDatablockProxy]] = None,
):
Expand Down
16 changes: 11 additions & 5 deletions src/scitacean/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -235,14 +235,20 @@ def local_is_up_to_date(self) -> bool:
return False
if self.checksum_algorithm is None:
warnings.warn(
f"Cannot check if local file {self.local_path} is up to date because "
"the checksum algorithm is not set. "
"Assuming the file needs to be updated."
"No checksum algorithm has been set, using the default 'blake2b' "
f"to check if local file '{self.local_path}' is up to date. "
"There is a very low chance that this yields a false positive "
"and the file is incorrect. Set an algorithm manually to avoid this."
)
return False
return self._local_is_up_to_date_with_checksum_algorithm("blake2b")
return self._local_is_up_to_date_with_checksum_algorithm(
self.checksum_algorithm
)

def _local_is_up_to_date_with_checksum_algorithm(self, algorithm: str) -> bool:
local_checksum = self._checksum_cache.get( # type: ignore[union-attr]
path=self.local_path, # type: ignore[arg-type]
algorithm=self.checksum_algorithm,
algorithm=algorithm,
)
return self._remote_checksum == local_checksum

Expand Down
10 changes: 5 additions & 5 deletions tests/dataset_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def test_add_local_file_to_new_dataset(typ, fs):
assert f.remote_access_path(dset.source_folder) is None
assert f.local_path == Path("local/folder/data.dat")
assert f.size == file_data["size"]
assert f.checksum_algorithm == "md5"
assert f.checksum_algorithm == "blake2b"

assert abs(file_data["creation_time"] - f.creation_time) < timedelta(seconds=1)
assert abs(file_data["creation_time"] - f.make_model().time) < timedelta(seconds=1)
Expand Down Expand Up @@ -72,14 +72,14 @@ def test_add_multiple_local_files_to_new_dataset(typ, fs):
assert f0.remote_access_path(dset.source_folder) is None
assert f0.local_path == Path("common/location1/data.dat")
assert f0.size == file_data0["size"]
assert f0.checksum_algorithm == "md5"
assert f0.checksum_algorithm == "blake2b"

assert not f1.is_on_remote
assert f1.is_on_local
assert f1.remote_access_path(dset.source_folder) is None
assert f1.local_path == Path("common/song.mp3")
assert f1.size == file_data1["size"]
assert f1.checksum_algorithm == "md5"
assert f1.checksum_algorithm == "blake2b"


@pytest.mark.parametrize("typ", (DatasetType.RAW, DatasetType.DERIVED))
Expand Down Expand Up @@ -107,14 +107,14 @@ def test_add_multiple_local_files_to_new_dataset_with_base_path(typ, fs):
assert f0.remote_access_path(dset.source_folder) is None
assert f0.local_path == Path("common/location1/data.dat")
assert f0.size == file_data0["size"]
assert f0.checksum_algorithm == "md5"
assert f0.checksum_algorithm == "blake2b"

assert not f1.is_on_remote
assert f1.is_on_local
assert f1.remote_access_path(dset.source_folder) is None
assert f1.local_path == Path("common/song.mp3")
assert f1.size == file_data1["size"]
assert f1.checksum_algorithm == "md5"
assert f1.checksum_algorithm == "blake2b"


@pytest.mark.parametrize("typ", (DatasetType.RAW, DatasetType.DERIVED))
Expand Down
14 changes: 10 additions & 4 deletions tests/file_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,12 +340,18 @@ def test_local_is_up_to_date_for_local_file():
assert file.local_is_up_to_date()


def test_local_is_not_up_to_date_without_checksum_alg():
def test_local_is_up_to_date_default_checksum_alg(fs):
contents = b"some file content"
checksum = hashlib.new("blake2b")
checksum.update(contents)
checksum = checksum.hexdigest()
fs.create_file("data.csv", contents=contents)

file = File.from_scicat(
DataFile(path="data.csv", size=65178, chk="sha256")
DataFile(path="data.csv", size=65178, chk=checksum)
).downloaded(local_path="data.csv")
with pytest.warns(UserWarning, match="checksum"):
assert not file.local_is_up_to_date()
with pytest.warns(UserWarning):
assert file.local_is_up_to_date()


def test_local_is_up_to_date_matching_checksum(fake_file):
Expand Down
2 changes: 1 addition & 1 deletion tools/model-generation/templates/dataset.py.template
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ class DatasetFields:
creation_time: Optional[Union[datetime, str]] = None,
pid: Optional[Union[str, PID]] = None,
$field_init_args,
checksum_algorithm: Optional[str] = "md5",
checksum_algorithm: Optional[str] = "blake2b",
_read_only: Optional[Dict[str, Any]] = None,
_orig_datablocks: Optional[List[OrigDatablockProxy]] = None,
):
Expand Down

0 comments on commit b18be12

Please sign in to comment.