From 730422944053f58089e8467cda7ad224f04d5637 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 12 Aug 2024 11:39:00 -0700 Subject: [PATCH 01/33] setup temp conversion script --- temp_test.py | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 temp_test.py diff --git a/temp_test.py b/temp_test.py new file mode 100644 index 000000000..831c98d1e --- /dev/null +++ b/temp_test.py @@ -0,0 +1,61 @@ +import shutil +from datetime import datetime +from pathlib import Path + +from lerner_lab_to_nwb.seiler_2024.seiler_2024_convert_session import session_to_nwb + + +def main(): + # Parameters for conversion + data_dir_path = Path("/Volumes/T7/CatalystNeuro/Lerner/raw_data") + output_dir_path = Path("/Volumes/T7/CatalystNeuro/Lerner/conversion_nwb") + stub_test = False + + if output_dir_path.exists(): + shutil.rmtree( + output_dir_path, ignore_errors=True + ) # ignore errors due to MacOS race condition (https://github.com/python/cpython/issues/81441) + + # Fiber Photometry session + experiment_type = "FP" + experimental_group = "PS" + subject_id = "112.283" + start_datetime = datetime(2019, 6, 20, 9, 32, 4) + session_conditions = { + "Start Date": start_datetime.strftime("%m/%d/%y"), + "Start Time": start_datetime.strftime("%H:%M:%S"), + } + start_variable = "Start Date" + behavior_file_path = ( + data_dir_path + / f"{experiment_type} Experiments" + / "Behavior" + / f"{experimental_group}" + / f"{subject_id}" + / f"{subject_id}" + ) + fiber_photometry_folder_path = ( + data_dir_path + / f"{experiment_type} Experiments" + / "Photometry" + / f"Punishment Sensitive" + / f"Early RI60" + / f"Photo_{subject_id.split('.')[0]}_{subject_id.split('.')[1]}-190620-093542" + ) + session_to_nwb( + data_dir_path=data_dir_path, + output_dir_path=output_dir_path, + behavior_file_path=behavior_file_path, + fiber_photometry_folder_path=fiber_photometry_folder_path, + has_demodulated_commanded_voltages=False, + subject_id=subject_id, + session_conditions=session_conditions, + start_variable=start_variable, + experiment_type=experiment_type, + experimental_group=experimental_group, + stub_test=stub_test, + ) + + +if __name__ == "__main__": + main() From 4cc2a06a2ccb7c9a29f8f5f1ff4e33553b46df78 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 12 Aug 2024 15:31:35 -0700 Subject: [PATCH 02/33] added from_existing_neurodata_object for hdf5 --- .../_configuration_models/_hdf5_dataset_io.py | 29 +++++- temp_test.py | 91 ++++++++----------- 2 files changed, 67 insertions(+), 53 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py index 828a37998..03c29389e 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py @@ -3,9 +3,11 @@ from typing import Any, Dict, Literal, Union import h5py +from hdmf import Container from pydantic import Field, InstanceOf +from typing_extensions import Self -from ._base_dataset_io import DatasetIOConfiguration +from ._base_dataset_io import DatasetIOConfiguration, _find_location_in_memory_nwbfile from ...importing import is_package_installed _base_hdf5_filters = set(h5py.filters.decode) @@ -78,3 +80,28 @@ def get_data_io_kwargs(self) -> Dict[str, Any]: compression_bundle = dict(compression=self.compression_method, compression_opts=compression_opts) return dict(chunks=self.chunk_shape, **compression_bundle) + + @classmethod + def from_existing_neurodata_object( + cls, neurodata_object: Container, dataset_name: Literal["data", "timestamps"] + ) -> Self: + location_in_file = _find_location_in_memory_nwbfile(neurodata_object=neurodata_object, field_name=dataset_name) + full_shape = getattr(neurodata_object, dataset_name).shape + dtype = getattr(neurodata_object, dataset_name).dtype + chunk_shape = getattr(neurodata_object, dataset_name).chunks + buffer_shape = getattr(neurodata_object, dataset_name).maxshape + compression_method = getattr(neurodata_object, dataset_name).compression + compression_opts = getattr(neurodata_object, dataset_name).compression_opts + compression_options = dict(compression_opts=compression_opts) + return cls( + object_id=neurodata_object.object_id, + object_name=neurodata_object.name, + location_in_file=location_in_file, + dataset_name=dataset_name, + full_shape=full_shape, + dtype=dtype, + chunk_shape=chunk_shape, + buffer_shape=buffer_shape, + compression_method=compression_method, + compression_options=compression_options, + ) diff --git a/temp_test.py b/temp_test.py index 831c98d1e..c8ab00784 100644 --- a/temp_test.py +++ b/temp_test.py @@ -1,60 +1,47 @@ -import shutil -from datetime import datetime +import os from pathlib import Path -from lerner_lab_to_nwb.seiler_2024.seiler_2024_convert_session import session_to_nwb +import numpy as np +from pynwb import NWBHDF5IO, H5DataIO, TimeSeries +from pynwb.testing.mock.file import mock_NWBFile +from neuroconv.tools.nwb_helpers._configuration_models._hdf5_dataset_io import ( + HDF5DatasetIOConfiguration, +) -def main(): - # Parameters for conversion - data_dir_path = Path("/Volumes/T7/CatalystNeuro/Lerner/raw_data") - output_dir_path = Path("/Volumes/T7/CatalystNeuro/Lerner/conversion_nwb") - stub_test = False - - if output_dir_path.exists(): - shutil.rmtree( - output_dir_path, ignore_errors=True - ) # ignore errors due to MacOS race condition (https://github.com/python/cpython/issues/81441) - - # Fiber Photometry session - experiment_type = "FP" - experimental_group = "PS" - subject_id = "112.283" - start_datetime = datetime(2019, 6, 20, 9, 32, 4) - session_conditions = { - "Start Date": start_datetime.strftime("%m/%d/%y"), - "Start Time": start_datetime.strftime("%H:%M:%S"), - } - start_variable = "Start Date" - behavior_file_path = ( - data_dir_path - / f"{experiment_type} Experiments" - / "Behavior" - / f"{experimental_group}" - / f"{subject_id}" - / f"{subject_id}" - ) - fiber_photometry_folder_path = ( - data_dir_path - / f"{experiment_type} Experiments" - / "Photometry" - / f"Punishment Sensitive" - / f"Early RI60" - / f"Photo_{subject_id.split('.')[0]}_{subject_id.split('.')[1]}-190620-093542" - ) - session_to_nwb( - data_dir_path=data_dir_path, - output_dir_path=output_dir_path, - behavior_file_path=behavior_file_path, - fiber_photometry_folder_path=fiber_photometry_folder_path, - has_demodulated_commanded_voltages=False, - subject_id=subject_id, - session_conditions=session_conditions, - start_variable=start_variable, - experiment_type=experiment_type, - experimental_group=experimental_group, - stub_test=stub_test, + +def write_nwbfile(nwbfile_path: Path): + if nwbfile_path.exists(): + os.remove(nwbfile_path) + nwbfile = mock_NWBFile() + timestamps = np.arange(10.0) + data = np.arange(100, 200, 10) + time_series_with_timestamps = TimeSeries( + name="test_timeseries", + description="an example time series", + data=H5DataIO(data=data, compression="gzip", chunks=(1,), compression_opts=2), + unit="m", + timestamps=timestamps, ) + nwbfile.add_acquisition(time_series_with_timestamps) + with NWBHDF5IO(nwbfile_path, mode="w") as io: + io.write(nwbfile) + + +def main(): + nwbfile_path = Path("/Volumes/T7/CatalystNeuro/temp.nwb") + write_nwbfile(nwbfile_path) + with NWBHDF5IO(nwbfile_path, mode="r") as io: + nwbfile = io.read() + for neurodata_object in nwbfile.objects.values(): + print(neurodata_object.name) + if isinstance(neurodata_object, TimeSeries): + config = HDF5DatasetIOConfiguration.from_existing_neurodata_object( + neurodata_object=neurodata_object, dataset_name="data" + ) + print(f"{config.chunk_shape = }") + print(f"{config.compression_method = }") + print(f"{config.compression_options = }") if __name__ == "__main__": From c33dfbfbb367bf978e99a85ce3130383233eeca4 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 12 Aug 2024 15:51:07 -0700 Subject: [PATCH 03/33] added get_existing_dataset_io_configurations --- .../nwb_helpers/_dataset_configuration.py | 60 +++++++++++++++++++ temp_test.py | 20 +++---- 2 files changed, 68 insertions(+), 12 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index f3d8e7560..bca151eaf 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -172,3 +172,63 @@ def get_default_dataset_io_configurations( ) yield dataset_io_configuration + + +def get_existing_dataset_io_configurations( + nwbfile: NWBFile, + backend: Literal["hdf5", "zarr"], +) -> Generator[DatasetIOConfiguration, None, None]: + + DatasetIOConfigurationClass = DATASET_IO_CONFIGURATIONS[backend] + + known_dataset_fields = ("data", "timestamps") + for neurodata_object in nwbfile.objects.values(): + if isinstance(neurodata_object, DynamicTable): + dynamic_table = neurodata_object # For readability + + for column in dynamic_table.columns: + candidate_dataset = column.data # VectorData object + + # Skip over columns whose values are links, such as the 'group' of an ElectrodesTable + if any(isinstance(value, Container) for value in candidate_dataset): + continue # Skip + + # Skip when columns whose values are a reference type + if isinstance(column, TimeSeriesReferenceVectorData): + continue + + # Skip datasets with any zero-length axes + dataset_name = "data" + candidate_dataset = getattr(column, dataset_name) + full_shape = get_data_shape(data=candidate_dataset) + if any(axis_length == 0 for axis_length in full_shape): + continue + + dataset_io_configuration = DatasetIOConfigurationClass.from_existing_neurodata_object( + neurodata_object=column, dataset_name=dataset_name + ) + + yield dataset_io_configuration + elif isinstance(neurodata_object, NWBContainer): + for known_dataset_field in known_dataset_fields: + # Skip optional fields that aren't present + if known_dataset_field not in neurodata_object.fields: + continue + + candidate_dataset = getattr(neurodata_object, known_dataset_field) + + # Skip edge case of in-memory ImageSeries with external mode; data is in fields and is empty array + if isinstance(candidate_dataset, np.ndarray) and candidate_dataset.size == 0: + continue + + # Skip datasets with any zero-length axes + candidate_dataset = getattr(neurodata_object, known_dataset_field) + full_shape = get_data_shape(data=candidate_dataset) + if any(axis_length == 0 for axis_length in full_shape): + continue + + dataset_io_configuration = DatasetIOConfigurationClass.from_existing_neurodata_object( + neurodata_object=neurodata_object, dataset_name=known_dataset_field + ) + + yield dataset_io_configuration diff --git a/temp_test.py b/temp_test.py index c8ab00784..74d0e17c2 100644 --- a/temp_test.py +++ b/temp_test.py @@ -5,8 +5,8 @@ from pynwb import NWBHDF5IO, H5DataIO, TimeSeries from pynwb.testing.mock.file import mock_NWBFile -from neuroconv.tools.nwb_helpers._configuration_models._hdf5_dataset_io import ( - HDF5DatasetIOConfiguration, +from neuroconv.tools.nwb_helpers._dataset_configuration import ( + get_existing_dataset_io_configurations, ) @@ -21,7 +21,9 @@ def write_nwbfile(nwbfile_path: Path): description="an example time series", data=H5DataIO(data=data, compression="gzip", chunks=(1,), compression_opts=2), unit="m", - timestamps=timestamps, + timestamps=H5DataIO( + timestamps, compression="gzip", chunks=(1,), compression_opts=2 + ), # TODO: add support for uncompressed timestamps ) nwbfile.add_acquisition(time_series_with_timestamps) with NWBHDF5IO(nwbfile_path, mode="w") as io: @@ -33,15 +35,9 @@ def main(): write_nwbfile(nwbfile_path) with NWBHDF5IO(nwbfile_path, mode="r") as io: nwbfile = io.read() - for neurodata_object in nwbfile.objects.values(): - print(neurodata_object.name) - if isinstance(neurodata_object, TimeSeries): - config = HDF5DatasetIOConfiguration.from_existing_neurodata_object( - neurodata_object=neurodata_object, dataset_name="data" - ) - print(f"{config.chunk_shape = }") - print(f"{config.compression_method = }") - print(f"{config.compression_options = }") + existing_dataset_io_configurations = get_existing_dataset_io_configurations(nwbfile, backend="hdf5") + for dataset_io_configuration in existing_dataset_io_configurations: + print(dataset_io_configuration) if __name__ == "__main__": From 80c1fbae37889ec8500de1db273b96bebed160b0 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 13 Aug 2024 11:49:14 -0700 Subject: [PATCH 04/33] added support for chunk_shape=None --- .../_configuration_models/_base_dataset_io.py | 15 +++++++++------ temp_test.py | 4 +--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py index 01e291034..20fcf9d0c 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_dataset_io.py @@ -147,7 +147,6 @@ def __str__(self) -> str: """ size_in_bytes = math.prod(self.full_shape) * self.dtype.itemsize maximum_ram_usage_per_iteration_in_bytes = math.prod(self.buffer_shape) * self.dtype.itemsize - disk_space_usage_per_chunk_in_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize string = ( f"\n{self.location_in_file}" @@ -159,10 +158,14 @@ def __str__(self) -> str: f"\n buffer shape : {self.buffer_shape}" f"\n expected RAM usage : {human_readable_size(maximum_ram_usage_per_iteration_in_bytes)}" "\n" - f"\n chunk shape : {self.chunk_shape}" - f"\n disk space usage per chunk : {human_readable_size(disk_space_usage_per_chunk_in_bytes)}" - "\n" ) + if self.chunk_shape is not None: + disk_space_usage_per_chunk_in_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize + string += ( + f"\n chunk shape : {self.chunk_shape}" + f"\n disk space usage per chunk : {human_readable_size(disk_space_usage_per_chunk_in_bytes)}" + "\n" + ) if self.compression_method is not None: string += f"\n compression method : {self.compression_method}" if self.compression_options is not None: @@ -182,9 +185,9 @@ def validate_all_shapes(cls, values: Dict[str, Any]) -> Dict[str, Any]: dataset_name == location_in_file.split("/")[-1] ), f"The `dataset_name` ({dataset_name}) does not match the end of the `location_in_file` ({location_in_file})!" - chunk_shape = values["chunk_shape"] - buffer_shape = values["buffer_shape"] full_shape = values["full_shape"] + chunk_shape = values["chunk_shape"] if values["chunk_shape"] is not None else full_shape + buffer_shape = values["buffer_shape"] if values["buffer_shape"] is not None else full_shape if len(chunk_shape) != len(buffer_shape): raise ValueError( diff --git a/temp_test.py b/temp_test.py index 74d0e17c2..ec75b139a 100644 --- a/temp_test.py +++ b/temp_test.py @@ -21,9 +21,7 @@ def write_nwbfile(nwbfile_path: Path): description="an example time series", data=H5DataIO(data=data, compression="gzip", chunks=(1,), compression_opts=2), unit="m", - timestamps=H5DataIO( - timestamps, compression="gzip", chunks=(1,), compression_opts=2 - ), # TODO: add support for uncompressed timestamps + timestamps=timestamps, ) nwbfile.add_acquisition(time_series_with_timestamps) with NWBHDF5IO(nwbfile_path, mode="w") as io: From 7ee6fc6db71534047fd687fe5216616440168bed Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 13 Aug 2024 12:04:40 -0700 Subject: [PATCH 05/33] added from_existing_nwbfile to HDF5BackendConfiguration --- .../_configuration_models/_hdf5_backend.py | 14 +++++++++++++- temp_test.py | 9 ++++----- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_backend.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_backend.py index f85d388b7..a8c416292 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_backend.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_backend.py @@ -3,10 +3,12 @@ from typing import ClassVar, Dict, Literal, Type from pydantic import Field -from pynwb import H5DataIO +from pynwb import H5DataIO, NWBFile +from typing_extensions import Self from ._base_backend import BackendConfiguration from ._hdf5_dataset_io import HDF5DatasetIOConfiguration +from .._dataset_configuration import get_existing_dataset_io_configurations class HDF5BackendConfiguration(BackendConfiguration): @@ -22,3 +24,13 @@ class HDF5BackendConfiguration(BackendConfiguration): "information for writing the datasets to disk using the HDF5 backend." ) ) + + @classmethod + def from_existing_nwbfile(cls, nwbfile: NWBFile) -> Self: + existing_dataset_configurations = get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend) + dataset_configurations = { + existing_dataset_configuration.location_in_file: existing_dataset_configuration + for existing_dataset_configuration in existing_dataset_configurations + } + + return cls(dataset_configurations=dataset_configurations) diff --git a/temp_test.py b/temp_test.py index ec75b139a..e0d99cacc 100644 --- a/temp_test.py +++ b/temp_test.py @@ -5,8 +5,8 @@ from pynwb import NWBHDF5IO, H5DataIO, TimeSeries from pynwb.testing.mock.file import mock_NWBFile -from neuroconv.tools.nwb_helpers._dataset_configuration import ( - get_existing_dataset_io_configurations, +from neuroconv.tools.nwb_helpers._configuration_models._hdf5_backend import ( + HDF5BackendConfiguration, ) @@ -33,9 +33,8 @@ def main(): write_nwbfile(nwbfile_path) with NWBHDF5IO(nwbfile_path, mode="r") as io: nwbfile = io.read() - existing_dataset_io_configurations = get_existing_dataset_io_configurations(nwbfile, backend="hdf5") - for dataset_io_configuration in existing_dataset_io_configurations: - print(dataset_io_configuration) + hdf5_backend_config = HDF5BackendConfiguration.from_existing_nwbfile(nwbfile) + print(hdf5_backend_config) if __name__ == "__main__": From dacdeeaa47a8f98018a08ec3a7912e2c54933214 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 13 Aug 2024 12:25:28 -0700 Subject: [PATCH 06/33] added get_existing_backend_configuration --- .../tools/nwb_helpers/_backend_configuration.py | 17 ++++++++++++++++- temp_test.py | 8 ++++---- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py index 8cb465c76..488e03a35 100644 --- a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py @@ -2,7 +2,8 @@ from typing import Literal, Union -from pynwb import NWBFile +from hdmf_zarr import NWBZarrIO +from pynwb import NWBHDF5IO, NWBFile from ._configuration_models._hdf5_backend import HDF5BackendConfiguration from ._configuration_models._zarr_backend import ZarrBackendConfiguration @@ -17,3 +18,17 @@ def get_default_backend_configuration( BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend] return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile) + + +def get_existing_backend_configuration(nwbfile: NWBFile) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]: + """Fill an existing backend configuration to serve as a starting point for further customization.""" + + read_io = nwbfile.read_io + if isinstance(read_io, NWBHDF5IO): + backend = "hdf5" + elif isinstance(read_io, NWBZarrIO): + backend = "zarr" + else: + raise ValueError(f"The backend of the NWBFile from io {read_io} is not recognized.") + BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend] + return BackendConfigurationClass.from_existing_nwbfile(nwbfile=nwbfile) diff --git a/temp_test.py b/temp_test.py index e0d99cacc..dfc477918 100644 --- a/temp_test.py +++ b/temp_test.py @@ -5,8 +5,8 @@ from pynwb import NWBHDF5IO, H5DataIO, TimeSeries from pynwb.testing.mock.file import mock_NWBFile -from neuroconv.tools.nwb_helpers._configuration_models._hdf5_backend import ( - HDF5BackendConfiguration, +from neuroconv.tools.nwb_helpers._backend_configuration import ( + get_existing_backend_configuration, ) @@ -33,8 +33,8 @@ def main(): write_nwbfile(nwbfile_path) with NWBHDF5IO(nwbfile_path, mode="r") as io: nwbfile = io.read() - hdf5_backend_config = HDF5BackendConfiguration.from_existing_nwbfile(nwbfile) - print(hdf5_backend_config) + backend_config = get_existing_backend_configuration(nwbfile=nwbfile) + print(backend_config) if __name__ == "__main__": From dae04bfefa33a69901afa92ca03471945ea6fbf5 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 13 Aug 2024 15:07:56 -0700 Subject: [PATCH 07/33] added repack_nwbfile --- src/neuroconv/tools/nwb_helpers/__init__.py | 4 ++ .../nwb_helpers/_metadata_and_file_helpers.py | 52 ++++++++++++++++++- temp_test.py | 18 +++++-- 3 files changed, 69 insertions(+), 5 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index 6221aead6..a7f5924b0 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -6,6 +6,7 @@ from ._backend_configuration import ( BACKEND_CONFIGURATIONS, get_default_backend_configuration, + get_existing_backend_configuration, ) from ._configuration_models import DATASET_IO_CONFIGURATIONS from ._configuration_models._base_backend import BackendConfiguration @@ -30,6 +31,7 @@ get_module, make_nwbfile_from_metadata, make_or_load_nwbfile, + repack_nwbfile, ) __all__ = [ @@ -46,6 +48,7 @@ "ZarrDatasetIOConfiguration", "get_default_backend_configuration", "get_default_dataset_io_configurations", + "get_existing_backend_configuration", "configure_backend", "get_default_dataset_io_configurations", "get_default_backend_configuration", @@ -55,4 +58,5 @@ "get_module", "make_nwbfile_from_metadata", "make_or_load_nwbfile", + "repack_nwbfile", ] diff --git a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py index c3aaea48d..0711883ff 100644 --- a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py +++ b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py @@ -15,7 +15,12 @@ from pynwb import NWBHDF5IO, NWBFile from pynwb.file import Subject -from . import BackendConfiguration, configure_backend, get_default_backend_configuration +from . import ( + BackendConfiguration, + configure_backend, + get_default_backend_configuration, + get_existing_backend_configuration, +) from ...utils.dict import DeepDict, load_dict_from_file from ...utils.json_schema import validate_metadata @@ -370,3 +375,48 @@ def configure_and_write_nwbfile( with IO(output_filepath, mode="w") as io: io.write(nwbfile) + + +def configure_and_export_nwbfile( + nwbfile: NWBFile, + export_nwbfile_path: Path, + backend_configuration: BackendConfiguration, +) -> None: + configure_backend(nwbfile=nwbfile, backend_configuration=backend_configuration) + + IO = BACKEND_NWB_IO[backend_configuration.backend] + + with IO(export_nwbfile_path, mode="w") as io: + io.export(nwbfile=nwbfile, src_io=nwbfile.read_io) + + +def repack_nwbfile( + *, + nwbfile: NWBFile, + export_nwbfile_path: Path, + backend_configuration_changes: dict, + template: Literal["existing", "default"], +): + """Repack the NWBFile with the new backend configuration changes.""" + + if template == "existing": + backend_configuration = get_existing_backend_configuration(nwbfile=nwbfile) + elif template == "default": + read_io = nwbfile.read_io + if isinstance(read_io, NWBHDF5IO): + backend = "hdf5" + elif isinstance(read_io, NWBZarrIO): + backend = "zarr" + else: + raise ValueError(f"The backend of the NWBFile from io {read_io} is not recognized.") + backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend=backend) + dataset_configurations = backend_configuration.dataset_configurations + + for neurodata_object_location, dataset_config_changes in backend_configuration_changes.items(): + dataset_configuration = dataset_configurations[neurodata_object_location] + for dataset_config_key, dataset_config_value in dataset_config_changes.items(): + setattr(dataset_configuration, dataset_config_key, dataset_config_value) + + configure_and_export_nwbfile( + nwbfile=nwbfile, backend_configuration=backend_configuration, export_nwbfile_path=export_nwbfile_path + ) diff --git a/temp_test.py b/temp_test.py index dfc477918..7148a5466 100644 --- a/temp_test.py +++ b/temp_test.py @@ -5,8 +5,8 @@ from pynwb import NWBHDF5IO, H5DataIO, TimeSeries from pynwb.testing.mock.file import mock_NWBFile -from neuroconv.tools.nwb_helpers._backend_configuration import ( - get_existing_backend_configuration, +from neuroconv.tools.nwb_helpers import ( + repack_nwbfile, ) @@ -30,11 +30,21 @@ def write_nwbfile(nwbfile_path: Path): def main(): nwbfile_path = Path("/Volumes/T7/CatalystNeuro/temp.nwb") + repacked_nwbfile_path = Path("/Volumes/T7/CatalystNeuro/repacked_temp.nwb") write_nwbfile(nwbfile_path) with NWBHDF5IO(nwbfile_path, mode="r") as io: nwbfile = io.read() - backend_config = get_existing_backend_configuration(nwbfile=nwbfile) - print(backend_config) + backend_configuration_changes = {"acquisition/test_timeseries/data": dict(chunk_shape=(2,))} + repack_nwbfile( + nwbfile=nwbfile, + export_nwbfile_path=repacked_nwbfile_path, + backend_configuration_changes=backend_configuration_changes, + template="existing", + ) + + with NWBHDF5IO(repacked_nwbfile_path, mode="r") as io: + nwbfile = io.read() + print(f'{nwbfile.acquisition["test_timeseries"].data.chunks = }') if __name__ == "__main__": From 4ac6e33c1415e77c7c715501f3bbb53a7108011b Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 14 Aug 2024 14:15:25 -0700 Subject: [PATCH 08/33] fixed bug with export options and hdmf.container.Container.set_data_io --- .../tools/nwb_helpers/_metadata_and_file_helpers.py | 4 ++-- temp_test.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py index 0711883ff..ef1ed28fc 100644 --- a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py +++ b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py @@ -385,9 +385,9 @@ def configure_and_export_nwbfile( configure_backend(nwbfile=nwbfile, backend_configuration=backend_configuration) IO = BACKEND_NWB_IO[backend_configuration.backend] - + nwbfile.set_modified() with IO(export_nwbfile_path, mode="w") as io: - io.export(nwbfile=nwbfile, src_io=nwbfile.read_io) + io.export(nwbfile=nwbfile, src_io=nwbfile.read_io, write_args=dict(link_data=False)) def repack_nwbfile( diff --git a/temp_test.py b/temp_test.py index 7148a5466..349fc4c18 100644 --- a/temp_test.py +++ b/temp_test.py @@ -31,7 +31,10 @@ def write_nwbfile(nwbfile_path: Path): def main(): nwbfile_path = Path("/Volumes/T7/CatalystNeuro/temp.nwb") repacked_nwbfile_path = Path("/Volumes/T7/CatalystNeuro/repacked_temp.nwb") - write_nwbfile(nwbfile_path) + if repacked_nwbfile_path.exists(): + os.remove(repacked_nwbfile_path) + if not nwbfile_path.exists(): + write_nwbfile(nwbfile_path) with NWBHDF5IO(nwbfile_path, mode="r") as io: nwbfile = io.read() backend_configuration_changes = {"acquisition/test_timeseries/data": dict(chunk_shape=(2,))} From ce267fb1df5e269eb2b260b6127cca8cd4b5fd97 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 14 Aug 2024 16:39:35 -0700 Subject: [PATCH 09/33] refactored from_ methods --- .../nwb_helpers/_backend_configuration.py | 2 +- .../_configuration_models/_base_backend.py | 16 ++++-- .../_configuration_models/_hdf5_backend.py | 14 +---- .../_configuration_models/_hdf5_dataset_io.py | 54 +++++++++++-------- .../nwb_helpers/_dataset_configuration.py | 12 +++-- 5 files changed, 54 insertions(+), 44 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py index 488e03a35..3e47876f5 100644 --- a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py @@ -31,4 +31,4 @@ def get_existing_backend_configuration(nwbfile: NWBFile) -> Union[HDF5BackendCon else: raise ValueError(f"The backend of the NWBFile from io {read_io} is not recognized.") BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend] - return BackendConfigurationClass.from_existing_nwbfile(nwbfile=nwbfile) + return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile, mode="existing") diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py index 2c07a1bb0..ed159df8f 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py @@ -9,7 +9,10 @@ from ._base_dataset_io import DatasetIOConfiguration from ._pydantic_pure_json_schema_generator import PureJSONSchemaGenerator -from .._dataset_configuration import get_default_dataset_io_configurations +from .._dataset_configuration import ( + get_default_dataset_io_configurations, + get_existing_dataset_io_configurations, +) class BackendConfiguration(BaseModel): @@ -56,11 +59,16 @@ def model_json_schema(cls, **kwargs) -> Dict[str, Any]: return super().model_json_schema(mode="validation", schema_generator=PureJSONSchemaGenerator, **kwargs) @classmethod - def from_nwbfile(cls, nwbfile: NWBFile) -> Self: - default_dataset_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend) + def from_nwbfile(cls, nwbfile: NWBFile, mode: Literal["default", "existing"] = "default") -> Self: + if mode == "default": + dataset_io_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend) + elif mode == "existing": + dataset_io_configurations = get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend) + else: + raise ValueError(f"mode must be either 'default' or 'existing' but got {mode}") dataset_configurations = { default_dataset_configuration.location_in_file: default_dataset_configuration - for default_dataset_configuration in default_dataset_configurations + for default_dataset_configuration in dataset_io_configurations } return cls(dataset_configurations=dataset_configurations) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_backend.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_backend.py index a8c416292..f85d388b7 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_backend.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_backend.py @@ -3,12 +3,10 @@ from typing import ClassVar, Dict, Literal, Type from pydantic import Field -from pynwb import H5DataIO, NWBFile -from typing_extensions import Self +from pynwb import H5DataIO from ._base_backend import BackendConfiguration from ._hdf5_dataset_io import HDF5DatasetIOConfiguration -from .._dataset_configuration import get_existing_dataset_io_configurations class HDF5BackendConfiguration(BackendConfiguration): @@ -24,13 +22,3 @@ class HDF5BackendConfiguration(BackendConfiguration): "information for writing the datasets to disk using the HDF5 backend." ) ) - - @classmethod - def from_existing_nwbfile(cls, nwbfile: NWBFile) -> Self: - existing_dataset_configurations = get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend) - dataset_configurations = { - existing_dataset_configuration.location_in_file: existing_dataset_configuration - for existing_dataset_configuration in existing_dataset_configurations - } - - return cls(dataset_configurations=dataset_configurations) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py index 03c29389e..6a0508106 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py @@ -82,26 +82,36 @@ def get_data_io_kwargs(self) -> Dict[str, Any]: return dict(chunks=self.chunk_shape, **compression_bundle) @classmethod - def from_existing_neurodata_object( - cls, neurodata_object: Container, dataset_name: Literal["data", "timestamps"] + def from_neurodata_object( + cls, + neurodata_object: Container, + dataset_name: Literal["data", "timestamps"], + mode: Literal["default", "existing"] = "default", ) -> Self: - location_in_file = _find_location_in_memory_nwbfile(neurodata_object=neurodata_object, field_name=dataset_name) - full_shape = getattr(neurodata_object, dataset_name).shape - dtype = getattr(neurodata_object, dataset_name).dtype - chunk_shape = getattr(neurodata_object, dataset_name).chunks - buffer_shape = getattr(neurodata_object, dataset_name).maxshape - compression_method = getattr(neurodata_object, dataset_name).compression - compression_opts = getattr(neurodata_object, dataset_name).compression_opts - compression_options = dict(compression_opts=compression_opts) - return cls( - object_id=neurodata_object.object_id, - object_name=neurodata_object.name, - location_in_file=location_in_file, - dataset_name=dataset_name, - full_shape=full_shape, - dtype=dtype, - chunk_shape=chunk_shape, - buffer_shape=buffer_shape, - compression_method=compression_method, - compression_options=compression_options, - ) + if mode == "default": + return super().from_neurodata_object(neurodata_object=neurodata_object, dataset_name=dataset_name) + elif mode == "existing": + location_in_file = _find_location_in_memory_nwbfile( + neurodata_object=neurodata_object, field_name=dataset_name + ) + full_shape = getattr(neurodata_object, dataset_name).shape + dtype = getattr(neurodata_object, dataset_name).dtype + chunk_shape = getattr(neurodata_object, dataset_name).chunks + buffer_shape = getattr(neurodata_object, dataset_name).maxshape + compression_method = getattr(neurodata_object, dataset_name).compression + compression_opts = getattr(neurodata_object, dataset_name).compression_opts + compression_options = dict(compression_opts=compression_opts) + return cls( + object_id=neurodata_object.object_id, + object_name=neurodata_object.name, + location_in_file=location_in_file, + dataset_name=dataset_name, + full_shape=full_shape, + dtype=dtype, + chunk_shape=chunk_shape, + buffer_shape=buffer_shape, + compression_method=compression_method, + compression_options=compression_options, + ) + else: + raise ValueError(f"mode must be either 'default' or 'existing' but got {mode}") diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index bca151eaf..a6428ae40 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -204,8 +204,10 @@ def get_existing_dataset_io_configurations( if any(axis_length == 0 for axis_length in full_shape): continue - dataset_io_configuration = DatasetIOConfigurationClass.from_existing_neurodata_object( - neurodata_object=column, dataset_name=dataset_name + dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object( + neurodata_object=column, + dataset_name=dataset_name, + mode="existing", ) yield dataset_io_configuration @@ -227,8 +229,10 @@ def get_existing_dataset_io_configurations( if any(axis_length == 0 for axis_length in full_shape): continue - dataset_io_configuration = DatasetIOConfigurationClass.from_existing_neurodata_object( - neurodata_object=neurodata_object, dataset_name=known_dataset_field + dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object( + neurodata_object=neurodata_object, + dataset_name=known_dataset_field, + mode="existing", ) yield dataset_io_configuration From 49f42624e85f94fdfde0b48dd8a512dc1403fea0 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 14 Aug 2024 16:54:01 -0700 Subject: [PATCH 10/33] template and changes optional --- .../tools/nwb_helpers/_metadata_and_file_helpers.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py index ef1ed28fc..54674f899 100644 --- a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py +++ b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py @@ -394,8 +394,8 @@ def repack_nwbfile( *, nwbfile: NWBFile, export_nwbfile_path: Path, - backend_configuration_changes: dict, - template: Literal["existing", "default"], + template: Literal["existing", "default"] = "default", + backend_configuration_changes: dict = None, ): """Repack the NWBFile with the new backend configuration changes.""" @@ -410,8 +410,11 @@ def repack_nwbfile( else: raise ValueError(f"The backend of the NWBFile from io {read_io} is not recognized.") backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend=backend) + else: + raise ValueError(f"template must be either 'default' or 'existing' but got {template}") dataset_configurations = backend_configuration.dataset_configurations + backend_configuration_changes = backend_configuration_changes or dict() for neurodata_object_location, dataset_config_changes in backend_configuration_changes.items(): dataset_configuration = dataset_configurations[neurodata_object_location] for dataset_config_key, dataset_config_value in dataset_config_changes.items(): From d93a5c50045f56e131fd8576434e7e52ae0db212 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 14 Aug 2024 17:34:24 -0700 Subject: [PATCH 11/33] added image series test --- .../test_get_default_backend_configuration.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py index 55ab2db72..f426f0b25 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py @@ -8,6 +8,7 @@ import pytest from hdmf_zarr import NWBZarrIO from pynwb import NWBHDF5IO, NWBFile +from pynwb.image import ImageSeries from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.file import mock_NWBFile @@ -262,3 +263,22 @@ def test_complex_zarr(zarr_nwbfile_path): """ assert stdout.getvalue() == expected_print + + +def test_000_ImageSeries(): + nwbfile = mock_NWBFile() + + im_series = ImageSeries( + name="my_video", external_file=["my_video.mp4"], starting_frame=[0], format="external", rate=30.0 + ) + nwbfile.add_acquisition(im_series) + + with NWBHDF5IO("test.nwb", "w") as io: + io.write(nwbfile) + + io = NWBHDF5IO("test.nwb", "r") + nwbfile = io.read() + print(nwbfile.acquisition["my_video"]) + + backend_config = get_default_backend_configuration(nwbfile, "hdf5") + print(backend_config) # TODO: Figure out why this doesn't throw an error like Ben said it did From 1ad69ca5ab32bdca537fa09a8554d9310aeb245b Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 15 Aug 2024 16:45:36 -0700 Subject: [PATCH 12/33] added initial test --- .../test_helpers/test_repack_nwbfile.py | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py new file mode 100644 index 000000000..24b8a005a --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py @@ -0,0 +1,69 @@ +from pathlib import Path + +import numpy as np +import pytest +from hdmf_zarr import NWBZarrIO +from pynwb import NWBHDF5IO, NWBFile +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.nwb_helpers import ( + get_module, + repack_nwbfile, +) + + +def generate_complex_nwbfile() -> NWBFile: + nwbfile = mock_NWBFile() + + raw_array = np.array([[1, 2, 3], [4, 5, 6]]) + raw_time_series = mock_TimeSeries(name="RawTimeSeries", data=raw_array) + nwbfile.add_acquisition(raw_time_series) + + number_of_trials = 10 + for start_time, stop_time in zip( + np.linspace(start=0.0, stop=10.0, num=number_of_trials), np.linspace(start=1.0, stop=11.0, num=number_of_trials) + ): + nwbfile.add_trial(start_time=start_time, stop_time=stop_time) + + ecephys_module = get_module(nwbfile=nwbfile, name="ecephys") + processed_array = np.array([[7.0, 8.0], [9.0, 10.0], [11.0, 12.0], [13.0, 14.0]]) + processed_time_series = mock_TimeSeries(name="ProcessedTimeSeries", data=processed_array) + ecephys_module.add(processed_time_series) + + return nwbfile + + +@pytest.fixture(scope="session") +def hdf5_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_repack_nwbfile.nwb.h5") + if not Path(nwbfile_path).exists(): + nwbfile = generate_complex_nwbfile() + with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +@pytest.fixture(scope="session") +def zarr_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_repack_nwbfile.nwb.zarr") + if not Path(nwbfile_path).exists(): + nwbfile = generate_complex_nwbfile() + with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +def test_repack_nwbfile(hdf5_nwbfile_path): + export_path = Path(hdf5_nwbfile_path).parent / "repacked_test_repack_nwbfile.nwb.h5" + with NWBHDF5IO(hdf5_nwbfile_path, mode="r") as io: + nwbfile = io.read() + repack_nwbfile( + nwbfile=nwbfile, + export_nwbfile_path=export_path, + ) + + with NWBHDF5IO(export_path, mode="r") as io: + nwbfile = io.read() + assert nwbfile.acquisition["RawTimeSeries"].data.chunks == (2, 3) + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.chunks == (4, 2) From 04fb89c4701119faac45d45da0972b4bd16b6c73 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 16 Aug 2024 07:54:58 -0700 Subject: [PATCH 13/33] updated signature to use file_path --- .../nwb_helpers/_metadata_and_file_helpers.py | 47 +++++++++---------- temp_test.py | 17 ++++--- .../test_helpers/test_repack_nwbfile.py | 11 ++--- 3 files changed, 34 insertions(+), 41 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py index 54674f899..12d31ca0b 100644 --- a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py +++ b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py @@ -392,34 +392,29 @@ def configure_and_export_nwbfile( def repack_nwbfile( *, - nwbfile: NWBFile, + nwbfile_path: Path, export_nwbfile_path: Path, - template: Literal["existing", "default"] = "default", + backend: Literal["hdf5", "zarr"] = "hdf5", + export_backend: Literal["hdf5", "zarr", None] = None, + use_default_backend_configuration: bool = True, backend_configuration_changes: dict = None, ): """Repack the NWBFile with the new backend configuration changes.""" - - if template == "existing": - backend_configuration = get_existing_backend_configuration(nwbfile=nwbfile) - elif template == "default": - read_io = nwbfile.read_io - if isinstance(read_io, NWBHDF5IO): - backend = "hdf5" - elif isinstance(read_io, NWBZarrIO): - backend = "zarr" + IO = BACKEND_NWB_IO[backend] + with IO(nwbfile_path, mode="r") as io: + nwbfile = io.read() + if use_default_backend_configuration: + backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend=backend) else: - raise ValueError(f"The backend of the NWBFile from io {read_io} is not recognized.") - backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend=backend) - else: - raise ValueError(f"template must be either 'default' or 'existing' but got {template}") - dataset_configurations = backend_configuration.dataset_configurations - - backend_configuration_changes = backend_configuration_changes or dict() - for neurodata_object_location, dataset_config_changes in backend_configuration_changes.items(): - dataset_configuration = dataset_configurations[neurodata_object_location] - for dataset_config_key, dataset_config_value in dataset_config_changes.items(): - setattr(dataset_configuration, dataset_config_key, dataset_config_value) - - configure_and_export_nwbfile( - nwbfile=nwbfile, backend_configuration=backend_configuration, export_nwbfile_path=export_nwbfile_path - ) + backend_configuration = get_existing_backend_configuration(nwbfile=nwbfile) + dataset_configurations = backend_configuration.dataset_configurations + + backend_configuration_changes = backend_configuration_changes or dict() + for neurodata_object_location, dataset_config_changes in backend_configuration_changes.items(): + dataset_configuration = dataset_configurations[neurodata_object_location] + for dataset_config_key, dataset_config_value in dataset_config_changes.items(): + setattr(dataset_configuration, dataset_config_key, dataset_config_value) + + configure_and_export_nwbfile( + nwbfile=nwbfile, backend_configuration=backend_configuration, export_nwbfile_path=export_nwbfile_path + ) diff --git a/temp_test.py b/temp_test.py index 349fc4c18..4711388fe 100644 --- a/temp_test.py +++ b/temp_test.py @@ -35,15 +35,14 @@ def main(): os.remove(repacked_nwbfile_path) if not nwbfile_path.exists(): write_nwbfile(nwbfile_path) - with NWBHDF5IO(nwbfile_path, mode="r") as io: - nwbfile = io.read() - backend_configuration_changes = {"acquisition/test_timeseries/data": dict(chunk_shape=(2,))} - repack_nwbfile( - nwbfile=nwbfile, - export_nwbfile_path=repacked_nwbfile_path, - backend_configuration_changes=backend_configuration_changes, - template="existing", - ) + backend_configuration_changes = {"acquisition/test_timeseries/data": dict(chunk_shape=(2,))} + repack_nwbfile( + nwbfile_path=nwbfile_path, + export_nwbfile_path=repacked_nwbfile_path, + backend="hdf5", + backend_configuration_changes=backend_configuration_changes, + use_default_backend_configuration=False, + ) with NWBHDF5IO(repacked_nwbfile_path, mode="r") as io: nwbfile = io.read() diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py index 24b8a005a..54034acd8 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py @@ -56,12 +56,11 @@ def zarr_nwbfile_path(tmpdir_factory): def test_repack_nwbfile(hdf5_nwbfile_path): export_path = Path(hdf5_nwbfile_path).parent / "repacked_test_repack_nwbfile.nwb.h5" - with NWBHDF5IO(hdf5_nwbfile_path, mode="r") as io: - nwbfile = io.read() - repack_nwbfile( - nwbfile=nwbfile, - export_nwbfile_path=export_path, - ) + repack_nwbfile( + nwbfile_path=hdf5_nwbfile_path, + export_nwbfile_path=export_path, + backend="hdf5", + ) with NWBHDF5IO(export_path, mode="r") as io: nwbfile = io.read() From 6dab47778b38a117cce30fe3603497655e254317 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 16 Aug 2024 08:25:20 -0700 Subject: [PATCH 14/33] added test for trials table (fails) --- .../test_helpers/test_repack_nwbfile.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py index 54034acd8..26216d1a4 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py @@ -65,4 +65,5 @@ def test_repack_nwbfile(hdf5_nwbfile_path): with NWBHDF5IO(export_path, mode="r") as io: nwbfile = io.read() assert nwbfile.acquisition["RawTimeSeries"].data.chunks == (2, 3) + assert nwbfile.intervals["trials"].start_time.data.chunks == (10,) assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.chunks == (4, 2) From e6d31a69b604964fb27f3f945447263b1b932de2 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 16 Aug 2024 10:14:39 -0700 Subject: [PATCH 15/33] moved backend_configuration_changes to top of the fn --- src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py index 12d31ca0b..17d9d0682 100644 --- a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py +++ b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py @@ -400,6 +400,7 @@ def repack_nwbfile( backend_configuration_changes: dict = None, ): """Repack the NWBFile with the new backend configuration changes.""" + backend_configuration_changes = backend_configuration_changes or dict() IO = BACKEND_NWB_IO[backend] with IO(nwbfile_path, mode="r") as io: nwbfile = io.read() @@ -409,7 +410,6 @@ def repack_nwbfile( backend_configuration = get_existing_backend_configuration(nwbfile=nwbfile) dataset_configurations = backend_configuration.dataset_configurations - backend_configuration_changes = backend_configuration_changes or dict() for neurodata_object_location, dataset_config_changes in backend_configuration_changes.items(): dataset_configuration = dataset_configurations[neurodata_object_location] for dataset_config_key, dataset_config_value in dataset_config_changes.items(): From 72524495a1eee5a2d5eedaec50888058c878e2d3 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 16 Aug 2024 10:20:48 -0700 Subject: [PATCH 16/33] consolidated configure_and_export_nwbfile into configure_and_write_nwbfile --- .../nwb_helpers/_metadata_and_file_helpers.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py index 17d9d0682..8d24c8ed5 100644 --- a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py +++ b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py @@ -342,6 +342,7 @@ def configure_and_write_nwbfile( output_filepath: str, backend: Optional[Literal["hdf5"]] = None, backend_configuration: Optional[BackendConfiguration] = None, + export: bool = False, ) -> None: """ Write an NWB file using a specific backend or backend configuration. @@ -360,6 +361,8 @@ def configure_and_write_nwbfile( backend_configuration: BackendConfiguration, optional Specifies the backend type and the chunking and compression parameters of each dataset. If no ``backend_configuration`` is specified, the default configuration for the specified ``backend`` is used. + export: bool, default: False + Whether to export the NWB file instead of writing. """ @@ -374,20 +377,11 @@ def configure_and_write_nwbfile( IO = BACKEND_NWB_IO[backend_configuration.backend] with IO(output_filepath, mode="w") as io: - io.write(nwbfile) - - -def configure_and_export_nwbfile( - nwbfile: NWBFile, - export_nwbfile_path: Path, - backend_configuration: BackendConfiguration, -) -> None: - configure_backend(nwbfile=nwbfile, backend_configuration=backend_configuration) - - IO = BACKEND_NWB_IO[backend_configuration.backend] - nwbfile.set_modified() - with IO(export_nwbfile_path, mode="w") as io: - io.export(nwbfile=nwbfile, src_io=nwbfile.read_io, write_args=dict(link_data=False)) + if export: + nwbfile.set_modified() + io.export(nwbfile=nwbfile, src_io=nwbfile.read_io, write_args=dict(link_data=False)) + else: + io.write(nwbfile) def repack_nwbfile( @@ -401,6 +395,8 @@ def repack_nwbfile( ): """Repack the NWBFile with the new backend configuration changes.""" backend_configuration_changes = backend_configuration_changes or dict() + export_backend = export_backend or backend + IO = BACKEND_NWB_IO[backend] with IO(nwbfile_path, mode="r") as io: nwbfile = io.read() @@ -415,6 +411,10 @@ def repack_nwbfile( for dataset_config_key, dataset_config_value in dataset_config_changes.items(): setattr(dataset_configuration, dataset_config_key, dataset_config_value) - configure_and_export_nwbfile( - nwbfile=nwbfile, backend_configuration=backend_configuration, export_nwbfile_path=export_nwbfile_path + configure_and_write_nwbfile( + nwbfile=nwbfile, + backend_configuration=backend_configuration, + output_filepath=export_nwbfile_path, + backend=export_backend, + export=True, ) From 2ef5c44380a45667fb6d1b5a5cabd683585798ce Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Fri, 16 Aug 2024 11:12:34 -0700 Subject: [PATCH 17/33] parameterized for use_default_backend_configuration --- .../test_helpers/test_repack_nwbfile.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py index 26216d1a4..a8c23e4cd 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py @@ -3,7 +3,7 @@ import numpy as np import pytest from hdmf_zarr import NWBZarrIO -from pynwb import NWBHDF5IO, NWBFile +from pynwb import NWBHDF5IO, H5DataIO, NWBFile from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.file import mock_NWBFile @@ -39,6 +39,13 @@ def hdf5_nwbfile_path(tmpdir_factory): nwbfile_path = tmpdir_factory.mktemp("data").join("test_repack_nwbfile.nwb.h5") if not Path(nwbfile_path).exists(): nwbfile = generate_complex_nwbfile() + + # Add a H5DataIO-compressed time series + raw_array = np.array([[11, 21, 31], [41, 51, 61]], dtype="int32") + data = H5DataIO(data=raw_array, compression="gzip", compression_opts=2) + raw_time_series = mock_TimeSeries(name="CompressedRawTimeSeries", data=data) + nwbfile.add_acquisition(raw_time_series) + with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: io.write(nwbfile) return str(nwbfile_path) @@ -54,16 +61,26 @@ def zarr_nwbfile_path(tmpdir_factory): return str(nwbfile_path) -def test_repack_nwbfile(hdf5_nwbfile_path): +@pytest.mark.parametrize("use_default_backend_configuration", [True, False]) +def test_repack_nwbfile(hdf5_nwbfile_path, use_default_backend_configuration): export_path = Path(hdf5_nwbfile_path).parent / "repacked_test_repack_nwbfile.nwb.h5" repack_nwbfile( nwbfile_path=hdf5_nwbfile_path, export_nwbfile_path=export_path, backend="hdf5", + use_default_backend_configuration=use_default_backend_configuration, ) with NWBHDF5IO(export_path, mode="r") as io: nwbfile = io.read() - assert nwbfile.acquisition["RawTimeSeries"].data.chunks == (2, 3) - assert nwbfile.intervals["trials"].start_time.data.chunks == (10,) - assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.chunks == (4, 2) + + if use_default_backend_configuration: + assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts == 4 + assert nwbfile.intervals["trials"].start_time.data.compression_opts == 4 + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts == 4 + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 4 + else: + assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts is None + assert nwbfile.intervals["trials"].start_time.data.compression_opts is None + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts is None + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 2 From 80eb598c11d7eabdf416b92afa2759ea2b9ed0bf Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 19 Aug 2024 10:39:58 -0700 Subject: [PATCH 18/33] optional dci --- .../tools/nwb_helpers/_configure_backend.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_configure_backend.py b/src/neuroconv/tools/nwb_helpers/_configure_backend.py index a67308d43..0fcbe5756 100644 --- a/src/neuroconv/tools/nwb_helpers/_configure_backend.py +++ b/src/neuroconv/tools/nwb_helpers/_configure_backend.py @@ -4,6 +4,7 @@ from typing import Union from hdmf.common import Data +from hdmf.data_utils import DataChunkIterator from pynwb import NWBFile, TimeSeries from ._configuration_models._hdf5_backend import HDF5BackendConfiguration @@ -46,16 +47,24 @@ def configure_backend( # Table columns if isinstance(neurodata_object, Data): - neurodata_object.set_data_io(data_io_class=data_io_class, data_io_kwargs=data_io_kwargs) + neurodata_object.set_data_io( + data_io_class=data_io_class, data_io_kwargs=data_io_kwargs, data_chunk_iterator_class=DataChunkIterator + ) # TimeSeries data or timestamps elif isinstance(neurodata_object, TimeSeries) and not is_dataset_linked: neurodata_object.set_data_io( - dataset_name=dataset_name, data_io_class=data_io_class, data_io_kwargs=data_io_kwargs + dataset_name=dataset_name, + data_io_class=data_io_class, + data_io_kwargs=data_io_kwargs, + data_chunk_iterator_class=DataChunkIterator, ) # Special ndx-events v0.2.0 types elif is_ndx_events_installed and isinstance(neurodata_object, ndx_events.Events): neurodata_object.set_data_io( - dataset_name=dataset_name, data_io_class=data_io_class, data_io_kwargs=data_io_kwargs + dataset_name=dataset_name, + data_io_class=data_io_class, + data_io_kwargs=data_io_kwargs, + data_chunk_iterator_class=DataChunkIterator, ) # But temporarily skipping LabeledEvents elif is_ndx_events_installed and isinstance(neurodata_object, ndx_events.LabeledEvents): From 433f8c904f75b8f7e8d96372c2733bf9ae2ba071 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 19 Aug 2024 11:12:34 -0700 Subject: [PATCH 19/33] added test for backend config changes --- .../nwb_helpers/_metadata_and_file_helpers.py | 2 +- .../test_helpers/test_repack_nwbfile.py | 29 +++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py index 8d24c8ed5..b4899b7c6 100644 --- a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py +++ b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py @@ -393,7 +393,7 @@ def repack_nwbfile( use_default_backend_configuration: bool = True, backend_configuration_changes: dict = None, ): - """Repack the NWBFile with the new backend configuration changes.""" + """Repack the NWBFile with the new backend configuration changes.""" # NOTE: keys for configuration_changes must be as they appear in the BackendConfiguration NOT how they appear in the H5DataIO backend_configuration_changes = backend_configuration_changes or dict() export_backend = export_backend or backend diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py index a8c23e4cd..b88c87b38 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py @@ -84,3 +84,32 @@ def test_repack_nwbfile(hdf5_nwbfile_path, use_default_backend_configuration): assert nwbfile.intervals["trials"].start_time.data.compression_opts is None assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts is None assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 2 + + +@pytest.mark.parametrize("use_default_backend_configuration", [True, False]) +def test_repack_nwbfile_with_changes(hdf5_nwbfile_path, use_default_backend_configuration): + export_path = Path(hdf5_nwbfile_path).parent / "repacked_test_repack_nwbfile.nwb.h5" + backend_configuration_changes = { + "acquisition/RawTimeSeries/data": dict(compression_method="gzip", compression_options=dict(compression_opts=1)) + } + repack_nwbfile( + nwbfile_path=hdf5_nwbfile_path, + export_nwbfile_path=export_path, + backend="hdf5", + use_default_backend_configuration=use_default_backend_configuration, + backend_configuration_changes=backend_configuration_changes, + ) + + with NWBHDF5IO(export_path, mode="r") as io: + nwbfile = io.read() + + if use_default_backend_configuration: + assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts == 1 + assert nwbfile.intervals["trials"].start_time.data.compression_opts == 4 + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts == 4 + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 4 + else: + assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts == 1 + assert nwbfile.intervals["trials"].start_time.data.compression_opts is None + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts is None + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 2 From dd906ac242efdd8d260895eb8c24dabfc4260e2d Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 19 Aug 2024 11:21:11 -0700 Subject: [PATCH 20/33] updated api to use boolean use_default flag instead of mode=existing --- src/neuroconv/tools/nwb_helpers/_backend_configuration.py | 2 +- .../nwb_helpers/_configuration_models/_base_backend.py | 8 +++----- .../nwb_helpers/_configuration_models/_hdf5_dataset_io.py | 8 +++----- src/neuroconv/tools/nwb_helpers/_dataset_configuration.py | 4 ++-- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py index 3e47876f5..794c843eb 100644 --- a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py @@ -31,4 +31,4 @@ def get_existing_backend_configuration(nwbfile: NWBFile) -> Union[HDF5BackendCon else: raise ValueError(f"The backend of the NWBFile from io {read_io} is not recognized.") BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend] - return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile, mode="existing") + return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile, use_default_dataset_io_configurations=False) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py index ed159df8f..be75b646a 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py @@ -59,13 +59,11 @@ def model_json_schema(cls, **kwargs) -> Dict[str, Any]: return super().model_json_schema(mode="validation", schema_generator=PureJSONSchemaGenerator, **kwargs) @classmethod - def from_nwbfile(cls, nwbfile: NWBFile, mode: Literal["default", "existing"] = "default") -> Self: - if mode == "default": + def from_nwbfile(cls, nwbfile: NWBFile, use_default_dataset_io_configurations: bool = True) -> Self: + if use_default_dataset_io_configurations: dataset_io_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend) - elif mode == "existing": - dataset_io_configurations = get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend) else: - raise ValueError(f"mode must be either 'default' or 'existing' but got {mode}") + dataset_io_configurations = get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend) dataset_configurations = { default_dataset_configuration.location_in_file: default_dataset_configuration for default_dataset_configuration in dataset_io_configurations diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py index 6a0508106..9727818aa 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py @@ -86,11 +86,11 @@ def from_neurodata_object( cls, neurodata_object: Container, dataset_name: Literal["data", "timestamps"], - mode: Literal["default", "existing"] = "default", + use_default_dataset_io_configuration: bool = True, ) -> Self: - if mode == "default": + if use_default_dataset_io_configuration: return super().from_neurodata_object(neurodata_object=neurodata_object, dataset_name=dataset_name) - elif mode == "existing": + else: location_in_file = _find_location_in_memory_nwbfile( neurodata_object=neurodata_object, field_name=dataset_name ) @@ -113,5 +113,3 @@ def from_neurodata_object( compression_method=compression_method, compression_options=compression_options, ) - else: - raise ValueError(f"mode must be either 'default' or 'existing' but got {mode}") diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index a6428ae40..6f68bb3a5 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -207,7 +207,7 @@ def get_existing_dataset_io_configurations( dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object( neurodata_object=column, dataset_name=dataset_name, - mode="existing", + use_default_dataset_io_configuration=False, ) yield dataset_io_configuration @@ -232,7 +232,7 @@ def get_existing_dataset_io_configurations( dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object( neurodata_object=neurodata_object, dataset_name=known_dataset_field, - mode="existing", + use_default_dataset_io_configuration=False, ) yield dataset_io_configuration From 668cacca82990be2760175fb1947355f1ff97400 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 19 Aug 2024 12:32:35 -0700 Subject: [PATCH 21/33] added test for get_existing_backend_configuration --- ...test_get_existing_backend_configuration.py | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_backend_configuration.py diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_backend_configuration.py new file mode 100644 index 000000000..938c79069 --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_backend_configuration.py @@ -0,0 +1,140 @@ +"""Integration tests for `get_default_backend_configuration`.""" + +from io import StringIO +from pathlib import Path +from unittest.mock import patch + +import numpy as np +import pytest +from pynwb import NWBHDF5IO, H5DataIO, NWBFile +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.nwb_helpers import ( + HDF5BackendConfiguration, + get_existing_backend_configuration, + get_module, +) + + +def generate_complex_nwbfile() -> NWBFile: + nwbfile = mock_NWBFile() + + raw_array = np.array([[1, 2, 3], [4, 5, 6]]) + raw_time_series = mock_TimeSeries(name="RawTimeSeries", data=raw_array) + nwbfile.add_acquisition(raw_time_series) + + number_of_trials = 10 + for start_time, stop_time in zip( + np.linspace(start=0.0, stop=10.0, num=number_of_trials), np.linspace(start=1.0, stop=11.0, num=number_of_trials) + ): + nwbfile.add_trial(start_time=start_time, stop_time=stop_time) + + ecephys_module = get_module(nwbfile=nwbfile, name="ecephys") + processed_array = np.array([[7.0, 8.0], [9.0, 10.0], [11.0, 12.0], [13.0, 14.0]]) + processed_time_series = mock_TimeSeries(name="ProcessedTimeSeries", data=processed_array) + ecephys_module.add(processed_time_series) + + return nwbfile + + +@pytest.fixture(scope="session") +def hdf5_nwbfile_path(tmpdir_factory): + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.h5") + if not Path(nwbfile_path).exists(): + nwbfile = generate_complex_nwbfile() + + # Add a H5DataIO-compressed time series + raw_array = np.array([[11, 21, 31], [41, 51, 61]], dtype="int32") + data = H5DataIO(data=raw_array, compression="gzip", compression_opts=2) + raw_time_series = mock_TimeSeries(name="CompressedRawTimeSeries", data=data) + nwbfile.add_acquisition(raw_time_series) + + with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + +def test_complex_hdf5(hdf5_nwbfile_path): + with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: + nwbfile = io.read() + backend_configuration = get_existing_backend_configuration(nwbfile=nwbfile) + + assert isinstance(backend_configuration, HDF5BackendConfiguration) + + dataset_configurations = backend_configuration.dataset_configurations + assert len(dataset_configurations) == 5 + + # Best summary test of expected output is the printout + with patch("sys.stdout", new=StringIO()) as stdout: + print(backend_configuration) + + expected_print = """ +HDF5 dataset configurations +--------------------------- + +intervals/trials/start_time/data +-------------------------------- + dtype : float64 + full shape of source array : (10,) + full size of source array : 80 B + + buffer shape : (10,) + expected RAM usage : 80 B + + compression options : {'compression_opts': None} + + +intervals/trials/stop_time/data +------------------------------- + dtype : float64 + full shape of source array : (10,) + full size of source array : 80 B + + buffer shape : (10,) + expected RAM usage : 80 B + + compression options : {'compression_opts': None} + + +processing/ecephys/ProcessedTimeSeries/data +------------------------------------------- + dtype : float64 + full shape of source array : (4, 2) + full size of source array : 64 B + + buffer shape : (4, 2) + expected RAM usage : 64 B + + compression options : {'compression_opts': None} + + +acquisition/RawTimeSeries/data +------------------------------ + dtype : int64 + full shape of source array : (2, 3) + full size of source array : 48 B + + buffer shape : (2, 3) + expected RAM usage : 48 B + + compression options : {'compression_opts': None} + + +acquisition/CompressedRawTimeSeries/data +---------------------------------------- + dtype : int32 + full shape of source array : (2, 3) + full size of source array : 24 B + + buffer shape : (2, 3) + expected RAM usage : 24 B + + chunk shape : (2, 3) + disk space usage per chunk : 24 B + + compression method : gzip + compression options : {'compression_opts': 2} + +""" + assert stdout.getvalue() == expected_print From 779619789117f691daa5fc76a937a9cf792b4ea3 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 19 Aug 2024 12:33:06 -0700 Subject: [PATCH 22/33] removed image_series test --- .../test_get_default_backend_configuration.py | 20 ------------------- 1 file changed, 20 deletions(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py index f426f0b25..55ab2db72 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_default_backend_configuration.py @@ -8,7 +8,6 @@ import pytest from hdmf_zarr import NWBZarrIO from pynwb import NWBHDF5IO, NWBFile -from pynwb.image import ImageSeries from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.file import mock_NWBFile @@ -263,22 +262,3 @@ def test_complex_zarr(zarr_nwbfile_path): """ assert stdout.getvalue() == expected_print - - -def test_000_ImageSeries(): - nwbfile = mock_NWBFile() - - im_series = ImageSeries( - name="my_video", external_file=["my_video.mp4"], starting_frame=[0], format="external", rate=30.0 - ) - nwbfile.add_acquisition(im_series) - - with NWBHDF5IO("test.nwb", "w") as io: - io.write(nwbfile) - - io = NWBHDF5IO("test.nwb", "r") - nwbfile = io.read() - print(nwbfile.acquisition["my_video"]) - - backend_config = get_default_backend_configuration(nwbfile, "hdf5") - print(backend_config) # TODO: Figure out why this doesn't throw an error like Ben said it did From b8a788c8c8f5073e65b32ca777a13c32e2992625 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 19 Aug 2024 13:00:35 -0700 Subject: [PATCH 23/33] added compressed trials table column --- ...test_get_existing_backend_configuration.py | 29 +++++++++++++++++-- .../test_helpers/test_repack_nwbfile.py | 13 +++++++++ 2 files changed, 40 insertions(+), 2 deletions(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_backend_configuration.py index 938c79069..dddb80140 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_backend_configuration.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_backend_configuration.py @@ -40,7 +40,7 @@ def generate_complex_nwbfile() -> NWBFile: @pytest.fixture(scope="session") def hdf5_nwbfile_path(tmpdir_factory): - nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.h5") + nwbfile_path = tmpdir_factory.mktemp("data").join("test_existing_backend_configuration_hdf5_nwbfile.nwb.h5") if not Path(nwbfile_path).exists(): nwbfile = generate_complex_nwbfile() @@ -50,6 +50,15 @@ def hdf5_nwbfile_path(tmpdir_factory): raw_time_series = mock_TimeSeries(name="CompressedRawTimeSeries", data=data) nwbfile.add_acquisition(raw_time_series) + # Add H5DataIO-compressed trials column + number_of_trials = 10 + start_time = np.linspace(start=0.0, stop=10.0, num=number_of_trials) + nwbfile.add_trial_column( + name="compressed_start_time", + description="start time of epoch", + data=H5DataIO(data=start_time, compression="gzip", compression_opts=2), + ) + with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: io.write(nwbfile) return str(nwbfile_path) @@ -63,7 +72,7 @@ def test_complex_hdf5(hdf5_nwbfile_path): assert isinstance(backend_configuration, HDF5BackendConfiguration) dataset_configurations = backend_configuration.dataset_configurations - assert len(dataset_configurations) == 5 + assert len(dataset_configurations) == 6 # Best summary test of expected output is the printout with patch("sys.stdout", new=StringIO()) as stdout: @@ -97,6 +106,22 @@ def test_complex_hdf5(hdf5_nwbfile_path): compression options : {'compression_opts': None} +intervals/trials/compressed_start_time/data +------------------------------------------- + dtype : float64 + full shape of source array : (10,) + full size of source array : 80 B + + buffer shape : (10,) + expected RAM usage : 80 B + + chunk shape : (10,) + disk space usage per chunk : 80 B + + compression method : gzip + compression options : {'compression_opts': 2} + + processing/ecephys/ProcessedTimeSeries/data ------------------------------------------- dtype : float64 diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py index b88c87b38..10867e9db 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py @@ -46,6 +46,15 @@ def hdf5_nwbfile_path(tmpdir_factory): raw_time_series = mock_TimeSeries(name="CompressedRawTimeSeries", data=data) nwbfile.add_acquisition(raw_time_series) + # Add H5DataIO-compressed trials column + number_of_trials = 10 + start_time = np.linspace(start=0.0, stop=10.0, num=number_of_trials) + nwbfile.add_trial_column( + name="compressed_start_time", + description="start time of epoch", + data=H5DataIO(data=start_time, compression="gzip", compression_opts=2), + ) + with NWBHDF5IO(path=str(nwbfile_path), mode="w") as io: io.write(nwbfile) return str(nwbfile_path) @@ -79,11 +88,13 @@ def test_repack_nwbfile(hdf5_nwbfile_path, use_default_backend_configuration): assert nwbfile.intervals["trials"].start_time.data.compression_opts == 4 assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts == 4 assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 4 + assert nwbfile.intervals["trials"].compressed_start_time.data.compression_opts == 4 else: assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts is None assert nwbfile.intervals["trials"].start_time.data.compression_opts is None assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts is None assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 2 + assert nwbfile.intervals["trials"].compressed_start_time.data.compression_opts == 2 @pytest.mark.parametrize("use_default_backend_configuration", [True, False]) @@ -108,8 +119,10 @@ def test_repack_nwbfile_with_changes(hdf5_nwbfile_path, use_default_backend_conf assert nwbfile.intervals["trials"].start_time.data.compression_opts == 4 assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts == 4 assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 4 + assert nwbfile.intervals["trials"].compressed_start_time.data.compression_opts == 4 else: assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts == 1 assert nwbfile.intervals["trials"].start_time.data.compression_opts is None assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts is None assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 2 + assert nwbfile.intervals["trials"].compressed_start_time.data.compression_opts == 2 From f631fb4758c90e842f7fc4a0f0c3dc8de3779eab Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Mon, 19 Aug 2024 17:30:04 -0700 Subject: [PATCH 24/33] added test for get_existing_dataset_io.py --- src/neuroconv/tools/nwb_helpers/__init__.py | 3 +- ..._get_existing_dataset_io_configurations.py | 507 ++++++++++++++++++ 2 files changed, 509 insertions(+), 1 deletion(-) create mode 100644 tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_dataset_io_configurations.py diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index a7f5924b0..e93f30ecc 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -22,7 +22,7 @@ ZarrDatasetIOConfiguration, ) from ._configure_backend import configure_backend -from ._dataset_configuration import get_default_dataset_io_configurations +from ._dataset_configuration import get_default_dataset_io_configurations, get_existing_dataset_io_configurations from ._metadata_and_file_helpers import ( BACKEND_NWB_IO, add_device_from_metadata, @@ -49,6 +49,7 @@ "get_default_backend_configuration", "get_default_dataset_io_configurations", "get_existing_backend_configuration", + "get_existing_dataset_io_configurations", "configure_backend", "get_default_dataset_io_configurations", "get_default_backend_configuration", diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_dataset_io_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_dataset_io_configurations.py new file mode 100644 index 000000000..298733ea3 --- /dev/null +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_dataset_io_configurations.py @@ -0,0 +1,507 @@ +"""Unit tests for `get_default_dataset_io_configurations`.""" + +from typing import Literal + +import numpy as np +import pytest +from hdmf.common import VectorData +from pynwb import NWBHDF5IO, H5DataIO +from pynwb.base import DynamicTable +from pynwb.behavior import CompassDirection +from pynwb.image import ImageSeries +from pynwb.testing.mock.base import mock_TimeSeries +from pynwb.testing.mock.behavior import mock_SpatialSeries +from pynwb.testing.mock.file import mock_NWBFile + +from neuroconv.tools.importing import is_package_installed +from neuroconv.tools.nwb_helpers import ( + DATASET_IO_CONFIGURATIONS, + get_existing_dataset_io_configurations, + get_module, +) + + +@pytest.mark.parametrize("backend", ["hdf5"]) # ["hdf5", "zarr"]) TODO: Add zarr support +def test_configuration_on_time_series(tmp_path, backend: Literal["hdf5", "zarr"]): + data = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + time_series = mock_TimeSeries(name="TestTimeSeries", data=data) + nwbfile.add_acquisition(time_series) + compressed_time_series = mock_TimeSeries( + name="CompressedTimeSeries", + data=H5DataIO(data=data, compression="gzip", compression_opts=2, chunks=(1, 3)), + ) + nwbfile.add_acquisition(compressed_time_series) + + nwbfile_path = tmp_path / "test_existing_dataset_io_configurations_timeseries.nwb" + with NWBHDF5IO(nwbfile_path, "w") as io: + io.write(nwbfile) + with NWBHDF5IO(nwbfile_path, "r") as io: + nwbfile = io.read() + + dataset_configurations = list(get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + assert len(dataset_configurations) == 2 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.object_id == time_series.object_id + assert dataset_configuration.location_in_file == "acquisition/TestTimeSeries/data" + assert dataset_configuration.full_shape == data.shape + assert dataset_configuration.dtype == data.dtype + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.buffer_shape == data.shape + assert dataset_configuration.compression_method is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + + dataset_configuration = dataset_configurations[1] + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.object_id == compressed_time_series.object_id + assert dataset_configuration.location_in_file == "acquisition/CompressedTimeSeries/data" + assert dataset_configuration.full_shape == data.shape + assert dataset_configuration.dtype == data.dtype + assert dataset_configuration.chunk_shape == (1, 3) + assert dataset_configuration.buffer_shape == data.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options["compression_opts"] == 2 + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +@pytest.mark.parametrize("backend", ["hdf5"]) # ["hdf5", "zarr"]) TODO: Add zarr support +def test_configuration_on_external_image_series(tmp_path, backend: Literal["hdf5", "zarr"]): + nwbfile = mock_NWBFile() + image_series = ImageSeries(name="TestImageSeries", external_file=[""], rate=1.0) + nwbfile.add_acquisition(image_series) + + nwbfile_path = tmp_path / "test_existing_dataset_io_configurations_external_image_series.nwb" + with NWBHDF5IO(nwbfile_path, "w") as io: + io.write(nwbfile) + with NWBHDF5IO(nwbfile_path, "r") as io: + nwbfile = io.read() + dataset_configurations = list(get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + assert len(dataset_configurations) == 0 + + +@pytest.mark.parametrize("backend", ["hdf5"]) # ["hdf5", "zarr"]) TODO: Add zarr support +def test_configuration_on_dynamic_table(tmp_path, backend: Literal["hdf5", "zarr"]): + data = np.array([0.1, 0.2, 0.3]) + + nwbfile = mock_NWBFile() + column = VectorData(name="TestColumn", description="", data=data) + compressed_column = VectorData( + name="CompressedColumn", + description="", + data=H5DataIO(data=data, compression="gzip", compression_opts=2, chunks=(1,)), + ) + dynamic_table = DynamicTable( + name="TestDynamicTable", description="", columns=[column, compressed_column], id=list(range(len(data))) + ) + nwbfile.add_acquisition(dynamic_table) + + nwbfile_path = tmp_path / "test_existing_dataset_io_configurations_dynamic_table.nwb" + with NWBHDF5IO(nwbfile_path, "w") as io: + io.write(nwbfile) + with NWBHDF5IO(nwbfile_path, "r") as io: + nwbfile = io.read() + + dataset_configurations = list(get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + assert len(dataset_configurations) == 2 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.object_id == column.object_id + assert dataset_configuration.location_in_file == "acquisition/TestDynamicTable/TestColumn/data" + assert dataset_configuration.full_shape == data.shape + assert dataset_configuration.dtype == data.dtype + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.buffer_shape == data.shape + assert dataset_configuration.compression_method is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = dataset_configurations[1] + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.object_id == compressed_column.object_id + assert dataset_configuration.location_in_file == "acquisition/TestDynamicTable/CompressedColumn/data" + assert dataset_configuration.full_shape == data.shape + assert dataset_configuration.dtype == data.dtype + assert dataset_configuration.chunk_shape == (1,) + assert dataset_configuration.buffer_shape == data.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options == dict(compression_opts=2) + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +@pytest.mark.parametrize("backend", ["hdf5"]) # ["hdf5", "zarr"]) TODO: Add zarr support +def test_configuration_on_ragged_units_table(tmp_path, backend: Literal["hdf5", "zarr"]): + nwbfile = mock_NWBFile() + + spike_times1 = np.array([0.0, 1.0, 2.0]) + waveforms1 = np.array( + [[[1, 2, 3], [1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3], [1, 2, 3]], [[1, 2, 3], [1, 2, 3], [1, 2, 3]]], + dtype="int32", + ) + nwbfile.add_unit(spike_times=spike_times1, waveforms=waveforms1) + + spike_times2 = np.array([3.0, 4.0]) + waveforms2 = np.array([[[4, 5, 6], [4, 5, 6], [4, 5, 6]], [[4, 5, 6], [4, 5, 6], [4, 5, 6]]], dtype="int32") + nwbfile.add_unit(spike_times=spike_times2, waveforms=waveforms2) + + spike_times = np.concatenate([spike_times1, spike_times2]) + waveforms = np.concatenate([waveforms1, waveforms2], axis=0) + index = [len(spike_times1), len(spike_times1) + len(spike_times2)] + spike_times = H5DataIO(data=spike_times, compression="gzip", compression_opts=2, chunks=(2,)) + waveforms = H5DataIO(data=waveforms, compression="gzip", compression_opts=2, chunks=(1, 3, 3)) + nwbfile.add_unit_column(name="compressed_spike_times", description="", data=spike_times, index=index) + nwbfile.add_unit_column(name="compressed_waveforms", description="", data=waveforms, index=index) + + nwbfile_path = tmp_path / "test_existing_dataset_io_configurations_ragged_units_table.nwb" + with NWBHDF5IO(nwbfile_path, "w") as io: + io.write(nwbfile) + + with NWBHDF5IO(nwbfile_path, "r") as io: + nwbfile = io.read() + dataset_configurations = list(get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + assert len(dataset_configurations) == 9 + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "units/spike_times/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.full_shape == (5,) + assert dataset_configuration.dtype == np.dtype("float64") + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.buffer_shape == (5,) + assert dataset_configuration.compression_method is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "units/spike_times_index/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.full_shape == (2,) + assert dataset_configuration.dtype == np.dtype("uint8") + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.buffer_shape == (2,) + assert dataset_configuration.compression_method is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "units/waveforms/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.full_shape == (15, 3) + assert dataset_configuration.dtype == np.dtype("int32") + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.buffer_shape == (15, 3) + assert dataset_configuration.compression_method is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "units/waveforms_index/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.full_shape == (5,) + assert dataset_configuration.dtype == np.dtype("uint8") + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.buffer_shape == (5,) + assert dataset_configuration.compression_method is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "units/waveforms_index_index/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.full_shape == (2,) + assert dataset_configuration.dtype == np.dtype("uint8") + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.buffer_shape == (2,) + assert dataset_configuration.compression_method is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "units/compressed_spike_times/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.full_shape == (5,) + assert dataset_configuration.dtype == np.dtype("float64") + assert dataset_configuration.chunk_shape == (2,) + assert dataset_configuration.buffer_shape == (5,) + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options == dict(compression_opts=2) + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "units/compressed_spike_times_index/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.full_shape == (2,) + assert dataset_configuration.dtype == np.dtype("uint8") + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.buffer_shape == (2,) + assert dataset_configuration.compression_method is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "units/compressed_waveforms/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.full_shape == (5, 3, 3) + assert dataset_configuration.dtype == np.dtype("int32") + assert dataset_configuration.chunk_shape == (1, 3, 3) + assert dataset_configuration.buffer_shape == (5, 3, 3) + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options == dict(compression_opts=2) + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "units/compressed_waveforms_index/data" + ) + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.full_shape == (2,) + assert dataset_configuration.dtype == np.dtype("uint8") + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.buffer_shape == (2,) + assert dataset_configuration.compression_method is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +@pytest.mark.parametrize("backend", ["hdf5"]) # ["hdf5", "zarr"]) TODO: Add zarr support +def test_configuration_on_compass_direction(tmp_path, backend: Literal["hdf5", "zarr"]): + data = np.array([[1, 2, 3], [4, 5, 6]]) + + nwbfile = mock_NWBFile() + spatial_series = mock_SpatialSeries(name="TestSpatialSeries", data=data) + compass_direction = CompassDirection(name="TestCompassDirection", spatial_series=spatial_series) + behavior_module = get_module(nwbfile=nwbfile, name="behavior") + behavior_module.add(compass_direction) + compressed_spatial_series = mock_SpatialSeries( + name="CompressedSpatialSeries", + data=H5DataIO(data=data, compression="gzip", compression_opts=2, chunks=(1, 3)), + ) + compressed_compass_direction = CompassDirection( + name="CompressedCompassDirection", spatial_series=compressed_spatial_series + ) + behavior_module.add(compressed_compass_direction) + nwbfile_path = tmp_path / "test_existing_dataset_io_configurations_compass_direction.nwb" + with NWBHDF5IO(nwbfile_path, "w") as io: + io.write(nwbfile) + + with NWBHDF5IO(nwbfile_path, "r") as io: + nwbfile = io.read() + dataset_configurations = list(get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + assert len(dataset_configurations) == 2 + + dataset_configuration = dataset_configurations[0] + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.object_id == spatial_series.object_id + assert ( + dataset_configuration.location_in_file == "processing/behavior/TestCompassDirection/TestSpatialSeries/data" + ) + assert dataset_configuration.full_shape == data.shape + assert dataset_configuration.dtype == data.dtype + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.buffer_shape == data.shape + assert dataset_configuration.compression_method is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + dataset_configuration = dataset_configurations[1] + assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert dataset_configuration.object_id == compressed_spatial_series.object_id + assert ( + dataset_configuration.location_in_file + == "processing/behavior/CompressedCompassDirection/CompressedSpatialSeries/data" + ) + assert dataset_configuration.full_shape == data.shape + assert dataset_configuration.dtype == data.dtype + assert dataset_configuration.chunk_shape == (1, 3) + assert dataset_configuration.buffer_shape == data.shape + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options == dict(compression_opts=2) + + if backend == "zarr": + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None + + +@pytest.mark.skipif( + not is_package_installed(package_name="ndx_events"), + reason="The extra testing package 'ndx-events' is not installed!", +) +@pytest.mark.parametrize("backend", ["hdf5"]) # ["hdf5", "zarr"]) TODO: Add zarr support +def test_configuration_on_ndx_events(tmp_path, backend: Literal["hdf5", "zarr"]): + from ndx_events import LabeledEvents + + # ndx_events data fields do not support wrapping in DataChunkIterators - data is nearly always small enough + # to fit entirely in memory + data = np.array([1, 2, 3], dtype="uint32") + timestamps = np.array([4.5, 6.7, 8.9]) + + nwbfile = mock_NWBFile() + labeled_events = LabeledEvents( + name="TestLabeledEvents", + description="", + timestamps=timestamps, + data=data, + labels=["response_left", "cue_onset", "cue_offset"], + ) + behavior_module = get_module(nwbfile=nwbfile, name="behavior") + behavior_module.add(labeled_events) + compressed_labeled_events = LabeledEvents( + name="CompressedLabeledEvents", + description="", + timestamps=H5DataIO(data=timestamps, compression="gzip", compression_opts=2, chunks=(3,)), + data=H5DataIO(data=data, compression="gzip", compression_opts=2, chunks=(3,)), + labels=["response_left", "cue_onset", "cue_offset"], + ) + behavior_module.add(compressed_labeled_events) + nwbfile_path = tmp_path / "test_existing_dataset_io_configurations_ndx_events.nwb" + with NWBHDF5IO(nwbfile_path, "w") as io: + io.write(nwbfile) + + with NWBHDF5IO(nwbfile_path, "r") as io: + nwbfile = io.read() + + dataset_configurations = list(get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) + + # Note that the labels dataset is not caught since we search only for 'data' and 'timestamps' fields + assert len(dataset_configurations) == 4 + + data_dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "processing/behavior/TestLabeledEvents/data" + ) + assert isinstance(data_dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert data_dataset_configuration.object_id == labeled_events.object_id + assert data_dataset_configuration.full_shape == data.shape + assert data_dataset_configuration.dtype == data.dtype + assert data_dataset_configuration.chunk_shape is None + assert data_dataset_configuration.buffer_shape == data.shape + assert data_dataset_configuration.compression_method is None + assert data_dataset_configuration.compression_options == dict(compression_opts=None) + + if backend == "zarr": + assert data_dataset_configuration.filter_methods is None + assert data_dataset_configuration.filter_options is None + + timestamps_dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "processing/behavior/TestLabeledEvents/timestamps" + ) + assert isinstance(timestamps_dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert timestamps_dataset_configuration.object_id == labeled_events.object_id + assert timestamps_dataset_configuration.full_shape == timestamps.shape + assert timestamps_dataset_configuration.dtype == timestamps.dtype + assert timestamps_dataset_configuration.chunk_shape is None + assert timestamps_dataset_configuration.buffer_shape == timestamps.shape + assert timestamps_dataset_configuration.compression_method is None + assert timestamps_dataset_configuration.compression_options == dict(compression_opts=None) + + if backend == "zarr": + assert timestamps_dataset_configuration.filter_methods is None + assert timestamps_dataset_configuration.filter_options is None + + data_dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "processing/behavior/CompressedLabeledEvents/data" + ) + assert isinstance(data_dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert data_dataset_configuration.object_id == compressed_labeled_events.object_id + assert data_dataset_configuration.full_shape == data.shape + assert data_dataset_configuration.dtype == data.dtype + assert data_dataset_configuration.chunk_shape == (3,) + assert data_dataset_configuration.buffer_shape == data.shape + assert data_dataset_configuration.compression_method == "gzip" + assert data_dataset_configuration.compression_options == dict(compression_opts=2) + + if backend == "zarr": + assert data_dataset_configuration.filter_methods is None + assert data_dataset_configuration.filter_options is None + + timestamps_dataset_configuration = next( + dataset_configuration + for dataset_configuration in dataset_configurations + if dataset_configuration.location_in_file == "processing/behavior/CompressedLabeledEvents/timestamps" + ) + assert isinstance(timestamps_dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) + assert timestamps_dataset_configuration.object_id == compressed_labeled_events.object_id + assert timestamps_dataset_configuration.full_shape == timestamps.shape + assert timestamps_dataset_configuration.dtype == timestamps.dtype + assert timestamps_dataset_configuration.chunk_shape == (3,) + assert timestamps_dataset_configuration.buffer_shape == timestamps.shape + assert timestamps_dataset_configuration.compression_method == "gzip" + assert timestamps_dataset_configuration.compression_options == dict(compression_opts=2) + + if backend == "zarr": + assert timestamps_dataset_configuration.filter_methods is None + assert timestamps_dataset_configuration.filter_options is None From c4647643a40413fd136005261b0895b07b8a6dfd Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 20 Aug 2024 08:45:28 -0700 Subject: [PATCH 25/33] added docstrings --- .../nwb_helpers/_backend_configuration.py | 14 ++++++++- .../_configuration_models/_base_backend.py | 17 +++++++++++ .../nwb_helpers/_dataset_configuration.py | 15 ++++++++++ .../nwb_helpers/_metadata_and_file_helpers.py | 29 +++++++++++++++++-- 4 files changed, 72 insertions(+), 3 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py index 794c843eb..aa3c3a910 100644 --- a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py @@ -21,7 +21,19 @@ def get_default_backend_configuration( def get_existing_backend_configuration(nwbfile: NWBFile) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]: - """Fill an existing backend configuration to serve as a starting point for further customization.""" + """Fill an existing backend configuration to serve as a starting point for further customization. + + Parameters + ---------- + nwbfile : NWBFile + The NWBFile object to extract the backend configuration from. The nwbfile must have been read from an io object + to work properly. + + Returns + ------- + Union[HDF5BackendConfiguration, ZarrBackendConfiguration] + The backend configuration extracted from the nwbfile. + """ read_io = nwbfile.read_io if isinstance(read_io, NWBHDF5IO): diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py index be75b646a..241da1d66 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_base_backend.py @@ -60,6 +60,23 @@ def model_json_schema(cls, **kwargs) -> Dict[str, Any]: @classmethod def from_nwbfile(cls, nwbfile: NWBFile, use_default_dataset_io_configurations: bool = True) -> Self: + """ + Create a backend configuration from an NWBFile. + + Parameters + ---------- + nwbfile : pynwb.NWBFile + The NWBFile object to extract the backend configuration from. + use_default_dataset_io_configurations : bool, optional + Whether to use default dataset configurations, by default True. If False, the existing dataset + configurations in the NWBFile will be used, which requires that the NWBFile was read from an io object. + + Returns + ------- + Self + The backend configuration extracted from the NWBFile. + """ + if use_default_dataset_io_configurations: dataset_io_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend) else: diff --git a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py index 6f68bb3a5..10f788cf7 100644 --- a/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_dataset_configuration.py @@ -178,6 +178,21 @@ def get_existing_dataset_io_configurations( nwbfile: NWBFile, backend: Literal["hdf5", "zarr"], ) -> Generator[DatasetIOConfiguration, None, None]: + """ + Generate DatasetIOConfiguration objects for each neurodata object in an nwbfile. + + Parameters + ---------- + nwbfile : pynwb.NWBFile + An NWBFile object that has been read from an existing file with an existing backend configuration. + backend : "hdf5" or "zarr" + Which backend format type you would like to use in configuring each dataset's compression methods and options. + + Yields + ------ + DatasetIOConfiguration + A configuration object for each dataset in the NWB file. + """ DatasetIOConfigurationClass = DATASET_IO_CONFIGURATIONS[backend] diff --git a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py index b4899b7c6..a5110eed4 100644 --- a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py +++ b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py @@ -391,9 +391,34 @@ def repack_nwbfile( backend: Literal["hdf5", "zarr"] = "hdf5", export_backend: Literal["hdf5", "zarr", None] = None, use_default_backend_configuration: bool = True, - backend_configuration_changes: dict = None, + backend_configuration_changes: dict[str, dict] = None, ): - """Repack the NWBFile with the new backend configuration changes.""" # NOTE: keys for configuration_changes must be as they appear in the BackendConfiguration NOT how they appear in the H5DataIO + """ + Repack an NWBFile with a new backend configuration. + + Parameters + ---------- + nwbfile_path : Path + Path to the NWB file to be repacked. + export_nwbfile_path : Path + Path to export the repacked NWB file. + backend : {"hdf5", "zarr"}, default: "hdf5" + The type of backend used to read the file. + export_backend : {"hdf5", "zarr", None}, default: None + The type of backend used to write the repacked file. If None, the same backend as the input file is used. + use_default_backend_configuration : bool, default: True + Whether to use the default backend configuration for the specified backend and nwbfile. If False, the nwbfile + must be written to disk and its existing backend configuration is used. + backend_configuration_changes : dict, default: None + Changes to the backend configuration. The keys are the locations of the datasets in the NWB file, and the values + are dictionaries of the changes to be made to the dataset configuration. + + Notes + ----- + The keys for the `backend_configuration_changes` must be as they appear in the BackendConfiguration NOT how they + appear in the H5DataIO. For example, if you want to change the chunking of the 'acquisition/RawTimeSeries/data' + dataset to (10,), you would pass {'acquisition/RawTimeSeries/data': {'chunk_shape': (10,)}}. + """ backend_configuration_changes = backend_configuration_changes or dict() export_backend = export_backend or backend From 1cf36292b4b262eb2437b15e6d032f4690e0f2fb Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 20 Aug 2024 08:56:57 -0700 Subject: [PATCH 26/33] used BACKEND_NWB_IO dict --- src/neuroconv/tools/nwb_helpers/__init__.py | 2 +- .../tools/nwb_helpers/_backend_configuration.py | 11 ++++------- .../tools/nwb_helpers/_metadata_and_file_helpers.py | 6 ++---- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/__init__.py b/src/neuroconv/tools/nwb_helpers/__init__.py index e93f30ecc..87a1e2081 100644 --- a/src/neuroconv/tools/nwb_helpers/__init__.py +++ b/src/neuroconv/tools/nwb_helpers/__init__.py @@ -5,6 +5,7 @@ from ._backend_configuration import ( BACKEND_CONFIGURATIONS, + BACKEND_NWB_IO, get_default_backend_configuration, get_existing_backend_configuration, ) @@ -24,7 +25,6 @@ from ._configure_backend import configure_backend from ._dataset_configuration import get_default_dataset_io_configurations, get_existing_dataset_io_configurations from ._metadata_and_file_helpers import ( - BACKEND_NWB_IO, add_device_from_metadata, configure_and_write_nwbfile, get_default_nwbfile_metadata, diff --git a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py index aa3c3a910..860d1a2cf 100644 --- a/src/neuroconv/tools/nwb_helpers/_backend_configuration.py +++ b/src/neuroconv/tools/nwb_helpers/_backend_configuration.py @@ -9,6 +9,7 @@ from ._configuration_models._zarr_backend import ZarrBackendConfiguration BACKEND_CONFIGURATIONS = dict(hdf5=HDF5BackendConfiguration, zarr=ZarrBackendConfiguration) +BACKEND_NWB_IO = dict(hdf5=NWBHDF5IO, zarr=NWBZarrIO) def get_default_backend_configuration( @@ -34,13 +35,9 @@ def get_existing_backend_configuration(nwbfile: NWBFile) -> Union[HDF5BackendCon Union[HDF5BackendConfiguration, ZarrBackendConfiguration] The backend configuration extracted from the nwbfile. """ - read_io = nwbfile.read_io - if isinstance(read_io, NWBHDF5IO): - backend = "hdf5" - elif isinstance(read_io, NWBZarrIO): - backend = "zarr" - else: - raise ValueError(f"The backend of the NWBFile from io {read_io} is not recognized.") + for backend, io in BACKEND_NWB_IO.items(): + if isinstance(read_io, io): + break BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend] return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile, use_default_dataset_io_configurations=False) diff --git a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py index a5110eed4..b92bf10ad 100644 --- a/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py +++ b/src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py @@ -10,12 +10,12 @@ from typing import Literal, Optional from warnings import warn -from hdmf_zarr import NWBZarrIO from pydantic import FilePath -from pynwb import NWBHDF5IO, NWBFile +from pynwb import NWBFile from pynwb.file import Subject from . import ( + BACKEND_NWB_IO, BackendConfiguration, configure_backend, get_default_backend_configuration, @@ -24,8 +24,6 @@ from ...utils.dict import DeepDict, load_dict_from_file from ...utils.json_schema import validate_metadata -BACKEND_NWB_IO = dict(hdf5=NWBHDF5IO, zarr=NWBZarrIO) - def get_module(nwbfile: NWBFile, name: str, description: str = None): """Check if processing module exists. If not, create it. Then return module.""" From 481529f00845ba0ef25d113477d298ed54498aeb Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Tue, 20 Aug 2024 14:25:52 -0700 Subject: [PATCH 27/33] added ZarrDatsetIOConfiguration.from_neurodata_object --- .../_configuration_models/_zarr_dataset_io.py | 37 +++++++++++++- temp_test.py | 48 ++++++++++++------- 2 files changed, 65 insertions(+), 20 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py index c070a20e9..30de2f05e 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py @@ -1,12 +1,13 @@ """Base Pydantic models for the ZarrDatasetConfiguration.""" -from typing import Any, Dict, List, Literal, Union +from typing import Any, Dict, List, Literal, Self, Union import numcodecs import zarr +from hdmf import Container from pydantic import Field, InstanceOf, model_validator -from ._base_dataset_io import DatasetIOConfiguration +from ._base_dataset_io import DatasetIOConfiguration, _find_location_in_memory_nwbfile _base_zarr_codecs = set(zarr.codec_registry.keys()) _lossy_zarr_codecs = set(("astype", "bitround", "quantize")) @@ -130,3 +131,35 @@ def get_data_io_kwargs(self) -> Dict[str, Any]: compressor = False return dict(chunks=self.chunk_shape, filters=filters, compressor=compressor) + + @classmethod + def from_neurodata_object( + cls, + neurodata_object: Container, + dataset_name: Literal["data", "timestamps"], + use_default_dataset_io_configuration: bool = True, + ) -> Self: + if use_default_dataset_io_configuration: + return super().from_neurodata_object(neurodata_object=neurodata_object, dataset_name=dataset_name) + else: # TODO: Remove else to decrease indentation + location_in_file = _find_location_in_memory_nwbfile( + neurodata_object=neurodata_object, field_name=dataset_name + ) + full_shape = getattr(neurodata_object, dataset_name).shape + dtype = getattr(neurodata_object, dataset_name).dtype + chunk_shape = getattr(neurodata_object, dataset_name).chunks + buffer_shape = full_shape # TODO: replace with default buffer shape + compression_method = getattr(neurodata_object, dataset_name).compressor + filter_methods = getattr(neurodata_object, dataset_name).filters + return cls( + object_id=neurodata_object.object_id, + object_name=neurodata_object.name, + location_in_file=location_in_file, + dataset_name=dataset_name, + full_shape=full_shape, + dtype=dtype, + chunk_shape=chunk_shape, + buffer_shape=buffer_shape, + compression_method=compression_method, + filter_methods=filter_methods, + ) diff --git a/temp_test.py b/temp_test.py index 4711388fe..a740e8311 100644 --- a/temp_test.py +++ b/temp_test.py @@ -2,51 +2,63 @@ from pathlib import Path import numpy as np +from hdmf_zarr import ZarrDataIO +from hdmf_zarr.nwb import NWBZarrIO from pynwb import NWBHDF5IO, H5DataIO, TimeSeries from pynwb.testing.mock.file import mock_NWBFile -from neuroconv.tools.nwb_helpers import ( - repack_nwbfile, +from neuroconv.tools.nwb_helpers._dataset_configuration import ( + get_existing_dataset_io_configurations, ) -def write_nwbfile(nwbfile_path: Path): +def write_nwbfile(nwbfile_path: Path, backend: str = "hdf5"): if nwbfile_path.exists(): os.remove(nwbfile_path) nwbfile = mock_NWBFile() timestamps = np.arange(10.0) data = np.arange(100, 200, 10) + if backend == "hdf5": + data = H5DataIO(data=data, compression="gzip", chunks=(1,), compression_opts=2) + elif backend == "zarr": + data = ZarrDataIO(data=data, chunks=(3,), compressor=True) time_series_with_timestamps = TimeSeries( name="test_timeseries", description="an example time series", - data=H5DataIO(data=data, compression="gzip", chunks=(1,), compression_opts=2), + data=data, unit="m", timestamps=timestamps, ) nwbfile.add_acquisition(time_series_with_timestamps) - with NWBHDF5IO(nwbfile_path, mode="w") as io: + IO = NWBHDF5IO if backend == "hdf5" else NWBZarrIO + with IO(str(nwbfile_path), mode="w") as io: io.write(nwbfile) def main(): - nwbfile_path = Path("/Volumes/T7/CatalystNeuro/temp.nwb") - repacked_nwbfile_path = Path("/Volumes/T7/CatalystNeuro/repacked_temp.nwb") + nwbfile_path = Path("temp.nwb.zarr") + repacked_nwbfile_path = Path("repacked_temp.nwb") if repacked_nwbfile_path.exists(): os.remove(repacked_nwbfile_path) if not nwbfile_path.exists(): - write_nwbfile(nwbfile_path) - backend_configuration_changes = {"acquisition/test_timeseries/data": dict(chunk_shape=(2,))} - repack_nwbfile( - nwbfile_path=nwbfile_path, - export_nwbfile_path=repacked_nwbfile_path, - backend="hdf5", - backend_configuration_changes=backend_configuration_changes, - use_default_backend_configuration=False, - ) + write_nwbfile(nwbfile_path, backend="zarr") - with NWBHDF5IO(repacked_nwbfile_path, mode="r") as io: + with NWBZarrIO(str(nwbfile_path), mode="r") as io: nwbfile = io.read() - print(f'{nwbfile.acquisition["test_timeseries"].data.chunks = }') + dataset_io_configurations = get_existing_dataset_io_configurations(nwbfile=nwbfile, backend="zarr") + print(next(dataset_io_configurations)) + # backend_configuration_changes = {"acquisition/test_timeseries/data": dict(chunk_shape=(2,))} + # repack_nwbfile( + # nwbfile_path=nwbfile_path, + # export_nwbfile_path=repacked_nwbfile_path, + # backend="hdf5", + # backend_configuration_changes=backend_configuration_changes, + # use_default_backend_configuration=False, + # ) + + # with NWBHDF5IO(repacked_nwbfile_path, mode="r") as io: + # nwbfile = io.read() + # print(f'{nwbfile.acquisition["test_timeseries"].data.chunks = }') if __name__ == "__main__": From 9f02b6188cfbc2c87e4c7c21ece084630a7cfd60 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Aug 2024 10:31:40 -0700 Subject: [PATCH 28/33] removed unnecessary indent --- .../_configuration_models/_hdf5_dataset_io.py | 44 +++++++++---------- .../_configuration_models/_zarr_dataset_io.py | 42 +++++++++--------- 2 files changed, 41 insertions(+), 45 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py index 9727818aa..c8454770e 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py @@ -90,26 +90,24 @@ def from_neurodata_object( ) -> Self: if use_default_dataset_io_configuration: return super().from_neurodata_object(neurodata_object=neurodata_object, dataset_name=dataset_name) - else: - location_in_file = _find_location_in_memory_nwbfile( - neurodata_object=neurodata_object, field_name=dataset_name - ) - full_shape = getattr(neurodata_object, dataset_name).shape - dtype = getattr(neurodata_object, dataset_name).dtype - chunk_shape = getattr(neurodata_object, dataset_name).chunks - buffer_shape = getattr(neurodata_object, dataset_name).maxshape - compression_method = getattr(neurodata_object, dataset_name).compression - compression_opts = getattr(neurodata_object, dataset_name).compression_opts - compression_options = dict(compression_opts=compression_opts) - return cls( - object_id=neurodata_object.object_id, - object_name=neurodata_object.name, - location_in_file=location_in_file, - dataset_name=dataset_name, - full_shape=full_shape, - dtype=dtype, - chunk_shape=chunk_shape, - buffer_shape=buffer_shape, - compression_method=compression_method, - compression_options=compression_options, - ) + + location_in_file = _find_location_in_memory_nwbfile(neurodata_object=neurodata_object, field_name=dataset_name) + full_shape = getattr(neurodata_object, dataset_name).shape + dtype = getattr(neurodata_object, dataset_name).dtype + chunk_shape = getattr(neurodata_object, dataset_name).chunks + buffer_shape = getattr(neurodata_object, dataset_name).maxshape + compression_method = getattr(neurodata_object, dataset_name).compression + compression_opts = getattr(neurodata_object, dataset_name).compression_opts + compression_options = dict(compression_opts=compression_opts) + return cls( + object_id=neurodata_object.object_id, + object_name=neurodata_object.name, + location_in_file=location_in_file, + dataset_name=dataset_name, + full_shape=full_shape, + dtype=dtype, + chunk_shape=chunk_shape, + buffer_shape=buffer_shape, + compression_method=compression_method, + compression_options=compression_options, + ) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py index 30de2f05e..5c514ec0a 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py @@ -141,25 +141,23 @@ def from_neurodata_object( ) -> Self: if use_default_dataset_io_configuration: return super().from_neurodata_object(neurodata_object=neurodata_object, dataset_name=dataset_name) - else: # TODO: Remove else to decrease indentation - location_in_file = _find_location_in_memory_nwbfile( - neurodata_object=neurodata_object, field_name=dataset_name - ) - full_shape = getattr(neurodata_object, dataset_name).shape - dtype = getattr(neurodata_object, dataset_name).dtype - chunk_shape = getattr(neurodata_object, dataset_name).chunks - buffer_shape = full_shape # TODO: replace with default buffer shape - compression_method = getattr(neurodata_object, dataset_name).compressor - filter_methods = getattr(neurodata_object, dataset_name).filters - return cls( - object_id=neurodata_object.object_id, - object_name=neurodata_object.name, - location_in_file=location_in_file, - dataset_name=dataset_name, - full_shape=full_shape, - dtype=dtype, - chunk_shape=chunk_shape, - buffer_shape=buffer_shape, - compression_method=compression_method, - filter_methods=filter_methods, - ) + + location_in_file = _find_location_in_memory_nwbfile(neurodata_object=neurodata_object, field_name=dataset_name) + full_shape = getattr(neurodata_object, dataset_name).shape + dtype = getattr(neurodata_object, dataset_name).dtype + chunk_shape = getattr(neurodata_object, dataset_name).chunks + buffer_shape = full_shape # TODO: replace with default buffer shape + compression_method = getattr(neurodata_object, dataset_name).compressor + filter_methods = getattr(neurodata_object, dataset_name).filters + return cls( + object_id=neurodata_object.object_id, + object_name=neurodata_object.name, + location_in_file=location_in_file, + dataset_name=dataset_name, + full_shape=full_shape, + dtype=dtype, + chunk_shape=chunk_shape, + buffer_shape=buffer_shape, + compression_method=compression_method, + filter_methods=filter_methods, + ) From 9ee146f189cde56e2c3cf13d5931860b2965c049 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Aug 2024 10:36:46 -0700 Subject: [PATCH 29/33] estimate buffer shape --- .../nwb_helpers/_configuration_models/_hdf5_dataset_io.py | 7 ++++++- .../nwb_helpers/_configuration_models/_zarr_dataset_io.py | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py index c8454770e..4a3180b60 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_hdf5_dataset_io.py @@ -3,11 +3,13 @@ from typing import Any, Dict, Literal, Union import h5py +import numpy as np from hdmf import Container from pydantic import Field, InstanceOf from typing_extensions import Self from ._base_dataset_io import DatasetIOConfiguration, _find_location_in_memory_nwbfile +from ...hdmf import SliceableDataChunkIterator from ...importing import is_package_installed _base_hdf5_filters = set(h5py.filters.decode) @@ -95,7 +97,10 @@ def from_neurodata_object( full_shape = getattr(neurodata_object, dataset_name).shape dtype = getattr(neurodata_object, dataset_name).dtype chunk_shape = getattr(neurodata_object, dataset_name).chunks - buffer_shape = getattr(neurodata_object, dataset_name).maxshape + buffer_chunk_shape = chunk_shape or full_shape + buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape( + buffer_gb=0.5, chunk_shape=buffer_chunk_shape, maxshape=full_shape, dtype=np.dtype(dtype) + ) compression_method = getattr(neurodata_object, dataset_name).compression compression_opts = getattr(neurodata_object, dataset_name).compression_opts compression_options = dict(compression_opts=compression_opts) diff --git a/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py b/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py index 5c514ec0a..3112f5480 100644 --- a/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py +++ b/src/neuroconv/tools/nwb_helpers/_configuration_models/_zarr_dataset_io.py @@ -3,11 +3,13 @@ from typing import Any, Dict, List, Literal, Self, Union import numcodecs +import numpy as np import zarr from hdmf import Container from pydantic import Field, InstanceOf, model_validator from ._base_dataset_io import DatasetIOConfiguration, _find_location_in_memory_nwbfile +from ...hdmf import SliceableDataChunkIterator _base_zarr_codecs = set(zarr.codec_registry.keys()) _lossy_zarr_codecs = set(("astype", "bitround", "quantize")) @@ -146,7 +148,10 @@ def from_neurodata_object( full_shape = getattr(neurodata_object, dataset_name).shape dtype = getattr(neurodata_object, dataset_name).dtype chunk_shape = getattr(neurodata_object, dataset_name).chunks - buffer_shape = full_shape # TODO: replace with default buffer shape + buffer_chunk_shape = chunk_shape or full_shape + buffer_shape = SliceableDataChunkIterator.estimate_default_buffer_shape( + buffer_gb=0.5, chunk_shape=buffer_chunk_shape, maxshape=full_shape, dtype=np.dtype(dtype) + ) compression_method = getattr(neurodata_object, dataset_name).compressor filter_methods = getattr(neurodata_object, dataset_name).filters return cls( From ee7ec524b401c0d40e9a1bfaeae19cf1a570d007 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Wed, 21 Aug 2024 14:18:04 -0700 Subject: [PATCH 30/33] updated temp_test --- temp_test.py | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/temp_test.py b/temp_test.py index a740e8311..74bd0294c 100644 --- a/temp_test.py +++ b/temp_test.py @@ -1,4 +1,5 @@ import os +import shutil from pathlib import Path import numpy as np @@ -7,9 +8,7 @@ from pynwb import NWBHDF5IO, H5DataIO, TimeSeries from pynwb.testing.mock.file import mock_NWBFile -from neuroconv.tools.nwb_helpers._dataset_configuration import ( - get_existing_dataset_io_configurations, -) +from neuroconv.tools.nwb_helpers import repack_nwbfile def write_nwbfile(nwbfile_path: Path, backend: str = "hdf5"): @@ -37,28 +36,27 @@ def write_nwbfile(nwbfile_path: Path, backend: str = "hdf5"): def main(): nwbfile_path = Path("temp.nwb.zarr") - repacked_nwbfile_path = Path("repacked_temp.nwb") + repacked_nwbfile_path = Path("repacked_temp.nwb.zarr") if repacked_nwbfile_path.exists(): - os.remove(repacked_nwbfile_path) + if repacked_nwbfile_path.is_dir(): + shutil.rmtree(repacked_nwbfile_path) + else: + os.remove(repacked_nwbfile_path) if not nwbfile_path.exists(): write_nwbfile(nwbfile_path, backend="zarr") - with NWBZarrIO(str(nwbfile_path), mode="r") as io: - nwbfile = io.read() - dataset_io_configurations = get_existing_dataset_io_configurations(nwbfile=nwbfile, backend="zarr") - print(next(dataset_io_configurations)) - # backend_configuration_changes = {"acquisition/test_timeseries/data": dict(chunk_shape=(2,))} - # repack_nwbfile( - # nwbfile_path=nwbfile_path, - # export_nwbfile_path=repacked_nwbfile_path, - # backend="hdf5", - # backend_configuration_changes=backend_configuration_changes, - # use_default_backend_configuration=False, - # ) + backend_configuration_changes = {"acquisition/test_timeseries/data": dict(chunk_shape=(2,))} + repack_nwbfile( + nwbfile_path=str(nwbfile_path), + export_nwbfile_path=str(repacked_nwbfile_path), + backend="zarr", + backend_configuration_changes=backend_configuration_changes, + use_default_backend_configuration=False, + ) - # with NWBHDF5IO(repacked_nwbfile_path, mode="r") as io: - # nwbfile = io.read() - # print(f'{nwbfile.acquisition["test_timeseries"].data.chunks = }') + with NWBZarrIO(str(repacked_nwbfile_path), mode="r") as io: + nwbfile = io.read() + print(f'{nwbfile.acquisition["test_timeseries"].data.chunks = }') if __name__ == "__main__": From a2145a1211ed0a1178eb9cad60aa4f1e7b3dfb1c Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 22 Aug 2024 09:15:36 -0700 Subject: [PATCH 31/33] added zarr to dataset_io tests --- ..._get_existing_dataset_io_configurations.py | 347 ++++++++++++------ 1 file changed, 235 insertions(+), 112 deletions(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_dataset_io_configurations.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_dataset_io_configurations.py index 298733ea3..2ea2ed189 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_dataset_io_configurations.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_dataset_io_configurations.py @@ -5,6 +5,9 @@ import numpy as np import pytest from hdmf.common import VectorData +from hdmf_zarr import ZarrDataIO +from hdmf_zarr.nwb import NWBZarrIO +from numcodecs import Blosc from pynwb import NWBHDF5IO, H5DataIO from pynwb.base import DynamicTable from pynwb.behavior import CompassDirection @@ -21,23 +24,36 @@ ) -@pytest.mark.parametrize("backend", ["hdf5"]) # ["hdf5", "zarr"]) TODO: Add zarr support +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) def test_configuration_on_time_series(tmp_path, backend: Literal["hdf5", "zarr"]): data = np.array([[1, 2, 3], [4, 5, 6]]) nwbfile = mock_NWBFile() + if backend == "zarr": # ZarrDataIO compresses by default, so we disable it to test no-compression + data = ZarrDataIO(data=data, compressor=False) time_series = mock_TimeSeries(name="TestTimeSeries", data=data) nwbfile.add_acquisition(time_series) + + data = np.array([[1, 2, 3], [4, 5, 6]]) + if backend == "hdf5": + data = H5DataIO(data=data, compression="gzip", compression_opts=2, chunks=(1, 3)) + elif backend == "zarr": + compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.SHUFFLE, blocksize=0) + filter1 = Blosc(cname="zstd", clevel=1, shuffle=Blosc.SHUFFLE) + filter2 = Blosc(cname="zstd", clevel=2, shuffle=Blosc.SHUFFLE) + filters = [filter1, filter2] + data = ZarrDataIO(data=data, chunks=(1, 3), compressor=compressor, filters=filters) compressed_time_series = mock_TimeSeries( name="CompressedTimeSeries", - data=H5DataIO(data=data, compression="gzip", compression_opts=2, chunks=(1, 3)), + data=data, ) nwbfile.add_acquisition(compressed_time_series) nwbfile_path = tmp_path / "test_existing_dataset_io_configurations_timeseries.nwb" - with NWBHDF5IO(nwbfile_path, "w") as io: + IO = NWBHDF5IO if backend == "hdf5" else NWBZarrIO + with IO(str(nwbfile_path), "w") as io: io.write(nwbfile) - with NWBHDF5IO(nwbfile_path, "r") as io: + with IO(str(nwbfile_path), "r") as io: nwbfile = io.read() dataset_configurations = list(get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) @@ -50,10 +66,18 @@ def test_configuration_on_time_series(tmp_path, backend: Literal["hdf5", "zarr"] assert dataset_configuration.location_in_file == "acquisition/TestTimeSeries/data" assert dataset_configuration.full_shape == data.shape assert dataset_configuration.dtype == data.dtype - assert dataset_configuration.chunk_shape is None assert dataset_configuration.buffer_shape == data.shape assert dataset_configuration.compression_method is None - assert dataset_configuration.compression_options == dict(compression_opts=None) + + if backend == "hdf5": + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + + elif backend == "zarr": + assert dataset_configuration.chunk_shape == (2, 3) + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods is None + assert dataset_configuration.filter_options is None dataset_configuration = dataset_configurations[1] assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) @@ -63,39 +87,56 @@ def test_configuration_on_time_series(tmp_path, backend: Literal["hdf5", "zarr"] assert dataset_configuration.dtype == data.dtype assert dataset_configuration.chunk_shape == (1, 3) assert dataset_configuration.buffer_shape == data.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options["compression_opts"] == 2 - if backend == "zarr": - assert dataset_configuration.filter_methods is None + if backend == "hdf5": + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options["compression_opts"] == 2 + + elif backend == "zarr": + assert dataset_configuration.compression_method == compressor + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods == filters assert dataset_configuration.filter_options is None -@pytest.mark.parametrize("backend", ["hdf5"]) # ["hdf5", "zarr"]) TODO: Add zarr support +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) def test_configuration_on_external_image_series(tmp_path, backend: Literal["hdf5", "zarr"]): nwbfile = mock_NWBFile() image_series = ImageSeries(name="TestImageSeries", external_file=[""], rate=1.0) nwbfile.add_acquisition(image_series) nwbfile_path = tmp_path / "test_existing_dataset_io_configurations_external_image_series.nwb" - with NWBHDF5IO(nwbfile_path, "w") as io: + IO = NWBHDF5IO if backend == "hdf5" else NWBZarrIO + with IO(str(nwbfile_path), "w") as io: io.write(nwbfile) - with NWBHDF5IO(nwbfile_path, "r") as io: + with IO(str(nwbfile_path), "r") as io: nwbfile = io.read() dataset_configurations = list(get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) assert len(dataset_configurations) == 0 -@pytest.mark.parametrize("backend", ["hdf5"]) # ["hdf5", "zarr"]) TODO: Add zarr support +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) def test_configuration_on_dynamic_table(tmp_path, backend: Literal["hdf5", "zarr"]): data = np.array([0.1, 0.2, 0.3]) nwbfile = mock_NWBFile() + if backend == "zarr": # ZarrDataIO compresses by default, so we disable it to test no-compression + data = ZarrDataIO(data=data, compressor=False) column = VectorData(name="TestColumn", description="", data=data) + + data = np.array([0.1, 0.2, 0.3]) + if backend == "hdf5": + data = H5DataIO(data=data, compression="gzip", compression_opts=2, chunks=(1,)) + elif backend == "zarr": + compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.SHUFFLE, blocksize=0) + filter1 = Blosc(cname="zstd", clevel=1, shuffle=Blosc.SHUFFLE) + filter2 = Blosc(cname="zstd", clevel=2, shuffle=Blosc.SHUFFLE) + filters = [filter1, filter2] + data = ZarrDataIO(data=data, chunks=(1,), compressor=compressor, filters=filters) compressed_column = VectorData( name="CompressedColumn", description="", - data=H5DataIO(data=data, compression="gzip", compression_opts=2, chunks=(1,)), + data=data, ) dynamic_table = DynamicTable( name="TestDynamicTable", description="", columns=[column, compressed_column], id=list(range(len(data))) @@ -103,9 +144,10 @@ def test_configuration_on_dynamic_table(tmp_path, backend: Literal["hdf5", "zarr nwbfile.add_acquisition(dynamic_table) nwbfile_path = tmp_path / "test_existing_dataset_io_configurations_dynamic_table.nwb" - with NWBHDF5IO(nwbfile_path, "w") as io: + IO = NWBHDF5IO if backend == "hdf5" else NWBZarrIO + with IO(str(nwbfile_path), "w") as io: io.write(nwbfile) - with NWBHDF5IO(nwbfile_path, "r") as io: + with IO(str(nwbfile_path), "r") as io: nwbfile = io.read() dataset_configurations = list(get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) @@ -118,11 +160,15 @@ def test_configuration_on_dynamic_table(tmp_path, backend: Literal["hdf5", "zarr assert dataset_configuration.location_in_file == "acquisition/TestDynamicTable/TestColumn/data" assert dataset_configuration.full_shape == data.shape assert dataset_configuration.dtype == data.dtype - assert dataset_configuration.chunk_shape is None assert dataset_configuration.buffer_shape == data.shape assert dataset_configuration.compression_method is None - assert dataset_configuration.compression_options == dict(compression_opts=None) - if backend == "zarr": + + if backend == "hdf5": + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + elif backend == "zarr": + assert dataset_configuration.chunk_shape == (3,) + assert dataset_configuration.compression_options is None assert dataset_configuration.filter_methods is None assert dataset_configuration.filter_options is None @@ -134,14 +180,18 @@ def test_configuration_on_dynamic_table(tmp_path, backend: Literal["hdf5", "zarr assert dataset_configuration.dtype == data.dtype assert dataset_configuration.chunk_shape == (1,) assert dataset_configuration.buffer_shape == data.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options == dict(compression_opts=2) - if backend == "zarr": - assert dataset_configuration.filter_methods is None + + if backend == "hdf5": + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options == dict(compression_opts=2) + elif backend == "zarr": + assert dataset_configuration.compression_method == compressor + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods == filters assert dataset_configuration.filter_options is None -@pytest.mark.parametrize("backend", ["hdf5"]) # ["hdf5", "zarr"]) TODO: Add zarr support +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) def test_configuration_on_ragged_units_table(tmp_path, backend: Literal["hdf5", "zarr"]): nwbfile = mock_NWBFile() @@ -159,16 +209,24 @@ def test_configuration_on_ragged_units_table(tmp_path, backend: Literal["hdf5", spike_times = np.concatenate([spike_times1, spike_times2]) waveforms = np.concatenate([waveforms1, waveforms2], axis=0) index = [len(spike_times1), len(spike_times1) + len(spike_times2)] - spike_times = H5DataIO(data=spike_times, compression="gzip", compression_opts=2, chunks=(2,)) - waveforms = H5DataIO(data=waveforms, compression="gzip", compression_opts=2, chunks=(1, 3, 3)) + if backend == "hdf5": + spike_times = H5DataIO(data=spike_times, compression="gzip", compression_opts=2, chunks=(2,)) + waveforms = H5DataIO(data=waveforms, compression="gzip", compression_opts=2, chunks=(1, 3, 3)) + elif backend == "zarr": + compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.SHUFFLE, blocksize=0) + filter1 = Blosc(cname="zstd", clevel=1, shuffle=Blosc.SHUFFLE) + filter2 = Blosc(cname="zstd", clevel=2, shuffle=Blosc.SHUFFLE) + filters = [filter1, filter2] + spike_times = ZarrDataIO(data=spike_times, chunks=(2,), compressor=compressor, filters=filters) + waveforms = ZarrDataIO(data=waveforms, chunks=(1, 3, 3), compressor=compressor, filters=filters) nwbfile.add_unit_column(name="compressed_spike_times", description="", data=spike_times, index=index) nwbfile.add_unit_column(name="compressed_waveforms", description="", data=waveforms, index=index) nwbfile_path = tmp_path / "test_existing_dataset_io_configurations_ragged_units_table.nwb" - with NWBHDF5IO(nwbfile_path, "w") as io: + IO = NWBHDF5IO if backend == "hdf5" else NWBZarrIO + with IO(str(nwbfile_path), "w") as io: io.write(nwbfile) - - with NWBHDF5IO(nwbfile_path, "r") as io: + with IO(str(nwbfile_path), "r") as io: nwbfile = io.read() dataset_configurations = list(get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) @@ -182,12 +240,15 @@ def test_configuration_on_ragged_units_table(tmp_path, backend: Literal["hdf5", assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) assert dataset_configuration.full_shape == (5,) assert dataset_configuration.dtype == np.dtype("float64") - assert dataset_configuration.chunk_shape is None assert dataset_configuration.buffer_shape == (5,) - assert dataset_configuration.compression_method is None - assert dataset_configuration.compression_options == dict(compression_opts=None) - - if backend == "zarr": + if backend == "hdf5": + assert dataset_configuration.compression_method is None + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + elif backend == "zarr": + assert dataset_configuration.compression_method == compressor + assert dataset_configuration.chunk_shape == (5,) + assert dataset_configuration.compression_options is None assert dataset_configuration.filter_methods is None assert dataset_configuration.filter_options is None @@ -199,12 +260,15 @@ def test_configuration_on_ragged_units_table(tmp_path, backend: Literal["hdf5", assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) assert dataset_configuration.full_shape == (2,) assert dataset_configuration.dtype == np.dtype("uint8") - assert dataset_configuration.chunk_shape is None assert dataset_configuration.buffer_shape == (2,) - assert dataset_configuration.compression_method is None - assert dataset_configuration.compression_options == dict(compression_opts=None) - - if backend == "zarr": + if backend == "hdf5": + assert dataset_configuration.compression_method is None + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + elif backend == "zarr": + assert dataset_configuration.compression_method == compressor + assert dataset_configuration.chunk_shape == (2,) + assert dataset_configuration.compression_options is None assert dataset_configuration.filter_methods is None assert dataset_configuration.filter_options is None @@ -216,12 +280,15 @@ def test_configuration_on_ragged_units_table(tmp_path, backend: Literal["hdf5", assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) assert dataset_configuration.full_shape == (15, 3) assert dataset_configuration.dtype == np.dtype("int32") - assert dataset_configuration.chunk_shape is None assert dataset_configuration.buffer_shape == (15, 3) - assert dataset_configuration.compression_method is None - assert dataset_configuration.compression_options == dict(compression_opts=None) - - if backend == "zarr": + if backend == "hdf5": + assert dataset_configuration.compression_method is None + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + elif backend == "zarr": + assert dataset_configuration.compression_method == compressor + assert dataset_configuration.chunk_shape == (15, 3) + assert dataset_configuration.compression_options is None assert dataset_configuration.filter_methods is None assert dataset_configuration.filter_options is None @@ -233,12 +300,15 @@ def test_configuration_on_ragged_units_table(tmp_path, backend: Literal["hdf5", assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) assert dataset_configuration.full_shape == (5,) assert dataset_configuration.dtype == np.dtype("uint8") - assert dataset_configuration.chunk_shape is None assert dataset_configuration.buffer_shape == (5,) - assert dataset_configuration.compression_method is None - assert dataset_configuration.compression_options == dict(compression_opts=None) - - if backend == "zarr": + if backend == "hdf5": + assert dataset_configuration.compression_method is None + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + elif backend == "zarr": + assert dataset_configuration.compression_method == compressor + assert dataset_configuration.chunk_shape == (5,) + assert dataset_configuration.compression_options is None assert dataset_configuration.filter_methods is None assert dataset_configuration.filter_options is None @@ -250,12 +320,15 @@ def test_configuration_on_ragged_units_table(tmp_path, backend: Literal["hdf5", assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) assert dataset_configuration.full_shape == (2,) assert dataset_configuration.dtype == np.dtype("uint8") - assert dataset_configuration.chunk_shape is None assert dataset_configuration.buffer_shape == (2,) - assert dataset_configuration.compression_method is None - assert dataset_configuration.compression_options == dict(compression_opts=None) - - if backend == "zarr": + if backend == "hdf5": + assert dataset_configuration.compression_method is None + assert dataset_configuration.chunk_shape is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + elif backend == "zarr": + assert dataset_configuration.compression_method == compressor + assert dataset_configuration.chunk_shape == (2,) + assert dataset_configuration.compression_options is None assert dataset_configuration.filter_methods is None assert dataset_configuration.filter_options is None @@ -267,13 +340,15 @@ def test_configuration_on_ragged_units_table(tmp_path, backend: Literal["hdf5", assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) assert dataset_configuration.full_shape == (5,) assert dataset_configuration.dtype == np.dtype("float64") - assert dataset_configuration.chunk_shape == (2,) assert dataset_configuration.buffer_shape == (5,) - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options == dict(compression_opts=2) - - if backend == "zarr": - assert dataset_configuration.filter_methods is None + assert dataset_configuration.chunk_shape == (2,) + if backend == "hdf5": + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options == dict(compression_opts=2) + elif backend == "zarr": + assert dataset_configuration.compression_method == compressor + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods == filters assert dataset_configuration.filter_options is None dataset_configuration = next( @@ -284,14 +359,17 @@ def test_configuration_on_ragged_units_table(tmp_path, backend: Literal["hdf5", assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) assert dataset_configuration.full_shape == (2,) assert dataset_configuration.dtype == np.dtype("uint8") - assert dataset_configuration.chunk_shape is None assert dataset_configuration.buffer_shape == (2,) - assert dataset_configuration.compression_method is None - assert dataset_configuration.compression_options == dict(compression_opts=None) - - if backend == "zarr": + if backend == "hdf5": + assert dataset_configuration.compression_method is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + assert dataset_configuration.chunk_shape is None + elif backend == "zarr": + assert dataset_configuration.compression_method == compressor + assert dataset_configuration.compression_options is None assert dataset_configuration.filter_methods is None assert dataset_configuration.filter_options is None + assert dataset_configuration.chunk_shape == (2,) dataset_configuration = next( dataset_configuration @@ -303,11 +381,13 @@ def test_configuration_on_ragged_units_table(tmp_path, backend: Literal["hdf5", assert dataset_configuration.dtype == np.dtype("int32") assert dataset_configuration.chunk_shape == (1, 3, 3) assert dataset_configuration.buffer_shape == (5, 3, 3) - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options == dict(compression_opts=2) - - if backend == "zarr": - assert dataset_configuration.filter_methods is None + if backend == "hdf5": + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options == dict(compression_opts=2) + elif backend == "zarr": + assert dataset_configuration.compression_method == compressor + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods == filters assert dataset_configuration.filter_options is None dataset_configuration = next( @@ -318,38 +398,53 @@ def test_configuration_on_ragged_units_table(tmp_path, backend: Literal["hdf5", assert isinstance(dataset_configuration, DATASET_IO_CONFIGURATIONS[backend]) assert dataset_configuration.full_shape == (2,) assert dataset_configuration.dtype == np.dtype("uint8") - assert dataset_configuration.chunk_shape is None assert dataset_configuration.buffer_shape == (2,) - assert dataset_configuration.compression_method is None - assert dataset_configuration.compression_options == dict(compression_opts=None) - - if backend == "zarr": + if backend == "hdf5": + assert dataset_configuration.compression_method is None + assert dataset_configuration.compression_options == dict(compression_opts=None) + assert dataset_configuration.chunk_shape is None + elif backend == "zarr": + assert dataset_configuration.compression_method == compressor + assert dataset_configuration.compression_options is None assert dataset_configuration.filter_methods is None assert dataset_configuration.filter_options is None + assert dataset_configuration.chunk_shape == (2,) -@pytest.mark.parametrize("backend", ["hdf5"]) # ["hdf5", "zarr"]) TODO: Add zarr support +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) def test_configuration_on_compass_direction(tmp_path, backend: Literal["hdf5", "zarr"]): data = np.array([[1, 2, 3], [4, 5, 6]]) nwbfile = mock_NWBFile() + if backend == "zarr": # ZarrDataIO compresses by default, so we disable it to test no-compression + data = ZarrDataIO(data=data, compressor=False) spatial_series = mock_SpatialSeries(name="TestSpatialSeries", data=data) compass_direction = CompassDirection(name="TestCompassDirection", spatial_series=spatial_series) behavior_module = get_module(nwbfile=nwbfile, name="behavior") behavior_module.add(compass_direction) + data = np.array([[1, 2, 3], [4, 5, 6]]) + if backend == "hdf5": + data = H5DataIO(data=data, compression="gzip", compression_opts=2, chunks=(1, 3)) + elif backend == "zarr": + filter1 = Blosc(cname="zstd", clevel=1, shuffle=Blosc.SHUFFLE) + filter2 = Blosc(cname="zstd", clevel=2, shuffle=Blosc.SHUFFLE) + filters = [filter1, filter2] + compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.SHUFFLE, blocksize=0) + data = ZarrDataIO(data=data, chunks=(1, 3), compressor=compressor, filters=filters) compressed_spatial_series = mock_SpatialSeries( name="CompressedSpatialSeries", - data=H5DataIO(data=data, compression="gzip", compression_opts=2, chunks=(1, 3)), + data=data, ) compressed_compass_direction = CompassDirection( name="CompressedCompassDirection", spatial_series=compressed_spatial_series ) behavior_module.add(compressed_compass_direction) nwbfile_path = tmp_path / "test_existing_dataset_io_configurations_compass_direction.nwb" - with NWBHDF5IO(nwbfile_path, "w") as io: + IO = NWBHDF5IO if backend == "hdf5" else NWBZarrIO + with IO(str(nwbfile_path), "w") as io: io.write(nwbfile) - with NWBHDF5IO(nwbfile_path, "r") as io: + with IO(str(nwbfile_path), "r") as io: nwbfile = io.read() dataset_configurations = list(get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) @@ -363,12 +458,14 @@ def test_configuration_on_compass_direction(tmp_path, backend: Literal["hdf5", " ) assert dataset_configuration.full_shape == data.shape assert dataset_configuration.dtype == data.dtype - assert dataset_configuration.chunk_shape is None assert dataset_configuration.buffer_shape == data.shape assert dataset_configuration.compression_method is None - assert dataset_configuration.compression_options == dict(compression_opts=None) - - if backend == "zarr": + if backend == "hdf5": + assert dataset_configuration.compression_options == dict(compression_opts=None) + assert dataset_configuration.chunk_shape is None + elif backend == "zarr": + assert dataset_configuration.compression_options is None + assert dataset_configuration.chunk_shape == data.shape assert dataset_configuration.filter_methods is None assert dataset_configuration.filter_options is None @@ -383,11 +480,13 @@ def test_configuration_on_compass_direction(tmp_path, backend: Literal["hdf5", " assert dataset_configuration.dtype == data.dtype assert dataset_configuration.chunk_shape == (1, 3) assert dataset_configuration.buffer_shape == data.shape - assert dataset_configuration.compression_method == "gzip" - assert dataset_configuration.compression_options == dict(compression_opts=2) - - if backend == "zarr": - assert dataset_configuration.filter_methods is None + if backend == "hdf5": + assert dataset_configuration.compression_method == "gzip" + assert dataset_configuration.compression_options == dict(compression_opts=2) + elif backend == "zarr": + assert dataset_configuration.compression_method == compressor + assert dataset_configuration.compression_options is None + assert dataset_configuration.filter_methods == filters assert dataset_configuration.filter_options is None @@ -395,7 +494,7 @@ def test_configuration_on_compass_direction(tmp_path, backend: Literal["hdf5", " not is_package_installed(package_name="ndx_events"), reason="The extra testing package 'ndx-events' is not installed!", ) -@pytest.mark.parametrize("backend", ["hdf5"]) # ["hdf5", "zarr"]) TODO: Add zarr support +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) def test_configuration_on_ndx_events(tmp_path, backend: Literal["hdf5", "zarr"]): from ndx_events import LabeledEvents @@ -405,6 +504,9 @@ def test_configuration_on_ndx_events(tmp_path, backend: Literal["hdf5", "zarr"]) timestamps = np.array([4.5, 6.7, 8.9]) nwbfile = mock_NWBFile() + if backend == "zarr": # ZarrDataIO compresses by default, so we disable it to test no-compression + data = ZarrDataIO(data=data, compressor=False) + timestamps = ZarrDataIO(data=timestamps, compressor=False) labeled_events = LabeledEvents( name="TestLabeledEvents", description="", @@ -414,19 +516,32 @@ def test_configuration_on_ndx_events(tmp_path, backend: Literal["hdf5", "zarr"]) ) behavior_module = get_module(nwbfile=nwbfile, name="behavior") behavior_module.add(labeled_events) + data = np.array([1, 2, 3], dtype="uint32") + timestamps = np.array([4.5, 6.7, 8.9]) + if backend == "hdf5": + data = H5DataIO(data=data, compression="gzip", compression_opts=2, chunks=(3,)) + timestamps = H5DataIO(data=timestamps, compression="gzip", compression_opts=2, chunks=(3,)) + elif backend == "zarr": + compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.SHUFFLE, blocksize=0) + filter1 = Blosc(cname="zstd", clevel=1, shuffle=Blosc.SHUFFLE) + filter2 = Blosc(cname="zstd", clevel=2, shuffle=Blosc.SHUFFLE) + filters = [filter1, filter2] + data = ZarrDataIO(data=data, chunks=(3,), compressor=compressor, filters=filters) + timestamps = ZarrDataIO(data=timestamps, chunks=(3,), compressor=compressor, filters=filters) compressed_labeled_events = LabeledEvents( name="CompressedLabeledEvents", description="", - timestamps=H5DataIO(data=timestamps, compression="gzip", compression_opts=2, chunks=(3,)), - data=H5DataIO(data=data, compression="gzip", compression_opts=2, chunks=(3,)), + timestamps=timestamps, + data=data, labels=["response_left", "cue_onset", "cue_offset"], ) behavior_module.add(compressed_labeled_events) nwbfile_path = tmp_path / "test_existing_dataset_io_configurations_ndx_events.nwb" - with NWBHDF5IO(nwbfile_path, "w") as io: + IO = NWBHDF5IO if backend == "hdf5" else NWBZarrIO + with IO(str(nwbfile_path), "w") as io: io.write(nwbfile) - with NWBHDF5IO(nwbfile_path, "r") as io: + with IO(str(nwbfile_path), "r") as io: nwbfile = io.read() dataset_configurations = list(get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=backend)) @@ -443,12 +558,14 @@ def test_configuration_on_ndx_events(tmp_path, backend: Literal["hdf5", "zarr"]) assert data_dataset_configuration.object_id == labeled_events.object_id assert data_dataset_configuration.full_shape == data.shape assert data_dataset_configuration.dtype == data.dtype - assert data_dataset_configuration.chunk_shape is None assert data_dataset_configuration.buffer_shape == data.shape assert data_dataset_configuration.compression_method is None - assert data_dataset_configuration.compression_options == dict(compression_opts=None) - - if backend == "zarr": + if backend == "hdf5": + assert data_dataset_configuration.compression_options == dict(compression_opts=None) + assert data_dataset_configuration.chunk_shape is None + elif backend == "zarr": + assert data_dataset_configuration.compression_options is None + assert data_dataset_configuration.chunk_shape == data.shape assert data_dataset_configuration.filter_methods is None assert data_dataset_configuration.filter_options is None @@ -461,12 +578,14 @@ def test_configuration_on_ndx_events(tmp_path, backend: Literal["hdf5", "zarr"]) assert timestamps_dataset_configuration.object_id == labeled_events.object_id assert timestamps_dataset_configuration.full_shape == timestamps.shape assert timestamps_dataset_configuration.dtype == timestamps.dtype - assert timestamps_dataset_configuration.chunk_shape is None assert timestamps_dataset_configuration.buffer_shape == timestamps.shape assert timestamps_dataset_configuration.compression_method is None - assert timestamps_dataset_configuration.compression_options == dict(compression_opts=None) - - if backend == "zarr": + if backend == "hdf5": + assert timestamps_dataset_configuration.compression_options == dict(compression_opts=None) + assert timestamps_dataset_configuration.chunk_shape is None + elif backend == "zarr": + assert timestamps_dataset_configuration.compression_options is None + assert timestamps_dataset_configuration.chunk_shape == timestamps.shape assert timestamps_dataset_configuration.filter_methods is None assert timestamps_dataset_configuration.filter_options is None @@ -481,11 +600,13 @@ def test_configuration_on_ndx_events(tmp_path, backend: Literal["hdf5", "zarr"]) assert data_dataset_configuration.dtype == data.dtype assert data_dataset_configuration.chunk_shape == (3,) assert data_dataset_configuration.buffer_shape == data.shape - assert data_dataset_configuration.compression_method == "gzip" - assert data_dataset_configuration.compression_options == dict(compression_opts=2) - - if backend == "zarr": - assert data_dataset_configuration.filter_methods is None + if backend == "hdf5": + assert data_dataset_configuration.compression_method == "gzip" + assert data_dataset_configuration.compression_options == dict(compression_opts=2) + elif backend == "zarr": + assert data_dataset_configuration.compression_method == compressor + assert data_dataset_configuration.compression_options is None + assert data_dataset_configuration.filter_methods == filters assert data_dataset_configuration.filter_options is None timestamps_dataset_configuration = next( @@ -499,9 +620,11 @@ def test_configuration_on_ndx_events(tmp_path, backend: Literal["hdf5", "zarr"]) assert timestamps_dataset_configuration.dtype == timestamps.dtype assert timestamps_dataset_configuration.chunk_shape == (3,) assert timestamps_dataset_configuration.buffer_shape == timestamps.shape - assert timestamps_dataset_configuration.compression_method == "gzip" - assert timestamps_dataset_configuration.compression_options == dict(compression_opts=2) - - if backend == "zarr": - assert timestamps_dataset_configuration.filter_methods is None + if backend == "hdf5": + assert timestamps_dataset_configuration.compression_method == "gzip" + assert timestamps_dataset_configuration.compression_options == dict(compression_opts=2) + elif backend == "zarr": + assert timestamps_dataset_configuration.compression_method == compressor + assert timestamps_dataset_configuration.compression_options is None + assert timestamps_dataset_configuration.filter_methods == filters assert timestamps_dataset_configuration.filter_options is None From 5785af0757743aa8e72886633c9daa93a259be78 Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 22 Aug 2024 09:27:26 -0700 Subject: [PATCH 32/33] added zarr to backend_configuration tests --- ...test_get_existing_backend_configuration.py | 154 +++++++++++++++++- 1 file changed, 153 insertions(+), 1 deletion(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_backend_configuration.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_backend_configuration.py index dddb80140..1497523cb 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_backend_configuration.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_get_existing_backend_configuration.py @@ -1,4 +1,4 @@ -"""Integration tests for `get_default_backend_configuration`.""" +"""Integration tests for `get_existing_backend_configuration`.""" from io import StringIO from pathlib import Path @@ -6,12 +6,16 @@ import numpy as np import pytest +from hdmf_zarr import ZarrDataIO +from hdmf_zarr.nwb import NWBZarrIO +from numcodecs import Blosc from pynwb import NWBHDF5IO, H5DataIO, NWBFile from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.file import mock_NWBFile from neuroconv.tools.nwb_helpers import ( HDF5BackendConfiguration, + ZarrBackendConfiguration, get_existing_backend_configuration, get_module, ) @@ -64,6 +68,38 @@ def hdf5_nwbfile_path(tmpdir_factory): return str(nwbfile_path) +@pytest.fixture(scope="session") +def zarr_nwbfile_path(tmpdir_factory): + compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.SHUFFLE, blocksize=0) + filter1 = Blosc(cname="zstd", clevel=1, shuffle=Blosc.SHUFFLE) + filter2 = Blosc(cname="zstd", clevel=2, shuffle=Blosc.SHUFFLE) + filters = [filter1, filter2] + + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.zarr") + if not Path(nwbfile_path).exists(): + nwbfile = generate_complex_nwbfile() + + # Add a ZarrDataIO-compressed time series + raw_array = np.array([[11, 21, 31], [41, 51, 61]], dtype="int32") + data = ZarrDataIO(data=raw_array, chunks=(1, 3), compressor=compressor, filters=filters) + raw_time_series = mock_TimeSeries(name="CompressedRawTimeSeries", data=data) + nwbfile.add_acquisition(raw_time_series) + + # Add ZarrDataIO-compressed trials column + number_of_trials = 10 + start_time = np.linspace(start=0.0, stop=10.0, num=number_of_trials) + data = ZarrDataIO(data=start_time, chunks=(5,), compressor=compressor, filters=filters) + nwbfile.add_trial_column( + name="compressed_start_time", + description="start time of epoch", + data=data, + ) + + with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: + io.write(nwbfile) + return str(nwbfile_path) + + def test_complex_hdf5(hdf5_nwbfile_path): with NWBHDF5IO(path=hdf5_nwbfile_path, mode="a") as io: nwbfile = io.read() @@ -163,3 +199,119 @@ def test_complex_hdf5(hdf5_nwbfile_path): """ assert stdout.getvalue() == expected_print + + +def test_complex_zarr(zarr_nwbfile_path): + with NWBZarrIO(path=zarr_nwbfile_path, mode="a") as io: + nwbfile = io.read() + backend_configuration = get_existing_backend_configuration(nwbfile=nwbfile) + + assert isinstance(backend_configuration, ZarrBackendConfiguration) + + dataset_configurations = backend_configuration.dataset_configurations + assert len(dataset_configurations) == 6 + + # Best summary test of expected output is the printout + print(backend_configuration) + with patch("sys.stdout", new=StringIO()) as stdout: + print(backend_configuration) + + expected_print = """ +Zarr dataset configurations +--------------------------- + +intervals/trials/start_time/data +-------------------------------- + dtype : float64 + full shape of source array : (10,) + full size of source array : 80 B + + buffer shape : (10,) + expected RAM usage : 80 B + + chunk shape : (10,) + disk space usage per chunk : 80 B + + compression method : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + + +intervals/trials/stop_time/data +------------------------------- + dtype : float64 + full shape of source array : (10,) + full size of source array : 80 B + + buffer shape : (10,) + expected RAM usage : 80 B + + chunk shape : (10,) + disk space usage per chunk : 80 B + + compression method : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + + +intervals/trials/compressed_start_time/data +------------------------------------------- + dtype : float64 + full shape of source array : (10,) + full size of source array : 80 B + + buffer shape : (10,) + expected RAM usage : 80 B + + chunk shape : (5,) + disk space usage per chunk : 40 B + + compression method : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + + filter methods : [Blosc(cname='zstd', clevel=1, shuffle=SHUFFLE, blocksize=0), Blosc(cname='zstd', clevel=2, shuffle=SHUFFLE, blocksize=0)] + + +processing/ecephys/ProcessedTimeSeries/data +------------------------------------------- + dtype : float64 + full shape of source array : (4, 2) + full size of source array : 64 B + + buffer shape : (4, 2) + expected RAM usage : 64 B + + chunk shape : (4, 2) + disk space usage per chunk : 64 B + + compression method : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + + +acquisition/RawTimeSeries/data +------------------------------ + dtype : int64 + full shape of source array : (2, 3) + full size of source array : 48 B + + buffer shape : (2, 3) + expected RAM usage : 48 B + + chunk shape : (2, 3) + disk space usage per chunk : 48 B + + compression method : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + + +acquisition/CompressedRawTimeSeries/data +---------------------------------------- + dtype : int32 + full shape of source array : (2, 3) + full size of source array : 24 B + + buffer shape : (2, 3) + expected RAM usage : 24 B + + chunk shape : (1, 3) + disk space usage per chunk : 12 B + + compression method : Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0) + + filter methods : [Blosc(cname='zstd', clevel=1, shuffle=SHUFFLE, blocksize=0), Blosc(cname='zstd', clevel=2, shuffle=SHUFFLE, blocksize=0)] + +""" + assert stdout.getvalue() == expected_print From b07c0022159a776dac3e1cfd39d4c6543ddff9dc Mon Sep 17 00:00:00 2001 From: pauladkisson Date: Thu, 22 Aug 2024 09:53:27 -0700 Subject: [PATCH 33/33] added zarr to repack_nwbfile tests --- .../test_helpers/test_repack_nwbfile.py | 187 +++++++++++++----- 1 file changed, 142 insertions(+), 45 deletions(-) diff --git a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py index 10867e9db..b0bf138b3 100644 --- a/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py +++ b/tests/test_minimal/test_tools/test_backend_and_dataset_configuration/test_helpers/test_repack_nwbfile.py @@ -2,7 +2,9 @@ import numpy as np import pytest -from hdmf_zarr import NWBZarrIO +from hdmf_zarr import NWBZarrIO, ZarrDataIO +from hdmf_zarr.nwb import NWBZarrIO +from numcodecs import Blosc, GZip from pynwb import NWBHDF5IO, H5DataIO, NWBFile from pynwb.testing.mock.base import mock_TimeSeries from pynwb.testing.mock.file import mock_NWBFile @@ -62,67 +64,162 @@ def hdf5_nwbfile_path(tmpdir_factory): @pytest.fixture(scope="session") def zarr_nwbfile_path(tmpdir_factory): - nwbfile_path = tmpdir_factory.mktemp("data").join("test_repack_nwbfile.nwb.zarr") + compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.SHUFFLE, blocksize=0) + filter1 = Blosc(cname="zstd", clevel=1, shuffle=Blosc.SHUFFLE) + filter2 = Blosc(cname="zstd", clevel=2, shuffle=Blosc.SHUFFLE) + filters = [filter1, filter2] + + nwbfile_path = tmpdir_factory.mktemp("data").join("test_default_backend_configuration_hdf5_nwbfile.nwb.zarr") if not Path(nwbfile_path).exists(): nwbfile = generate_complex_nwbfile() + + # Add a ZarrDataIO-compressed time series + raw_array = np.array([[11, 21, 31], [41, 51, 61]], dtype="int32") + data = ZarrDataIO(data=raw_array, chunks=(1, 3), compressor=compressor, filters=filters) + raw_time_series = mock_TimeSeries(name="CompressedRawTimeSeries", data=data) + nwbfile.add_acquisition(raw_time_series) + + # Add ZarrDataIO-compressed trials column + number_of_trials = 10 + start_time = np.linspace(start=0.0, stop=10.0, num=number_of_trials) + data = ZarrDataIO(data=start_time, chunks=(5,), compressor=compressor, filters=filters) + nwbfile.add_trial_column( + name="compressed_start_time", + description="start time of epoch", + data=data, + ) + with NWBZarrIO(path=str(nwbfile_path), mode="w") as io: io.write(nwbfile) return str(nwbfile_path) +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) @pytest.mark.parametrize("use_default_backend_configuration", [True, False]) -def test_repack_nwbfile(hdf5_nwbfile_path, use_default_backend_configuration): - export_path = Path(hdf5_nwbfile_path).parent / "repacked_test_repack_nwbfile.nwb.h5" +def test_repack_nwbfile(hdf5_nwbfile_path, zarr_nwbfile_path, backend, use_default_backend_configuration): + compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.SHUFFLE, blocksize=0) + filter1 = Blosc(cname="zstd", clevel=1, shuffle=Blosc.SHUFFLE) + filter2 = Blosc(cname="zstd", clevel=2, shuffle=Blosc.SHUFFLE) + filters = [filter1, filter2] + default_compressor = GZip(level=1) + + if backend == "hdf5": + nwbfile_path = hdf5_nwbfile_path + export_path = Path(hdf5_nwbfile_path).parent / "repacked_test_repack_nwbfile.nwb.h5" + elif backend == "zarr": + nwbfile_path = zarr_nwbfile_path + export_path = Path(hdf5_nwbfile_path).parent / "repacked_test_repack_nwbfile.nwb.zarr" repack_nwbfile( - nwbfile_path=hdf5_nwbfile_path, - export_nwbfile_path=export_path, - backend="hdf5", + nwbfile_path=str(nwbfile_path), + export_nwbfile_path=str(export_path), + backend=backend, use_default_backend_configuration=use_default_backend_configuration, ) - - with NWBHDF5IO(export_path, mode="r") as io: + IO = NWBHDF5IO if backend == "hdf5" else NWBZarrIO + with IO(str(export_path), mode="r") as io: nwbfile = io.read() - if use_default_backend_configuration: - assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts == 4 - assert nwbfile.intervals["trials"].start_time.data.compression_opts == 4 - assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts == 4 - assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 4 - assert nwbfile.intervals["trials"].compressed_start_time.data.compression_opts == 4 - else: - assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts is None - assert nwbfile.intervals["trials"].start_time.data.compression_opts is None - assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts is None - assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 2 - assert nwbfile.intervals["trials"].compressed_start_time.data.compression_opts == 2 - - + if backend == "hdf5": + if use_default_backend_configuration: + assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts == 4 + assert nwbfile.intervals["trials"].start_time.data.compression_opts == 4 + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts == 4 + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 4 + assert nwbfile.intervals["trials"].compressed_start_time.data.compression_opts == 4 + else: + assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts is None + assert nwbfile.intervals["trials"].start_time.data.compression_opts is None + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts is None + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 2 + assert nwbfile.intervals["trials"].compressed_start_time.data.compression_opts == 2 + elif backend == "zarr": + if use_default_backend_configuration: + assert nwbfile.acquisition["RawTimeSeries"].data.compressor == default_compressor + assert nwbfile.acquisition["RawTimeSeries"].data.filters is None + assert nwbfile.intervals["trials"].start_time.data.compressor == default_compressor + assert nwbfile.intervals["trials"].start_time.data.filters is None + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compressor == default_compressor + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.filters is None + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compressor == default_compressor + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.filters is None + else: + assert nwbfile.acquisition["RawTimeSeries"].data.compressor == compressor + assert nwbfile.acquisition["RawTimeSeries"].data.filters is None + assert nwbfile.intervals["trials"].start_time.data.compressor == compressor + assert nwbfile.intervals["trials"].start_time.data.filters is None + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compressor == compressor + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.filters is None + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compressor == compressor + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.filters == filters + + +@pytest.mark.parametrize("backend", ["hdf5", "zarr"]) @pytest.mark.parametrize("use_default_backend_configuration", [True, False]) -def test_repack_nwbfile_with_changes(hdf5_nwbfile_path, use_default_backend_configuration): - export_path = Path(hdf5_nwbfile_path).parent / "repacked_test_repack_nwbfile.nwb.h5" - backend_configuration_changes = { - "acquisition/RawTimeSeries/data": dict(compression_method="gzip", compression_options=dict(compression_opts=1)) - } +def test_repack_nwbfile_with_changes(hdf5_nwbfile_path, zarr_nwbfile_path, backend, use_default_backend_configuration): + compressor = Blosc(cname="lz4", clevel=5, shuffle=Blosc.SHUFFLE, blocksize=0) + filter1 = Blosc(cname="zstd", clevel=1, shuffle=Blosc.SHUFFLE) + filter2 = Blosc(cname="zstd", clevel=2, shuffle=Blosc.SHUFFLE) + filters = [filter1, filter2] + default_compressor = GZip(level=1) + + if backend == "hdf5": + nwbfile_path = hdf5_nwbfile_path + export_path = Path(hdf5_nwbfile_path).parent / "repacked_test_repack_nwbfile.nwb.h5" + backend_configuration_changes = { + "acquisition/RawTimeSeries/data": dict( + compression_method="gzip", compression_options=dict(compression_opts=1) + ) + } + elif backend == "zarr": + nwbfile_path = zarr_nwbfile_path + export_path = Path(hdf5_nwbfile_path).parent / "repacked_test_repack_nwbfile.nwb.zarr" + changed_compressor = Blosc(cname="lz4", clevel=3, shuffle=Blosc.SHUFFLE, blocksize=0) + changed_filters = [Blosc(cname="zstd", clevel=3, shuffle=Blosc.SHUFFLE)] + backend_configuration_changes = { + "acquisition/RawTimeSeries/data": dict( + compression_method=changed_compressor, filter_methods=changed_filters + ) + } repack_nwbfile( - nwbfile_path=hdf5_nwbfile_path, - export_nwbfile_path=export_path, - backend="hdf5", + nwbfile_path=str(nwbfile_path), + export_nwbfile_path=str(export_path), + backend=backend, use_default_backend_configuration=use_default_backend_configuration, backend_configuration_changes=backend_configuration_changes, ) - with NWBHDF5IO(export_path, mode="r") as io: + IO = NWBHDF5IO if backend == "hdf5" else NWBZarrIO + with IO(str(export_path), mode="r") as io: nwbfile = io.read() - - if use_default_backend_configuration: - assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts == 1 - assert nwbfile.intervals["trials"].start_time.data.compression_opts == 4 - assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts == 4 - assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 4 - assert nwbfile.intervals["trials"].compressed_start_time.data.compression_opts == 4 - else: - assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts == 1 - assert nwbfile.intervals["trials"].start_time.data.compression_opts is None - assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts is None - assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 2 - assert nwbfile.intervals["trials"].compressed_start_time.data.compression_opts == 2 + if backend == "hdf5": + if use_default_backend_configuration: + assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts == 1 + assert nwbfile.intervals["trials"].start_time.data.compression_opts == 4 + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts == 4 + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 4 + assert nwbfile.intervals["trials"].compressed_start_time.data.compression_opts == 4 + else: + assert nwbfile.acquisition["RawTimeSeries"].data.compression_opts == 1 + assert nwbfile.intervals["trials"].start_time.data.compression_opts is None + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compression_opts is None + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compression_opts == 2 + assert nwbfile.intervals["trials"].compressed_start_time.data.compression_opts == 2 + elif backend == "zarr": + if use_default_backend_configuration: + assert nwbfile.acquisition["RawTimeSeries"].data.compressor == changed_compressor + assert nwbfile.acquisition["RawTimeSeries"].data.filters == changed_filters + assert nwbfile.intervals["trials"].start_time.data.compressor == default_compressor + assert nwbfile.intervals["trials"].start_time.data.filters is None + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compressor == default_compressor + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.filters is None + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compressor == default_compressor + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.filters is None + else: + assert nwbfile.acquisition["RawTimeSeries"].data.compressor == changed_compressor + assert nwbfile.acquisition["RawTimeSeries"].data.filters == changed_filters + assert nwbfile.intervals["trials"].start_time.data.compressor == compressor + assert nwbfile.intervals["trials"].start_time.data.filters is None + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.compressor == compressor + assert nwbfile.processing["ecephys"]["ProcessedTimeSeries"].data.filters is None + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.compressor == compressor + assert nwbfile.acquisition["CompressedRawTimeSeries"].data.filters == filters