Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Repack Nwb Files #1003

Draft
wants to merge 40 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 18 commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
7304229
setup temp conversion script
pauladkisson Aug 12, 2024
4cc2a06
added from_existing_neurodata_object for hdf5
pauladkisson Aug 12, 2024
c33dfbf
added get_existing_dataset_io_configurations
pauladkisson Aug 12, 2024
80c1fba
added support for chunk_shape=None
pauladkisson Aug 13, 2024
7ee6fc6
added from_existing_nwbfile to HDF5BackendConfiguration
pauladkisson Aug 13, 2024
dacdeea
added get_existing_backend_configuration
pauladkisson Aug 13, 2024
dae04bf
added repack_nwbfile
pauladkisson Aug 13, 2024
4ac6e33
fixed bug with export options and hdmf.container.Container.set_data_io
pauladkisson Aug 14, 2024
ce267fb
refactored from_ methods
pauladkisson Aug 14, 2024
49f4262
template and changes optional
pauladkisson Aug 14, 2024
d93a5c5
added image series test
pauladkisson Aug 15, 2024
ab8b22f
Merge branch 'main' into repack
bendichter Aug 15, 2024
934bb3a
Merge branch 'main' into repack
pauladkisson Aug 15, 2024
1ad69ca
added initial test
pauladkisson Aug 15, 2024
04fb89c
updated signature to use file_path
pauladkisson Aug 16, 2024
6dab477
added test for trials table (fails)
pauladkisson Aug 16, 2024
e6d31a6
moved backend_configuration_changes to top of the fn
pauladkisson Aug 16, 2024
7252449
consolidated configure_and_export_nwbfile into configure_and_write_nw…
pauladkisson Aug 16, 2024
2ef5c44
parameterized for use_default_backend_configuration
pauladkisson Aug 16, 2024
80eb598
optional dci
pauladkisson Aug 19, 2024
433f8c9
added test for backend config changes
pauladkisson Aug 19, 2024
dd906ac
updated api to use boolean use_default flag instead of mode=existing
pauladkisson Aug 19, 2024
668cacc
added test for get_existing_backend_configuration
pauladkisson Aug 19, 2024
7796197
removed image_series test
pauladkisson Aug 19, 2024
b8a788c
added compressed trials table column
pauladkisson Aug 19, 2024
f631fb4
added test for get_existing_dataset_io.py
pauladkisson Aug 20, 2024
b089eb3
Merge branch 'main' into repack
pauladkisson Aug 20, 2024
c464764
added docstrings
pauladkisson Aug 20, 2024
1cf3629
used BACKEND_NWB_IO dict
pauladkisson Aug 20, 2024
481529f
added ZarrDatsetIOConfiguration.from_neurodata_object
pauladkisson Aug 20, 2024
1e6b119
Merge branch 'main' into repack
bendichter Aug 20, 2024
9f02b61
removed unnecessary indent
pauladkisson Aug 21, 2024
9ee146f
estimate buffer shape
pauladkisson Aug 21, 2024
ee7ec52
updated temp_test
pauladkisson Aug 21, 2024
a2145a1
added zarr to dataset_io tests
pauladkisson Aug 22, 2024
5785af0
added zarr to backend_configuration tests
pauladkisson Aug 22, 2024
b07c002
added zarr to repack_nwbfile tests
pauladkisson Aug 22, 2024
a8c57e8
Merge branch 'main' into repack
pauladkisson Jan 25, 2025
ad641ef
fixed merge
pauladkisson Jan 25, 2025
8d8689f
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/neuroconv/tools/nwb_helpers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ._backend_configuration import (
BACKEND_CONFIGURATIONS,
get_default_backend_configuration,
get_existing_backend_configuration,
)
from ._configuration_models import DATASET_IO_CONFIGURATIONS
from ._configuration_models._base_backend import BackendConfiguration
Expand All @@ -30,6 +31,7 @@
get_module,
make_nwbfile_from_metadata,
make_or_load_nwbfile,
repack_nwbfile,
)

__all__ = [
Expand All @@ -46,6 +48,7 @@
"ZarrDatasetIOConfiguration",
"get_default_backend_configuration",
"get_default_dataset_io_configurations",
"get_existing_backend_configuration",
"configure_backend",
"get_default_dataset_io_configurations",
"get_default_backend_configuration",
Expand All @@ -55,4 +58,5 @@
"get_module",
"make_nwbfile_from_metadata",
"make_or_load_nwbfile",
"repack_nwbfile",
]
17 changes: 16 additions & 1 deletion src/neuroconv/tools/nwb_helpers/_backend_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@

from typing import Literal, Union

from pynwb import NWBFile
from hdmf_zarr import NWBZarrIO
from pynwb import NWBHDF5IO, NWBFile

from ._configuration_models._hdf5_backend import HDF5BackendConfiguration
from ._configuration_models._zarr_backend import ZarrBackendConfiguration
Expand All @@ -17,3 +18,17 @@ def get_default_backend_configuration(

BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend]
return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile)


def get_existing_backend_configuration(nwbfile: NWBFile) -> Union[HDF5BackendConfiguration, ZarrBackendConfiguration]:
"""Fill an existing backend configuration to serve as a starting point for further customization."""

read_io = nwbfile.read_io
if isinstance(read_io, NWBHDF5IO):
backend = "hdf5"
elif isinstance(read_io, NWBZarrIO):
backend = "zarr"
pauladkisson marked this conversation as resolved.
Show resolved Hide resolved
else:
raise ValueError(f"The backend of the NWBFile from io {read_io} is not recognized.")
BackendConfigurationClass = BACKEND_CONFIGURATIONS[backend]
return BackendConfigurationClass.from_nwbfile(nwbfile=nwbfile, mode="existing")
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@

from ._base_dataset_io import DatasetIOConfiguration
from ._pydantic_pure_json_schema_generator import PureJSONSchemaGenerator
from .._dataset_configuration import get_default_dataset_io_configurations
from .._dataset_configuration import (
get_default_dataset_io_configurations,
get_existing_dataset_io_configurations,
)


class BackendConfiguration(BaseModel):
Expand Down Expand Up @@ -56,11 +59,16 @@ def model_json_schema(cls, **kwargs) -> Dict[str, Any]:
return super().model_json_schema(mode="validation", schema_generator=PureJSONSchemaGenerator, **kwargs)

@classmethod
def from_nwbfile(cls, nwbfile: NWBFile) -> Self:
default_dataset_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend)
def from_nwbfile(cls, nwbfile: NWBFile, mode: Literal["default", "existing"] = "default") -> Self:
pauladkisson marked this conversation as resolved.
Show resolved Hide resolved
if mode == "default":
dataset_io_configurations = get_default_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend)
elif mode == "existing":
dataset_io_configurations = get_existing_dataset_io_configurations(nwbfile=nwbfile, backend=cls.backend)
else:
raise ValueError(f"mode must be either 'default' or 'existing' but got {mode}")
dataset_configurations = {
default_dataset_configuration.location_in_file: default_dataset_configuration
for default_dataset_configuration in default_dataset_configurations
for default_dataset_configuration in dataset_io_configurations
}

return cls(dataset_configurations=dataset_configurations)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ def __str__(self) -> str:
"""
size_in_bytes = math.prod(self.full_shape) * self.dtype.itemsize
maximum_ram_usage_per_iteration_in_bytes = math.prod(self.buffer_shape) * self.dtype.itemsize
disk_space_usage_per_chunk_in_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize

string = (
f"\n{self.location_in_file}"
Expand All @@ -159,10 +158,14 @@ def __str__(self) -> str:
f"\n buffer shape : {self.buffer_shape}"
f"\n expected RAM usage : {human_readable_size(maximum_ram_usage_per_iteration_in_bytes)}"
"\n"
f"\n chunk shape : {self.chunk_shape}"
f"\n disk space usage per chunk : {human_readable_size(disk_space_usage_per_chunk_in_bytes)}"
"\n"
)
if self.chunk_shape is not None:
disk_space_usage_per_chunk_in_bytes = math.prod(self.chunk_shape) * self.dtype.itemsize
string += (
f"\n chunk shape : {self.chunk_shape}"
f"\n disk space usage per chunk : {human_readable_size(disk_space_usage_per_chunk_in_bytes)}"
"\n"
)
if self.compression_method is not None:
string += f"\n compression method : {self.compression_method}"
if self.compression_options is not None:
Expand All @@ -182,9 +185,9 @@ def validate_all_shapes(cls, values: Dict[str, Any]) -> Dict[str, Any]:
dataset_name == location_in_file.split("/")[-1]
), f"The `dataset_name` ({dataset_name}) does not match the end of the `location_in_file` ({location_in_file})!"

chunk_shape = values["chunk_shape"]
buffer_shape = values["buffer_shape"]
full_shape = values["full_shape"]
chunk_shape = values["chunk_shape"] if values["chunk_shape"] is not None else full_shape
buffer_shape = values["buffer_shape"] if values["buffer_shape"] is not None else full_shape

if len(chunk_shape) != len(buffer_shape):
raise ValueError(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
from typing import Any, Dict, Literal, Union

import h5py
from hdmf import Container
from pydantic import Field, InstanceOf
from typing_extensions import Self

from ._base_dataset_io import DatasetIOConfiguration
from ._base_dataset_io import DatasetIOConfiguration, _find_location_in_memory_nwbfile
from ...importing import is_package_installed

_base_hdf5_filters = set(h5py.filters.decode)
Expand Down Expand Up @@ -78,3 +80,38 @@ def get_data_io_kwargs(self) -> Dict[str, Any]:
compression_bundle = dict(compression=self.compression_method, compression_opts=compression_opts)

return dict(chunks=self.chunk_shape, **compression_bundle)

@classmethod
def from_neurodata_object(
cls,
neurodata_object: Container,
dataset_name: Literal["data", "timestamps"],
mode: Literal["default", "existing"] = "default",
) -> Self:
if mode == "default":
return super().from_neurodata_object(neurodata_object=neurodata_object, dataset_name=dataset_name)
elif mode == "existing":
location_in_file = _find_location_in_memory_nwbfile(
neurodata_object=neurodata_object, field_name=dataset_name
)
full_shape = getattr(neurodata_object, dataset_name).shape
dtype = getattr(neurodata_object, dataset_name).dtype
chunk_shape = getattr(neurodata_object, dataset_name).chunks
buffer_shape = getattr(neurodata_object, dataset_name).maxshape
compression_method = getattr(neurodata_object, dataset_name).compression
compression_opts = getattr(neurodata_object, dataset_name).compression_opts
compression_options = dict(compression_opts=compression_opts)
return cls(
object_id=neurodata_object.object_id,
object_name=neurodata_object.name,
location_in_file=location_in_file,
dataset_name=dataset_name,
full_shape=full_shape,
dtype=dtype,
chunk_shape=chunk_shape,
buffer_shape=buffer_shape,
compression_method=compression_method,
compression_options=compression_options,
)
else:
raise ValueError(f"mode must be either 'default' or 'existing' but got {mode}")
64 changes: 64 additions & 0 deletions src/neuroconv/tools/nwb_helpers/_dataset_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,3 +172,67 @@ def get_default_dataset_io_configurations(
)

yield dataset_io_configuration


def get_existing_dataset_io_configurations(
nwbfile: NWBFile,
backend: Literal["hdf5", "zarr"],
) -> Generator[DatasetIOConfiguration, None, None]:

DatasetIOConfigurationClass = DATASET_IO_CONFIGURATIONS[backend]

known_dataset_fields = ("data", "timestamps")
for neurodata_object in nwbfile.objects.values():
if isinstance(neurodata_object, DynamicTable):
dynamic_table = neurodata_object # For readability

for column in dynamic_table.columns:
candidate_dataset = column.data # VectorData object

# Skip over columns whose values are links, such as the 'group' of an ElectrodesTable
if any(isinstance(value, Container) for value in candidate_dataset):
continue # Skip

# Skip when columns whose values are a reference type
if isinstance(column, TimeSeriesReferenceVectorData):
continue

# Skip datasets with any zero-length axes
dataset_name = "data"
candidate_dataset = getattr(column, dataset_name)
full_shape = get_data_shape(data=candidate_dataset)
if any(axis_length == 0 for axis_length in full_shape):
continue

dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object(
neurodata_object=column,
dataset_name=dataset_name,
mode="existing",
)

yield dataset_io_configuration
elif isinstance(neurodata_object, NWBContainer):
for known_dataset_field in known_dataset_fields:
# Skip optional fields that aren't present
if known_dataset_field not in neurodata_object.fields:
continue

candidate_dataset = getattr(neurodata_object, known_dataset_field)

# Skip edge case of in-memory ImageSeries with external mode; data is in fields and is empty array
if isinstance(candidate_dataset, np.ndarray) and candidate_dataset.size == 0:
continue

# Skip datasets with any zero-length axes
candidate_dataset = getattr(neurodata_object, known_dataset_field)
full_shape = get_data_shape(data=candidate_dataset)
if any(axis_length == 0 for axis_length in full_shape):
continue

dataset_io_configuration = DatasetIOConfigurationClass.from_neurodata_object(
neurodata_object=neurodata_object,
dataset_name=known_dataset_field,
mode="existing",
)

yield dataset_io_configuration
52 changes: 50 additions & 2 deletions src/neuroconv/tools/nwb_helpers/_metadata_and_file_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,12 @@
from pynwb import NWBHDF5IO, NWBFile
from pynwb.file import Subject

from . import BackendConfiguration, configure_backend, get_default_backend_configuration
from . import (
BackendConfiguration,
configure_backend,
get_default_backend_configuration,
get_existing_backend_configuration,
)
from ...utils.dict import DeepDict, load_dict_from_file
from ...utils.json_schema import validate_metadata

Expand Down Expand Up @@ -337,6 +342,7 @@ def configure_and_write_nwbfile(
output_filepath: str,
backend: Optional[Literal["hdf5"]] = None,
backend_configuration: Optional[BackendConfiguration] = None,
export: bool = False,
) -> None:
"""
Write an NWB file using a specific backend or backend configuration.
Expand All @@ -355,6 +361,8 @@ def configure_and_write_nwbfile(
backend_configuration: BackendConfiguration, optional
Specifies the backend type and the chunking and compression parameters of each dataset. If no
``backend_configuration`` is specified, the default configuration for the specified ``backend`` is used.
export: bool, default: False
Whether to export the NWB file instead of writing.

"""

Expand All @@ -369,4 +377,44 @@ def configure_and_write_nwbfile(
IO = BACKEND_NWB_IO[backend_configuration.backend]

with IO(output_filepath, mode="w") as io:
io.write(nwbfile)
if export:
nwbfile.set_modified()
io.export(nwbfile=nwbfile, src_io=nwbfile.read_io, write_args=dict(link_data=False))
else:
io.write(nwbfile)


def repack_nwbfile(
*,
nwbfile_path: Path,
export_nwbfile_path: Path,
backend: Literal["hdf5", "zarr"] = "hdf5",
export_backend: Literal["hdf5", "zarr", None] = None,
use_default_backend_configuration: bool = True,
backend_configuration_changes: dict = None,
):
"""Repack the NWBFile with the new backend configuration changes."""
backend_configuration_changes = backend_configuration_changes or dict()
export_backend = export_backend or backend

IO = BACKEND_NWB_IO[backend]
with IO(nwbfile_path, mode="r") as io:
nwbfile = io.read()
if use_default_backend_configuration:
backend_configuration = get_default_backend_configuration(nwbfile=nwbfile, backend=backend)
else:
backend_configuration = get_existing_backend_configuration(nwbfile=nwbfile)
dataset_configurations = backend_configuration.dataset_configurations

for neurodata_object_location, dataset_config_changes in backend_configuration_changes.items():
dataset_configuration = dataset_configurations[neurodata_object_location]
for dataset_config_key, dataset_config_value in dataset_config_changes.items():
setattr(dataset_configuration, dataset_config_key, dataset_config_value)

configure_and_write_nwbfile(
nwbfile=nwbfile,
backend_configuration=backend_configuration,
output_filepath=export_nwbfile_path,
backend=export_backend,
export=True,
)
53 changes: 53 additions & 0 deletions temp_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
import os
from pathlib import Path

import numpy as np
from pynwb import NWBHDF5IO, H5DataIO, TimeSeries
from pynwb.testing.mock.file import mock_NWBFile

from neuroconv.tools.nwb_helpers import (
repack_nwbfile,
)


def write_nwbfile(nwbfile_path: Path):
if nwbfile_path.exists():
os.remove(nwbfile_path)
nwbfile = mock_NWBFile()
timestamps = np.arange(10.0)
data = np.arange(100, 200, 10)
time_series_with_timestamps = TimeSeries(
name="test_timeseries",
description="an example time series",
data=H5DataIO(data=data, compression="gzip", chunks=(1,), compression_opts=2),
unit="m",
timestamps=timestamps,
)
nwbfile.add_acquisition(time_series_with_timestamps)
with NWBHDF5IO(nwbfile_path, mode="w") as io:
io.write(nwbfile)


def main():
nwbfile_path = Path("/Volumes/T7/CatalystNeuro/temp.nwb")
repacked_nwbfile_path = Path("/Volumes/T7/CatalystNeuro/repacked_temp.nwb")
if repacked_nwbfile_path.exists():
os.remove(repacked_nwbfile_path)
if not nwbfile_path.exists():
write_nwbfile(nwbfile_path)
backend_configuration_changes = {"acquisition/test_timeseries/data": dict(chunk_shape=(2,))}
repack_nwbfile(
nwbfile_path=nwbfile_path,
export_nwbfile_path=repacked_nwbfile_path,
backend="hdf5",
backend_configuration_changes=backend_configuration_changes,
use_default_backend_configuration=False,
)

with NWBHDF5IO(repacked_nwbfile_path, mode="r") as io:
nwbfile = io.read()
print(f'{nwbfile.acquisition["test_timeseries"].data.chunks = }')


if __name__ == "__main__":
main()
Loading
Loading