Skip to content

Commit

Permalink
Add json schema validation for source data at the interface level (#1090
Browse files Browse the repository at this point in the history
)
  • Loading branch information
h-mayorquin authored Sep 20, 2024
1 parent 1ccdb2a commit 9f67ec6
Show file tree
Hide file tree
Showing 22 changed files with 191 additions and 44 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

## Features
* Using in-house `GenericDataChunkIterator` [PR #1068](https://github.com/catalystneuro/neuroconv/pull/1068)
* Data interfaces now perform source (argument inputs) validation with the json schema [PR #1020](https://github.com/catalystneuro/neuroconv/pull/1020)

## Improvements
* Remove dev test from PR [PR #1092](https://github.com/catalystneuro/neuroconv/pull/1092)
Expand All @@ -27,7 +28,7 @@
## Features
* Added chunking/compression for string-only compound objects: [PR #1042](https://github.com/catalystneuro/neuroconv/pull/1042)
* Added automated EFS volume creation and mounting to the `submit_aws_job` helper function. [PR #1018](https://github.com/catalystneuro/neuroconv/pull/1018)
* Added a mock for segmentation extractors interfaces in ophys: `MockSegmentationInterface` [PR #1067](https://github.com/catalystneuro/neuroconv/pull/1067)
* Added a mock for segmentation extractors interfaces in ophys: `MockSegmentationInterface` [PR #1067](https://github.com/catalystneuro/neuroconv/pull/1067)
* Added a `MockSortingInterface` for testing purposes. [PR #1065](https://github.com/catalystneuro/neuroconv/pull/1065)
* BaseRecordingInterfaces have a new conversion options `always_write_timestamps` that ca be used to force writing timestamps even if neuroconv heuristic indicates regular sampling rate [PR #1091](https://github.com/catalystneuro/neuroconv/pull/1091)

Expand Down Expand Up @@ -99,6 +100,7 @@
* Data interfaces `run_conversion` method now performs metadata validation before running the conversion. [PR #949](https://github.com/catalystneuro/neuroconv/pull/949)
* Introduced `null_values_for_properties` to `add_units_table` to give user control over null values behavior [PR #989](https://github.com/catalystneuro/neuroconv/pull/989)


## Bug fixes
* Fixed the default naming of multiple electrical series in the `SpikeGLXConverterPipe`. [PR #957](https://github.com/catalystneuro/neuroconv/pull/957)
* Write new properties to the electrode table use the global identifier channel_name, group [PR #984](https://github.com/catalystneuro/neuroconv/pull/984)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Convert LightningPose pose estimation data to NWB using :py:class:`~neuroconv.da
>>> labeled_video_file_path = str(folder_path / "labeled_videos/test_vid_labeled.mp4")
>>> converter = LightningPoseConverter(file_path=file_path, original_video_file_path=original_video_file_path, labeled_video_file_path=labeled_video_file_path, verbose=False)
Source data is valid!
>>> metadata = converter.get_metadata()
>>> # For data provenance we add the time zone information to the conversion
>>> session_start_time = metadata["NWBFile"]["session_start_time"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ Convert TDT Fiber Photometry data to NWB using
>>> editable_metadata_path = LOCAL_PATH / "tests" / "test_on_data" / "ophys" / "fiber_photometry_metadata.yaml"
>>> interface = TDTFiberPhotometryInterface(folder_path=folder_path, verbose=True)
Source data is valid!
>>> metadata = interface.get_metadata()
>>> metadata["NWBFile"]["session_start_time"] = datetime.now(tz=ZoneInfo("US/Pacific"))
>>> editable_metadata = load_dict_from_file(editable_metadata_path)
Expand Down
20 changes: 20 additions & 0 deletions src/neuroconv/basedatainterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
load_dict_from_file,
)
from .utils.dict import DeepDict
from .utils.json_schema import _NWBSourceDataEncoder


class BaseDataInterface(ABC):
Expand All @@ -39,11 +40,30 @@ def get_source_schema(cls) -> dict:
"""Infer the JSON schema for the source_data from the method signature (annotation typing)."""
return get_json_schema_from_method_signature(cls, exclude=["source_data"])

@classmethod
def validate_source(cls, source_data: dict, verbose: bool = False):
"""Validate source_data against Converter source_schema."""
cls._validate_source_data(source_data=source_data, verbose=verbose)

def _validate_source_data(self, source_data: dict, verbose: bool = False):

encoder = _NWBSourceDataEncoder()
# The encoder produces a serialized object, so we deserialized it for comparison

serialized_source_data = encoder.encode(source_data)
decoded_source_data = json.loads(serialized_source_data)
source_schema = self.get_source_schema()
validate(instance=decoded_source_data, schema=source_schema)
if verbose:
print("Source data is valid!")

@validate_call
def __init__(self, verbose: bool = False, **source_data):
self.verbose = verbose
self.source_data = source_data

self._validate_source_data(source_data=source_data, verbose=verbose)

def get_metadata_schema(self) -> dict:
"""Retrieve JSON schema for metadata."""
metadata_schema = load_dict_from_file(Path(__file__).parent / "schemas" / "base_metadata_schema.json")
Expand Down
10 changes: 10 additions & 0 deletions src/neuroconv/baseextractorinterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,3 +29,13 @@ def get_extractor(cls):
)
cls.Extractor = extractor
return extractor

def __init__(self, **source_data):
super().__init__(**source_data)
self.extractor = self.get_extractor()
self.extractor_kwargs = self._source_data_to_extractor_kwargs(source_data)
self._extractor_instance = self.extractor(**self.extractor_kwargs)

def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
"""This functions maps the source_data to kwargs required to initialize the Extractor."""
return source_data
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,19 @@ class AlphaOmegaRecordingInterface(BaseRecordingExtractorInterface):
display_name = "AlphaOmega Recording"
associated_suffixes = (".mpx",)
info = "Interface class for converting AlphaOmega recording data."
stream_id = "RAW"

@classmethod
def get_source_schema(cls) -> dict:
source_schema = super().get_source_schema()
source_schema["properties"]["folder_path"]["description"] = "Path to the folder of .mpx files."
return source_schema

def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
extractor_kwargs = source_data.copy()
extractor_kwargs["stream_id"] = self.stream_id
return extractor_kwargs

def __init__(self, folder_path: DirectoryPath, verbose: bool = True, es_key: str = "ElectricalSeries"):
"""
Load and prepare data for AlphaOmega.
Expand All @@ -33,7 +39,7 @@ def __init__(self, folder_path: DirectoryPath, verbose: bool = True, es_key: str
Default is True.
es_key: str, default: "ElectricalSeries"
"""
super().__init__(folder_path=folder_path, stream_id="RAW", verbose=verbose, es_key=es_key)
super().__init__(folder_path=folder_path, verbose=verbose, es_key=es_key)

def get_metadata(self) -> dict:
metadata = super().get_metadata()
Expand Down
23 changes: 20 additions & 3 deletions src/neuroconv/datainterfaces/ecephys/axona/axonadatainterface.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,12 @@ def get_source_schema(cls) -> dict:
source_schema["properties"]["file_path"]["description"] = "Path to .bin file."
return source_schema

def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
extractor_kwargs = source_data.copy()
extractor_kwargs["all_annotations"] = True

return extractor_kwargs

def __init__(self, file_path: FilePath, verbose: bool = True, es_key: str = "ElectricalSeries"):
"""
Expand All @@ -41,7 +47,7 @@ def __init__(self, file_path: FilePath, verbose: bool = True, es_key: str = "Ele
es_key: str, default: "ElectricalSeries"
"""

super().__init__(file_path=file_path, all_annotations=True, verbose=verbose, es_key=es_key)
super().__init__(file_path=file_path, verbose=verbose, es_key=es_key)
self.source_data = dict(file_path=file_path, verbose=verbose)
self.metadata_in_set_file = self.recording_extractor.neo_reader.file_parameters["set"]["file_header"]

Expand Down Expand Up @@ -134,6 +140,7 @@ def __init__(self, file_path: FilePath, noise_std: float = 3.5):
class AxonaLFPDataInterface(BaseLFPExtractorInterface):
"""
Primary data interface class for converting Axona LFP data.
Note that this interface is not lazy and will load all data into memory.
"""

display_name = "Axona LFP"
Expand All @@ -151,10 +158,20 @@ def get_source_schema(cls) -> dict:
additionalProperties=False,
)

def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:

extractor_kwargs = source_data.copy()
extractor_kwargs.pop("file_path")
extractor_kwargs["traces_list"] = self.traces_list
extractor_kwargs["sampling_frequency"] = self.sampling_frequency

return extractor_kwargs

def __init__(self, file_path: FilePath):
data = read_all_eeg_file_lfp_data(file_path).T
sampling_frequency = get_eeg_sampling_frequency(file_path)
super().__init__(traces_list=[data], sampling_frequency=sampling_frequency)
self.traces_list = [data]
self.sampling_frequency = get_eeg_sampling_frequency(file_path)
super().__init__(file_path=file_path)

self.source_data = dict(file_path=file_path)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ def __init__(self, verbose: bool = True, es_key: str = "ElectricalSeries", **sou
The key-value pairs of extractor-specific arguments.
"""

super().__init__(**source_data)
self.recording_extractor = self.get_extractor()(**source_data)
self.recording_extractor = self._extractor_instance
property_names = self.recording_extractor.get_property_keys()
# TODO remove this and go and change all the uses of channel_name once spikeinterface > 0.101.0 is released
if "channel_name" not in property_names and "channel_names" in property_names:
Expand Down Expand Up @@ -118,7 +119,11 @@ def get_original_timestamps(self) -> Union[np.ndarray, list[np.ndarray]]:
The timestamps for the data stream; if the recording has multiple segments, then a list of timestamps is returned.
"""
new_recording = self.get_extractor()(
**{keyword: value for keyword, value in self.source_data.items() if keyword not in ["verbose", "es_key"]}
**{
keyword: value
for keyword, value in self.extractor_kwargs.items()
if keyword not in ["verbose", "es_key"]
}
)
if self._number_of_segments == 1:
return new_recording.get_times()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,12 @@ def get_source_schema(cls):
] = "Path to the Blackrock file with suffix being .ns1, .ns2, .ns3, .ns4m .ns4, or .ns6."
return source_schema

def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
extractor_kwargs = source_data.copy()
extractor_kwargs["stream_id"] = self.stream_id

return extractor_kwargs

def __init__(
self,
file_path: FilePath,
Expand Down Expand Up @@ -55,7 +61,8 @@ def __init__(
nsx_to_load = int(file_path.suffix[-1])
self.file_path = file_path

super().__init__(file_path=file_path, stream_id=str(nsx_to_load), verbose=verbose, es_key=es_key)
self.stream_id = str(nsx_to_load)
super().__init__(file_path=file_path, verbose=verbose, es_key=es_key)

def get_metadata(self) -> dict:
metadata = super().get_metadata()
Expand Down Expand Up @@ -83,7 +90,7 @@ def get_source_schema(cls) -> dict:
metadata_schema["properties"]["file_path"].update(description="Path to Blackrock .nev file.")
return metadata_schema

def __init__(self, file_path: FilePath, sampling_frequency: float = None, verbose: bool = True):
def __init__(self, file_path: FilePath, sampling_frequency: Optional[float] = None, verbose: bool = True):
"""
Parameters
----------
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,12 @@ class CellExplorerSortingInterface(BaseSortingExtractorInterface):
associated_suffixes = (".mat", ".sessionInfo", ".spikes", ".cellinfo")
info = "Interface for CellExplorer sorting data."

def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
extractor_kwargs = source_data.copy()
extractor_kwargs["sampling_frequency"] = self.sampling_frequency

return extractor_kwargs

def __init__(self, file_path: FilePath, verbose: bool = True):
"""
Initialize read of Cell Explorer file.
Expand Down Expand Up @@ -454,7 +460,8 @@ def __init__(self, file_path: FilePath, verbose: bool = True):
if "extracellular" in session_data.keys():
sampling_frequency = session_data["extracellular"].get("sr", None)

super().__init__(file_path=file_path, sampling_frequency=sampling_frequency, verbose=verbose)
self.sampling_frequency = sampling_frequency
super().__init__(file_path=file_path, verbose=verbose)
self.source_data = dict(file_path=file_path)
spikes_matfile_path = Path(file_path)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,13 @@ def get_source_schema(cls) -> dict:
source_schema["properties"]["file_path"]["description"] = "Path to either a .rhd or a .rhs file"
return source_schema

def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
extractor_kwargs = source_data.copy()
extractor_kwargs["all_annotations"] = True
extractor_kwargs["stream_id"] = self.stream_id

return extractor_kwargs

def __init__(
self,
file_path: FilePath,
Expand Down Expand Up @@ -52,10 +59,8 @@ def __init__(

init_kwargs = dict(
file_path=self.file_path,
stream_id=self.stream_id,
verbose=verbose,
es_key=es_key,
all_annotations=True,
ignore_integrity_checks=ignore_integrity_checks,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,12 @@ def get_source_schema(cls) -> dict:
] = 'Path to Neuralynx directory containing ".ncs", ".nse", ".ntt", ".nse", or ".nev" files.'
return source_schema

def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
extractor_kwargs = source_data.copy()
extractor_kwargs["all_annotations"] = True

return extractor_kwargs

def __init__(
self,
folder_path: DirectoryPath,
Expand All @@ -53,7 +59,10 @@ def __init__(
es_key : str, default: "ElectricalSeries"
"""
super().__init__(
folder_path=folder_path, stream_name=stream_name, verbose=verbose, all_annotations=True, es_key=es_key
folder_path=folder_path,
stream_name=stream_name,
verbose=verbose,
es_key=es_key,
)

# convert properties of object dtype (e.g. datetime) and bool as these are not supported by nwb
Expand Down Expand Up @@ -103,7 +112,7 @@ class NeuralynxSortingInterface(BaseSortingExtractorInterface):
associated_suffixes = (".nse", ".ntt", ".nse", ".nev")
info = "Interface for Neuralynx sorting data."

def __init__(self, folder_path: DirectoryPath, sampling_frequency: float = None, verbose: bool = True):
def __init__(self, folder_path: DirectoryPath, sampling_frequency: Optional[float] = None, verbose: bool = True):
"""_summary_
Parameters
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,13 @@ def get_source_schema(cls) -> dict:
source_schema["properties"]["file_path"]["description"] = "Path to the .pl2 file."
return source_schema

def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
extractor_kwargs = source_data.copy()
extractor_kwargs["all_annotations"] = True
extractor_kwargs["stream_id"] = self.stream_id

return extractor_kwargs

@validate_call
def __init__(self, file_path: FilePath, verbose: bool = True, es_key: str = "ElectricalSeries"):
"""
Expand All @@ -101,16 +108,14 @@ def __init__(self, file_path: FilePath, verbose: bool = True, es_key: str = "Ele

neo_version = Version(neo.__version__)
if neo_version <= Version("0.13.3"):
stream_id = "3"
self.stream_id = "3"
else:
stream_id = "WB"
self.stream_id = "WB"
assert Path(file_path).is_file(), f"Plexon file not found in: {file_path}"
super().__init__(
file_path=file_path,
verbose=verbose,
es_key=es_key,
stream_id=stream_id,
all_annotations=True,
)

def get_metadata(self) -> DeepDict:
Expand Down
Loading

0 comments on commit 9f67ec6

Please sign in to comment.