Add json schema validation for source data at the interface level (#1090

)
catalystneuro · Sep 20, 2024 · 9f67ec6 · 9f67ec6
1 parent 1ccdb2a
commit 9f67ec6
Show file tree

Hide file tree

Showing 22 changed files with 191 additions and 44 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,7 @@
 
 ## Features
 * Using in-house `GenericDataChunkIterator` [PR #1068](https://github.com/catalystneuro/neuroconv/pull/1068)
+* Data interfaces now perform source (argument inputs) validation with the json schema  [PR #1020](https://github.com/catalystneuro/neuroconv/pull/1020)
 
 ## Improvements
 * Remove dev test from PR  [PR #1092](https://github.com/catalystneuro/neuroconv/pull/1092)
@@ -27,7 +28,7 @@
 ## Features
 * Added chunking/compression for string-only compound objects: [PR #1042](https://github.com/catalystneuro/neuroconv/pull/1042)
 * Added automated EFS volume creation and mounting to the `submit_aws_job` helper function. [PR #1018](https://github.com/catalystneuro/neuroconv/pull/1018)
-* Added a mock for segmentation extractors interfaces in ophys: `MockSegmentationInterface`  [PR #1067](https://github.com/catalystneuro/neuroconv/pull/1067)
+* Added a mock for segmentation extractors interfaces in ophys: `MockSegmentationInterface` [PR #1067](https://github.com/catalystneuro/neuroconv/pull/1067)
 * Added a `MockSortingInterface` for testing purposes. [PR #1065](https://github.com/catalystneuro/neuroconv/pull/1065)
 * BaseRecordingInterfaces have a new conversion options `always_write_timestamps` that ca be used to force writing timestamps even if neuroconv heuristic indicates regular sampling rate [PR #1091](https://github.com/catalystneuro/neuroconv/pull/1091)
 
@@ -99,6 +100,7 @@
 * Data interfaces `run_conversion` method now performs metadata validation before running the conversion. [PR #949](https://github.com/catalystneuro/neuroconv/pull/949)
 * Introduced `null_values_for_properties` to `add_units_table` to give user control over null values behavior [PR #989](https://github.com/catalystneuro/neuroconv/pull/989)
 
+
 ## Bug fixes
 * Fixed the default naming of multiple electrical series in the `SpikeGLXConverterPipe`. [PR #957](https://github.com/catalystneuro/neuroconv/pull/957)
 * Write new properties to the electrode table use the global identifier channel_name, group [PR #984](https://github.com/catalystneuro/neuroconv/pull/984)

diff --git a/docs/conversion_examples_gallery/behavior/lightningpose.rst b/docs/conversion_examples_gallery/behavior/lightningpose.rst
@@ -23,7 +23,7 @@ Convert LightningPose pose estimation data to NWB using :py:class:`~neuroconv.da
     >>> labeled_video_file_path = str(folder_path / "labeled_videos/test_vid_labeled.mp4")
 
     >>> converter = LightningPoseConverter(file_path=file_path, original_video_file_path=original_video_file_path, labeled_video_file_path=labeled_video_file_path, verbose=False)
-
+    Source data is valid!
     >>> metadata = converter.get_metadata()
     >>> # For data provenance we add the time zone information to the conversion
     >>> session_start_time = metadata["NWBFile"]["session_start_time"]

diff --git a/docs/conversion_examples_gallery/fiberphotometry/tdt_fp.rst b/docs/conversion_examples_gallery/fiberphotometry/tdt_fp.rst
@@ -208,6 +208,7 @@ Convert TDT Fiber Photometry data to NWB using
     >>> editable_metadata_path = LOCAL_PATH / "tests" / "test_on_data" / "ophys" / "fiber_photometry_metadata.yaml"
 
     >>> interface = TDTFiberPhotometryInterface(folder_path=folder_path, verbose=True)
+    Source data is valid!
     >>> metadata = interface.get_metadata()
     >>> metadata["NWBFile"]["session_start_time"] = datetime.now(tz=ZoneInfo("US/Pacific"))
     >>> editable_metadata = load_dict_from_file(editable_metadata_path)

diff --git a/src/neuroconv/basedatainterface.py b/src/neuroconv/basedatainterface.py
@@ -24,6 +24,7 @@
     load_dict_from_file,
 )
 from .utils.dict import DeepDict
+from .utils.json_schema import _NWBSourceDataEncoder
 
 
 class BaseDataInterface(ABC):
@@ -39,11 +40,30 @@ def get_source_schema(cls) -> dict:
         """Infer the JSON schema for the source_data from the method signature (annotation typing)."""
         return get_json_schema_from_method_signature(cls, exclude=["source_data"])
 
+    @classmethod
+    def validate_source(cls, source_data: dict, verbose: bool = False):
+        """Validate source_data against Converter source_schema."""
+        cls._validate_source_data(source_data=source_data, verbose=verbose)
+
+    def _validate_source_data(self, source_data: dict, verbose: bool = False):
+
+        encoder = _NWBSourceDataEncoder()
+        # The encoder produces a serialized object, so we deserialized it for comparison
+
+        serialized_source_data = encoder.encode(source_data)
+        decoded_source_data = json.loads(serialized_source_data)
+        source_schema = self.get_source_schema()
+        validate(instance=decoded_source_data, schema=source_schema)
+        if verbose:
+            print("Source data is valid!")
+
     @validate_call
     def __init__(self, verbose: bool = False, **source_data):
         self.verbose = verbose
         self.source_data = source_data
 
+        self._validate_source_data(source_data=source_data, verbose=verbose)
+
     def get_metadata_schema(self) -> dict:
         """Retrieve JSON schema for metadata."""
         metadata_schema = load_dict_from_file(Path(__file__).parent / "schemas" / "base_metadata_schema.json")

diff --git a/src/neuroconv/baseextractorinterface.py b/src/neuroconv/baseextractorinterface.py
@@ -29,3 +29,13 @@ def get_extractor(cls):
         )
         cls.Extractor = extractor
         return extractor
+
+    def __init__(self, **source_data):
+        super().__init__(**source_data)
+        self.extractor = self.get_extractor()
+        self.extractor_kwargs = self._source_data_to_extractor_kwargs(source_data)
+        self._extractor_instance = self.extractor(**self.extractor_kwargs)
+
+    def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
+        """This functions maps the source_data to kwargs required to initialize the Extractor."""
+        return source_data
diff --git a/src/neuroconv/datainterfaces/ecephys/alphaomega/alphaomegadatainterface.py b/src/neuroconv/datainterfaces/ecephys/alphaomega/alphaomegadatainterface.py
@@ -13,13 +13,19 @@ class AlphaOmegaRecordingInterface(BaseRecordingExtractorInterface):
     display_name = "AlphaOmega Recording"
     associated_suffixes = (".mpx",)
     info = "Interface class for converting AlphaOmega recording data."
+    stream_id = "RAW"
 
     @classmethod
     def get_source_schema(cls) -> dict:
         source_schema = super().get_source_schema()
         source_schema["properties"]["folder_path"]["description"] = "Path to the folder of .mpx files."
         return source_schema
 
+    def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
+        extractor_kwargs = source_data.copy()
+        extractor_kwargs["stream_id"] = self.stream_id
+        return extractor_kwargs
+
     def __init__(self, folder_path: DirectoryPath, verbose: bool = True, es_key: str = "ElectricalSeries"):
         """
         Load and prepare data for AlphaOmega.
@@ -33,7 +39,7 @@ def __init__(self, folder_path: DirectoryPath, verbose: bool = True, es_key: str
             Default is True.
         es_key: str, default: "ElectricalSeries"
         """
-        super().__init__(folder_path=folder_path, stream_id="RAW", verbose=verbose, es_key=es_key)
+        super().__init__(folder_path=folder_path, verbose=verbose, es_key=es_key)
 
     def get_metadata(self) -> dict:
         metadata = super().get_metadata()

diff --git a/src/neuroconv/datainterfaces/ecephys/axona/axonadatainterface.py b/src/neuroconv/datainterfaces/ecephys/axona/axonadatainterface.py
@@ -30,6 +30,12 @@ def get_source_schema(cls) -> dict:
         source_schema["properties"]["file_path"]["description"] = "Path to .bin file."
         return source_schema
 
+    def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
+        extractor_kwargs = source_data.copy()
+        extractor_kwargs["all_annotations"] = True
+
+        return extractor_kwargs
+
     def __init__(self, file_path: FilePath, verbose: bool = True, es_key: str = "ElectricalSeries"):
         """
 
@@ -41,7 +47,7 @@ def __init__(self, file_path: FilePath, verbose: bool = True, es_key: str = "Ele
         es_key: str, default: "ElectricalSeries"
         """
 
-        super().__init__(file_path=file_path, all_annotations=True, verbose=verbose, es_key=es_key)
+        super().__init__(file_path=file_path, verbose=verbose, es_key=es_key)
         self.source_data = dict(file_path=file_path, verbose=verbose)
         self.metadata_in_set_file = self.recording_extractor.neo_reader.file_parameters["set"]["file_header"]
 
@@ -134,6 +140,7 @@ def __init__(self, file_path: FilePath, noise_std: float = 3.5):
 class AxonaLFPDataInterface(BaseLFPExtractorInterface):
     """
     Primary data interface class for converting Axona LFP data.
+    Note that this interface is not lazy and will load all data into memory.
     """
 
     display_name = "Axona LFP"
@@ -151,10 +158,20 @@ def get_source_schema(cls) -> dict:
             additionalProperties=False,
         )
 
+    def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
+
+        extractor_kwargs = source_data.copy()
+        extractor_kwargs.pop("file_path")
+        extractor_kwargs["traces_list"] = self.traces_list
+        extractor_kwargs["sampling_frequency"] = self.sampling_frequency
+
+        return extractor_kwargs
+
     def __init__(self, file_path: FilePath):
         data = read_all_eeg_file_lfp_data(file_path).T
-        sampling_frequency = get_eeg_sampling_frequency(file_path)
-        super().__init__(traces_list=[data], sampling_frequency=sampling_frequency)
+        self.traces_list = [data]
+        self.sampling_frequency = get_eeg_sampling_frequency(file_path)
+        super().__init__(file_path=file_path)
 
         self.source_data = dict(file_path=file_path)
 

diff --git a/src/neuroconv/datainterfaces/ecephys/baserecordingextractorinterface.py b/src/neuroconv/datainterfaces/ecephys/baserecordingextractorinterface.py
@@ -32,8 +32,9 @@ def __init__(self, verbose: bool = True, es_key: str = "ElectricalSeries", **sou
             The key-value pairs of extractor-specific arguments.
 
         """
+
         super().__init__(**source_data)
-        self.recording_extractor = self.get_extractor()(**source_data)
+        self.recording_extractor = self._extractor_instance
         property_names = self.recording_extractor.get_property_keys()
         # TODO remove this and go and change all the uses of channel_name once spikeinterface > 0.101.0 is released
         if "channel_name" not in property_names and "channel_names" in property_names:
@@ -118,7 +119,11 @@ def get_original_timestamps(self) -> Union[np.ndarray, list[np.ndarray]]:
             The timestamps for the data stream; if the recording has multiple segments, then a list of timestamps is returned.
         """
         new_recording = self.get_extractor()(
-            **{keyword: value for keyword, value in self.source_data.items() if keyword not in ["verbose", "es_key"]}
+            **{
+                keyword: value
+                for keyword, value in self.extractor_kwargs.items()
+                if keyword not in ["verbose", "es_key"]
+            }
         )
         if self._number_of_segments == 1:
             return new_recording.get_times()

diff --git a/src/neuroconv/datainterfaces/ecephys/blackrock/blackrockdatainterface.py b/src/neuroconv/datainterfaces/ecephys/blackrock/blackrockdatainterface.py
@@ -25,6 +25,12 @@ def get_source_schema(cls):
         ] = "Path to the Blackrock file with suffix being .ns1, .ns2, .ns3, .ns4m .ns4, or .ns6."
         return source_schema
 
+    def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
+        extractor_kwargs = source_data.copy()
+        extractor_kwargs["stream_id"] = self.stream_id
+
+        return extractor_kwargs
+
     def __init__(
         self,
         file_path: FilePath,
@@ -55,7 +61,8 @@ def __init__(
             nsx_to_load = int(file_path.suffix[-1])
             self.file_path = file_path
 
-        super().__init__(file_path=file_path, stream_id=str(nsx_to_load), verbose=verbose, es_key=es_key)
+        self.stream_id = str(nsx_to_load)
+        super().__init__(file_path=file_path, verbose=verbose, es_key=es_key)
 
     def get_metadata(self) -> dict:
         metadata = super().get_metadata()
@@ -83,7 +90,7 @@ def get_source_schema(cls) -> dict:
         metadata_schema["properties"]["file_path"].update(description="Path to Blackrock .nev file.")
         return metadata_schema
 
-    def __init__(self, file_path: FilePath, sampling_frequency: float = None, verbose: bool = True):
+    def __init__(self, file_path: FilePath, sampling_frequency: Optional[float] = None, verbose: bool = True):
         """
         Parameters
         ----------

diff --git a/src/neuroconv/datainterfaces/ecephys/cellexplorer/cellexplorerdatainterface.py b/src/neuroconv/datainterfaces/ecephys/cellexplorer/cellexplorerdatainterface.py
@@ -419,6 +419,12 @@ class CellExplorerSortingInterface(BaseSortingExtractorInterface):
     associated_suffixes = (".mat", ".sessionInfo", ".spikes", ".cellinfo")
     info = "Interface for CellExplorer sorting data."
 
+    def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
+        extractor_kwargs = source_data.copy()
+        extractor_kwargs["sampling_frequency"] = self.sampling_frequency
+
+        return extractor_kwargs
+
     def __init__(self, file_path: FilePath, verbose: bool = True):
         """
         Initialize read of Cell Explorer file.
@@ -454,7 +460,8 @@ def __init__(self, file_path: FilePath, verbose: bool = True):
             if "extracellular" in session_data.keys():
                 sampling_frequency = session_data["extracellular"].get("sr", None)
 
-        super().__init__(file_path=file_path, sampling_frequency=sampling_frequency, verbose=verbose)
+        self.sampling_frequency = sampling_frequency
+        super().__init__(file_path=file_path, verbose=verbose)
         self.source_data = dict(file_path=file_path)
         spikes_matfile_path = Path(file_path)
 

diff --git a/src/neuroconv/datainterfaces/ecephys/intan/intandatainterface.py b/src/neuroconv/datainterfaces/ecephys/intan/intandatainterface.py
@@ -25,6 +25,13 @@ def get_source_schema(cls) -> dict:
         source_schema["properties"]["file_path"]["description"] = "Path to either a .rhd or a .rhs file"
         return source_schema
 
+    def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
+        extractor_kwargs = source_data.copy()
+        extractor_kwargs["all_annotations"] = True
+        extractor_kwargs["stream_id"] = self.stream_id
+
+        return extractor_kwargs
+
     def __init__(
         self,
         file_path: FilePath,
@@ -52,10 +59,8 @@ def __init__(
 
         init_kwargs = dict(
             file_path=self.file_path,
-            stream_id=self.stream_id,
             verbose=verbose,
             es_key=es_key,
-            all_annotations=True,
             ignore_integrity_checks=ignore_integrity_checks,
         )
 

diff --git a/src/neuroconv/datainterfaces/ecephys/neuralynx/neuralynxdatainterface.py b/src/neuroconv/datainterfaces/ecephys/neuralynx/neuralynxdatainterface.py
@@ -32,6 +32,12 @@ def get_source_schema(cls) -> dict:
         ] = 'Path to Neuralynx directory containing ".ncs", ".nse", ".ntt", ".nse", or ".nev" files.'
         return source_schema
 
+    def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
+        extractor_kwargs = source_data.copy()
+        extractor_kwargs["all_annotations"] = True
+
+        return extractor_kwargs
+
     def __init__(
         self,
         folder_path: DirectoryPath,
@@ -53,7 +59,10 @@ def __init__(
         es_key : str, default: "ElectricalSeries"
         """
         super().__init__(
-            folder_path=folder_path, stream_name=stream_name, verbose=verbose, all_annotations=True, es_key=es_key
+            folder_path=folder_path,
+            stream_name=stream_name,
+            verbose=verbose,
+            es_key=es_key,
         )
 
         # convert properties of object dtype (e.g. datetime) and bool as these are not supported by nwb
@@ -103,7 +112,7 @@ class NeuralynxSortingInterface(BaseSortingExtractorInterface):
     associated_suffixes = (".nse", ".ntt", ".nse", ".nev")
     info = "Interface for Neuralynx sorting data."
 
-    def __init__(self, folder_path: DirectoryPath, sampling_frequency: float = None, verbose: bool = True):
+    def __init__(self, folder_path: DirectoryPath, sampling_frequency: Optional[float] = None, verbose: bool = True):
         """_summary_
 
         Parameters

diff --git a/src/neuroconv/datainterfaces/ecephys/plexon/plexondatainterface.py b/src/neuroconv/datainterfaces/ecephys/plexon/plexondatainterface.py
@@ -82,6 +82,13 @@ def get_source_schema(cls) -> dict:
         source_schema["properties"]["file_path"]["description"] = "Path to the .pl2 file."
         return source_schema
 
+    def _source_data_to_extractor_kwargs(self, source_data: dict) -> dict:
+        extractor_kwargs = source_data.copy()
+        extractor_kwargs["all_annotations"] = True
+        extractor_kwargs["stream_id"] = self.stream_id
+
+        return extractor_kwargs
+
     @validate_call
     def __init__(self, file_path: FilePath, verbose: bool = True, es_key: str = "ElectricalSeries"):
         """
@@ -101,16 +108,14 @@ def __init__(self, file_path: FilePath, verbose: bool = True, es_key: str = "Ele
 
         neo_version = Version(neo.__version__)
         if neo_version <= Version("0.13.3"):
-            stream_id = "3"
+            self.stream_id = "3"
         else:
-            stream_id = "WB"
+            self.stream_id = "WB"
         assert Path(file_path).is_file(), f"Plexon file not found in: {file_path}"
         super().__init__(
             file_path=file_path,
             verbose=verbose,
             es_key=es_key,
-            stream_id=stream_id,
-            all_annotations=True,
         )
 
     def get_metadata(self) -> DeepDict: