generalize to hf (#203)

MaximilienLC · web-flow · commit ce34183394be · 2024-01-23T22:09:20.000-05:00
diff --git a/cneuromax/fitting/deeplearning/datamodule/base.py b/cneuromax/fitting/deeplearning/datamodule/base.py
@@ -4,6 +4,7 @@
 from typing import Annotated as An
 from typing import final
 
+from datasets import Dataset as HFDataset
 from lightning.pytorch import LightningDataModule
 from torch import Tensor
 from torch.utils.data import DataLoader, Dataset
@@ -13,7 +14,10 @@
 
 @dataclass
 class Datasets:
-    """Holds stage-specific :class:`~torch.utils.data.Dataset` objects.
+    """Holds phase-specific :class:`~torch.utils.data.Dataset` objects.
+
+    Using the word ``phase`` to not overload :mod:`lightning` ``stage``
+    terminology used for ``fit``, ``validate`` and ``test``.
 
     Args:
         train: Training dataset.
@@ -22,10 +26,10 @@ class Datasets:
         predict: Prediction dataset.
     """
 
-    train: Dataset[Tensor] | None = None
-    val: Dataset[Tensor] | None = None
-    test: Dataset[Tensor] | None = None
-    predict: Dataset[Tensor] | None = None
+    train: Dataset[Tensor] | HFDataset | None = None
+    val: Dataset[Tensor] | HFDataset | None = None
+    test: Dataset[Tensor] | HFDataset | None = None
+    predict: Dataset[Tensor] | HFDataset | None = None
 
 
 @dataclass
@@ -44,16 +48,18 @@ class BaseDataModuleConfig:
 class BaseDataModule(LightningDataModule, metaclass=ABCMeta):
     """Base :mod:`lightning` ``DataModule``.
 
-    With ``<stage>`` being any of ``train``, ``val``, ``test`` or
+    With ``<phase>`` being any of ``train``, ``val``, ``test`` or
     ``predict``, subclasses need to properly define the
-    ``datasets.<stage>`` attribute(s) for each desired stage.
+    ``datasets.<phase>`` attribute(s) for each desired phase.
 
     Args:
         config: See :class:`BaseDataModuleConfig`.
 
     Attributes:
         config (:class:`BaseDataModuleConfig`)
         datasets (:class:`Datasets`)
+        collate_fn (``callable``): See \
+            :paramref:`torch.utils.data.DataLoader.collate_fn`.
         pin_memory (``bool``): Whether to copy tensors into device\
             pinned memory before returning them (is set to ``True`` by\
             default if :paramref:`~BaseDataModuleConfig.device` is\
@@ -72,6 +78,7 @@ def __init__(self: "BaseDataModule", config: BaseDataModuleConfig) -> None:
         super().__init__()
         self.config = config
         self.datasets = Datasets()
+        self.collate_fn = None
         self.pin_memory = self.config.device == "gpu"
         self.per_device_batch_size = 1
         self.per_device_num_workers = 0
@@ -108,7 +115,7 @@ def state_dict(self: "BaseDataModule") -> dict[str, int]:
     @final
     def x_dataloader(
         self: "BaseDataModule",
-        dataset: Dataset[Tensor] | None,
+        dataset: Dataset[Tensor] | HFDataset | None,
         *,
         shuffle: bool = True,
     ) -> DataLoader[Tensor]:
@@ -134,6 +141,7 @@ def x_dataloader(
             batch_size=self.per_device_batch_size,
             shuffle=shuffle,
             num_workers=self.per_device_num_workers,
+            collate_fn=self.collate_fn,
             pin_memory=self.pin_memory,
         )
 
diff --git a/cneuromax/fitting/deeplearning/litmodule/base.py b/cneuromax/fitting/deeplearning/litmodule/base.py
@@ -10,6 +10,7 @@
 from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
 
+from cneuromax.fitting.deeplearning.utils.type import Batch_type
 from cneuromax.utils.beartype import one_of
 
 
@@ -86,9 +87,7 @@ def __init__(
     @final
     def stage_step(
         self: "BaseLitModule",
-        batch: Num[Tensor, " ..."]
-        | tuple[Num[Tensor, " ..."], ...]
-        | list[Num[Tensor, " ..."]],
+        batch: Batch_type,
         stage: An[str, one_of("train", "val", "test", "predict")],
     ) -> Num[Tensor, " ..."]:
         """Generic stage wrapper around the :meth:`step` method.
@@ -105,17 +104,15 @@ def stage_step(
             The loss value(s).
         """
         if isinstance(batch, list):
-            tupled_batch: tuple[Num[Tensor, " ..."], ...] = tuple(batch)
-        loss: Num[Tensor, " ..."] = self.step(tupled_batch, stage)
+            batch = tuple(batch)
+        loss: Num[Tensor, " ..."] = self.step(batch, stage)
         self.log(name=f"{stage}/loss", value=loss)
         return loss
 
     @final
     def training_step(
         self: "BaseLitModule",
-        batch: Num[Tensor, " ..."]
-        | tuple[Num[Tensor, " ..."], ...]
-        | list[Num[Tensor, " ..."]],
+        batch: Batch_type,
     ) -> Num[Tensor, " ..."]:
         """Calls :meth:`stage_step` with argument ``stage="train"``.
 
@@ -130,9 +127,7 @@ def training_step(
     @final
     def validation_step(
         self: "BaseLitModule",
-        batch: Num[Tensor, " ..."]
-        | tuple[Num[Tensor, " ..."], ...]
-        | list[Num[Tensor, " ..."]],
+        batch: Batch_type,
         # :paramref:`*args` & :paramref:`**kwargs` type annotations
         # cannot be more specific because of
         # :meth:`LightningModule.validation_step`\'s signature.
@@ -154,9 +149,7 @@ def validation_step(
     @final
     def test_step(
         self: "BaseLitModule",
-        batch: Num[Tensor, " ..."]
-        | tuple[Num[Tensor, " ..."], ...]
-        | list[Num[Tensor, " ..."]],
+        batch: Batch_type,
     ) -> Num[Tensor, " ..."]:
         """Calls :meth:`stage_step` with argument ``stage="test"``.
 
diff --git a/cneuromax/fitting/deeplearning/utils/type.py b/cneuromax/fitting/deeplearning/utils/type.py
@@ -0,0 +1,11 @@
+"""Typing utilities."""
+
+from jaxtyping import Num
+from torch import Tensor
+
+Batch_type = (
+    Num[Tensor, " ..."]
+    | tuple[Num[Tensor, " ..."], ...]
+    | list[Num[Tensor, " ..."]]
+    | dict[str, Num[Tensor, " ..."]]
+)