fix: tests and notebooks after rebase

CLARIN-PL · Apr 8, 2022 · 38c1438 · 38c1438
1 parent 2396b36
commit 38c1438
Show file tree

Hide file tree

Showing 3 changed files with 100 additions and 91 deletions.
diff --git a/tests/test_lightning_classification_pipeline.py b/tests/test_lightning_classification_pipeline.py
@@ -21,13 +21,7 @@ def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path:
 
 
 @pytest.fixture(scope="module")
-def pipeline_kwargs() -> Dict[str, Any]:
-    return {"embedding_name_or_path": "allegro/herbert-base-cased"}
-
-
-@pytest.fixture(scope="module")
-def dataset_kwargs(tmp_path_module) -> Dict[str, Any]:
-    path = str(tmp_path_module)
+def dataset_kwargs(tmp_path_module: Path) -> Dict[str, Any]:
     pipeline = HuggingFacePreprocessingPipeline(
         dataset_name="clarin-pl/polemo2-official",
         load_dataset_kwargs={
@@ -36,7 +30,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]:
             "test_domains": ["hotels", "medicine"],
             "text_cfg": "text",
         },
-        persist_path=path,
+        persist_path=tmp_path_module.name,
         sample_missing_splits=None,
         ignore_test_subset=False,
         downsample_splits=(0.01, 0.01, 0.05),
@@ -45,7 +39,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]:
     pipeline.run()
 
     return {
-        "dataset_name_or_path": path,
+        "dataset_name_or_path": tmp_path_module.name,
         "input_column_name": ["text"],
         "target_column_name": "target",
     }
@@ -88,32 +82,25 @@ def config() -> LightningAdvancedConfig:
 def lightning_classification_pipeline(
     dataset_kwargs: Dict[str, Any],
     config: LightningAdvancedConfig,
-    result_path: "TemporaryDirectory[str]",
-) -> Tuple[
-    LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]],
-    "TemporaryDirectory[str]",
-]:
-    return (
-        LightningClassificationPipeline(
-            embedding_name_or_path="allegro/herbert-base-cased",
-            output_path=result_path.name,
-            config=config,
-            devices="auto",
-            accelerator="cpu",
-            **dataset_kwargs,
-        ),
-        result_path,
+    tmp_path_module: Path,
+) -> LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]]:
+    return LightningClassificationPipeline(
+        embedding_name_or_path="allegro/herbert-base-cased",
+        output_path=tmp_path_module.name,
+        config=config,
+        devices="auto",
+        accelerator="cpu",
+        **dataset_kwargs,
     )
 
 
 def test_lightning_classification_pipeline(
-    lightning_classification_pipeline: Tuple[
-        LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]],
-        "TemporaryDirectory[str]",
-    ],
+    lightning_classification_pipeline: LightningPipeline[
+        datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]
+    ]
 ) -> None:
     pl.seed_everything(441, workers=True)
-    pipeline, path = lightning_classification_pipeline
+    pipeline = lightning_classification_pipeline
     result = pipeline.run()
     np.testing.assert_almost_equal(
         result["accuracy"]["accuracy"], 0.3783783, decimal=pytest.decimal

diff --git a/tests/test_lightning_inference.py b/tests/test_lightning_inference.py
@@ -1,4 +1,5 @@
 from pathlib import Path
+from tempfile import TemporaryDirectory
 from typing import Any, Dict, Tuple
 
 import datasets
@@ -8,6 +9,7 @@
 import torch
 from _pytest.tmpdir import TempdirFactory
 
+from embeddings.config.lightning_config import LightningAdvancedConfig
 from embeddings.pipeline.hf_preprocessing_pipeline import HuggingFacePreprocessingPipeline
 from embeddings.pipeline.lightning_classification import LightningClassificationPipeline
 from embeddings.pipeline.lightning_pipeline import LightningPipeline
@@ -21,16 +23,7 @@ def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path:
 
 
 @pytest.fixture(scope="module")
-def pipeline_kwargs() -> Dict[str, Any]:
-    return {
-        "embedding_name_or_path": "allegro/herbert-base-cased",
-        "finetune_last_n_layers": 0,
-    }
-
-
-@pytest.fixture(scope="module")
-def dataset_kwargs(tmp_path_module) -> Dict[str, Any]:
-    path = str(tmp_path_module)
+def dataset_kwargs(tmp_path_module: Path) -> Dict[str, Any]:
     pipeline = HuggingFacePreprocessingPipeline(
         dataset_name="clarin-pl/polemo2-official",
         load_dataset_kwargs={
@@ -39,7 +32,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]:
             "test_domains": ["hotels", "medicine"],
             "text_cfg": "text",
         },
-        persist_path=path,
+        persist_path=tmp_path_module.name,
         sample_missing_splits=None,
         ignore_test_subset=False,
         downsample_splits=(0.01, 0.01, 0.05),
@@ -48,79 +41,83 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]:
     pipeline.run()
 
     return {
-        "dataset_name_or_path": path,
+        "dataset_name_or_path": tmp_path_module.name,
         "input_column_name": ["text"],
         "target_column_name": "target",
     }
 
 
 @pytest.fixture(scope="module")
-def task_train_kwargs() -> Dict[str, Any]:
-    return {
-        "max_epochs": 1,
-        "devices": "auto",
-        "accelerator": "cpu",
-        "deterministic": True,
-    }
-
-
-@pytest.fixture(scope="module")
-def task_model_kwargs() -> Dict[str, Any]:
-    return {"learning_rate": 5e-4, "use_scheduler": False}
-
-
-@pytest.fixture(scope="module")
-def datamodule_kwargs() -> Dict[str, Any]:
-    return {"num_workers": 0}
+def config() -> LightningAdvancedConfig:
+    return LightningAdvancedConfig(
+        finetune_last_n_layers=0,
+        task_train_kwargs={
+            "max_epochs": 1,
+            "deterministic": True,
+        },
+        task_model_kwargs={
+            "learning_rate": 5e-4,
+            "train_batch_size": 32,
+            "eval_batch_size": 32,
+            "use_scheduler": False,
+            "optimizer": "AdamW",
+            "adam_epsilon": 1e-8,
+            "warmup_steps": 100,
+            "weight_decay": 0.0,
+        },
+        datamodule_kwargs={
+            "max_seq_length": 64,
+        },
+        early_stopping_kwargs={
+            "monitor": "val/Loss",
+            "mode": "min",
+            "patience": 3,
+        },
+        tokenizer_kwargs={},
+        batch_encoding_kwargs={},
+        dataloader_kwargs={},
+        model_config_kwargs={},
+    )
 
 
 @pytest.fixture(scope="module")
 def lightning_classification_pipeline(
-    pipeline_kwargs: Dict[str, Any],
     dataset_kwargs: Dict[str, Any],
-    datamodule_kwargs: Dict[str, Any],
-    task_train_kwargs: Dict[str, Any],
-    task_model_kwargs: Dict[str, Any],
-    result_path: Path,
-) -> Tuple[LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], Path]:
-    return (
-        LightningClassificationPipeline(
-            output_path=result_path.name,
-            **pipeline_kwargs,
-            **dataset_kwargs,
-            datamodule_kwargs=datamodule_kwargs,
-            task_train_kwargs=task_train_kwargs,
-            task_model_kwargs=task_model_kwargs,
-        ),
-        result_path,
+    config: LightningAdvancedConfig,
+    tmp_path_module: Path,
+) -> LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]]:
+    return LightningClassificationPipeline(
+        embedding_name_or_path="allegro/herbert-base-cased",
+        output_path=tmp_path_module.name,
+        config=config,
+        devices="auto",
+        accelerator="cpu",
+        **dataset_kwargs,
     )
 
 
 def test_lightning_pipeline_inference(
-    lightning_classification_pipeline: Tuple[
-        LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]],
-        Path,
+    lightning_classification_pipeline: LightningPipeline[
+        datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]
     ],
+    tmp_path_module: "TemporaryDirectory[str]",
 ) -> None:
     pl.seed_everything(441, workers=True)
-
-    pipeline, path = lightning_classification_pipeline
+    pipeline = lightning_classification_pipeline
     results = pipeline.run()
 
-    ckpt_path = Path(path.name) / "checkpoints" / "epoch=0-step=1.ckpt"
+    ckpt_path = Path(tmp_path_module.name) / "checkpoints" / "epoch=0-step=1.ckpt"
     task_from_ckpt = TextClassificationTask.from_checkpoint(
         checkpoint_path=ckpt_path.resolve(),
-        output_path=path.name,
+        output_path=tmp_path_module.name,
     )
 
     model_state_dict = pipeline.model.task.model.model.state_dict()
     model_from_ckpt_state_dict = task_from_ckpt.model.model.state_dict()
-
     assert model_state_dict.keys() == model_from_ckpt_state_dict.keys()
     for k in model_state_dict.keys():
         assert torch.equal(model_state_dict[k], model_from_ckpt_state_dict[k])
 
     test_dataloader = pipeline.datamodule.test_dataloader()
     predictions = task_from_ckpt.predict(test_dataloader, return_names=False)
-
     assert np.array_equal(results["data"]["y_probabilities"], predictions["y_probabilities"])
diff --git a/tutorials/validate_lightning_models_inference.ipynb b/tutorials/validate_lightning_models_inference.ipynb
@@ -24,6 +24,7 @@
     "from typing import Any, Dict\n",
     "\n",
     "import pytorch_lightning as pl\n",
+    "from embeddings.config.lightning_config import LightningAdvancedConfig\n",
     "from embeddings.defaults import DATASET_PATH, RESULTS_PATH\n",
     "from embeddings.model.lightning_module.text_classification import (\n",
     "    TextClassificationModule,\n",
@@ -100,6 +101,35 @@
     "### Train simple downsampled pipeline"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4cb7ebd4-182c-4797-b5de-a7069313a901",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = LightningAdvancedConfig(\n",
+    "    finetune_last_n_layers=0,\n",
+    "    task_train_kwargs={\"max_epochs\": 1, \"deterministic\": True,},\n",
+    "    task_model_kwargs={\n",
+    "        \"learning_rate\": 5e-4,\n",
+    "        \"train_batch_size\": 32,\n",
+    "        \"eval_batch_size\": 32,\n",
+    "        \"use_scheduler\": False,\n",
+    "        \"optimizer\": \"AdamW\",\n",
+    "        \"adam_epsilon\": 1e-8,\n",
+    "        \"warmup_steps\": 100,\n",
+    "        \"weight_decay\": 0.0,\n",
+    "    },\n",
+    "    datamodule_kwargs={\"max_seq_length\": 64,},\n",
+    "    early_stopping_kwargs={\"monitor\": \"val/Loss\", \"mode\": \"min\", \"patience\": 3,},\n",
+    "    tokenizer_kwargs={},\n",
+    "    batch_encoding_kwargs={},\n",
+    "    dataloader_kwargs={},\n",
+    "    model_config_kwargs={},\n",
+    ")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -112,14 +142,9 @@
     "pipeline = LightningClassificationPipeline(\n",
     "    embedding_name_or_path=embedding_name_or_path,\n",
     "    output_path=output_path,\n",
-    "    finetune_last_n_layers=0,\n",
-    "    datamodule_kwargs={\"max_seq_length\": 64,},\n",
-    "    task_train_kwargs={\n",
-    "        \"max_epochs\": 1,\n",
-    "        \"devices\": \"auto\",\n",
-    "        \"accelerator\": \"cpu\",\n",
-    "        \"deterministic\": True,\n",
-    "    },\n",
+    "    config=config,\n",
+    "    devices=\"auto\",\n",
+    "    accelerator=\"cpu\",\n",
     "    **dataset_kwargs\n",
     ")\n",
     "result = pipeline.run()"