From 38c1438583e068b47ff58efbfbebeb4122ae8587 Mon Sep 17 00:00:00 2001 From: djaniak Date: Fri, 8 Apr 2022 16:33:27 +0200 Subject: [PATCH] fix: tests and notebooks after rebase --- .../test_lightning_classification_pipeline.py | 45 +++----- tests/test_lightning_inference.py | 105 +++++++++--------- .../validate_lightning_models_inference.ipynb | 41 +++++-- 3 files changed, 100 insertions(+), 91 deletions(-) diff --git a/tests/test_lightning_classification_pipeline.py b/tests/test_lightning_classification_pipeline.py index 5dc064f7..eb43122a 100644 --- a/tests/test_lightning_classification_pipeline.py +++ b/tests/test_lightning_classification_pipeline.py @@ -21,13 +21,7 @@ def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path: @pytest.fixture(scope="module") -def pipeline_kwargs() -> Dict[str, Any]: - return {"embedding_name_or_path": "allegro/herbert-base-cased"} - - -@pytest.fixture(scope="module") -def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: - path = str(tmp_path_module) +def dataset_kwargs(tmp_path_module: Path) -> Dict[str, Any]: pipeline = HuggingFacePreprocessingPipeline( dataset_name="clarin-pl/polemo2-official", load_dataset_kwargs={ @@ -36,7 +30,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: "test_domains": ["hotels", "medicine"], "text_cfg": "text", }, - persist_path=path, + persist_path=tmp_path_module.name, sample_missing_splits=None, ignore_test_subset=False, downsample_splits=(0.01, 0.01, 0.05), @@ -45,7 +39,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: pipeline.run() return { - "dataset_name_or_path": path, + "dataset_name_or_path": tmp_path_module.name, "input_column_name": ["text"], "target_column_name": "target", } @@ -88,32 +82,25 @@ def config() -> LightningAdvancedConfig: def lightning_classification_pipeline( dataset_kwargs: Dict[str, Any], config: LightningAdvancedConfig, - result_path: "TemporaryDirectory[str]", -) -> Tuple[ - LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], - "TemporaryDirectory[str]", -]: - return ( - LightningClassificationPipeline( - embedding_name_or_path="allegro/herbert-base-cased", - output_path=result_path.name, - config=config, - devices="auto", - accelerator="cpu", - **dataset_kwargs, - ), - result_path, + tmp_path_module: Path, +) -> LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]]: + return LightningClassificationPipeline( + embedding_name_or_path="allegro/herbert-base-cased", + output_path=tmp_path_module.name, + config=config, + devices="auto", + accelerator="cpu", + **dataset_kwargs, ) def test_lightning_classification_pipeline( - lightning_classification_pipeline: Tuple[ - LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], - "TemporaryDirectory[str]", - ], + lightning_classification_pipeline: LightningPipeline[ + datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any] + ] ) -> None: pl.seed_everything(441, workers=True) - pipeline, path = lightning_classification_pipeline + pipeline = lightning_classification_pipeline result = pipeline.run() np.testing.assert_almost_equal( result["accuracy"]["accuracy"], 0.3783783, decimal=pytest.decimal diff --git a/tests/test_lightning_inference.py b/tests/test_lightning_inference.py index 1c1e10af..aaa5fa4c 100644 --- a/tests/test_lightning_inference.py +++ b/tests/test_lightning_inference.py @@ -1,4 +1,5 @@ from pathlib import Path +from tempfile import TemporaryDirectory from typing import Any, Dict, Tuple import datasets @@ -8,6 +9,7 @@ import torch from _pytest.tmpdir import TempdirFactory +from embeddings.config.lightning_config import LightningAdvancedConfig from embeddings.pipeline.hf_preprocessing_pipeline import HuggingFacePreprocessingPipeline from embeddings.pipeline.lightning_classification import LightningClassificationPipeline from embeddings.pipeline.lightning_pipeline import LightningPipeline @@ -21,16 +23,7 @@ def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path: @pytest.fixture(scope="module") -def pipeline_kwargs() -> Dict[str, Any]: - return { - "embedding_name_or_path": "allegro/herbert-base-cased", - "finetune_last_n_layers": 0, - } - - -@pytest.fixture(scope="module") -def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: - path = str(tmp_path_module) +def dataset_kwargs(tmp_path_module: Path) -> Dict[str, Any]: pipeline = HuggingFacePreprocessingPipeline( dataset_name="clarin-pl/polemo2-official", load_dataset_kwargs={ @@ -39,7 +32,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: "test_domains": ["hotels", "medicine"], "text_cfg": "text", }, - persist_path=path, + persist_path=tmp_path_module.name, sample_missing_splits=None, ignore_test_subset=False, downsample_splits=(0.01, 0.01, 0.05), @@ -48,79 +41,83 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: pipeline.run() return { - "dataset_name_or_path": path, + "dataset_name_or_path": tmp_path_module.name, "input_column_name": ["text"], "target_column_name": "target", } @pytest.fixture(scope="module") -def task_train_kwargs() -> Dict[str, Any]: - return { - "max_epochs": 1, - "devices": "auto", - "accelerator": "cpu", - "deterministic": True, - } - - -@pytest.fixture(scope="module") -def task_model_kwargs() -> Dict[str, Any]: - return {"learning_rate": 5e-4, "use_scheduler": False} - - -@pytest.fixture(scope="module") -def datamodule_kwargs() -> Dict[str, Any]: - return {"num_workers": 0} +def config() -> LightningAdvancedConfig: + return LightningAdvancedConfig( + finetune_last_n_layers=0, + task_train_kwargs={ + "max_epochs": 1, + "deterministic": True, + }, + task_model_kwargs={ + "learning_rate": 5e-4, + "train_batch_size": 32, + "eval_batch_size": 32, + "use_scheduler": False, + "optimizer": "AdamW", + "adam_epsilon": 1e-8, + "warmup_steps": 100, + "weight_decay": 0.0, + }, + datamodule_kwargs={ + "max_seq_length": 64, + }, + early_stopping_kwargs={ + "monitor": "val/Loss", + "mode": "min", + "patience": 3, + }, + tokenizer_kwargs={}, + batch_encoding_kwargs={}, + dataloader_kwargs={}, + model_config_kwargs={}, + ) @pytest.fixture(scope="module") def lightning_classification_pipeline( - pipeline_kwargs: Dict[str, Any], dataset_kwargs: Dict[str, Any], - datamodule_kwargs: Dict[str, Any], - task_train_kwargs: Dict[str, Any], - task_model_kwargs: Dict[str, Any], - result_path: Path, -) -> Tuple[LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], Path]: - return ( - LightningClassificationPipeline( - output_path=result_path.name, - **pipeline_kwargs, - **dataset_kwargs, - datamodule_kwargs=datamodule_kwargs, - task_train_kwargs=task_train_kwargs, - task_model_kwargs=task_model_kwargs, - ), - result_path, + config: LightningAdvancedConfig, + tmp_path_module: Path, +) -> LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]]: + return LightningClassificationPipeline( + embedding_name_or_path="allegro/herbert-base-cased", + output_path=tmp_path_module.name, + config=config, + devices="auto", + accelerator="cpu", + **dataset_kwargs, ) def test_lightning_pipeline_inference( - lightning_classification_pipeline: Tuple[ - LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], - Path, + lightning_classification_pipeline: LightningPipeline[ + datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any] ], + tmp_path_module: "TemporaryDirectory[str]", ) -> None: pl.seed_everything(441, workers=True) - - pipeline, path = lightning_classification_pipeline + pipeline = lightning_classification_pipeline results = pipeline.run() - ckpt_path = Path(path.name) / "checkpoints" / "epoch=0-step=1.ckpt" + ckpt_path = Path(tmp_path_module.name) / "checkpoints" / "epoch=0-step=1.ckpt" task_from_ckpt = TextClassificationTask.from_checkpoint( checkpoint_path=ckpt_path.resolve(), - output_path=path.name, + output_path=tmp_path_module.name, ) model_state_dict = pipeline.model.task.model.model.state_dict() model_from_ckpt_state_dict = task_from_ckpt.model.model.state_dict() - assert model_state_dict.keys() == model_from_ckpt_state_dict.keys() for k in model_state_dict.keys(): assert torch.equal(model_state_dict[k], model_from_ckpt_state_dict[k]) test_dataloader = pipeline.datamodule.test_dataloader() predictions = task_from_ckpt.predict(test_dataloader, return_names=False) - assert np.array_equal(results["data"]["y_probabilities"], predictions["y_probabilities"]) diff --git a/tutorials/validate_lightning_models_inference.ipynb b/tutorials/validate_lightning_models_inference.ipynb index 7318d1ae..d9326b1f 100644 --- a/tutorials/validate_lightning_models_inference.ipynb +++ b/tutorials/validate_lightning_models_inference.ipynb @@ -24,6 +24,7 @@ "from typing import Any, Dict\n", "\n", "import pytorch_lightning as pl\n", + "from embeddings.config.lightning_config import LightningAdvancedConfig\n", "from embeddings.defaults import DATASET_PATH, RESULTS_PATH\n", "from embeddings.model.lightning_module.text_classification import (\n", " TextClassificationModule,\n", @@ -100,6 +101,35 @@ "### Train simple downsampled pipeline" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cb7ebd4-182c-4797-b5de-a7069313a901", + "metadata": {}, + "outputs": [], + "source": [ + "config = LightningAdvancedConfig(\n", + " finetune_last_n_layers=0,\n", + " task_train_kwargs={\"max_epochs\": 1, \"deterministic\": True,},\n", + " task_model_kwargs={\n", + " \"learning_rate\": 5e-4,\n", + " \"train_batch_size\": 32,\n", + " \"eval_batch_size\": 32,\n", + " \"use_scheduler\": False,\n", + " \"optimizer\": \"AdamW\",\n", + " \"adam_epsilon\": 1e-8,\n", + " \"warmup_steps\": 100,\n", + " \"weight_decay\": 0.0,\n", + " },\n", + " datamodule_kwargs={\"max_seq_length\": 64,},\n", + " early_stopping_kwargs={\"monitor\": \"val/Loss\", \"mode\": \"min\", \"patience\": 3,},\n", + " tokenizer_kwargs={},\n", + " batch_encoding_kwargs={},\n", + " dataloader_kwargs={},\n", + " model_config_kwargs={},\n", + ")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -112,14 +142,9 @@ "pipeline = LightningClassificationPipeline(\n", " embedding_name_or_path=embedding_name_or_path,\n", " output_path=output_path,\n", - " finetune_last_n_layers=0,\n", - " datamodule_kwargs={\"max_seq_length\": 64,},\n", - " task_train_kwargs={\n", - " \"max_epochs\": 1,\n", - " \"devices\": \"auto\",\n", - " \"accelerator\": \"cpu\",\n", - " \"deterministic\": True,\n", - " },\n", + " config=config,\n", + " devices=\"auto\",\n", + " accelerator=\"cpu\",\n", " **dataset_kwargs\n", ")\n", "result = pipeline.run()"