Skip to content

Commit

Permalink
fix: tests and notebooks after rebase
Browse files Browse the repository at this point in the history
  • Loading branch information
djaniak committed Apr 8, 2022
1 parent 2396b36 commit 38c1438
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 91 deletions.
45 changes: 16 additions & 29 deletions tests/test_lightning_classification_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,7 @@ def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path:


@pytest.fixture(scope="module")
def pipeline_kwargs() -> Dict[str, Any]:
return {"embedding_name_or_path": "allegro/herbert-base-cased"}


@pytest.fixture(scope="module")
def dataset_kwargs(tmp_path_module) -> Dict[str, Any]:
path = str(tmp_path_module)
def dataset_kwargs(tmp_path_module: Path) -> Dict[str, Any]:
pipeline = HuggingFacePreprocessingPipeline(
dataset_name="clarin-pl/polemo2-official",
load_dataset_kwargs={
Expand All @@ -36,7 +30,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]:
"test_domains": ["hotels", "medicine"],
"text_cfg": "text",
},
persist_path=path,
persist_path=tmp_path_module.name,
sample_missing_splits=None,
ignore_test_subset=False,
downsample_splits=(0.01, 0.01, 0.05),
Expand All @@ -45,7 +39,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]:
pipeline.run()

return {
"dataset_name_or_path": path,
"dataset_name_or_path": tmp_path_module.name,
"input_column_name": ["text"],
"target_column_name": "target",
}
Expand Down Expand Up @@ -88,32 +82,25 @@ def config() -> LightningAdvancedConfig:
def lightning_classification_pipeline(
dataset_kwargs: Dict[str, Any],
config: LightningAdvancedConfig,
result_path: "TemporaryDirectory[str]",
) -> Tuple[
LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]],
"TemporaryDirectory[str]",
]:
return (
LightningClassificationPipeline(
embedding_name_or_path="allegro/herbert-base-cased",
output_path=result_path.name,
config=config,
devices="auto",
accelerator="cpu",
**dataset_kwargs,
),
result_path,
tmp_path_module: Path,
) -> LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]]:
return LightningClassificationPipeline(
embedding_name_or_path="allegro/herbert-base-cased",
output_path=tmp_path_module.name,
config=config,
devices="auto",
accelerator="cpu",
**dataset_kwargs,
)


def test_lightning_classification_pipeline(
lightning_classification_pipeline: Tuple[
LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]],
"TemporaryDirectory[str]",
],
lightning_classification_pipeline: LightningPipeline[
datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]
]
) -> None:
pl.seed_everything(441, workers=True)
pipeline, path = lightning_classification_pipeline
pipeline = lightning_classification_pipeline
result = pipeline.run()
np.testing.assert_almost_equal(
result["accuracy"]["accuracy"], 0.3783783, decimal=pytest.decimal
Expand Down
105 changes: 51 additions & 54 deletions tests/test_lightning_inference.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from pathlib import Path
from tempfile import TemporaryDirectory
from typing import Any, Dict, Tuple

import datasets
Expand All @@ -8,6 +9,7 @@
import torch
from _pytest.tmpdir import TempdirFactory

from embeddings.config.lightning_config import LightningAdvancedConfig
from embeddings.pipeline.hf_preprocessing_pipeline import HuggingFacePreprocessingPipeline
from embeddings.pipeline.lightning_classification import LightningClassificationPipeline
from embeddings.pipeline.lightning_pipeline import LightningPipeline
Expand All @@ -21,16 +23,7 @@ def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path:


@pytest.fixture(scope="module")
def pipeline_kwargs() -> Dict[str, Any]:
return {
"embedding_name_or_path": "allegro/herbert-base-cased",
"finetune_last_n_layers": 0,
}


@pytest.fixture(scope="module")
def dataset_kwargs(tmp_path_module) -> Dict[str, Any]:
path = str(tmp_path_module)
def dataset_kwargs(tmp_path_module: Path) -> Dict[str, Any]:
pipeline = HuggingFacePreprocessingPipeline(
dataset_name="clarin-pl/polemo2-official",
load_dataset_kwargs={
Expand All @@ -39,7 +32,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]:
"test_domains": ["hotels", "medicine"],
"text_cfg": "text",
},
persist_path=path,
persist_path=tmp_path_module.name,
sample_missing_splits=None,
ignore_test_subset=False,
downsample_splits=(0.01, 0.01, 0.05),
Expand All @@ -48,79 +41,83 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]:
pipeline.run()

return {
"dataset_name_or_path": path,
"dataset_name_or_path": tmp_path_module.name,
"input_column_name": ["text"],
"target_column_name": "target",
}


@pytest.fixture(scope="module")
def task_train_kwargs() -> Dict[str, Any]:
return {
"max_epochs": 1,
"devices": "auto",
"accelerator": "cpu",
"deterministic": True,
}


@pytest.fixture(scope="module")
def task_model_kwargs() -> Dict[str, Any]:
return {"learning_rate": 5e-4, "use_scheduler": False}


@pytest.fixture(scope="module")
def datamodule_kwargs() -> Dict[str, Any]:
return {"num_workers": 0}
def config() -> LightningAdvancedConfig:
return LightningAdvancedConfig(
finetune_last_n_layers=0,
task_train_kwargs={
"max_epochs": 1,
"deterministic": True,
},
task_model_kwargs={
"learning_rate": 5e-4,
"train_batch_size": 32,
"eval_batch_size": 32,
"use_scheduler": False,
"optimizer": "AdamW",
"adam_epsilon": 1e-8,
"warmup_steps": 100,
"weight_decay": 0.0,
},
datamodule_kwargs={
"max_seq_length": 64,
},
early_stopping_kwargs={
"monitor": "val/Loss",
"mode": "min",
"patience": 3,
},
tokenizer_kwargs={},
batch_encoding_kwargs={},
dataloader_kwargs={},
model_config_kwargs={},
)


@pytest.fixture(scope="module")
def lightning_classification_pipeline(
pipeline_kwargs: Dict[str, Any],
dataset_kwargs: Dict[str, Any],
datamodule_kwargs: Dict[str, Any],
task_train_kwargs: Dict[str, Any],
task_model_kwargs: Dict[str, Any],
result_path: Path,
) -> Tuple[LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], Path]:
return (
LightningClassificationPipeline(
output_path=result_path.name,
**pipeline_kwargs,
**dataset_kwargs,
datamodule_kwargs=datamodule_kwargs,
task_train_kwargs=task_train_kwargs,
task_model_kwargs=task_model_kwargs,
),
result_path,
config: LightningAdvancedConfig,
tmp_path_module: Path,
) -> LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]]:
return LightningClassificationPipeline(
embedding_name_or_path="allegro/herbert-base-cased",
output_path=tmp_path_module.name,
config=config,
devices="auto",
accelerator="cpu",
**dataset_kwargs,
)


def test_lightning_pipeline_inference(
lightning_classification_pipeline: Tuple[
LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]],
Path,
lightning_classification_pipeline: LightningPipeline[
datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]
],
tmp_path_module: "TemporaryDirectory[str]",
) -> None:
pl.seed_everything(441, workers=True)

pipeline, path = lightning_classification_pipeline
pipeline = lightning_classification_pipeline
results = pipeline.run()

ckpt_path = Path(path.name) / "checkpoints" / "epoch=0-step=1.ckpt"
ckpt_path = Path(tmp_path_module.name) / "checkpoints" / "epoch=0-step=1.ckpt"
task_from_ckpt = TextClassificationTask.from_checkpoint(
checkpoint_path=ckpt_path.resolve(),
output_path=path.name,
output_path=tmp_path_module.name,
)

model_state_dict = pipeline.model.task.model.model.state_dict()
model_from_ckpt_state_dict = task_from_ckpt.model.model.state_dict()

assert model_state_dict.keys() == model_from_ckpt_state_dict.keys()
for k in model_state_dict.keys():
assert torch.equal(model_state_dict[k], model_from_ckpt_state_dict[k])

test_dataloader = pipeline.datamodule.test_dataloader()
predictions = task_from_ckpt.predict(test_dataloader, return_names=False)

assert np.array_equal(results["data"]["y_probabilities"], predictions["y_probabilities"])
41 changes: 33 additions & 8 deletions tutorials/validate_lightning_models_inference.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"from typing import Any, Dict\n",
"\n",
"import pytorch_lightning as pl\n",
"from embeddings.config.lightning_config import LightningAdvancedConfig\n",
"from embeddings.defaults import DATASET_PATH, RESULTS_PATH\n",
"from embeddings.model.lightning_module.text_classification import (\n",
" TextClassificationModule,\n",
Expand Down Expand Up @@ -100,6 +101,35 @@
"### Train simple downsampled pipeline"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4cb7ebd4-182c-4797-b5de-a7069313a901",
"metadata": {},
"outputs": [],
"source": [
"config = LightningAdvancedConfig(\n",
" finetune_last_n_layers=0,\n",
" task_train_kwargs={\"max_epochs\": 1, \"deterministic\": True,},\n",
" task_model_kwargs={\n",
" \"learning_rate\": 5e-4,\n",
" \"train_batch_size\": 32,\n",
" \"eval_batch_size\": 32,\n",
" \"use_scheduler\": False,\n",
" \"optimizer\": \"AdamW\",\n",
" \"adam_epsilon\": 1e-8,\n",
" \"warmup_steps\": 100,\n",
" \"weight_decay\": 0.0,\n",
" },\n",
" datamodule_kwargs={\"max_seq_length\": 64,},\n",
" early_stopping_kwargs={\"monitor\": \"val/Loss\", \"mode\": \"min\", \"patience\": 3,},\n",
" tokenizer_kwargs={},\n",
" batch_encoding_kwargs={},\n",
" dataloader_kwargs={},\n",
" model_config_kwargs={},\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -112,14 +142,9 @@
"pipeline = LightningClassificationPipeline(\n",
" embedding_name_or_path=embedding_name_or_path,\n",
" output_path=output_path,\n",
" finetune_last_n_layers=0,\n",
" datamodule_kwargs={\"max_seq_length\": 64,},\n",
" task_train_kwargs={\n",
" \"max_epochs\": 1,\n",
" \"devices\": \"auto\",\n",
" \"accelerator\": \"cpu\",\n",
" \"deterministic\": True,\n",
" },\n",
" config=config,\n",
" devices=\"auto\",\n",
" accelerator=\"cpu\",\n",
" **dataset_kwargs\n",
")\n",
"result = pipeline.run()"
Expand Down

0 comments on commit 38c1438

Please sign in to comment.