From 57e7a75f86d97195d2ff127a547468c8f8fa89c7 Mon Sep 17 00:00:00 2001 From: djaniak Date: Thu, 24 Mar 2022 14:14:53 +0100 Subject: [PATCH 01/22] feat: allow for loading lightning model from ckpt --- .../lightning_module/huggingface_module.py | 28 +++++++++---------- .../lightning_module/sequence_labeling.py | 2 ++ .../lightning_module/text_classification.py | 2 ++ .../pipeline/lightning_classification.py | 1 + .../task/lightning_task/lightning_task.py | 4 +++ .../task/lightning_task/sequence_labeling.py | 6 ++++ .../lightning_task/text_classification.py | 6 ++++ 7 files changed, 34 insertions(+), 15 deletions(-) diff --git a/embeddings/model/lightning_module/huggingface_module.py b/embeddings/model/lightning_module/huggingface_module.py index a2196819..c23bd42e 100644 --- a/embeddings/model/lightning_module/huggingface_module.py +++ b/embeddings/model/lightning_module/huggingface_module.py @@ -15,6 +15,7 @@ def __init__( self, model_name_or_path: T_path, downstream_model_type: Type["AutoModel"], + num_classes: int, finetune_last_n_layers: int, metrics: Optional[MetricCollection] = None, config_kwargs: Optional[Dict[str, Any]] = None, @@ -24,11 +25,11 @@ def __init__( self.save_hyperparameters({"downstream_model_type": downstream_model_type.__name__}) self.downstream_model_type = downstream_model_type self.config_kwargs = config_kwargs if config_kwargs else {} + self.configure_model() + self.configure_metrics() def setup(self, stage: Optional[str] = None) -> None: if stage in ("fit", None): - self.configure_model() - self.configure_metrics() if self.hparams.use_scheduler: assert self.trainer is not None train_loader = self.trainer.datamodule.train_dataloader() @@ -40,10 +41,9 @@ def setup(self, stage: Optional[str] = None) -> None: ) def configure_model(self) -> None: - assert self.trainer is not None self.config = AutoConfig.from_pretrained( self.hparams.model_name_or_path, - num_labels=self.trainer.datamodule.num_classes, + num_labels=self.hparams.num_classes, **self.config_kwargs, ) self.model: AutoModel = self.downstream_model_type.from_pretrained( @@ -72,24 +72,22 @@ def freeze_transformer(self, finetune_last_n_layers: int) -> None: param.requires_grad = False def get_default_metrics(self) -> MetricCollection: - assert self.trainer is not None - num_classes = self.trainer.datamodule.num_classes - if num_classes > 2: + if self.hparams.num_classes > 2: metrics = MetricCollection( [ - Accuracy(num_classes=num_classes), - Precision(num_classes=num_classes, average="macro"), - Recall(num_classes=num_classes, average="macro"), - F1(num_classes=num_classes, average="macro"), + Accuracy(num_classes=self.hparams.num_classes), + Precision(num_classes=self.hparams.num_classes, average="macro"), + Recall(num_classes=self.hparams.num_classes, average="macro"), + F1(num_classes=self.hparams.num_classes, average="macro"), ] ) else: metrics = MetricCollection( [ - Accuracy(num_classes=num_classes), - Precision(num_classes=num_classes), - Recall(num_classes=num_classes), - F1(num_classes=num_classes), + Accuracy(num_classes=self.hparams.num_classes), + Precision(num_classes=self.hparams.num_classes), + Recall(num_classes=self.hparams.num_classes), + F1(num_classes=self.hparams.num_classes), ] ) return metrics diff --git a/embeddings/model/lightning_module/sequence_labeling.py b/embeddings/model/lightning_module/sequence_labeling.py index d6a6805c..1e27a4f2 100644 --- a/embeddings/model/lightning_module/sequence_labeling.py +++ b/embeddings/model/lightning_module/sequence_labeling.py @@ -18,6 +18,7 @@ class SequenceLabelingModule(HuggingFaceLightningModule): def __init__( self, model_name_or_path: T_path, + num_classes: int, finetune_last_n_layers: int, metrics: Optional[MetricCollection] = None, ignore_index: int = -100, @@ -27,6 +28,7 @@ def __init__( super().__init__( model_name_or_path=model_name_or_path, downstream_model_type=self.downstream_model_type, + num_classes=num_classes, finetune_last_n_layers=finetune_last_n_layers, metrics=metrics, config_kwargs=config_kwargs, diff --git a/embeddings/model/lightning_module/text_classification.py b/embeddings/model/lightning_module/text_classification.py index 0f7d8ca3..a79db619 100644 --- a/embeddings/model/lightning_module/text_classification.py +++ b/embeddings/model/lightning_module/text_classification.py @@ -18,6 +18,7 @@ class TextClassificationModule(HuggingFaceLightningModule): def __init__( self, model_name_or_path: T_path, + num_classes: int, finetune_last_n_layers: int, metrics: Optional[MetricCollection] = None, config_kwargs: Optional[Dict[str, Any]] = None, @@ -26,6 +27,7 @@ def __init__( super().__init__( model_name_or_path=model_name_or_path, downstream_model_type=self.downstream_model_type, + num_classes=num_classes, finetune_last_n_layers=finetune_last_n_layers, metrics=metrics, config_kwargs=config_kwargs, diff --git a/embeddings/pipeline/lightning_classification.py b/embeddings/pipeline/lightning_classification.py index f8afc414..e3ed9039 100644 --- a/embeddings/pipeline/lightning_classification.py +++ b/embeddings/pipeline/lightning_classification.py @@ -57,6 +57,7 @@ def __init__( task = TextClassificationTask( model_name_or_path=embedding_name_or_path, output_path=output_path, + num_classes=datamodule.num_classes, finetune_last_n_layers=config.finetune_last_n_layers, model_config_kwargs=config.model_config_kwargs, task_model_kwargs=config.task_model_kwargs, diff --git a/embeddings/task/lightning_task/lightning_task.py b/embeddings/task/lightning_task/lightning_task.py index e0e6be69..5621c131 100644 --- a/embeddings/task/lightning_task/lightning_task.py +++ b/embeddings/task/lightning_task/lightning_task.py @@ -114,3 +114,7 @@ def fit_predict( @abc.abstractmethod def build_task_model(self) -> None: pass + + @abc.abstractmethod + def restore_task_model(self, checkpoint_path: str) -> None: + pass diff --git a/embeddings/task/lightning_task/sequence_labeling.py b/embeddings/task/lightning_task/sequence_labeling.py index 82f58896..cd633b40 100644 --- a/embeddings/task/lightning_task/sequence_labeling.py +++ b/embeddings/task/lightning_task/sequence_labeling.py @@ -15,6 +15,7 @@ def __init__( self, model_name_or_path: T_path, output_path: T_path, + num_classes: int, model_config_kwargs: Dict[str, Any], task_model_kwargs: Dict[str, Any], task_train_kwargs: Dict[str, Any], @@ -26,6 +27,7 @@ def __init__( ) -> None: super().__init__(output_path, task_train_kwargs, early_stopping_kwargs, logging_config) self.model_name_or_path = model_name_or_path + self.num_classes = num_classes self.model_config_kwargs = model_config_kwargs self.task_model_kwargs = task_model_kwargs self.train_batch_size = train_batch_size @@ -35,11 +37,15 @@ def __init__( def build_task_model(self) -> None: self.model = SequenceLabelingModule( model_name_or_path=self.model_name_or_path, + num_classes=self.num_classes, finetune_last_n_layers=self.finetune_last_n_layers, config_kwargs=self.model_config_kwargs, task_model_kwargs=self.task_model_kwargs, ) + def restore_task_model(self, checkpoint_path: str) -> None: + self.model = SequenceLabelingModule.load_from_checkpoint(checkpoint_path) + def predict(self, dataloader: DataLoader[Any]) -> Dict[str, nptyping.NDArray[Any]]: assert self.model is not None results = self.model.predict(dataloader=dataloader) diff --git a/embeddings/task/lightning_task/text_classification.py b/embeddings/task/lightning_task/text_classification.py index 7b91ade8..495e6011 100644 --- a/embeddings/task/lightning_task/text_classification.py +++ b/embeddings/task/lightning_task/text_classification.py @@ -15,6 +15,7 @@ def __init__( self, model_name_or_path: T_path, output_path: T_path, + num_classes: int, model_config_kwargs: Dict[str, Any], task_model_kwargs: Dict[str, Any], task_train_kwargs: Dict[str, Any], @@ -24,6 +25,7 @@ def __init__( ) -> None: super().__init__(output_path, task_train_kwargs, early_stopping_kwargs, logging_config) self.model_name_or_path = model_name_or_path + self.num_classes = num_classes self.model_config_kwargs = model_config_kwargs self.task_model_kwargs = task_model_kwargs self.finetune_last_n_layers = finetune_last_n_layers @@ -31,11 +33,15 @@ def __init__( def build_task_model(self) -> None: self.model = TextClassificationModule( model_name_or_path=self.model_name_or_path, + num_classes=self.num_classes, finetune_last_n_layers=self.finetune_last_n_layers, config_kwargs=self.model_config_kwargs, task_model_kwargs=self.task_model_kwargs, ) + def restore_task_model(self, checkpoint_path: str) -> None: + self.model = TextClassificationModule.load_from_checkpoint(checkpoint_path) + def predict(self, dataloader: DataLoader[Any]) -> Dict[str, nptyping.NDArray[Any]]: assert self.model is not None results = self.model.predict(dataloader=dataloader) From 4e762359f34a5d8f61179db1457bcb408d79870f Mon Sep 17 00:00:00 2001 From: djaniak Date: Thu, 31 Mar 2022 13:17:09 +0200 Subject: [PATCH 02/22] fix: missing arg in lightning_sequence_labeling.py --- embeddings/pipeline/lightning_sequence_labeling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/embeddings/pipeline/lightning_sequence_labeling.py b/embeddings/pipeline/lightning_sequence_labeling.py index 1386bd34..fd970db5 100644 --- a/embeddings/pipeline/lightning_sequence_labeling.py +++ b/embeddings/pipeline/lightning_sequence_labeling.py @@ -62,6 +62,7 @@ def __init__( task = SequenceLabelingTask( model_name_or_path=embedding_name_or_path, output_path=output_path, + num_classes=datamodule.num_classes, finetune_last_n_layers=config.finetune_last_n_layers, model_config_kwargs=config.model_config_kwargs, task_model_kwargs=config.task_model_kwargs, From e76c9482c435489c4cc407d670d7e5d8543b3c13 Mon Sep 17 00:00:00 2001 From: djaniak Date: Thu, 31 Mar 2022 16:32:14 +0200 Subject: [PATCH 03/22] feat: notebook with inference example for lightning --- .../validate_lightning_models_inference.ipynb | 243 ++++++++++++++++++ 1 file changed, 243 insertions(+) create mode 100644 notebooks/validate_lightning_models_inference.ipynb diff --git a/notebooks/validate_lightning_models_inference.ipynb b/notebooks/validate_lightning_models_inference.ipynb new file mode 100644 index 00000000..fb3e5a41 --- /dev/null +++ b/notebooks/validate_lightning_models_inference.ipynb @@ -0,0 +1,243 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "3d3ac2b5-06e8-46bc-a626-9384a35920e5", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1019b750-cebe-438b-b1ab-434d6f756864", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.chdir(\"..\")\n", + "\n", + "from pathlib import Path\n", + "from tempfile import TemporaryDirectory\n", + "\n", + "import pytorch_lightning as pl\n", + "import torch\n", + "from embeddings.defaults import RESULTS_PATH\n", + "from embeddings.model.lightning_module.text_classification import (\n", + " TextClassificationModule,\n", + ")\n", + "from embeddings.pipeline.lightning_classification import LightningClassificationPipeline\n", + "from embeddings.utils.utils import build_output_path, format_eval_result" + ] + }, + { + "cell_type": "markdown", + "id": "159445cd-fb59-4964-aca2-ce9c18a8cf5e", + "metadata": {}, + "source": [ + "### Train simple downsampled pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "148a0089-f461-4948-93fa-04f2e34ac9e0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "embedding_name_or_path = \"allegro/herbert-base-cased\"\n", + "dataset_name = \"clarin-pl/polemo2-official\"\n", + "input_columns_name = \"text\"\n", + "target_column_name = \"target\"\n", + "path = TemporaryDirectory()\n", + "output_path = path.name\n", + "\n", + "pipeline = LightningClassificationPipeline(\n", + " embedding_name_or_path=embedding_name_or_path,\n", + " dataset_name_or_path=dataset_name,\n", + " input_column_name=input_columns_name,\n", + " target_column_name=target_column_name,\n", + " output_path=output_path,\n", + " finetune_last_n_layers=0,\n", + " load_dataset_kwargs={\n", + " \"train_domains\": [\"hotels\", \"medicine\"],\n", + " \"dev_domains\": [\"hotels\", \"medicine\"],\n", + " \"test_domains\": [\"hotels\", \"medicine\"],\n", + " \"text_cfg\": \"text\",\n", + " },\n", + " datamodule_kwargs={\n", + " \"max_seq_length\": 64,\n", + " \"downsample_train\": 0.005,\n", + " \"downsample_val\": 0.01,\n", + " \"downsample_test\": 0.01,\n", + " },\n", + " task_train_kwargs={\n", + " \"max_epochs\": 1,\n", + " \"devices\": \"auto\",\n", + " \"accelerator\": \"cpu\",\n", + " \"deterministic\": True,\n", + " },\n", + ")\n", + "result = pipeline.run()" + ] + }, + { + "cell_type": "markdown", + "id": "491215dc-9960-4ad0-bc14-6d61d1fafac8", + "metadata": {}, + "source": [ + "### Load model from chechpoint automatically generated with Trainer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee9e824c-00f1-45b0-9e32-1bd33f364f3a", + "metadata": {}, + "outputs": [], + "source": [ + "ckpt_path = (\n", + " Path(output_path)\n", + " / \"lightning_logs\"\n", + " / \"version_0\"\n", + " / \"checkpoints\"\n", + " / \"epoch=0-step=0.ckpt\"\n", + ")\n", + "ckpt_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b32fd93-e43d-4c42-961e-53232bf9e02e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "model_from_ckpt = TextClassificationModule.load_from_checkpoint(\n", + " str(ckpt_path), strict=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a2c6e7fe-39c6-4fcb-87a1-1ed688c33adf", + "metadata": {}, + "source": [ + "### Validate model states (because of the warning)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "80999ea5-ef59-40db-8d39-2ae02937fec8", + "metadata": {}, + "outputs": [], + "source": [ + "model_state_dict = pipeline.model.task.model.model.state_dict()\n", + "model_from_ckpt_state_dict = model_from_ckpt.model.state_dict()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1207eb9f-2e68-46ec-bab7-e5c8d1cb5953", + "metadata": {}, + "outputs": [], + "source": [ + "model_state_dict.keys() == model_from_ckpt_state_dict.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3875c361-ee96-4e79-8209-0f04e4f1f599", + "metadata": {}, + "outputs": [], + "source": [ + "for k in model_state_dict.keys():\n", + " assert torch.equal(model_state_dict[k], model_from_ckpt_state_dict[k])" + ] + }, + { + "cell_type": "markdown", + "id": "4a07428d-cdad-45bb-a92a-302b511de9dc", + "metadata": {}, + "source": [ + "### Manually save and load the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c17b7792-b59a-4051-888e-c8b960bf04cc", + "metadata": {}, + "outputs": [], + "source": [ + "pipeline.model.task.trainer.save_checkpoint(\"example.ckpt\")\n", + "new_model = TextClassificationModule.load_from_checkpoint(\n", + " checkpoint_path=\"example.ckpt\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "88e7a6c7-449f-4d0c-9042-a5f98aebc14b", + "metadata": {}, + "source": [ + "### Use model from checkpoint for predictions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f3afa250-2937-4aad-bb3c-172a68639892", + "metadata": {}, + "outputs": [], + "source": [ + "trainer = pl.Trainer()\n", + "test_dataloader = pipeline.datamodule.test_dataloader()\n", + "predictions = trainer.predict(model_from_ckpt, dataloaders=test_dataloader)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "09f45c8b-791b-43b4-9826-f798d48b9d97", + "metadata": {}, + "outputs": [], + "source": [ + "predictions" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:clarinpl-embeddings]", + "language": "python", + "name": "conda-env-clarinpl-embeddings-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 7858aa438a7deb5687d24dfb2c20137f7744aa8e Mon Sep 17 00:00:00 2001 From: djaniak Date: Thu, 31 Mar 2022 17:20:53 +0200 Subject: [PATCH 04/22] feat: notebook with inference example for flair --- .../validate_flair_models_inference.ipynb | 208 ++++++++++++++++++ 1 file changed, 208 insertions(+) create mode 100644 notebooks/validate_flair_models_inference.ipynb diff --git a/notebooks/validate_flair_models_inference.ipynb b/notebooks/validate_flair_models_inference.ipynb new file mode 100644 index 00000000..d7bce5bd --- /dev/null +++ b/notebooks/validate_flair_models_inference.ipynb @@ -0,0 +1,208 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "418b9661-aea2-4990-8e26-e7f0e167b9b2", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f23bfd51-d1f4-4321-aed9-96f51b171fe9", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.chdir(\"..\")\n", + "\n", + "\n", + "from pathlib import Path\n", + "from tempfile import TemporaryDirectory\n", + "from typing import Any, Dict, Tuple\n", + "\n", + "import datasets\n", + "import flair\n", + "import numpy as np\n", + "import pytest\n", + "import torch\n", + "from embeddings.data.data_loader import HuggingFaceDataLoader\n", + "from embeddings.data.dataset import HuggingFaceDataset\n", + "from embeddings.defaults import RESULTS_PATH\n", + "from embeddings.embedding.auto_flair import AutoFlairWordEmbedding\n", + "from embeddings.embedding.flair_embedding import FlairEmbedding\n", + "from embeddings.evaluator.sequence_labeling_evaluator import SequenceLabelingEvaluator\n", + "from embeddings.model.flair_model import FlairModel\n", + "from embeddings.pipeline.standard_pipeline import StandardPipeline\n", + "from embeddings.task.flair_task.sequence_labeling import SequenceLabeling\n", + "from embeddings.transformation.flair_transformation.column_corpus_transformation import (\n", + " ColumnCorpusTransformation,\n", + ")\n", + "from embeddings.transformation.flair_transformation.downsample_corpus_transformation import (\n", + " DownsampleFlairCorpusTransformation,\n", + ")\n", + "from embeddings.transformation.flair_transformation.split_sample_corpus_transformation import (\n", + " SampleSplitsFlairCorpusTransformation,\n", + ")\n", + "from flair.data import Corpus\n", + "from numpy import typing as nptyping" + ] + }, + { + "cell_type": "markdown", + "id": "5e4c2372-8314-4868-a576-8f0988aae888", + "metadata": {}, + "source": [ + "### Run downsampled flair pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd4fb7d6-1e81-4bea-9bd3-b4a4bec87fc9", + "metadata": {}, + "outputs": [], + "source": [ + "result_path = TemporaryDirectory()\n", + "\n", + "dataset = HuggingFaceDataset(\"clarin-pl/kpwr-ner\")\n", + "data_loader = HuggingFaceDataLoader()\n", + "transformation = (\n", + " ColumnCorpusTransformation(\"tokens\", \"ner\")\n", + " .then(SampleSplitsFlairCorpusTransformation(dev_fraction=0.1, seed=441))\n", + " .then(DownsampleFlairCorpusTransformation(percentage=0.005))\n", + ")\n", + "task = SequenceLabeling(\n", + " result_path.name,\n", + " hidden_size=256,\n", + " task_train_kwargs={\"max_epochs\": 1, \"mini_batch_size\": 256},\n", + ")\n", + "embedding = AutoFlairWordEmbedding.from_hub(\"allegro/herbert-base-cased\")\n", + "model = FlairModel(embedding, task)\n", + "evaluator = SequenceLabelingEvaluator()\n", + "\n", + "pipeline = StandardPipeline(dataset, data_loader, transformation, model, evaluator)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f69e538-332d-4278-977d-7002fe2b67bd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "_ = pipeline.run()" + ] + }, + { + "cell_type": "markdown", + "id": "44613ef9-a9d4-4d5c-980c-9e0f68bc3525", + "metadata": {}, + "source": [ + "### Load model from checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ffae7c5-1734-4e4a-81bd-55170a5c14ca", + "metadata": {}, + "outputs": [], + "source": [ + "!ls $result_path.name" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e75f303-fb82-4cfd-9a81-5d42e994e606", + "metadata": {}, + "outputs": [], + "source": [ + "from flair.models import SequenceTagger\n", + "\n", + "trained_model = SequenceTagger.load(result_path.name + \"/final-model.pt\")" + ] + }, + { + "cell_type": "markdown", + "id": "802762c3-8246-465b-bb9c-2336134a51bd", + "metadata": {}, + "source": [ + "### Predict for test data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c0fadf3-aa47-407d-ad79-e5633532eafa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "loaded_data = data_loader.load(dataset)\n", + "transformed_data = transformation.transform(loaded_data)\n", + "test_data = transformed_data.test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "944cc964-262d-49f8-ab9f-510aba3dad7d", + "metadata": {}, + "outputs": [], + "source": [ + "task.remove_labels_from_data(test_data, \"predicted\")\n", + "\n", + "loss = trained_model.predict(\n", + " sentences=test_data, mini_batch_size=64, label_name=\"predicted\", return_loss=True,\n", + ")\n", + "\n", + "y_pred = task.get_y(test_data, y_type=\"predicted\", y_dictionary=task.y_dictionary)\n", + "y_true = task.get_y(test_data, task.y_type, task.y_dictionary)\n", + "\n", + "task.remove_labels_from_data(test_data, \"predicted\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f9d6dde-601a-4d0c-bb48-e9177e7002c9", + "metadata": {}, + "outputs": [], + "source": [ + "_ = evaluator.evaluate({\"y_pred\": y_pred, \"y_true\": y_true})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:clarinpl-embeddings]", + "language": "python", + "name": "conda-env-clarinpl-embeddings-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 3da9c8776d3cbe0d3e01e6a4eba8d734eef444de Mon Sep 17 00:00:00 2001 From: djaniak Date: Wed, 6 Apr 2022 18:33:32 +0200 Subject: [PATCH 05/22] feat: implement load_from_ckpt method for LightningTask --- .../lightning_module/lightning_module.py | 2 +- .../task/lightning_task/lightning_task.py | 40 ++++++++++++- .../task/lightning_task/sequence_labeling.py | 25 +++++--- .../lightning_task/text_classification.py | 22 +++++-- .../validate_lightning_models_inference.ipynb | 58 ++++++++++++------- 5 files changed, 112 insertions(+), 35 deletions(-) diff --git a/embeddings/model/lightning_module/lightning_module.py b/embeddings/model/lightning_module/lightning_module.py index 1f20bc26..62ee6465 100644 --- a/embeddings/model/lightning_module/lightning_module.py +++ b/embeddings/model/lightning_module/lightning_module.py @@ -70,7 +70,7 @@ def predict( ) -> Dict[str, nptyping.NDArray[Any]]: assert self.trainer is not None logits_predictions = self.trainer.predict( - dataloaders=dataloader, return_predictions=True, ckpt_path="best" + model=self, dataloaders=dataloader, return_predictions=True, ckpt_path="best" ) logits, predictions = zip(*logits_predictions) probabilities = softmax(torch.cat(logits), dim=1).numpy() diff --git a/embeddings/task/lightning_task/lightning_task.py b/embeddings/task/lightning_task/lightning_task.py index 5621c131..1a5129af 100644 --- a/embeddings/task/lightning_task/lightning_task.py +++ b/embeddings/task/lightning_task/lightning_task.py @@ -1,6 +1,6 @@ import abc from pathlib import Path -from typing import Any, Dict, List, Optional, Sequence +from typing import Any, Dict, List, Optional, Sequence, Type import pytorch_lightning as pl import torch @@ -8,11 +8,13 @@ from pytorch_lightning.callbacks import Callback, ModelCheckpoint from pytorch_lightning.callbacks.early_stopping import EarlyStopping from torch.utils.data import DataLoader +from transformers import AutoModel from embeddings.data.datamodule import HuggingFaceDataModule from embeddings.data.dataset import LightingDataModuleSubset from embeddings.data.io import T_path from embeddings.model.lightning_module.huggingface_module import HuggingFaceLightningModule +from embeddings.model.lightning_module.lightning_module import LightningModule from embeddings.task.task import Task from embeddings.utils.lightning_callbacks.best_epoch_callback import BestEpochCallback from embeddings.utils.loggers import LightningLoggingConfig, get_logger @@ -115,6 +117,40 @@ def fit_predict( def build_task_model(self) -> None: pass + @classmethod + def restore_task_model( + cls, + checkpoint_path: T_path, + output_path: T_path, + lightning_module: Type[LightningModule[AutoModel]], + task_train_kwargs: Optional[Dict[str, Any]], + early_stopping_kwargs: Optional[Dict[str, Any]], + ) -> "LightningTask": + model = lightning_module.load_from_checkpoint(str(checkpoint_path)) + trainer = pl.Trainer(default_root_dir=str(output_path), **task_train_kwargs or {}) + init_kwargs = { + "model_name_or_path": model.hparams.model_name_or_path, + "output_path": output_path, + "num_classes": model.hparams.num_classes, + "finetune_last_n_layers": model.hparams.finetune_last_n_layers, + "model_config_kwargs": model.hparams.config_kwargs, + "task_model_kwargs": model.hparams.task_model_kwargs, + "task_train_kwargs": task_train_kwargs or {}, + "early_stopping_kwargs": early_stopping_kwargs or {}, + } + task = cls(**init_kwargs) + task.model = model + task.trainer = trainer + model.trainer = trainer + return task + + @classmethod @abc.abstractmethod - def restore_task_model(self, checkpoint_path: str) -> None: + def from_checkpoint( + cls, + checkpoint_path: T_path, + output_path: T_path, + task_train_kwargs: Optional[Dict[str, Any]], + early_stopping_kwargs: Optional[Dict[str, Any]], + ) -> "LightningTask": pass diff --git a/embeddings/task/lightning_task/sequence_labeling.py b/embeddings/task/lightning_task/sequence_labeling.py index cd633b40..7a698050 100644 --- a/embeddings/task/lightning_task/sequence_labeling.py +++ b/embeddings/task/lightning_task/sequence_labeling.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import numpy as np from numpy import typing as nptyping @@ -21,8 +21,6 @@ def __init__( task_train_kwargs: Dict[str, Any], early_stopping_kwargs: Dict[str, Any], logging_config: LightningLoggingConfig, - train_batch_size: int = 32, - eval_batch_size: int = 32, finetune_last_n_layers: int = -1, ) -> None: super().__init__(output_path, task_train_kwargs, early_stopping_kwargs, logging_config) @@ -30,8 +28,6 @@ def __init__( self.num_classes = num_classes self.model_config_kwargs = model_config_kwargs self.task_model_kwargs = task_model_kwargs - self.train_batch_size = train_batch_size - self.eval_batch_size = eval_batch_size self.finetune_last_n_layers = finetune_last_n_layers def build_task_model(self) -> None: @@ -43,9 +39,6 @@ def build_task_model(self) -> None: task_model_kwargs=self.task_model_kwargs, ) - def restore_task_model(self, checkpoint_path: str) -> None: - self.model = SequenceLabelingModule.load_from_checkpoint(checkpoint_path) - def predict(self, dataloader: DataLoader[Any]) -> Dict[str, nptyping.NDArray[Any]]: assert self.model is not None results = self.model.predict(dataloader=dataloader) @@ -80,3 +73,19 @@ def _map_filter_data( getattr(self.trainer, "datamodule").id2str(x.item()) for x in data[ground_truth_data != self.model.ignore_index] ] + + @classmethod + def from_checkpoint( + cls, + checkpoint_path: T_path, + output_path: T_path, + task_train_kwargs: Optional[Dict[str, Any]], + early_stopping_kwargs: Optional[Dict[str, Any]], + ) -> "LightningTask": + return cls.restore_task_model( + checkpoint_path=checkpoint_path, + output_path=output_path, + task_train_kwargs=task_train_kwargs, + early_stopping_kwargs=early_stopping_kwargs, + lightning_module=SequenceLabelingModule, + ) diff --git a/embeddings/task/lightning_task/text_classification.py b/embeddings/task/lightning_task/text_classification.py index 495e6011..dc014302 100644 --- a/embeddings/task/lightning_task/text_classification.py +++ b/embeddings/task/lightning_task/text_classification.py @@ -1,4 +1,4 @@ -from typing import Any, Dict +from typing import Any, Dict, Optional import numpy as np from numpy import typing as nptyping @@ -29,6 +29,7 @@ def __init__( self.model_config_kwargs = model_config_kwargs self.task_model_kwargs = task_model_kwargs self.finetune_last_n_layers = finetune_last_n_layers + self.task_train_kwargs = task_train_kwargs def build_task_model(self) -> None: self.model = TextClassificationModule( @@ -39,9 +40,6 @@ def build_task_model(self) -> None: task_model_kwargs=self.task_model_kwargs, ) - def restore_task_model(self, checkpoint_path: str) -> None: - self.model = TextClassificationModule.load_from_checkpoint(checkpoint_path) - def predict(self, dataloader: DataLoader[Any]) -> Dict[str, nptyping.NDArray[Any]]: assert self.model is not None results = self.model.predict(dataloader=dataloader) @@ -49,3 +47,19 @@ def predict(self, dataloader: DataLoader[Any]) -> Dict[str, nptyping.NDArray[Any assert hasattr(self.trainer, "datamodule") results["names"] = np.array(getattr(self.trainer, "datamodule").target_names) return results + + @classmethod + def from_checkpoint( + cls, + checkpoint_path: T_path, + output_path: T_path, + task_train_kwargs: Optional[Dict[str, Any]], + early_stopping_kwargs: Optional[Dict[str, Any]], + ) -> "LightningTask": + return cls.restore_task_model( + checkpoint_path=checkpoint_path, + output_path=output_path, + task_train_kwargs=task_train_kwargs, + early_stopping_kwargs=early_stopping_kwargs, + lightning_module=TextClassificationModule, + ) diff --git a/notebooks/validate_lightning_models_inference.ipynb b/notebooks/validate_lightning_models_inference.ipynb index fb3e5a41..151b8603 100644 --- a/notebooks/validate_lightning_models_inference.ipynb +++ b/notebooks/validate_lightning_models_inference.ipynb @@ -28,6 +28,7 @@ "import pytorch_lightning as pl\n", "import torch\n", "from embeddings.defaults import RESULTS_PATH\n", + "from embeddings.task.lightning_task.text_classification import TextClassificationTask\n", "from embeddings.model.lightning_module.text_classification import (\n", " TextClassificationModule,\n", ")\n", @@ -52,12 +53,13 @@ }, "outputs": [], "source": [ - "embedding_name_or_path = \"allegro/herbert-base-cased\"\n", + "embedding_name_or_path = \"hf-internal-testing/tiny-albert\"\n", "dataset_name = \"clarin-pl/polemo2-official\"\n", "input_columns_name = \"text\"\n", "target_column_name = \"target\"\n", - "path = TemporaryDirectory()\n", - "output_path = path.name\n", + "# path = TemporaryDirectory()\n", + "# output_path = path.name\n", + "output_path = \".\"\n", "\n", "pipeline = LightningClassificationPipeline(\n", " embedding_name_or_path=embedding_name_or_path,\n", @@ -72,12 +74,7 @@ " \"test_domains\": [\"hotels\", \"medicine\"],\n", " \"text_cfg\": \"text\",\n", " },\n", - " datamodule_kwargs={\n", - " \"max_seq_length\": 64,\n", - " \"downsample_train\": 0.005,\n", - " \"downsample_val\": 0.01,\n", - " \"downsample_test\": 0.01,\n", - " },\n", + " datamodule_kwargs={\"max_seq_length\": 64,},\n", " task_train_kwargs={\n", " \"max_epochs\": 1,\n", " \"devices\": \"auto\",\n", @@ -106,11 +103,26 @@ "ckpt_path = (\n", " Path(output_path)\n", " / \"lightning_logs\"\n", - " / \"version_0\"\n", + " / \"version_1\"\n", " / \"checkpoints\"\n", - " / \"epoch=0-step=0.ckpt\"\n", + " / \"epoch=0-step=180.ckpt\"\n", ")\n", - "ckpt_path" + "ckpt_path.resolve()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2785fcbc-1c95-4d23-807f-a14569992354", + "metadata": {}, + "outputs": [], + "source": [ + "task_from_ckpt = TextClassificationTask.from_checkpoint(\n", + " checkpoint_path=ckpt_path,\n", + " output_path=output_path,\n", + " task_train_kwargs={},\n", + " early_stopping_kwargs={},\n", + ")" ] }, { @@ -143,7 +155,7 @@ "outputs": [], "source": [ "model_state_dict = pipeline.model.task.model.model.state_dict()\n", - "model_from_ckpt_state_dict = model_from_ckpt.model.state_dict()" + "model_from_ckpt_state_dict = task_from_ckpt.model.model.state_dict()" ] }, { @@ -185,6 +197,12 @@ "pipeline.model.task.trainer.save_checkpoint(\"example.ckpt\")\n", "new_model = TextClassificationModule.load_from_checkpoint(\n", " checkpoint_path=\"example.ckpt\"\n", + ")\n", + "new_task_from_ckpt = TextClassificationTask.from_checkpoint(\n", + " checkpoint_path=ckpt_path,\n", + " output_path=output_path,\n", + " task_train_kwargs={},\n", + " early_stopping_kwargs={},\n", ")" ] }, @@ -199,31 +217,31 @@ { "cell_type": "code", "execution_count": null, - "id": "f3afa250-2937-4aad-bb3c-172a68639892", + "id": "4ad7b9b0-823a-4c8e-aac5-61a333558ed1", "metadata": {}, "outputs": [], "source": [ - "trainer = pl.Trainer()\n", "test_dataloader = pipeline.datamodule.test_dataloader()\n", - "predictions = trainer.predict(model_from_ckpt, dataloaders=test_dataloader)" + "preds = task_from_ckpt.predict(test_dataloader)" ] }, { "cell_type": "code", "execution_count": null, - "id": "09f45c8b-791b-43b4-9826-f798d48b9d97", + "id": "f3afa250-2937-4aad-bb3c-172a68639892", "metadata": {}, "outputs": [], "source": [ - "predictions" + "trainer = pl.Trainer()\n", + "preds_other = trainer.predict(model_from_ckpt, dataloaders=test_dataloader)" ] } ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:clarinpl-embeddings]", + "display_name": "Python [conda env:embeddings]", "language": "python", - "name": "conda-env-clarinpl-embeddings-py" + "name": "conda-env-embeddings-py" }, "language_info": { "codemirror_mode": { From 03943f774a3b7824a13b87a093c486fe269de07d Mon Sep 17 00:00:00 2001 From: djaniak Date: Thu, 7 Apr 2022 16:51:12 +0200 Subject: [PATCH 06/22] fix: restore inference after rebase for lightning --- .../lightning_module/lightning_module.py | 20 +++++++++++++----- .../task/lightning_task/lightning_task.py | 17 +++++++++++---- .../task/lightning_task/sequence_labeling.py | 21 ++++++++++++------- .../lightning_task/text_classification.py | 17 +++++++++------ 4 files changed, 52 insertions(+), 23 deletions(-) diff --git a/embeddings/model/lightning_module/lightning_module.py b/embeddings/model/lightning_module/lightning_module.py index 62ee6465..f0a8b0a9 100644 --- a/embeddings/model/lightning_module/lightning_module.py +++ b/embeddings/model/lightning_module/lightning_module.py @@ -6,6 +6,7 @@ import pytorch_lightning as pl import torch from numpy import typing as nptyping +from pytorch_lightning.utilities.exceptions import MisconfigurationException from pytorch_lightning.utilities.types import STEP_OUTPUT from torch.nn.functional import softmax from torch.optim import Optimizer @@ -68,11 +69,7 @@ def predict_step(self, *args: Any, **kwargs: Any) -> Optional[Tuple[STEP_OUTPUT, def predict( self, dataloader: DataLoader[HuggingFaceDataset] ) -> Dict[str, nptyping.NDArray[Any]]: - assert self.trainer is not None - logits_predictions = self.trainer.predict( - model=self, dataloaders=dataloader, return_predictions=True, ckpt_path="best" - ) - logits, predictions = zip(*logits_predictions) + logits, predictions = zip(*self._predict_with_trainer(dataloader)) probabilities = softmax(torch.cat(logits), dim=1).numpy() predictions = torch.cat(predictions).numpy() ground_truth = torch.cat([x["labels"] for x in dataloader]).numpy() @@ -80,6 +77,19 @@ def predict( assert all(isinstance(x, np.ndarray) for x in result.values()) return result + def _predict_with_trainer(self, dataloader: DataLoader[HuggingFaceDataset]) -> torch.Tensor: + assert self.trainer is not None + try: + return self.trainer.predict( + model=self, dataloaders=dataloader, return_predictions=True, ckpt_path="best" + ) + except MisconfigurationException: # model loaded but not fitted + return self.trainer.predict( + model=self, + dataloaders=dataloader, + return_predictions=True, + ) + def configure_metrics(self) -> None: if self.metrics is None: self.metrics = self.get_default_metrics() diff --git a/embeddings/task/lightning_task/lightning_task.py b/embeddings/task/lightning_task/lightning_task.py index 1a5129af..c798b7f6 100644 --- a/embeddings/task/lightning_task/lightning_task.py +++ b/embeddings/task/lightning_task/lightning_task.py @@ -93,7 +93,9 @@ def fit( raise e @abc.abstractmethod - def predict(self, dataloader: DataLoader[Any]) -> Dict[str, nptyping.NDArray[Any]]: + def predict( + self, dataloader: DataLoader[Any], return_names: bool = True + ) -> Dict[str, nptyping.NDArray[Any]]: pass def fit_predict( @@ -125,9 +127,14 @@ def restore_task_model( lightning_module: Type[LightningModule[AutoModel]], task_train_kwargs: Optional[Dict[str, Any]], early_stopping_kwargs: Optional[Dict[str, Any]], + logging_config: Optional[LightningLoggingConfig], ) -> "LightningTask": model = lightning_module.load_from_checkpoint(str(checkpoint_path)) - trainer = pl.Trainer(default_root_dir=str(output_path), **task_train_kwargs or {}) + trainer = pl.Trainer( + default_root_dir=str(output_path), + callbacks=[ModelCheckpoint(dirpath=Path(output_path).joinpath("checkpoints"))], + **task_train_kwargs or {} + ) init_kwargs = { "model_name_or_path": model.hparams.model_name_or_path, "output_path": output_path, @@ -137,6 +144,7 @@ def restore_task_model( "task_model_kwargs": model.hparams.task_model_kwargs, "task_train_kwargs": task_train_kwargs or {}, "early_stopping_kwargs": early_stopping_kwargs or {}, + "logging_config": logging_config or LightningLoggingConfig(), } task = cls(**init_kwargs) task.model = model @@ -150,7 +158,8 @@ def from_checkpoint( cls, checkpoint_path: T_path, output_path: T_path, - task_train_kwargs: Optional[Dict[str, Any]], - early_stopping_kwargs: Optional[Dict[str, Any]], + task_train_kwargs: Optional[Dict[str, Any]] = None, + early_stopping_kwargs: Optional[Dict[str, Any]] = None, + logging_config: Optional[LightningLoggingConfig] = None, ) -> "LightningTask": pass diff --git a/embeddings/task/lightning_task/sequence_labeling.py b/embeddings/task/lightning_task/sequence_labeling.py index 7a698050..d878caec 100644 --- a/embeddings/task/lightning_task/sequence_labeling.py +++ b/embeddings/task/lightning_task/sequence_labeling.py @@ -39,7 +39,9 @@ def build_task_model(self) -> None: task_model_kwargs=self.task_model_kwargs, ) - def predict(self, dataloader: DataLoader[Any]) -> Dict[str, nptyping.NDArray[Any]]: + def predict( + self, dataloader: DataLoader[Any], return_names: bool = True + ) -> Dict[str, nptyping.NDArray[Any]]: assert self.model is not None results = self.model.predict(dataloader=dataloader) predictions, ground_truth, probabilities = ( @@ -53,15 +55,16 @@ def predict(self, dataloader: DataLoader[Any]) -> Dict[str, nptyping.NDArray[Any ground_truth[i] = self._map_filter_data(gt, gt) probabilities[i] = [x for x in probs[gt != self.model.ignore_index]] - assert self.trainer is not None - assert hasattr(self.trainer, "datamodule") - names = getattr(self.trainer, "datamodule").target_names - return { + results = { "y_pred": np.array(predictions, dtype=object), "y_true": np.array(ground_truth, dtype=object), "y_probabilities": np.array(probabilities, dtype=object), - "names": np.array(names), } + if return_names: + assert self.trainer is not None + assert hasattr(self.trainer, "datamodule") + results["names"] = np.array(getattr(self.trainer, "datamodule").target_names) + return results def _map_filter_data( self, data: nptyping.NDArray[Any], ground_truth_data: nptyping.NDArray[Any] @@ -79,8 +82,9 @@ def from_checkpoint( cls, checkpoint_path: T_path, output_path: T_path, - task_train_kwargs: Optional[Dict[str, Any]], - early_stopping_kwargs: Optional[Dict[str, Any]], + task_train_kwargs: Optional[Dict[str, Any]] = None, + early_stopping_kwargs: Optional[Dict[str, Any]] = None, + logging_config: Optional[LightningLoggingConfig] = None, ) -> "LightningTask": return cls.restore_task_model( checkpoint_path=checkpoint_path, @@ -88,4 +92,5 @@ def from_checkpoint( task_train_kwargs=task_train_kwargs, early_stopping_kwargs=early_stopping_kwargs, lightning_module=SequenceLabelingModule, + logging_config=logging_config, ) diff --git a/embeddings/task/lightning_task/text_classification.py b/embeddings/task/lightning_task/text_classification.py index dc014302..996392a1 100644 --- a/embeddings/task/lightning_task/text_classification.py +++ b/embeddings/task/lightning_task/text_classification.py @@ -40,12 +40,15 @@ def build_task_model(self) -> None: task_model_kwargs=self.task_model_kwargs, ) - def predict(self, dataloader: DataLoader[Any]) -> Dict[str, nptyping.NDArray[Any]]: + def predict( + self, dataloader: DataLoader[Any], return_names: bool = True + ) -> Dict[str, nptyping.NDArray[Any]]: assert self.model is not None results = self.model.predict(dataloader=dataloader) - assert self.trainer is not None - assert hasattr(self.trainer, "datamodule") - results["names"] = np.array(getattr(self.trainer, "datamodule").target_names) + if return_names: + assert self.trainer is not None + assert hasattr(self.trainer, "datamodule") + results["names"] = np.array(getattr(self.trainer, "datamodule").target_names) return results @classmethod @@ -53,8 +56,9 @@ def from_checkpoint( cls, checkpoint_path: T_path, output_path: T_path, - task_train_kwargs: Optional[Dict[str, Any]], - early_stopping_kwargs: Optional[Dict[str, Any]], + task_train_kwargs: Optional[Dict[str, Any]] = None, + early_stopping_kwargs: Optional[Dict[str, Any]] = None, + logging_config: Optional[LightningLoggingConfig] = None, ) -> "LightningTask": return cls.restore_task_model( checkpoint_path=checkpoint_path, @@ -62,4 +66,5 @@ def from_checkpoint( task_train_kwargs=task_train_kwargs, early_stopping_kwargs=early_stopping_kwargs, lightning_module=TextClassificationModule, + logging_config=logging_config, ) From 29519d45b266740df44bd6e831c46b9cb1a78b53 Mon Sep 17 00:00:00 2001 From: djaniak Date: Thu, 7 Apr 2022 16:51:56 +0200 Subject: [PATCH 07/22] feat: tests for lightning inference --- tests/test_lightning_inference.py | 126 ++++++++++++++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 tests/test_lightning_inference.py diff --git a/tests/test_lightning_inference.py b/tests/test_lightning_inference.py new file mode 100644 index 00000000..469f3da4 --- /dev/null +++ b/tests/test_lightning_inference.py @@ -0,0 +1,126 @@ +from pathlib import Path +from typing import Any, Dict, Tuple + +import datasets +import numpy as np +import pytest +import pytorch_lightning as pl +import torch +from _pytest.tmpdir import TempdirFactory + +from embeddings.pipeline.hf_preprocessing_pipeline import HuggingFacePreprocessingPipeline +from embeddings.pipeline.lightning_classification import LightningClassificationPipeline +from embeddings.pipeline.lightning_pipeline import LightningPipeline +from embeddings.task.lightning_task.text_classification import TextClassificationTask + + +@pytest.fixture(scope="module") +def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path: + path = tmpdir_factory.mktemp(__name__) + return Path(path) + + +@pytest.fixture(scope="module") +def pipeline_kwargs() -> Dict[str, Any]: + return { + "embedding_name_or_path": "hf-internal-testing/tiny-albert", + "finetune_last_n_layers": 0, + } + + +@pytest.fixture(scope="module") +def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: + path = str(tmp_path_module) + pipeline = HuggingFacePreprocessingPipeline( + dataset_name="clarin-pl/polemo2-official", + load_dataset_kwargs={ + "train_domains": ["hotels", "medicine"], + "dev_domains": ["hotels", "medicine"], + "test_domains": ["hotels", "medicine"], + "text_cfg": "text", + }, + persist_path=path, + sample_missing_splits=None, + ignore_test_subset=False, + downsample_splits=(0.01, 0.01, 0.05), + seed=441, + ) + pipeline.run() + + return { + "dataset_name_or_path": path, + "input_column_name": ["text"], + "target_column_name": "target", + } + + +@pytest.fixture(scope="module") +def task_train_kwargs() -> Dict[str, Any]: + return { + "max_epochs": 1, + "devices": "auto", + "accelerator": "cpu", + "deterministic": True, + } + + +@pytest.fixture(scope="module") +def task_model_kwargs() -> Dict[str, Any]: + return {"learning_rate": 5e-4, "use_scheduler": False} + + +@pytest.fixture(scope="module") +def datamodule_kwargs() -> Dict[str, Any]: + return {"num_workers": 0} + + +@pytest.fixture(scope="module") +def lightning_classification_pipeline( + pipeline_kwargs: Dict[str, Any], + dataset_kwargs: Dict[str, Any], + datamodule_kwargs: Dict[str, Any], + task_train_kwargs: Dict[str, Any], + task_model_kwargs: Dict[str, Any], + result_path: Path, +) -> Tuple[LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], Path]: + return ( + LightningClassificationPipeline( + output_path=result_path.name, + **pipeline_kwargs, + **dataset_kwargs, + datamodule_kwargs=datamodule_kwargs, + task_train_kwargs=task_train_kwargs, + task_model_kwargs=task_model_kwargs, + ), + result_path, + ) + + +def test_lightning_pipeline_inference( + lightning_classification_pipeline: Tuple[ + LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], + Path, + ], +) -> None: + pl.seed_everything(441, workers=True) + + pipeline, path = lightning_classification_pipeline + results = pipeline.run() + + ckpt_path = Path(path.name) / "checkpoints" / "epoch=0-step=1.ckpt" + task_from_ckpt = TextClassificationTask.from_checkpoint( + checkpoint_path=ckpt_path.resolve(), + output_path=path.name, + ) + + model_state_dict = pipeline.model.task.model.model.state_dict() + model_from_ckpt_state_dict = task_from_ckpt.model.model.state_dict() + + assert model_state_dict.keys() == model_from_ckpt_state_dict.keys() + for k in model_state_dict.keys(): + assert torch.equal(model_state_dict[k], model_from_ckpt_state_dict[k]) + + test_dataloader = pipeline.datamodule.test_dataloader() + predictions = task_from_ckpt.predict(test_dataloader, return_names=False) + + assert np.array_equal(results["data"]["y_probabilities"], predictions["y_probabilities"]) From 282f4ab6994d7b6b4dc90eb34084b169c5c752de Mon Sep 17 00:00:00 2001 From: djaniak Date: Thu, 7 Apr 2022 18:08:18 +0200 Subject: [PATCH 08/22] refactor: move lightning inference notebook to tutorials and refactor --- .../validate_lightning_models_inference.ipynb | 186 ++++++++++-------- 1 file changed, 102 insertions(+), 84 deletions(-) rename {notebooks => tutorials}/validate_lightning_models_inference.ipynb (52%) diff --git a/notebooks/validate_lightning_models_inference.ipynb b/tutorials/validate_lightning_models_inference.ipynb similarity index 52% rename from notebooks/validate_lightning_models_inference.ipynb rename to tutorials/validate_lightning_models_inference.ipynb index 151b8603..456ef63b 100644 --- a/notebooks/validate_lightning_models_inference.ipynb +++ b/tutorials/validate_lightning_models_inference.ipynb @@ -21,19 +21,75 @@ "import os\n", "\n", "os.chdir(\"..\")\n", - "\n", - "from pathlib import Path\n", - "from tempfile import TemporaryDirectory\n", + "from typing import Any, Dict\n", "\n", "import pytorch_lightning as pl\n", - "import torch\n", - "from embeddings.defaults import RESULTS_PATH\n", - "from embeddings.task.lightning_task.text_classification import TextClassificationTask\n", + "from embeddings.defaults import DATASET_PATH, RESULTS_PATH\n", "from embeddings.model.lightning_module.text_classification import (\n", " TextClassificationModule,\n", ")\n", + "from embeddings.pipeline.hf_preprocessing_pipeline import (\n", + " HuggingFacePreprocessingPipeline,\n", + ")\n", "from embeddings.pipeline.lightning_classification import LightningClassificationPipeline\n", - "from embeddings.utils.utils import build_output_path, format_eval_result" + "from embeddings.task.lightning_task.text_classification import TextClassificationTask\n", + "from embeddings.utils.utils import build_output_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2d0e06e2-3c5a-420b-b065-31d5ccd6b255", + "metadata": {}, + "outputs": [], + "source": [ + "embedding_name_or_path = \"hf-internal-testing/tiny-albert\"\n", + "dataset_name = \"clarin-pl/polemo2-official\"\n", + "\n", + "dataset_path = build_output_path(DATASET_PATH, embedding_name_or_path, dataset_name)\n", + "output_path = build_output_path(RESULTS_PATH, embedding_name_or_path, dataset_name)" + ] + }, + { + "cell_type": "markdown", + "id": "b6d0098c-41ec-473a-954a-709f7fb05922", + "metadata": {}, + "source": [ + "### Preprocess and downsample data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "095d1c88-900f-4275-a879-f9efdb73265a", + "metadata": {}, + "outputs": [], + "source": [ + "def preprocess_data(path: str) -> Dict[str, Any]:\n", + " pipeline = HuggingFacePreprocessingPipeline(\n", + " dataset_name=dataset_name,\n", + " load_dataset_kwargs={\n", + " \"train_domains\": [\"hotels\", \"medicine\"],\n", + " \"dev_domains\": [\"hotels\", \"medicine\"],\n", + " \"test_domains\": [\"hotels\", \"medicine\"],\n", + " \"text_cfg\": \"text\",\n", + " },\n", + " persist_path=path,\n", + " sample_missing_splits=None,\n", + " ignore_test_subset=False,\n", + " downsample_splits=(0.01, 0.01, 0.05),\n", + " seed=441,\n", + " )\n", + " pipeline.run()\n", + "\n", + " return {\n", + " \"dataset_name_or_path\": path,\n", + " \"input_column_name\": [\"text\"],\n", + " \"target_column_name\": \"target\",\n", + " }\n", + "\n", + "\n", + "dataset_kwargs = preprocess_data(dataset_path)" ] }, { @@ -53,27 +109,10 @@ }, "outputs": [], "source": [ - "embedding_name_or_path = \"hf-internal-testing/tiny-albert\"\n", - "dataset_name = \"clarin-pl/polemo2-official\"\n", - "input_columns_name = \"text\"\n", - "target_column_name = \"target\"\n", - "# path = TemporaryDirectory()\n", - "# output_path = path.name\n", - "output_path = \".\"\n", - "\n", "pipeline = LightningClassificationPipeline(\n", " embedding_name_or_path=embedding_name_or_path,\n", - " dataset_name_or_path=dataset_name,\n", - " input_column_name=input_columns_name,\n", - " target_column_name=target_column_name,\n", " output_path=output_path,\n", " finetune_last_n_layers=0,\n", - " load_dataset_kwargs={\n", - " \"train_domains\": [\"hotels\", \"medicine\"],\n", - " \"dev_domains\": [\"hotels\", \"medicine\"],\n", - " \"test_domains\": [\"hotels\", \"medicine\"],\n", - " \"text_cfg\": \"text\",\n", - " },\n", " datamodule_kwargs={\"max_seq_length\": 64,},\n", " task_train_kwargs={\n", " \"max_epochs\": 1,\n", @@ -81,6 +120,7 @@ " \"accelerator\": \"cpu\",\n", " \"deterministic\": True,\n", " },\n", + " **dataset_kwargs\n", ")\n", "result = pipeline.run()" ] @@ -100,14 +140,8 @@ "metadata": {}, "outputs": [], "source": [ - "ckpt_path = (\n", - " Path(output_path)\n", - " / \"lightning_logs\"\n", - " / \"version_1\"\n", - " / \"checkpoints\"\n", - " / \"epoch=0-step=180.ckpt\"\n", - ")\n", - "ckpt_path.resolve()" + "ckpt_path = output_path / \"checkpoints\" / \"epoch=0-step=1.ckpt\"\n", + "ckpt_path" ] }, { @@ -118,13 +152,18 @@ "outputs": [], "source": [ "task_from_ckpt = TextClassificationTask.from_checkpoint(\n", - " checkpoint_path=ckpt_path,\n", - " output_path=output_path,\n", - " task_train_kwargs={},\n", - " early_stopping_kwargs={},\n", + " checkpoint_path=ckpt_path, output_path=output_path,\n", ")" ] }, + { + "cell_type": "markdown", + "id": "13272a49-8ef5-41af-80a3-5cf3b7b677c7", + "metadata": {}, + "source": [ + "#### Alternatively we can load the model" + ] + }, { "cell_type": "code", "execution_count": null, @@ -134,95 +173,73 @@ }, "outputs": [], "source": [ - "model_from_ckpt = TextClassificationModule.load_from_checkpoint(\n", - " str(ckpt_path), strict=True\n", - ")" + "model_from_ckpt = TextClassificationModule.load_from_checkpoint(str(ckpt_path))" ] }, { "cell_type": "markdown", - "id": "a2c6e7fe-39c6-4fcb-87a1-1ed688c33adf", + "id": "103e7972-c386-4c44-9b58-0385213f20f8", "metadata": {}, "source": [ - "### Validate model states (because of the warning)" + "The warning appears when loading the model, however, it was validated that the loaded weights are the same as the weights that are being saved. The reason for this is that when the model_state_dict keys are loaded from the cached huggingface model some of them (cls.(...)) do not match the keys from the state_dict of the model weights that are saved.\n", + "\n", + "https://github.com/CLARIN-PL/embeddings/issues/225" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "80999ea5-ef59-40db-8d39-2ae02937fec8", + "cell_type": "markdown", + "id": "88e7a6c7-449f-4d0c-9042-a5f98aebc14b", "metadata": {}, - "outputs": [], "source": [ - "model_state_dict = pipeline.model.task.model.model.state_dict()\n", - "model_from_ckpt_state_dict = task_from_ckpt.model.model.state_dict()" + "### Use task from checkpoint for predictions" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "1207eb9f-2e68-46ec-bab7-e5c8d1cb5953", + "cell_type": "markdown", + "id": "c5eeab69-e13c-4ba4-b0ea-2473555915d9", "metadata": {}, - "outputs": [], "source": [ - "model_state_dict.keys() == model_from_ckpt_state_dict.keys()" + "`return_names` needs to be set to False since it uses the `datamodule` to retrieves the names while the datamodule is not loaded to `Trainer` in the `LightningTask` since we have not fitted it yet." ] }, { "cell_type": "code", "execution_count": null, - "id": "3875c361-ee96-4e79-8209-0f04e4f1f599", + "id": "4ad7b9b0-823a-4c8e-aac5-61a333558ed1", "metadata": {}, "outputs": [], "source": [ - "for k in model_state_dict.keys():\n", - " assert torch.equal(model_state_dict[k], model_from_ckpt_state_dict[k])" + "test_dataloader = pipeline.datamodule.test_dataloader()\n", + "preds = task_from_ckpt.predict(test_dataloader, return_names=False)\n", + "preds" ] }, { "cell_type": "markdown", - "id": "4a07428d-cdad-45bb-a92a-302b511de9dc", + "id": "9c789d71-2368-4add-8a7b-f51571aecfbd", "metadata": {}, "source": [ - "### Manually save and load the model" + "Alternatively we can implicitly assign the `datamodule` to `Trainer` in `LightningTask`" ] }, { "cell_type": "code", "execution_count": null, - "id": "c17b7792-b59a-4051-888e-c8b960bf04cc", + "id": "9836dc5d-8ee2-46fc-b7d8-94841cc13ce5", "metadata": {}, "outputs": [], "source": [ - "pipeline.model.task.trainer.save_checkpoint(\"example.ckpt\")\n", - "new_model = TextClassificationModule.load_from_checkpoint(\n", - " checkpoint_path=\"example.ckpt\"\n", - ")\n", - "new_task_from_ckpt = TextClassificationTask.from_checkpoint(\n", - " checkpoint_path=ckpt_path,\n", - " output_path=output_path,\n", - " task_train_kwargs={},\n", - " early_stopping_kwargs={},\n", - ")" + "task_from_ckpt.trainer.datamodule = pipeline.datamodule\n", + "preds_with_names = task_from_ckpt.predict(test_dataloader, return_names=True)\n", + "preds_with_names" ] }, { "cell_type": "markdown", - "id": "88e7a6c7-449f-4d0c-9042-a5f98aebc14b", + "id": "29c321e2-9ecc-4b65-936b-c8e7cca1155a", "metadata": {}, "source": [ - "### Use model from checkpoint for predictions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ad7b9b0-823a-4c8e-aac5-61a333558ed1", - "metadata": {}, - "outputs": [], - "source": [ - "test_dataloader = pipeline.datamodule.test_dataloader()\n", - "preds = task_from_ckpt.predict(test_dataloader)" + "We can also use previosly loaded lightning model (`LightningModule`) outside of the task and get the predictions. To do this we also need to intitialize a `Trainer`." ] }, { @@ -232,8 +249,9 @@ "metadata": {}, "outputs": [], "source": [ - "trainer = pl.Trainer()\n", - "preds_other = trainer.predict(model_from_ckpt, dataloaders=test_dataloader)" + "trainer = pl.Trainer(default_root_dir=str(output_path))\n", + "preds_from_model = trainer.predict(model_from_ckpt, dataloaders=test_dataloader)\n", + "preds_from_model" ] } ], @@ -258,4 +276,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file From b6a2771c9fddb48369001c9769d8589d7f0d4734 Mon Sep 17 00:00:00 2001 From: djaniak Date: Fri, 8 Apr 2022 00:08:36 +0200 Subject: [PATCH 09/22] refactor: switch to herbert for testing inference --- tests/test_lightning_inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_lightning_inference.py b/tests/test_lightning_inference.py index 469f3da4..1c1e10af 100644 --- a/tests/test_lightning_inference.py +++ b/tests/test_lightning_inference.py @@ -23,7 +23,7 @@ def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path: @pytest.fixture(scope="module") def pipeline_kwargs() -> Dict[str, Any]: return { - "embedding_name_or_path": "hf-internal-testing/tiny-albert", + "embedding_name_or_path": "allegro/herbert-base-cased", "finetune_last_n_layers": 0, } From a99050219ed27ca3e5bb84b5d17ad56e9eed2287 Mon Sep 17 00:00:00 2001 From: djaniak Date: Fri, 8 Apr 2022 00:58:41 +0200 Subject: [PATCH 10/22] feat: implement flair task from_checkpoint method --- embeddings/task/flair_task/flair_task.py | 31 +- .../task/flair_task/sequence_labeling.py | 14 + .../task/flair_task/text_classification.py | 14 + .../validate_flair_models_inference.ipynb | 604 ++++++++++++++++-- 4 files changed, 626 insertions(+), 37 deletions(-) diff --git a/embeddings/task/flair_task/flair_task.py b/embeddings/task/flair_task/flair_task.py index 3d4bb5c9..1172ca3d 100644 --- a/embeddings/task/flair_task/flair_task.py +++ b/embeddings/task/flair_task/flair_task.py @@ -1,9 +1,10 @@ import abc from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple, Type import flair from flair.data import Corpus, Dictionary, Sentence +from flair.models import SequenceTagger from flair.trainers import ModelTrainer from numpy import typing as nptyping from typing_extensions import Literal @@ -106,3 +107,31 @@ def get_y(data: List[Sentence], y_type: str, y_dictionary: Dictionary) -> nptypi @abc.abstractmethod def remove_labels_from_data(data: List[Sentence], y_type: str) -> None: pass + + @classmethod + def restore_task_model( + cls, + checkpoint_path: T_path, + output_path: T_path, + flair_model: Type[flair.nn.Model], + task_train_kwargs: Optional[Dict[str, Any]], + ) -> "FlairTask": + model = flair_model.load(checkpoint_path) + task_model_kwargs = ( + {"hidden_size": model.hidden_size} if isinstance(model, SequenceTagger) else {} + ) + task = cls( + output_path=output_path, task_train_kwargs=task_train_kwargs or {}, **task_model_kwargs + ) + task.model = model + return task + + @classmethod + @abc.abstractmethod + def from_checkpoint( + cls, + checkpoint_path: T_path, + output_path: T_path, + task_train_kwargs: Optional[Dict[str, Any]] = None, + ) -> "FlairTask": + pass diff --git a/embeddings/task/flair_task/sequence_labeling.py b/embeddings/task/flair_task/sequence_labeling.py index aef4e189..6349f9c4 100644 --- a/embeddings/task/flair_task/sequence_labeling.py +++ b/embeddings/task/flair_task/sequence_labeling.py @@ -64,3 +64,17 @@ def remove_labels_from_data(data: List[Sentence], y_type: str) -> None: for sent in data: for token in sent: token.remove_labels(y_type) + + @classmethod + def from_checkpoint( + cls, + checkpoint_path: T_path, + output_path: T_path, + task_train_kwargs: Optional[Dict[str, Any]] = None, + ) -> "FlairTask": + return cls.restore_task_model( + checkpoint_path=checkpoint_path, + output_path=output_path, + flair_model=SequenceTagger, + task_train_kwargs=task_train_kwargs, + ) diff --git a/embeddings/task/flair_task/text_classification.py b/embeddings/task/flair_task/text_classification.py index 8120e08a..7dc8a31b 100644 --- a/embeddings/task/flair_task/text_classification.py +++ b/embeddings/task/flair_task/text_classification.py @@ -57,3 +57,17 @@ def get_y(data: List[Sentence], y_type: str, y_dictionary: Dictionary) -> nptypi def remove_labels_from_data(data: List[Sentence], y_type: str) -> None: for sentence in data: sentence.remove_labels(y_type) + + @classmethod + def from_checkpoint( + cls, + checkpoint_path: T_path, + output_path: T_path, + task_train_kwargs: Optional[Dict[str, Any]] = None, + ) -> "FlairTask": + return cls.restore_task_model( + checkpoint_path=checkpoint_path, + output_path=output_path, + flair_model=TextClassifier, + task_train_kwargs=task_train_kwargs, + ) diff --git a/notebooks/validate_flair_models_inference.ipynb b/notebooks/validate_flair_models_inference.ipynb index d7bce5bd..0257a589 100644 --- a/notebooks/validate_flair_models_inference.ipynb +++ b/notebooks/validate_flair_models_inference.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "418b9661-aea2-4990-8e26-e7f0e167b9b2", "metadata": {}, "outputs": [], @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "f23bfd51-d1f4-4321-aed9-96f51b171fe9", "metadata": {}, "outputs": [], @@ -22,21 +22,9 @@ "\n", "os.chdir(\"..\")\n", "\n", - "\n", - "from pathlib import Path\n", - "from tempfile import TemporaryDirectory\n", - "from typing import Any, Dict, Tuple\n", - "\n", - "import datasets\n", - "import flair\n", - "import numpy as np\n", - "import pytest\n", - "import torch\n", "from embeddings.data.data_loader import HuggingFaceDataLoader\n", - "from embeddings.data.dataset import HuggingFaceDataset\n", - "from embeddings.defaults import RESULTS_PATH\n", + "from embeddings.defaults import DATASET_PATH, RESULTS_PATH\n", "from embeddings.embedding.auto_flair import AutoFlairWordEmbedding\n", - "from embeddings.embedding.flair_embedding import FlairEmbedding\n", "from embeddings.evaluator.sequence_labeling_evaluator import SequenceLabelingEvaluator\n", "from embeddings.model.flair_model import FlairModel\n", "from embeddings.pipeline.standard_pipeline import StandardPipeline\n", @@ -44,14 +32,15 @@ "from embeddings.transformation.flair_transformation.column_corpus_transformation import (\n", " ColumnCorpusTransformation,\n", ")\n", + "from embeddings.data.dataset import Dataset\n", + "\n", "from embeddings.transformation.flair_transformation.downsample_corpus_transformation import (\n", " DownsampleFlairCorpusTransformation,\n", ")\n", "from embeddings.transformation.flair_transformation.split_sample_corpus_transformation import (\n", " SampleSplitsFlairCorpusTransformation,\n", ")\n", - "from flair.data import Corpus\n", - "from numpy import typing as nptyping" + "from embeddings.utils.utils import build_output_path" ] }, { @@ -64,26 +53,40 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "id": "dd4fb7d6-1e81-4bea-9bd3-b4a4bec87fc9", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2022-04-08 00:41:27,426 - embeddings.utils.utils - WARNING - String 'allegro/herbert-base-cased' contains '/'. Replacing it with '__'. Cleaned_text: allegro__herbert-base-cased.\n", + "2022-04-08 00:41:27,428 - embeddings.utils.utils - WARNING - String 'clarin-pl/kpwr-ner' contains '/'. Replacing it with '__'. Cleaned_text: clarin-pl__kpwr-ner.\n", + "2022-04-08 00:41:27,431 - embeddings.utils.utils - WARNING - String 'allegro/herbert-base-cased' contains '/'. Replacing it with '__'. Cleaned_text: allegro__herbert-base-cased.\n", + "2022-04-08 00:41:27,433 - embeddings.utils.utils - WARNING - String 'clarin-pl/kpwr-ner' contains '/'. Replacing it with '__'. Cleaned_text: clarin-pl__kpwr-ner.\n" + ] + } + ], "source": [ - "result_path = TemporaryDirectory()\n", + "embedding_name_or_path = \"allegro/herbert-base-cased\"\n", + "dataset_name = \"clarin-pl/kpwr-ner\"\n", + "\n", + "output_path = build_output_path(RESULTS_PATH, embedding_name_or_path, dataset_name)\n", "\n", - "dataset = HuggingFaceDataset(\"clarin-pl/kpwr-ner\")\n", + "dataset = Dataset(dataset_name)\n", "data_loader = HuggingFaceDataLoader()\n", "transformation = (\n", " ColumnCorpusTransformation(\"tokens\", \"ner\")\n", " .then(SampleSplitsFlairCorpusTransformation(dev_fraction=0.1, seed=441))\n", - " .then(DownsampleFlairCorpusTransformation(percentage=0.005))\n", + " .then(DownsampleFlairCorpusTransformation(downsample_train=0.005, downsample_dev=0.01, downsample_test=0.01))\n", ")\n", "task = SequenceLabeling(\n", - " result_path.name,\n", + " output_path,\n", " hidden_size=256,\n", - " task_train_kwargs={\"max_epochs\": 1, \"mini_batch_size\": 256},\n", + " task_train_kwargs={\"max_epochs\": 1, \"mini_batch_size\": 64},\n", ")\n", - "embedding = AutoFlairWordEmbedding.from_hub(\"allegro/herbert-base-cased\")\n", + "embedding = AutoFlairWordEmbedding.from_hub(embedding_name_or_path)\n", "model = FlairModel(embedding, task)\n", "evaluator = SequenceLabelingEvaluator()\n", "\n", @@ -92,12 +95,488 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "id": "2f69e538-332d-4278-977d-7002fe2b67bd", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Using custom data configuration default\n", + "Reusing dataset kpwrner (/home/djaniak/.cache/huggingface/datasets/clarin-pl___kpwrner/default/0.0.0/001e3d471298007e8412e3a6ccc06bec000dec1bce0cf8e0ba7e5b7e105b1342)\n" + ] + }, + { + "data": { + "text/plain": " 0%| | 0/2 [00:00\u001B[0;34m\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[0;32mfrom\u001B[0m \u001B[0mflair\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmodels\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mSequenceTagger\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 2\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m----> 3\u001B[0;31m \u001B[0mtask_from_ckpt\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mSequenceLabeling\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mfrom_checkpoint\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mcheckpoint_path\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0moutput_path\u001B[0m \u001B[0;34m/\u001B[0m \u001B[0;34m\"final-model.pt\"\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0moutput_path\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0moutput_path\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 4\u001B[0m \u001B[0;31m# trained_model = SequenceTagger.load(output_path / \"final-model.pt\")\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", + "\u001B[0;32m~/Projects/embeddings/embeddings/task/flair_task/sequence_labeling.py\u001B[0m in \u001B[0;36mfrom_checkpoint\u001B[0;34m(cls, checkpoint_path, output_path, task_train_kwargs)\u001B[0m\n\u001B[1;32m 73\u001B[0m \u001B[0mtask_train_kwargs\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mOptional\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mDict\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mstr\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mAny\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m]\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;32mNone\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 74\u001B[0m ) -> \"FlairTask\":\n\u001B[0;32m---> 75\u001B[0;31m return cls.restore_task_model(\n\u001B[0m\u001B[1;32m 76\u001B[0m \u001B[0mcheckpoint_path\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mcheckpoint_path\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 77\u001B[0m \u001B[0moutput_path\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0moutput_path\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", + "\u001B[0;32m~/Projects/embeddings/embeddings/task/flair_task/flair_task.py\u001B[0m in \u001B[0;36mrestore_task_model\u001B[0;34m(cls, checkpoint_path, output_path, flair_model, task_train_kwargs)\u001B[0m\n\u001B[1;32m 117\u001B[0m \u001B[0mtask_train_kwargs\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mOptional\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mDict\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mstr\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mAny\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 118\u001B[0m ) -> \"FlairTask\":\n\u001B[0;32m--> 119\u001B[0;31m \u001B[0mmodel\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mflair_model\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mload\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mcheckpoint_path\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 120\u001B[0m task_model_kwargs = (\n\u001B[1;32m 121\u001B[0m \u001B[0;34m{\u001B[0m\u001B[0;34m\"hidden_size\"\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mmodel\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mhidden_size\u001B[0m\u001B[0;34m}\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0misinstance\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmodel\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mSequenceTagger\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;32melse\u001B[0m \u001B[0;34m{\u001B[0m\u001B[0;34m}\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", + "\u001B[0;31mTypeError\u001B[0m: __init__() missing 1 required positional argument: 'hidden_size'" + ] + } + ], "source": [ "from flair.models import SequenceTagger\n", "\n", - "trained_model = SequenceTagger.load(result_path.name + \"/final-model.pt\")" + "task_from_ckpt = SequenceLabeling.from_checkpoint(checkpoint_path=(output_path / \"final-model.pt\"), output_path=output_path)\n", + "# trained_model = SequenceTagger.load(output_path / \"final-model.pt\")" ] }, { @@ -186,9 +718,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:clarinpl-embeddings]", + "name": "conda-env-embeddings-py", "language": "python", - "name": "conda-env-clarinpl-embeddings-py" + "display_name": "Python [conda env:embeddings]" }, "language_info": { "codemirror_mode": { @@ -205,4 +737,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} +} \ No newline at end of file From cb8758dd8764c7cbe821db219ad238da0ec0ecae Mon Sep 17 00:00:00 2001 From: djaniak Date: Fri, 8 Apr 2022 15:51:15 +0200 Subject: [PATCH 11/22] refactor: lightning inference and datamodule --- embeddings/data/datamodule.py | 18 ++++++++++++------ .../task/lightning_task/lightning_task.py | 6 +----- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/embeddings/data/datamodule.py b/embeddings/data/datamodule.py index c72bc16c..67ae6c48 100644 --- a/embeddings/data/datamodule.py +++ b/embeddings/data/datamodule.py @@ -60,6 +60,7 @@ def __init__( dataloader_kwargs: Optional[Dict[str, Any]] = None, seed: int = 441, ) -> None: + self.has_setup = False self.dataset_name_or_path = dataset_name_or_path self.tokenizer_name_or_path = tokenizer_name_or_path self.target_field = target_field @@ -74,9 +75,12 @@ def __init__( self.load_dataset_kwargs = load_dataset_kwargs if load_dataset_kwargs else {} self.dataloader_kwargs = dataloader_kwargs if dataloader_kwargs else {} self.seed = seed - dataset_info = self.load_dataset()["train"].info + + self.setup() + super().__init__( - dataset_info=dataset_info, dataset_version=dataset_info.version.version_str + dataset_info=self.dataset["train"].info, + dataset_version=self.dataset["train"].info.version.version_str, ) @abc.abstractmethod @@ -94,13 +98,15 @@ def convert_to_features( pass def prepare_data(self) -> None: - self.load_dataset(preparation_step=True) AutoTokenizer.from_pretrained(self.tokenizer_name_or_path) def setup(self, stage: Optional[str] = None) -> None: - self.dataset = self.load_dataset() - self.prepare_labels() - self.process_data() + if not self.has_setup: + self.dataset = self.load_dataset() + self.prepare_labels() + self.process_data() + self.has_setup = True + assert all(hasattr(self, attr) for attr in ["num_classes", "target_names", "dataset"]) def load_dataset(self, preparation_step: bool = False) -> DatasetDict: dataset = embeddings_dataset.Dataset( diff --git a/embeddings/task/lightning_task/lightning_task.py b/embeddings/task/lightning_task/lightning_task.py index c798b7f6..c2a25672 100644 --- a/embeddings/task/lightning_task/lightning_task.py +++ b/embeddings/task/lightning_task/lightning_task.py @@ -130,11 +130,7 @@ def restore_task_model( logging_config: Optional[LightningLoggingConfig], ) -> "LightningTask": model = lightning_module.load_from_checkpoint(str(checkpoint_path)) - trainer = pl.Trainer( - default_root_dir=str(output_path), - callbacks=[ModelCheckpoint(dirpath=Path(output_path).joinpath("checkpoints"))], - **task_train_kwargs or {} - ) + trainer = pl.Trainer(default_root_dir=str(output_path), **task_train_kwargs or {}) init_kwargs = { "model_name_or_path": model.hparams.model_name_or_path, "output_path": output_path, From a5eae476ac5f8ca7ac81712f9a73105e5df9db03 Mon Sep 17 00:00:00 2001 From: djaniak Date: Fri, 8 Apr 2022 15:52:22 +0200 Subject: [PATCH 12/22] tests: add flair trained model inference --- embeddings/task/flair_task/flair_task.py | 5 +- tests/test_flair_inference.py | 146 +++++++++++++++++++++++ 2 files changed, 149 insertions(+), 2 deletions(-) create mode 100644 tests/test_flair_inference.py diff --git a/embeddings/task/flair_task/flair_task.py b/embeddings/task/flair_task/flair_task.py index 1172ca3d..6fc746b0 100644 --- a/embeddings/task/flair_task/flair_task.py +++ b/embeddings/task/flair_task/flair_task.py @@ -25,6 +25,7 @@ def __init__( self, output_path: T_path = RESULTS_PATH, task_train_kwargs: Optional[Dict[str, Any]] = None, + **kwargs: Any ): super().__init__() self.model: Optional[flair.nn.Model] = None @@ -117,11 +118,11 @@ def restore_task_model( task_train_kwargs: Optional[Dict[str, Any]], ) -> "FlairTask": model = flair_model.load(checkpoint_path) - task_model_kwargs = ( + task_kwargs = ( {"hidden_size": model.hidden_size} if isinstance(model, SequenceTagger) else {} ) task = cls( - output_path=output_path, task_train_kwargs=task_train_kwargs or {}, **task_model_kwargs + output_path=output_path, task_train_kwargs=task_train_kwargs or {}, **task_kwargs ) task.model = model return task diff --git a/tests/test_flair_inference.py b/tests/test_flair_inference.py new file mode 100644 index 00000000..60cf7ab5 --- /dev/null +++ b/tests/test_flair_inference.py @@ -0,0 +1,146 @@ +from pathlib import Path +from tempfile import TemporaryDirectory +from typing import Any, Dict + +import datasets +import flair +import numpy as np +import pytest +from _pytest.tmpdir import TempdirFactory +from flair.data import Corpus +from numpy import typing as nptyping + +from embeddings.data.data_loader import HuggingFaceDataLoader +from embeddings.data.dataset import Dataset +from embeddings.embedding.flair_embedding import FlairDocumentPoolEmbedding +from embeddings.embedding.flair_loader import ( + FlairDocumentPoolEmbeddingLoader, + FlairWordEmbeddingLoader, +) +from embeddings.evaluator.sequence_labeling_evaluator import SequenceLabelingEvaluator +from embeddings.evaluator.text_classification_evaluator import TextClassificationEvaluator +from embeddings.model.flair_model import FlairModel +from embeddings.pipeline.standard_pipeline import StandardPipeline +from embeddings.task.flair_task.sequence_labeling import SequenceLabeling +from embeddings.task.flair_task.text_classification import TextClassification +from embeddings.transformation.flair_transformation.classification_corpus_transformation import ( + ClassificationCorpusTransformation, +) +from embeddings.transformation.flair_transformation.column_corpus_transformation import ( + ColumnCorpusTransformation, +) +from embeddings.transformation.flair_transformation.downsample_corpus_transformation import ( + DownsampleFlairCorpusTransformation, +) +from embeddings.transformation.flair_transformation.split_sample_corpus_transformation import ( + SampleSplitsFlairCorpusTransformation, +) + + +@pytest.fixture(scope="module") +def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path: + path = tmpdir_factory.mktemp(__name__) + return Path(path) + + +@pytest.fixture(scope="module") +def text_classification_pipeline( + tmp_path_module: "TemporaryDirectory[str]", +) -> StandardPipeline[ + str, datasets.DatasetDict, Corpus, Dict[str, nptyping.NDArray[Any]], Dict[str, Any] +]: + output_path = tmp_path_module + dataset = Dataset( + "clarin-pl/polemo2-official", + train_domains=["reviews"], + dev_domains=["reviews"], + test_domains=["reviews"], + text_cfg="sentence", + ) + data_loader = HuggingFaceDataLoader() + transformation = ClassificationCorpusTransformation("text", "target").then( + DownsampleFlairCorpusTransformation(*(0.005, 0.01, 0.01), stratify=False) + ) + embedding_loader = FlairDocumentPoolEmbeddingLoader("clarin-pl/word2vec-kgr10", "") + embedding = embedding_loader.get_embedding(FlairDocumentPoolEmbedding) + task = TextClassification(output_path.name, task_train_kwargs={"max_epochs": 1}) + model = FlairModel(embedding, task) + evaluator = TextClassificationEvaluator() + pipeline = StandardPipeline(dataset, data_loader, transformation, model, evaluator) + return pipeline + + +@pytest.fixture(scope="module") +def sequence_labeling_pipeline( + tmp_path_module: "TemporaryDirectory[str]", +) -> StandardPipeline[ + str, datasets.DatasetDict, Corpus, Dict[str, nptyping.NDArray[Any]], Dict[str, Any] +]: + output_path = tmp_path_module + dataset = Dataset("clarin-pl/kpwr-ner") + data_loader = HuggingFaceDataLoader() + transformation = ( + ColumnCorpusTransformation("tokens", "ner") + .then(SampleSplitsFlairCorpusTransformation(dev_fraction=0.1, seed=441)) + .then(DownsampleFlairCorpusTransformation(*(0.005, 0.01, 0.01), stratify=False)) + ) + task = SequenceLabeling( + output_path.name, + hidden_size=256, + task_train_kwargs={"max_epochs": 1, "mini_batch_size": 64}, + ) + embedding_loader = FlairWordEmbeddingLoader("clarin-pl/word2vec-kgr10", "") + embedding = embedding_loader.get_embedding() + model = FlairModel(embedding, task) + evaluator = SequenceLabelingEvaluator() + pipeline = StandardPipeline(dataset, data_loader, transformation, model, evaluator) + return pipeline + + +def test_text_classification_inference( + text_classification_pipeline: StandardPipeline[ + str, datasets.DatasetDict, Corpus, Dict[str, nptyping.NDArray[Any]], Dict[str, Any] + ], + tmp_path_module: "TemporaryDirectory[str]", +) -> None: + flair.set_seed(441) + output_path = tmp_path_module + pipeline = text_classification_pipeline + result = pipeline.run() + + task_from_ckpt = TextClassification.from_checkpoint( + checkpoint_path=(Path(output_path.name) / "final-model.pt"), output_path=output_path.name + ) + loaded_data = pipeline.data_loader.load(pipeline.dataset) + transformed_data = pipeline.transformation.transform(loaded_data) + test_data = transformed_data.test + + y_pred, loss = task_from_ckpt.predict(test_data) + y_true = task_from_ckpt.get_y(test_data, task_from_ckpt.y_type, task_from_ckpt.y_dictionary) + results_from_ckpt = pipeline.evaluator.evaluate({"y_pred": y_pred, "y_true": y_true}) + assert np.array_equal(result["data"]["y_pred"], results_from_ckpt["data"]["y_pred"]) + + +def test_sequence_labeling_inference( + sequence_labeling_pipeline: StandardPipeline[ + str, datasets.DatasetDict, Corpus, Dict[str, nptyping.NDArray[Any]], Dict[str, Any] + ], + tmp_path_module: "TemporaryDirectory[str]", +) -> None: + flair.set_seed(441) + output_path = tmp_path_module + pipeline = sequence_labeling_pipeline + result = pipeline.run() + + task_from_ckpt = SequenceLabeling.from_checkpoint( + checkpoint_path=(Path(output_path.name) / "final-model.pt"), output_path=output_path.name + ) + loaded_data = pipeline.data_loader.load(pipeline.dataset) + transformed_data = pipeline.transformation.transform(loaded_data) + test_data = transformed_data.test + + y_pred, loss = task_from_ckpt.predict(test_data) + y_true = task_from_ckpt.get_y(test_data, task_from_ckpt.y_type, task_from_ckpt.y_dictionary) + results_from_ckpt = pipeline.evaluator.evaluate({"y_pred": y_pred, "y_true": y_true}) + + assert np.array_equal(result["data"]["y_pred"], results_from_ckpt["data"]["y_pred"]) From 8ef0b25e186aa8657a900380e47f818ed12215eb Mon Sep 17 00:00:00 2001 From: djaniak Date: Fri, 8 Apr 2022 15:52:39 +0200 Subject: [PATCH 13/22] refactor: update notebooks with current code --- .../validate_flair_models_inference.ipynb | 740 ------------------ .../validate_flair_models_inference.ipynb | 190 +++++ .../validate_lightning_models_inference.ipynb | 6 +- 3 files changed, 193 insertions(+), 743 deletions(-) delete mode 100644 notebooks/validate_flair_models_inference.ipynb create mode 100644 tutorials/validate_flair_models_inference.ipynb diff --git a/notebooks/validate_flair_models_inference.ipynb b/notebooks/validate_flair_models_inference.ipynb deleted file mode 100644 index 0257a589..00000000 --- a/notebooks/validate_flair_models_inference.ipynb +++ /dev/null @@ -1,740 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "id": "418b9661-aea2-4990-8e26-e7f0e167b9b2", - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "f23bfd51-d1f4-4321-aed9-96f51b171fe9", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "os.chdir(\"..\")\n", - "\n", - "from embeddings.data.data_loader import HuggingFaceDataLoader\n", - "from embeddings.defaults import DATASET_PATH, RESULTS_PATH\n", - "from embeddings.embedding.auto_flair import AutoFlairWordEmbedding\n", - "from embeddings.evaluator.sequence_labeling_evaluator import SequenceLabelingEvaluator\n", - "from embeddings.model.flair_model import FlairModel\n", - "from embeddings.pipeline.standard_pipeline import StandardPipeline\n", - "from embeddings.task.flair_task.sequence_labeling import SequenceLabeling\n", - "from embeddings.transformation.flair_transformation.column_corpus_transformation import (\n", - " ColumnCorpusTransformation,\n", - ")\n", - "from embeddings.data.dataset import Dataset\n", - "\n", - "from embeddings.transformation.flair_transformation.downsample_corpus_transformation import (\n", - " DownsampleFlairCorpusTransformation,\n", - ")\n", - "from embeddings.transformation.flair_transformation.split_sample_corpus_transformation import (\n", - " SampleSplitsFlairCorpusTransformation,\n", - ")\n", - "from embeddings.utils.utils import build_output_path" - ] - }, - { - "cell_type": "markdown", - "id": "5e4c2372-8314-4868-a576-8f0988aae888", - "metadata": {}, - "source": [ - "### Run downsampled flair pipeline" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "dd4fb7d6-1e81-4bea-9bd3-b4a4bec87fc9", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2022-04-08 00:41:27,426 - embeddings.utils.utils - WARNING - String 'allegro/herbert-base-cased' contains '/'. Replacing it with '__'. Cleaned_text: allegro__herbert-base-cased.\n", - "2022-04-08 00:41:27,428 - embeddings.utils.utils - WARNING - String 'clarin-pl/kpwr-ner' contains '/'. Replacing it with '__'. Cleaned_text: clarin-pl__kpwr-ner.\n", - "2022-04-08 00:41:27,431 - embeddings.utils.utils - WARNING - String 'allegro/herbert-base-cased' contains '/'. Replacing it with '__'. Cleaned_text: allegro__herbert-base-cased.\n", - "2022-04-08 00:41:27,433 - embeddings.utils.utils - WARNING - String 'clarin-pl/kpwr-ner' contains '/'. Replacing it with '__'. Cleaned_text: clarin-pl__kpwr-ner.\n" - ] - } - ], - "source": [ - "embedding_name_or_path = \"allegro/herbert-base-cased\"\n", - "dataset_name = \"clarin-pl/kpwr-ner\"\n", - "\n", - "output_path = build_output_path(RESULTS_PATH, embedding_name_or_path, dataset_name)\n", - "\n", - "dataset = Dataset(dataset_name)\n", - "data_loader = HuggingFaceDataLoader()\n", - "transformation = (\n", - " ColumnCorpusTransformation(\"tokens\", \"ner\")\n", - " .then(SampleSplitsFlairCorpusTransformation(dev_fraction=0.1, seed=441))\n", - " .then(DownsampleFlairCorpusTransformation(downsample_train=0.005, downsample_dev=0.01, downsample_test=0.01))\n", - ")\n", - "task = SequenceLabeling(\n", - " output_path,\n", - " hidden_size=256,\n", - " task_train_kwargs={\"max_epochs\": 1, \"mini_batch_size\": 64},\n", - ")\n", - "embedding = AutoFlairWordEmbedding.from_hub(embedding_name_or_path)\n", - "model = FlairModel(embedding, task)\n", - "evaluator = SequenceLabelingEvaluator()\n", - "\n", - "pipeline = StandardPipeline(dataset, data_loader, transformation, model, evaluator)" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "2f69e538-332d-4278-977d-7002fe2b67bd", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Using custom data configuration default\n", - "Reusing dataset kpwrner (/home/djaniak/.cache/huggingface/datasets/clarin-pl___kpwrner/default/0.0.0/001e3d471298007e8412e3a6ccc06bec000dec1bce0cf8e0ba7e5b7e105b1342)\n" - ] - }, - { - "data": { - "text/plain": " 0%| | 0/2 [00:00\u001B[0;34m\u001B[0m\n\u001B[1;32m 1\u001B[0m \u001B[0;32mfrom\u001B[0m \u001B[0mflair\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mmodels\u001B[0m \u001B[0;32mimport\u001B[0m \u001B[0mSequenceTagger\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 2\u001B[0m \u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0;32m----> 3\u001B[0;31m \u001B[0mtask_from_ckpt\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mSequenceLabeling\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mfrom_checkpoint\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mcheckpoint_path\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0moutput_path\u001B[0m \u001B[0;34m/\u001B[0m \u001B[0;34m\"final-model.pt\"\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0moutput_path\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0moutput_path\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 4\u001B[0m \u001B[0;31m# trained_model = SequenceTagger.load(output_path / \"final-model.pt\")\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", - "\u001B[0;32m~/Projects/embeddings/embeddings/task/flair_task/sequence_labeling.py\u001B[0m in \u001B[0;36mfrom_checkpoint\u001B[0;34m(cls, checkpoint_path, output_path, task_train_kwargs)\u001B[0m\n\u001B[1;32m 73\u001B[0m \u001B[0mtask_train_kwargs\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mOptional\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mDict\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mstr\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mAny\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m]\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0;32mNone\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 74\u001B[0m ) -> \"FlairTask\":\n\u001B[0;32m---> 75\u001B[0;31m return cls.restore_task_model(\n\u001B[0m\u001B[1;32m 76\u001B[0m \u001B[0mcheckpoint_path\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0mcheckpoint_path\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 77\u001B[0m \u001B[0moutput_path\u001B[0m\u001B[0;34m=\u001B[0m\u001B[0moutput_path\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", - "\u001B[0;32m~/Projects/embeddings/embeddings/task/flair_task/flair_task.py\u001B[0m in \u001B[0;36mrestore_task_model\u001B[0;34m(cls, checkpoint_path, output_path, flair_model, task_train_kwargs)\u001B[0m\n\u001B[1;32m 117\u001B[0m \u001B[0mtask_train_kwargs\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mOptional\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mDict\u001B[0m\u001B[0;34m[\u001B[0m\u001B[0mstr\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mAny\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m]\u001B[0m\u001B[0;34m,\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[1;32m 118\u001B[0m ) -> \"FlairTask\":\n\u001B[0;32m--> 119\u001B[0;31m \u001B[0mmodel\u001B[0m \u001B[0;34m=\u001B[0m \u001B[0mflair_model\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mload\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mcheckpoint_path\u001B[0m\u001B[0;34m)\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n\u001B[0m\u001B[1;32m 120\u001B[0m task_model_kwargs = (\n\u001B[1;32m 121\u001B[0m \u001B[0;34m{\u001B[0m\u001B[0;34m\"hidden_size\"\u001B[0m\u001B[0;34m:\u001B[0m \u001B[0mmodel\u001B[0m\u001B[0;34m.\u001B[0m\u001B[0mhidden_size\u001B[0m\u001B[0;34m}\u001B[0m \u001B[0;32mif\u001B[0m \u001B[0misinstance\u001B[0m\u001B[0;34m(\u001B[0m\u001B[0mmodel\u001B[0m\u001B[0;34m,\u001B[0m \u001B[0mSequenceTagger\u001B[0m\u001B[0;34m)\u001B[0m \u001B[0;32melse\u001B[0m \u001B[0;34m{\u001B[0m\u001B[0;34m}\u001B[0m\u001B[0;34m\u001B[0m\u001B[0;34m\u001B[0m\u001B[0m\n", - "\u001B[0;31mTypeError\u001B[0m: __init__() missing 1 required positional argument: 'hidden_size'" - ] - } - ], - "source": [ - "from flair.models import SequenceTagger\n", - "\n", - "task_from_ckpt = SequenceLabeling.from_checkpoint(checkpoint_path=(output_path / \"final-model.pt\"), output_path=output_path)\n", - "# trained_model = SequenceTagger.load(output_path / \"final-model.pt\")" - ] - }, - { - "cell_type": "markdown", - "id": "802762c3-8246-465b-bb9c-2336134a51bd", - "metadata": {}, - "source": [ - "### Predict for test data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7c0fadf3-aa47-407d-ad79-e5633532eafa", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "loaded_data = data_loader.load(dataset)\n", - "transformed_data = transformation.transform(loaded_data)\n", - "test_data = transformed_data.test" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "944cc964-262d-49f8-ab9f-510aba3dad7d", - "metadata": {}, - "outputs": [], - "source": [ - "task.remove_labels_from_data(test_data, \"predicted\")\n", - "\n", - "loss = trained_model.predict(\n", - " sentences=test_data, mini_batch_size=64, label_name=\"predicted\", return_loss=True,\n", - ")\n", - "\n", - "y_pred = task.get_y(test_data, y_type=\"predicted\", y_dictionary=task.y_dictionary)\n", - "y_true = task.get_y(test_data, task.y_type, task.y_dictionary)\n", - "\n", - "task.remove_labels_from_data(test_data, \"predicted\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5f9d6dde-601a-4d0c-bb48-e9177e7002c9", - "metadata": {}, - "outputs": [], - "source": [ - "_ = evaluator.evaluate({\"y_pred\": y_pred, \"y_true\": y_true})" - ] - } - ], - "metadata": { - "kernelspec": { - "name": "conda-env-embeddings-py", - "language": "python", - "display_name": "Python [conda env:embeddings]" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file diff --git a/tutorials/validate_flair_models_inference.ipynb b/tutorials/validate_flair_models_inference.ipynb new file mode 100644 index 00000000..22858c62 --- /dev/null +++ b/tutorials/validate_flair_models_inference.ipynb @@ -0,0 +1,190 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "418b9661-aea2-4990-8e26-e7f0e167b9b2", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f23bfd51-d1f4-4321-aed9-96f51b171fe9", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "os.chdir(\"..\")\n", + "\n", + "from embeddings.data.data_loader import HuggingFaceDataLoader\n", + "from embeddings.defaults import DATASET_PATH, RESULTS_PATH\n", + "from embeddings.embedding.auto_flair import AutoFlairWordEmbedding\n", + "from embeddings.evaluator.sequence_labeling_evaluator import SequenceLabelingEvaluator\n", + "from embeddings.model.flair_model import FlairModel\n", + "from embeddings.pipeline.standard_pipeline import StandardPipeline\n", + "from embeddings.task.flair_task.sequence_labeling import SequenceLabeling\n", + "from embeddings.transformation.flair_transformation.column_corpus_transformation import (\n", + " ColumnCorpusTransformation,\n", + ")\n", + "from embeddings.data.dataset import Dataset\n", + "\n", + "from embeddings.transformation.flair_transformation.downsample_corpus_transformation import (\n", + " DownsampleFlairCorpusTransformation,\n", + ")\n", + "from embeddings.transformation.flair_transformation.split_sample_corpus_transformation import (\n", + " SampleSplitsFlairCorpusTransformation,\n", + ")\n", + "from embeddings.utils.utils import build_output_path" + ] + }, + { + "cell_type": "markdown", + "id": "5e4c2372-8314-4868-a576-8f0988aae888", + "metadata": {}, + "source": [ + "### Run downsampled flair pipeline" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dd4fb7d6-1e81-4bea-9bd3-b4a4bec87fc9", + "metadata": {}, + "outputs": [], + "source": [ + "embedding_name_or_path = \"clarin-pl/word2vec-kgr10\"\n", + "dataset_name = \"clarin-pl/kpwr-ner\"\n", + "\n", + "output_path = build_output_path(RESULTS_PATH, embedding_name_or_path, dataset_name)\n", + "\n", + "dataset = Dataset(dataset_name)\n", + "data_loader = HuggingFaceDataLoader()\n", + "transformation = (\n", + " ColumnCorpusTransformation(\"tokens\", \"ner\")\n", + " .then(SampleSplitsFlairCorpusTransformation(dev_fraction=0.1, seed=441))\n", + " .then(DownsampleFlairCorpusTransformation(downsample_train=0.005, downsample_dev=0.01, downsample_test=0.01))\n", + ")\n", + "task = SequenceLabeling(\n", + " output_path,\n", + " hidden_size=256,\n", + " task_train_kwargs={\"max_epochs\": 1, \"mini_batch_size\": 64},\n", + ")\n", + "embedding = AutoFlairWordEmbedding.from_hub(embedding_name_or_path)\n", + "model = FlairModel(embedding, task)\n", + "evaluator = SequenceLabelingEvaluator()\n", + "\n", + "pipeline = StandardPipeline(dataset, data_loader, transformation, model, evaluator)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f69e538-332d-4278-977d-7002fe2b67bd", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "_ = pipeline.run()" + ] + }, + { + "cell_type": "markdown", + "id": "44613ef9-a9d4-4d5c-980c-9e0f68bc3525", + "metadata": {}, + "source": [ + "### Load model from checkpoint" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ffae7c5-1734-4e4a-81bd-55170a5c14ca", + "metadata": {}, + "outputs": [], + "source": [ + "!ls $output_path" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8e75f303-fb82-4cfd-9a81-5d42e994e606", + "metadata": {}, + "outputs": [], + "source": [ + "task_from_ckpt = SequenceLabeling.from_checkpoint(checkpoint_path=(output_path / \"final-model.pt\"), output_path=output_path)" + ] + }, + { + "cell_type": "markdown", + "id": "802762c3-8246-465b-bb9c-2336134a51bd", + "metadata": {}, + "source": [ + "### Predict for test data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7c0fadf3-aa47-407d-ad79-e5633532eafa", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "loaded_data = data_loader.load(dataset)\n", + "transformed_data = transformation.transform(loaded_data)\n", + "test_data = transformed_data.test" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "df0d1eda-5bf0-40d7-97ed-16eab007a0f3", + "metadata": {}, + "outputs": [], + "source": [ + "y_pred, loss = task_from_ckpt.predict(test_data)\n", + "y_true = task_from_ckpt.get_y(test_data, task_from_ckpt.y_type, task_from_ckpt.y_dictionary)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5f9d6dde-601a-4d0c-bb48-e9177e7002c9", + "metadata": {}, + "outputs": [], + "source": [ + "evaluator.evaluate({\"y_pred\": y_pred, \"y_true\": y_true})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:clarinpl-embeddings]", + "language": "python", + "name": "conda-env-clarinpl-embeddings-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/tutorials/validate_lightning_models_inference.ipynb b/tutorials/validate_lightning_models_inference.ipynb index 456ef63b..7318d1ae 100644 --- a/tutorials/validate_lightning_models_inference.ipynb +++ b/tutorials/validate_lightning_models_inference.ipynb @@ -257,9 +257,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:embeddings]", + "display_name": "Python [conda env:clarinpl-embeddings]", "language": "python", - "name": "conda-env-embeddings-py" + "name": "conda-env-clarinpl-embeddings-py" }, "language_info": { "codemirror_mode": { @@ -276,4 +276,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +} From 2396b36f7b3af5184e4e6b0256c24df049104191 Mon Sep 17 00:00:00 2001 From: djaniak Date: Fri, 8 Apr 2022 16:11:19 +0200 Subject: [PATCH 14/22] refactor: naming in Lightning modules --- embeddings/model/lightning_module/huggingface_module.py | 6 +++--- embeddings/model/lightning_module/lightning_module.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/embeddings/model/lightning_module/huggingface_module.py b/embeddings/model/lightning_module/huggingface_module.py index c23bd42e..b772af83 100644 --- a/embeddings/model/lightning_module/huggingface_module.py +++ b/embeddings/model/lightning_module/huggingface_module.py @@ -25,8 +25,8 @@ def __init__( self.save_hyperparameters({"downstream_model_type": downstream_model_type.__name__}) self.downstream_model_type = downstream_model_type self.config_kwargs = config_kwargs if config_kwargs else {} - self.configure_model() - self.configure_metrics() + self._init_model() + self._init_metrics() def setup(self, stage: Optional[str] = None) -> None: if stage in ("fit", None): @@ -40,7 +40,7 @@ def setup(self, stage: Optional[str] = None) -> None: (len(train_loader.dataset) / ab_size) * float(self.trainer.max_epochs) ) - def configure_model(self) -> None: + def _init_model(self) -> None: self.config = AutoConfig.from_pretrained( self.hparams.model_name_or_path, num_labels=self.hparams.num_classes, diff --git a/embeddings/model/lightning_module/lightning_module.py b/embeddings/model/lightning_module/lightning_module.py index f0a8b0a9..0a7592da 100644 --- a/embeddings/model/lightning_module/lightning_module.py +++ b/embeddings/model/lightning_module/lightning_module.py @@ -90,7 +90,7 @@ def _predict_with_trainer(self, dataloader: DataLoader[HuggingFaceDataset]) -> t return_predictions=True, ) - def configure_metrics(self) -> None: + def _init_metrics(self) -> None: if self.metrics is None: self.metrics = self.get_default_metrics() self.train_metrics = self.metrics.clone(prefix="train/") @@ -142,13 +142,13 @@ def configure_optimizers(self) -> Tuple[List[Optimizer], List[Any]]: ) if self.hparams.use_scheduler: - lr_schedulers = self.configure_schedulers(optimizer=optimizer) + lr_schedulers = self._get_schedulers(optimizer=optimizer) else: lr_schedulers = [] return [optimizer], lr_schedulers - def configure_schedulers(self, optimizer: Optimizer) -> List[Dict[str, Any]]: + def _get_schedulers(self, optimizer: Optimizer) -> List[Dict[str, Any]]: scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=self.hparams.warmup_steps, From 38c1438583e068b47ff58efbfbebeb4122ae8587 Mon Sep 17 00:00:00 2001 From: djaniak Date: Fri, 8 Apr 2022 16:33:27 +0200 Subject: [PATCH 15/22] fix: tests and notebooks after rebase --- .../test_lightning_classification_pipeline.py | 45 +++----- tests/test_lightning_inference.py | 105 +++++++++--------- .../validate_lightning_models_inference.ipynb | 41 +++++-- 3 files changed, 100 insertions(+), 91 deletions(-) diff --git a/tests/test_lightning_classification_pipeline.py b/tests/test_lightning_classification_pipeline.py index 5dc064f7..eb43122a 100644 --- a/tests/test_lightning_classification_pipeline.py +++ b/tests/test_lightning_classification_pipeline.py @@ -21,13 +21,7 @@ def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path: @pytest.fixture(scope="module") -def pipeline_kwargs() -> Dict[str, Any]: - return {"embedding_name_or_path": "allegro/herbert-base-cased"} - - -@pytest.fixture(scope="module") -def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: - path = str(tmp_path_module) +def dataset_kwargs(tmp_path_module: Path) -> Dict[str, Any]: pipeline = HuggingFacePreprocessingPipeline( dataset_name="clarin-pl/polemo2-official", load_dataset_kwargs={ @@ -36,7 +30,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: "test_domains": ["hotels", "medicine"], "text_cfg": "text", }, - persist_path=path, + persist_path=tmp_path_module.name, sample_missing_splits=None, ignore_test_subset=False, downsample_splits=(0.01, 0.01, 0.05), @@ -45,7 +39,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: pipeline.run() return { - "dataset_name_or_path": path, + "dataset_name_or_path": tmp_path_module.name, "input_column_name": ["text"], "target_column_name": "target", } @@ -88,32 +82,25 @@ def config() -> LightningAdvancedConfig: def lightning_classification_pipeline( dataset_kwargs: Dict[str, Any], config: LightningAdvancedConfig, - result_path: "TemporaryDirectory[str]", -) -> Tuple[ - LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], - "TemporaryDirectory[str]", -]: - return ( - LightningClassificationPipeline( - embedding_name_or_path="allegro/herbert-base-cased", - output_path=result_path.name, - config=config, - devices="auto", - accelerator="cpu", - **dataset_kwargs, - ), - result_path, + tmp_path_module: Path, +) -> LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]]: + return LightningClassificationPipeline( + embedding_name_or_path="allegro/herbert-base-cased", + output_path=tmp_path_module.name, + config=config, + devices="auto", + accelerator="cpu", + **dataset_kwargs, ) def test_lightning_classification_pipeline( - lightning_classification_pipeline: Tuple[ - LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], - "TemporaryDirectory[str]", - ], + lightning_classification_pipeline: LightningPipeline[ + datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any] + ] ) -> None: pl.seed_everything(441, workers=True) - pipeline, path = lightning_classification_pipeline + pipeline = lightning_classification_pipeline result = pipeline.run() np.testing.assert_almost_equal( result["accuracy"]["accuracy"], 0.3783783, decimal=pytest.decimal diff --git a/tests/test_lightning_inference.py b/tests/test_lightning_inference.py index 1c1e10af..aaa5fa4c 100644 --- a/tests/test_lightning_inference.py +++ b/tests/test_lightning_inference.py @@ -1,4 +1,5 @@ from pathlib import Path +from tempfile import TemporaryDirectory from typing import Any, Dict, Tuple import datasets @@ -8,6 +9,7 @@ import torch from _pytest.tmpdir import TempdirFactory +from embeddings.config.lightning_config import LightningAdvancedConfig from embeddings.pipeline.hf_preprocessing_pipeline import HuggingFacePreprocessingPipeline from embeddings.pipeline.lightning_classification import LightningClassificationPipeline from embeddings.pipeline.lightning_pipeline import LightningPipeline @@ -21,16 +23,7 @@ def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path: @pytest.fixture(scope="module") -def pipeline_kwargs() -> Dict[str, Any]: - return { - "embedding_name_or_path": "allegro/herbert-base-cased", - "finetune_last_n_layers": 0, - } - - -@pytest.fixture(scope="module") -def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: - path = str(tmp_path_module) +def dataset_kwargs(tmp_path_module: Path) -> Dict[str, Any]: pipeline = HuggingFacePreprocessingPipeline( dataset_name="clarin-pl/polemo2-official", load_dataset_kwargs={ @@ -39,7 +32,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: "test_domains": ["hotels", "medicine"], "text_cfg": "text", }, - persist_path=path, + persist_path=tmp_path_module.name, sample_missing_splits=None, ignore_test_subset=False, downsample_splits=(0.01, 0.01, 0.05), @@ -48,79 +41,83 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: pipeline.run() return { - "dataset_name_or_path": path, + "dataset_name_or_path": tmp_path_module.name, "input_column_name": ["text"], "target_column_name": "target", } @pytest.fixture(scope="module") -def task_train_kwargs() -> Dict[str, Any]: - return { - "max_epochs": 1, - "devices": "auto", - "accelerator": "cpu", - "deterministic": True, - } - - -@pytest.fixture(scope="module") -def task_model_kwargs() -> Dict[str, Any]: - return {"learning_rate": 5e-4, "use_scheduler": False} - - -@pytest.fixture(scope="module") -def datamodule_kwargs() -> Dict[str, Any]: - return {"num_workers": 0} +def config() -> LightningAdvancedConfig: + return LightningAdvancedConfig( + finetune_last_n_layers=0, + task_train_kwargs={ + "max_epochs": 1, + "deterministic": True, + }, + task_model_kwargs={ + "learning_rate": 5e-4, + "train_batch_size": 32, + "eval_batch_size": 32, + "use_scheduler": False, + "optimizer": "AdamW", + "adam_epsilon": 1e-8, + "warmup_steps": 100, + "weight_decay": 0.0, + }, + datamodule_kwargs={ + "max_seq_length": 64, + }, + early_stopping_kwargs={ + "monitor": "val/Loss", + "mode": "min", + "patience": 3, + }, + tokenizer_kwargs={}, + batch_encoding_kwargs={}, + dataloader_kwargs={}, + model_config_kwargs={}, + ) @pytest.fixture(scope="module") def lightning_classification_pipeline( - pipeline_kwargs: Dict[str, Any], dataset_kwargs: Dict[str, Any], - datamodule_kwargs: Dict[str, Any], - task_train_kwargs: Dict[str, Any], - task_model_kwargs: Dict[str, Any], - result_path: Path, -) -> Tuple[LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], Path]: - return ( - LightningClassificationPipeline( - output_path=result_path.name, - **pipeline_kwargs, - **dataset_kwargs, - datamodule_kwargs=datamodule_kwargs, - task_train_kwargs=task_train_kwargs, - task_model_kwargs=task_model_kwargs, - ), - result_path, + config: LightningAdvancedConfig, + tmp_path_module: Path, +) -> LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]]: + return LightningClassificationPipeline( + embedding_name_or_path="allegro/herbert-base-cased", + output_path=tmp_path_module.name, + config=config, + devices="auto", + accelerator="cpu", + **dataset_kwargs, ) def test_lightning_pipeline_inference( - lightning_classification_pipeline: Tuple[ - LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], - Path, + lightning_classification_pipeline: LightningPipeline[ + datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any] ], + tmp_path_module: "TemporaryDirectory[str]", ) -> None: pl.seed_everything(441, workers=True) - - pipeline, path = lightning_classification_pipeline + pipeline = lightning_classification_pipeline results = pipeline.run() - ckpt_path = Path(path.name) / "checkpoints" / "epoch=0-step=1.ckpt" + ckpt_path = Path(tmp_path_module.name) / "checkpoints" / "epoch=0-step=1.ckpt" task_from_ckpt = TextClassificationTask.from_checkpoint( checkpoint_path=ckpt_path.resolve(), - output_path=path.name, + output_path=tmp_path_module.name, ) model_state_dict = pipeline.model.task.model.model.state_dict() model_from_ckpt_state_dict = task_from_ckpt.model.model.state_dict() - assert model_state_dict.keys() == model_from_ckpt_state_dict.keys() for k in model_state_dict.keys(): assert torch.equal(model_state_dict[k], model_from_ckpt_state_dict[k]) test_dataloader = pipeline.datamodule.test_dataloader() predictions = task_from_ckpt.predict(test_dataloader, return_names=False) - assert np.array_equal(results["data"]["y_probabilities"], predictions["y_probabilities"]) diff --git a/tutorials/validate_lightning_models_inference.ipynb b/tutorials/validate_lightning_models_inference.ipynb index 7318d1ae..d9326b1f 100644 --- a/tutorials/validate_lightning_models_inference.ipynb +++ b/tutorials/validate_lightning_models_inference.ipynb @@ -24,6 +24,7 @@ "from typing import Any, Dict\n", "\n", "import pytorch_lightning as pl\n", + "from embeddings.config.lightning_config import LightningAdvancedConfig\n", "from embeddings.defaults import DATASET_PATH, RESULTS_PATH\n", "from embeddings.model.lightning_module.text_classification import (\n", " TextClassificationModule,\n", @@ -100,6 +101,35 @@ "### Train simple downsampled pipeline" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "4cb7ebd4-182c-4797-b5de-a7069313a901", + "metadata": {}, + "outputs": [], + "source": [ + "config = LightningAdvancedConfig(\n", + " finetune_last_n_layers=0,\n", + " task_train_kwargs={\"max_epochs\": 1, \"deterministic\": True,},\n", + " task_model_kwargs={\n", + " \"learning_rate\": 5e-4,\n", + " \"train_batch_size\": 32,\n", + " \"eval_batch_size\": 32,\n", + " \"use_scheduler\": False,\n", + " \"optimizer\": \"AdamW\",\n", + " \"adam_epsilon\": 1e-8,\n", + " \"warmup_steps\": 100,\n", + " \"weight_decay\": 0.0,\n", + " },\n", + " datamodule_kwargs={\"max_seq_length\": 64,},\n", + " early_stopping_kwargs={\"monitor\": \"val/Loss\", \"mode\": \"min\", \"patience\": 3,},\n", + " tokenizer_kwargs={},\n", + " batch_encoding_kwargs={},\n", + " dataloader_kwargs={},\n", + " model_config_kwargs={},\n", + ")" + ] + }, { "cell_type": "code", "execution_count": null, @@ -112,14 +142,9 @@ "pipeline = LightningClassificationPipeline(\n", " embedding_name_or_path=embedding_name_or_path,\n", " output_path=output_path,\n", - " finetune_last_n_layers=0,\n", - " datamodule_kwargs={\"max_seq_length\": 64,},\n", - " task_train_kwargs={\n", - " \"max_epochs\": 1,\n", - " \"devices\": \"auto\",\n", - " \"accelerator\": \"cpu\",\n", - " \"deterministic\": True,\n", - " },\n", + " config=config,\n", + " devices=\"auto\",\n", + " accelerator=\"cpu\",\n", " **dataset_kwargs\n", ")\n", "result = pipeline.run()" From 5bebc1a46124ed0a41f4f2860478c12551fcbe62 Mon Sep 17 00:00:00 2001 From: djaniak Date: Fri, 8 Apr 2022 16:42:56 +0200 Subject: [PATCH 16/22] refactor: loading model from ckpt for lightning --- embeddings/task/lightning_task/lightning_task.py | 4 +--- embeddings/task/lightning_task/sequence_labeling.py | 2 -- embeddings/task/lightning_task/text_classification.py | 2 -- 3 files changed, 1 insertion(+), 7 deletions(-) diff --git a/embeddings/task/lightning_task/lightning_task.py b/embeddings/task/lightning_task/lightning_task.py index c2a25672..6ac3ccbf 100644 --- a/embeddings/task/lightning_task/lightning_task.py +++ b/embeddings/task/lightning_task/lightning_task.py @@ -126,7 +126,6 @@ def restore_task_model( output_path: T_path, lightning_module: Type[LightningModule[AutoModel]], task_train_kwargs: Optional[Dict[str, Any]], - early_stopping_kwargs: Optional[Dict[str, Any]], logging_config: Optional[LightningLoggingConfig], ) -> "LightningTask": model = lightning_module.load_from_checkpoint(str(checkpoint_path)) @@ -139,7 +138,7 @@ def restore_task_model( "model_config_kwargs": model.hparams.config_kwargs, "task_model_kwargs": model.hparams.task_model_kwargs, "task_train_kwargs": task_train_kwargs or {}, - "early_stopping_kwargs": early_stopping_kwargs or {}, + "early_stopping_kwargs": {}, "logging_config": logging_config or LightningLoggingConfig(), } task = cls(**init_kwargs) @@ -155,7 +154,6 @@ def from_checkpoint( checkpoint_path: T_path, output_path: T_path, task_train_kwargs: Optional[Dict[str, Any]] = None, - early_stopping_kwargs: Optional[Dict[str, Any]] = None, logging_config: Optional[LightningLoggingConfig] = None, ) -> "LightningTask": pass diff --git a/embeddings/task/lightning_task/sequence_labeling.py b/embeddings/task/lightning_task/sequence_labeling.py index d878caec..b1414f96 100644 --- a/embeddings/task/lightning_task/sequence_labeling.py +++ b/embeddings/task/lightning_task/sequence_labeling.py @@ -83,14 +83,12 @@ def from_checkpoint( checkpoint_path: T_path, output_path: T_path, task_train_kwargs: Optional[Dict[str, Any]] = None, - early_stopping_kwargs: Optional[Dict[str, Any]] = None, logging_config: Optional[LightningLoggingConfig] = None, ) -> "LightningTask": return cls.restore_task_model( checkpoint_path=checkpoint_path, output_path=output_path, task_train_kwargs=task_train_kwargs, - early_stopping_kwargs=early_stopping_kwargs, lightning_module=SequenceLabelingModule, logging_config=logging_config, ) diff --git a/embeddings/task/lightning_task/text_classification.py b/embeddings/task/lightning_task/text_classification.py index 996392a1..a817cce8 100644 --- a/embeddings/task/lightning_task/text_classification.py +++ b/embeddings/task/lightning_task/text_classification.py @@ -57,14 +57,12 @@ def from_checkpoint( checkpoint_path: T_path, output_path: T_path, task_train_kwargs: Optional[Dict[str, Any]] = None, - early_stopping_kwargs: Optional[Dict[str, Any]] = None, logging_config: Optional[LightningLoggingConfig] = None, ) -> "LightningTask": return cls.restore_task_model( checkpoint_path=checkpoint_path, output_path=output_path, task_train_kwargs=task_train_kwargs, - early_stopping_kwargs=early_stopping_kwargs, lightning_module=TextClassificationModule, logging_config=logging_config, ) From 8434919616a2e4b491ea2c7124a32e4f567b0cd0 Mon Sep 17 00:00:00 2001 From: djaniak Date: Sat, 9 Apr 2022 12:41:01 +0200 Subject: [PATCH 17/22] fix(tests): flair inference tests --- examples/evaluate_sequence_labelling.py | 1 - tests/test_flair_inference.py | 14 +++++--------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/examples/evaluate_sequence_labelling.py b/examples/evaluate_sequence_labelling.py index f5484ef0..d92d11e7 100644 --- a/examples/evaluate_sequence_labelling.py +++ b/examples/evaluate_sequence_labelling.py @@ -43,7 +43,6 @@ def run( input_column_name=input_column_name, target_column_name=target_column_name, output_path=output_path, - hidden_size=hidden_size, evaluation_mode=evaluation_mode, tagging_scheme=tagging_scheme, ) diff --git a/tests/test_flair_inference.py b/tests/test_flair_inference.py index 60cf7ab5..a1e880c9 100644 --- a/tests/test_flair_inference.py +++ b/tests/test_flair_inference.py @@ -12,11 +12,8 @@ from embeddings.data.data_loader import HuggingFaceDataLoader from embeddings.data.dataset import Dataset -from embeddings.embedding.flair_embedding import FlairDocumentPoolEmbedding -from embeddings.embedding.flair_loader import ( - FlairDocumentPoolEmbeddingLoader, - FlairWordEmbeddingLoader, -) +from embeddings.embedding.auto_flair import AutoFlairDocumentEmbedding +from embeddings.embedding.flair_loader import FlairWordEmbeddingLoader from embeddings.evaluator.sequence_labeling_evaluator import SequenceLabelingEvaluator from embeddings.evaluator.text_classification_evaluator import TextClassificationEvaluator from embeddings.model.flair_model import FlairModel @@ -59,10 +56,9 @@ def text_classification_pipeline( ) data_loader = HuggingFaceDataLoader() transformation = ClassificationCorpusTransformation("text", "target").then( - DownsampleFlairCorpusTransformation(*(0.005, 0.01, 0.01), stratify=False) + DownsampleFlairCorpusTransformation(*(0.01, 0.01, 0.01), stratify=False) ) - embedding_loader = FlairDocumentPoolEmbeddingLoader("clarin-pl/word2vec-kgr10", "") - embedding = embedding_loader.get_embedding(FlairDocumentPoolEmbedding) + embedding = AutoFlairDocumentEmbedding.from_hub("allegro/herbert-base-cased") task = TextClassification(output_path.name, task_train_kwargs={"max_epochs": 1}) model = FlairModel(embedding, task) evaluator = TextClassificationEvaluator() @@ -89,7 +85,7 @@ def sequence_labeling_pipeline( hidden_size=256, task_train_kwargs={"max_epochs": 1, "mini_batch_size": 64}, ) - embedding_loader = FlairWordEmbeddingLoader("clarin-pl/word2vec-kgr10", "") + embedding_loader = FlairWordEmbeddingLoader("allegro/herbert-base-cased", "model_type_reference") embedding = embedding_loader.get_embedding() model = FlairModel(embedding, task) evaluator = SequenceLabelingEvaluator() From cfb2cb190ec8f30997ea5f281f7a9ed24d192de6 Mon Sep 17 00:00:00 2001 From: djaniak Date: Sun, 10 Apr 2022 21:39:12 +0200 Subject: [PATCH 18/22] fix(tests): isort --- tests/test_flair_inference.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_flair_inference.py b/tests/test_flair_inference.py index a1e880c9..75b16d72 100644 --- a/tests/test_flair_inference.py +++ b/tests/test_flair_inference.py @@ -85,7 +85,9 @@ def sequence_labeling_pipeline( hidden_size=256, task_train_kwargs={"max_epochs": 1, "mini_batch_size": 64}, ) - embedding_loader = FlairWordEmbeddingLoader("allegro/herbert-base-cased", "model_type_reference") + embedding_loader = FlairWordEmbeddingLoader( + "allegro/herbert-base-cased", "model_type_reference" + ) embedding = embedding_loader.get_embedding() model = FlairModel(embedding, task) evaluator = SequenceLabelingEvaluator() From 2c830c86ba691b830b0ff7df2afddcab3e623017 Mon Sep 17 00:00:00 2001 From: djaniak Date: Tue, 12 Apr 2022 17:16:49 +0200 Subject: [PATCH 19/22] fix: inference for lightning pipelines --- embeddings/data/datamodule.py | 2 - .../lightning_module/huggingface_module.py | 13 +++- .../lightning_module/lightning_module.py | 6 ++ .../lightning_module/sequence_labeling.py | 76 ++++++++++++++++++- .../task/lightning_task/lightning_task.py | 2 +- .../task/lightning_task/sequence_labeling.py | 9 +-- .../lightning_task/text_classification.py | 2 +- 7 files changed, 96 insertions(+), 14 deletions(-) diff --git a/embeddings/data/datamodule.py b/embeddings/data/datamodule.py index 67ae6c48..4182198a 100644 --- a/embeddings/data/datamodule.py +++ b/embeddings/data/datamodule.py @@ -75,9 +75,7 @@ def __init__( self.load_dataset_kwargs = load_dataset_kwargs if load_dataset_kwargs else {} self.dataloader_kwargs = dataloader_kwargs if dataloader_kwargs else {} self.seed = seed - self.setup() - super().__init__( dataset_info=self.dataset["train"].info, dataset_version=self.dataset["train"].info.version.version_str, diff --git a/embeddings/model/lightning_module/huggingface_module.py b/embeddings/model/lightning_module/huggingface_module.py index b772af83..ed23b6e5 100644 --- a/embeddings/model/lightning_module/huggingface_module.py +++ b/embeddings/model/lightning_module/huggingface_module.py @@ -1,7 +1,7 @@ import abc import sys from collections import ChainMap -from typing import Any, Dict, Optional, Type +from typing import Any, Dict, List, Optional, Type from torchmetrics import F1, Accuracy, MetricCollection, Precision, Recall from transformers import AutoConfig, AutoModel @@ -25,13 +25,15 @@ def __init__( self.save_hyperparameters({"downstream_model_type": downstream_model_type.__name__}) self.downstream_model_type = downstream_model_type self.config_kwargs = config_kwargs if config_kwargs else {} + self.target_names: Optional[List[str]] = None self._init_model() self._init_metrics() def setup(self, stage: Optional[str] = None) -> None: if stage in ("fit", None): + assert self.trainer is not None + self.target_names = self.trainer.datamodule.target_names if self.hparams.use_scheduler: - assert self.trainer is not None train_loader = self.trainer.datamodule.train_dataloader() gpus = getattr(self.trainer, "gpus") if getattr(self.trainer, "gpus") else 0 tb_size = self.hparams.train_batch_size * max(1, gpus) @@ -98,3 +100,10 @@ def forward(self, *args: Any, **kwargs: Any) -> Any: if isinstance(inputs, tuple): inputs = dict(ChainMap(*inputs)) return self.model(**inputs) + + def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None: + assert self.trainer is not None + checkpoint["target_names"] = self.trainer.datamodule.target_names + + def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: + self.target_names = checkpoint["target_names"] diff --git a/embeddings/model/lightning_module/lightning_module.py b/embeddings/model/lightning_module/lightning_module.py index 0a7592da..9c5a9808 100644 --- a/embeddings/model/lightning_module/lightning_module.py +++ b/embeddings/model/lightning_module/lightning_module.py @@ -15,9 +15,12 @@ from transformers import get_linear_schedule_with_warmup from embeddings.data.datamodule import HuggingFaceDataset +from embeddings.utils.loggers import get_logger Model = TypeVar("Model") +_logger = get_logger(__name__) + class LightningModule(pl.LightningModule, abc.ABC, Generic[Model]): def __init__( @@ -84,6 +87,9 @@ def _predict_with_trainer(self, dataloader: DataLoader[HuggingFaceDataset]) -> t model=self, dataloaders=dataloader, return_predictions=True, ckpt_path="best" ) except MisconfigurationException: # model loaded but not fitted + _logger.warning( + "The best model checkpoint cannot be loaded because trainer.fit has not been called. Using current weights for prediction." + ) return self.trainer.predict( model=self, dataloaders=dataloader, diff --git a/embeddings/model/lightning_module/sequence_labeling.py b/embeddings/model/lightning_module/sequence_labeling.py index 1e27a4f2..d6b38cfd 100644 --- a/embeddings/model/lightning_module/sequence_labeling.py +++ b/embeddings/model/lightning_module/sequence_labeling.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Optional, Tuple +from typing import Any, Dict, Iterable, Optional, Tuple, Union import torch from pytorch_lightning.utilities.types import STEP_OUTPUT @@ -35,6 +35,19 @@ def __init__( task_model_kwargs=task_model_kwargs, ) self.ignore_index = ignore_index + self._str2int: Optional[Dict[str, int]] = None + self._int2str: Optional[Dict[int, str]] = None + + def setup(self, stage: Optional[str] = None) -> None: + if stage in ("fit", None): + assert self.trainer is not None + self._int2str = ( + self.trainer.datamodule.dataset["train"].features["labels"].feature._int2str + ) + self._str2int = ( + self.trainer.datamodule.dataset["train"].features["labels"].feature._str2int + ) + super().setup(stage=stage) def shared_step(self, **batch: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: outputs = self.forward(**batch) @@ -75,3 +88,64 @@ def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: else: _logger.warning("Missing labels for the test data") return None + + def str2int(self, values: Union[str, Iterable[Any]]) -> Union[int, Iterable[Any]]: + """Conversion class name string => integer duplicated from huggingface ClassLabel.""" + assert isinstance(values, str) or isinstance( + values, Iterable + ), f"Values {values} should be a string or an Iterable (list, numpy array, pytorch, tensorflow tensors)" + return_list = True + if isinstance(values, str): + values = [values] + return_list = False + + output = [] + for value in values: + if self._str2int: + # strip key if not in dict + if value not in self._str2int: + value = str(value).strip() + output.append(self._str2int[str(value)]) + else: + # No names provided, try to integerize + failed_parse = False + try: + output.append(int(value)) + if not 0 <= int(value) < self.hparams.num_classes: + failed_parse = True + except ValueError: + failed_parse = True + if failed_parse: + raise ValueError(f"Invalid string class label {value}") + return output if return_list else output[0] + + def int2str(self, values: Union[int, Iterable[Any]]) -> Union[str, Iterable[Any]]: + """Conversion integer => class name string duplicated from huggingface ClassLabel.""" + assert isinstance(values, int) or isinstance( + values, Iterable + ), f"Values {values} should be an integer or an Iterable (list, numpy array, pytorch, tensorflow tensors)" + return_list = True + if isinstance(values, int): + values = [values] + return_list = False + + for v in values: + if not 0 <= v < self.hparams.num_classes: + raise ValueError(f"Invalid integer class label {v:d}") + + if self._int2str: + output = [self._int2str[int(v)] for v in values] + else: + # No names provided, return str(values) + output = [str(v) for v in values] + return output if return_list else output[0] + + def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None: + checkpoint["_int2str"] = self._int2str + checkpoint["_str2int"] = self._str2int + super().on_save_checkpoint(checkpoint=checkpoint) + + def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: + self._int2str = checkpoint["_int2str"] + self._str2int = checkpoint["_str2int"] + super().on_load_checkpoint(checkpoint=checkpoint) diff --git a/embeddings/task/lightning_task/lightning_task.py b/embeddings/task/lightning_task/lightning_task.py index 6ac3ccbf..55a5d61b 100644 --- a/embeddings/task/lightning_task/lightning_task.py +++ b/embeddings/task/lightning_task/lightning_task.py @@ -62,7 +62,7 @@ def best_validation_score(self) -> Optional[float]: def _get_callbacks(self, dataset_subsets: Sequence[str]) -> List[Callback]: callbacks: List[Callback] = [ - ModelCheckpoint(dirpath=self.output_path.joinpath("checkpoints")) + ModelCheckpoint(dirpath=self.output_path.joinpath("checkpoints"), save_last=True) ] if "validation" in dataset_subsets: callbacks.append(BestEpochCallback()) diff --git a/embeddings/task/lightning_task/sequence_labeling.py b/embeddings/task/lightning_task/sequence_labeling.py index b1414f96..c31efa04 100644 --- a/embeddings/task/lightning_task/sequence_labeling.py +++ b/embeddings/task/lightning_task/sequence_labeling.py @@ -61,20 +61,15 @@ def predict( "y_probabilities": np.array(probabilities, dtype=object), } if return_names: - assert self.trainer is not None - assert hasattr(self.trainer, "datamodule") - results["names"] = np.array(getattr(self.trainer, "datamodule").target_names) + results["names"] = np.array(self.model.target_names) return results def _map_filter_data( self, data: nptyping.NDArray[Any], ground_truth_data: nptyping.NDArray[Any] ) -> List[str]: assert self.model is not None - assert self.trainer is not None - assert hasattr(self.trainer, "datamodule") return [ - getattr(self.trainer, "datamodule").id2str(x.item()) - for x in data[ground_truth_data != self.model.ignore_index] + self.model.int2str(x.item()) for x in data[ground_truth_data != self.model.ignore_index] ] @classmethod diff --git a/embeddings/task/lightning_task/text_classification.py b/embeddings/task/lightning_task/text_classification.py index a817cce8..c852a126 100644 --- a/embeddings/task/lightning_task/text_classification.py +++ b/embeddings/task/lightning_task/text_classification.py @@ -48,7 +48,7 @@ def predict( if return_names: assert self.trainer is not None assert hasattr(self.trainer, "datamodule") - results["names"] = np.array(getattr(self.trainer, "datamodule").target_names) + results["names"] = np.array(self.model.target_names) return results @classmethod From 63e96b7c150822ddc0eac316ebf818e423b59d31 Mon Sep 17 00:00:00 2001 From: djaniak Date: Tue, 12 Apr 2022 17:17:07 +0200 Subject: [PATCH 20/22] refactor: inference tests for flair and lightning --- tests/test_flair_inference.py | 144 ------------------ .../test_lightning_classification_pipeline.py | 41 ++++- tests/test_lightning_inference.py | 123 --------------- ...st_lightning_sequence_labeling_pipeline.py | 66 +++++--- tests/test_sequence_labelling.py | 38 ++++- tests/test_text_classification.py | 31 +++- 6 files changed, 146 insertions(+), 297 deletions(-) delete mode 100644 tests/test_flair_inference.py delete mode 100644 tests/test_lightning_inference.py diff --git a/tests/test_flair_inference.py b/tests/test_flair_inference.py deleted file mode 100644 index 75b16d72..00000000 --- a/tests/test_flair_inference.py +++ /dev/null @@ -1,144 +0,0 @@ -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Any, Dict - -import datasets -import flair -import numpy as np -import pytest -from _pytest.tmpdir import TempdirFactory -from flair.data import Corpus -from numpy import typing as nptyping - -from embeddings.data.data_loader import HuggingFaceDataLoader -from embeddings.data.dataset import Dataset -from embeddings.embedding.auto_flair import AutoFlairDocumentEmbedding -from embeddings.embedding.flair_loader import FlairWordEmbeddingLoader -from embeddings.evaluator.sequence_labeling_evaluator import SequenceLabelingEvaluator -from embeddings.evaluator.text_classification_evaluator import TextClassificationEvaluator -from embeddings.model.flair_model import FlairModel -from embeddings.pipeline.standard_pipeline import StandardPipeline -from embeddings.task.flair_task.sequence_labeling import SequenceLabeling -from embeddings.task.flair_task.text_classification import TextClassification -from embeddings.transformation.flair_transformation.classification_corpus_transformation import ( - ClassificationCorpusTransformation, -) -from embeddings.transformation.flair_transformation.column_corpus_transformation import ( - ColumnCorpusTransformation, -) -from embeddings.transformation.flair_transformation.downsample_corpus_transformation import ( - DownsampleFlairCorpusTransformation, -) -from embeddings.transformation.flair_transformation.split_sample_corpus_transformation import ( - SampleSplitsFlairCorpusTransformation, -) - - -@pytest.fixture(scope="module") -def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path: - path = tmpdir_factory.mktemp(__name__) - return Path(path) - - -@pytest.fixture(scope="module") -def text_classification_pipeline( - tmp_path_module: "TemporaryDirectory[str]", -) -> StandardPipeline[ - str, datasets.DatasetDict, Corpus, Dict[str, nptyping.NDArray[Any]], Dict[str, Any] -]: - output_path = tmp_path_module - dataset = Dataset( - "clarin-pl/polemo2-official", - train_domains=["reviews"], - dev_domains=["reviews"], - test_domains=["reviews"], - text_cfg="sentence", - ) - data_loader = HuggingFaceDataLoader() - transformation = ClassificationCorpusTransformation("text", "target").then( - DownsampleFlairCorpusTransformation(*(0.01, 0.01, 0.01), stratify=False) - ) - embedding = AutoFlairDocumentEmbedding.from_hub("allegro/herbert-base-cased") - task = TextClassification(output_path.name, task_train_kwargs={"max_epochs": 1}) - model = FlairModel(embedding, task) - evaluator = TextClassificationEvaluator() - pipeline = StandardPipeline(dataset, data_loader, transformation, model, evaluator) - return pipeline - - -@pytest.fixture(scope="module") -def sequence_labeling_pipeline( - tmp_path_module: "TemporaryDirectory[str]", -) -> StandardPipeline[ - str, datasets.DatasetDict, Corpus, Dict[str, nptyping.NDArray[Any]], Dict[str, Any] -]: - output_path = tmp_path_module - dataset = Dataset("clarin-pl/kpwr-ner") - data_loader = HuggingFaceDataLoader() - transformation = ( - ColumnCorpusTransformation("tokens", "ner") - .then(SampleSplitsFlairCorpusTransformation(dev_fraction=0.1, seed=441)) - .then(DownsampleFlairCorpusTransformation(*(0.005, 0.01, 0.01), stratify=False)) - ) - task = SequenceLabeling( - output_path.name, - hidden_size=256, - task_train_kwargs={"max_epochs": 1, "mini_batch_size": 64}, - ) - embedding_loader = FlairWordEmbeddingLoader( - "allegro/herbert-base-cased", "model_type_reference" - ) - embedding = embedding_loader.get_embedding() - model = FlairModel(embedding, task) - evaluator = SequenceLabelingEvaluator() - pipeline = StandardPipeline(dataset, data_loader, transformation, model, evaluator) - return pipeline - - -def test_text_classification_inference( - text_classification_pipeline: StandardPipeline[ - str, datasets.DatasetDict, Corpus, Dict[str, nptyping.NDArray[Any]], Dict[str, Any] - ], - tmp_path_module: "TemporaryDirectory[str]", -) -> None: - flair.set_seed(441) - output_path = tmp_path_module - pipeline = text_classification_pipeline - result = pipeline.run() - - task_from_ckpt = TextClassification.from_checkpoint( - checkpoint_path=(Path(output_path.name) / "final-model.pt"), output_path=output_path.name - ) - loaded_data = pipeline.data_loader.load(pipeline.dataset) - transformed_data = pipeline.transformation.transform(loaded_data) - test_data = transformed_data.test - - y_pred, loss = task_from_ckpt.predict(test_data) - y_true = task_from_ckpt.get_y(test_data, task_from_ckpt.y_type, task_from_ckpt.y_dictionary) - results_from_ckpt = pipeline.evaluator.evaluate({"y_pred": y_pred, "y_true": y_true}) - assert np.array_equal(result["data"]["y_pred"], results_from_ckpt["data"]["y_pred"]) - - -def test_sequence_labeling_inference( - sequence_labeling_pipeline: StandardPipeline[ - str, datasets.DatasetDict, Corpus, Dict[str, nptyping.NDArray[Any]], Dict[str, Any] - ], - tmp_path_module: "TemporaryDirectory[str]", -) -> None: - flair.set_seed(441) - output_path = tmp_path_module - pipeline = sequence_labeling_pipeline - result = pipeline.run() - - task_from_ckpt = SequenceLabeling.from_checkpoint( - checkpoint_path=(Path(output_path.name) / "final-model.pt"), output_path=output_path.name - ) - loaded_data = pipeline.data_loader.load(pipeline.dataset) - transformed_data = pipeline.transformation.transform(loaded_data) - test_data = transformed_data.test - - y_pred, loss = task_from_ckpt.predict(test_data) - y_true = task_from_ckpt.get_y(test_data, task_from_ckpt.y_type, task_from_ckpt.y_dictionary) - results_from_ckpt = pipeline.evaluator.evaluate({"y_pred": y_pred, "y_true": y_true}) - - assert np.array_equal(result["data"]["y_pred"], results_from_ckpt["data"]["y_pred"]) diff --git a/tests/test_lightning_classification_pipeline.py b/tests/test_lightning_classification_pipeline.py index eb43122a..64257bbd 100644 --- a/tests/test_lightning_classification_pipeline.py +++ b/tests/test_lightning_classification_pipeline.py @@ -6,12 +6,14 @@ import numpy as np import pytest import pytorch_lightning as pl +import torch from _pytest.tmpdir import TempdirFactory from embeddings.config.lightning_config import LightningAdvancedConfig from embeddings.pipeline.hf_preprocessing_pipeline import HuggingFacePreprocessingPipeline from embeddings.pipeline.lightning_classification import LightningClassificationPipeline from embeddings.pipeline.lightning_pipeline import LightningPipeline +from embeddings.task.lightning_task.text_classification import TextClassificationTask @pytest.fixture(scope="module") @@ -30,7 +32,7 @@ def dataset_kwargs(tmp_path_module: Path) -> Dict[str, Any]: "test_domains": ["hotels", "medicine"], "text_cfg": "text", }, - persist_path=tmp_path_module.name, + persist_path=str(tmp_path_module), sample_missing_splits=None, ignore_test_subset=False, downsample_splits=(0.01, 0.01, 0.05), @@ -39,7 +41,7 @@ def dataset_kwargs(tmp_path_module: Path) -> Dict[str, Any]: pipeline.run() return { - "dataset_name_or_path": tmp_path_module.name, + "dataset_name_or_path": tmp_path_module, "input_column_name": ["text"], "target_column_name": "target", } @@ -86,7 +88,7 @@ def lightning_classification_pipeline( ) -> LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]]: return LightningClassificationPipeline( embedding_name_or_path="allegro/herbert-base-cased", - output_path=tmp_path_module.name, + output_path=tmp_path_module, config=config, devices="auto", accelerator="cpu", @@ -97,11 +99,19 @@ def lightning_classification_pipeline( def test_lightning_classification_pipeline( lightning_classification_pipeline: LightningPipeline[ datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any] - ] + ], + tmp_path_module: Path, ) -> None: pl.seed_everything(441, workers=True) pipeline = lightning_classification_pipeline result = pipeline.run() + + assert_result_values(result) + assert_result_types(result) + assert_inference_from_checkpoint(result, pipeline, tmp_path_module) + + +def assert_result_values(result: Dict[str, Any]) -> None: np.testing.assert_almost_equal( result["accuracy"]["accuracy"], 0.3783783, decimal=pytest.decimal ) @@ -115,6 +125,8 @@ def test_lightning_classification_pipeline( result["recall__average_macro"]["recall"], 0.2333333, decimal=pytest.decimal ) + +def assert_result_types(result: Dict[str, Any]) -> None: assert "data" in result assert "y_pred" in result["data"] assert "y_true" in result["data"] @@ -128,3 +140,24 @@ def test_lightning_classification_pipeline( assert result["data"]["y_true"].dtype == np.int64 assert result["data"]["y_probabilities"].dtype == np.float32 assert isinstance(result["data"]["names"][0], str) + + +def assert_inference_from_checkpoint( + result: Dict[str, Any], + pipeline: LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], + tmp_path_module: Path, +) -> None: + ckpt_path = tmp_path_module / "checkpoints" / "last.ckpt" + task_from_ckpt = TextClassificationTask.from_checkpoint( + checkpoint_path=ckpt_path.resolve(), + output_path=tmp_path_module, + ) + + model_state_dict = pipeline.model.task.model.model.state_dict() + model_from_ckpt_state_dict = task_from_ckpt.model.model.state_dict() + assert model_state_dict.keys() == model_from_ckpt_state_dict.keys() + for k in model_state_dict.keys(): + assert torch.equal(model_state_dict[k], model_from_ckpt_state_dict[k]) + + predictions = task_from_ckpt.predict(pipeline.datamodule.test_dataloader()) + assert np.array_equal(result["data"]["y_probabilities"], predictions["y_probabilities"]) diff --git a/tests/test_lightning_inference.py b/tests/test_lightning_inference.py deleted file mode 100644 index aaa5fa4c..00000000 --- a/tests/test_lightning_inference.py +++ /dev/null @@ -1,123 +0,0 @@ -from pathlib import Path -from tempfile import TemporaryDirectory -from typing import Any, Dict, Tuple - -import datasets -import numpy as np -import pytest -import pytorch_lightning as pl -import torch -from _pytest.tmpdir import TempdirFactory - -from embeddings.config.lightning_config import LightningAdvancedConfig -from embeddings.pipeline.hf_preprocessing_pipeline import HuggingFacePreprocessingPipeline -from embeddings.pipeline.lightning_classification import LightningClassificationPipeline -from embeddings.pipeline.lightning_pipeline import LightningPipeline -from embeddings.task.lightning_task.text_classification import TextClassificationTask - - -@pytest.fixture(scope="module") -def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path: - path = tmpdir_factory.mktemp(__name__) - return Path(path) - - -@pytest.fixture(scope="module") -def dataset_kwargs(tmp_path_module: Path) -> Dict[str, Any]: - pipeline = HuggingFacePreprocessingPipeline( - dataset_name="clarin-pl/polemo2-official", - load_dataset_kwargs={ - "train_domains": ["hotels", "medicine"], - "dev_domains": ["hotels", "medicine"], - "test_domains": ["hotels", "medicine"], - "text_cfg": "text", - }, - persist_path=tmp_path_module.name, - sample_missing_splits=None, - ignore_test_subset=False, - downsample_splits=(0.01, 0.01, 0.05), - seed=441, - ) - pipeline.run() - - return { - "dataset_name_or_path": tmp_path_module.name, - "input_column_name": ["text"], - "target_column_name": "target", - } - - -@pytest.fixture(scope="module") -def config() -> LightningAdvancedConfig: - return LightningAdvancedConfig( - finetune_last_n_layers=0, - task_train_kwargs={ - "max_epochs": 1, - "deterministic": True, - }, - task_model_kwargs={ - "learning_rate": 5e-4, - "train_batch_size": 32, - "eval_batch_size": 32, - "use_scheduler": False, - "optimizer": "AdamW", - "adam_epsilon": 1e-8, - "warmup_steps": 100, - "weight_decay": 0.0, - }, - datamodule_kwargs={ - "max_seq_length": 64, - }, - early_stopping_kwargs={ - "monitor": "val/Loss", - "mode": "min", - "patience": 3, - }, - tokenizer_kwargs={}, - batch_encoding_kwargs={}, - dataloader_kwargs={}, - model_config_kwargs={}, - ) - - -@pytest.fixture(scope="module") -def lightning_classification_pipeline( - dataset_kwargs: Dict[str, Any], - config: LightningAdvancedConfig, - tmp_path_module: Path, -) -> LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]]: - return LightningClassificationPipeline( - embedding_name_or_path="allegro/herbert-base-cased", - output_path=tmp_path_module.name, - config=config, - devices="auto", - accelerator="cpu", - **dataset_kwargs, - ) - - -def test_lightning_pipeline_inference( - lightning_classification_pipeline: LightningPipeline[ - datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any] - ], - tmp_path_module: "TemporaryDirectory[str]", -) -> None: - pl.seed_everything(441, workers=True) - pipeline = lightning_classification_pipeline - results = pipeline.run() - - ckpt_path = Path(tmp_path_module.name) / "checkpoints" / "epoch=0-step=1.ckpt" - task_from_ckpt = TextClassificationTask.from_checkpoint( - checkpoint_path=ckpt_path.resolve(), - output_path=tmp_path_module.name, - ) - - model_state_dict = pipeline.model.task.model.model.state_dict() - model_from_ckpt_state_dict = task_from_ckpt.model.model.state_dict() - assert model_state_dict.keys() == model_from_ckpt_state_dict.keys() - for k in model_state_dict.keys(): - assert torch.equal(model_state_dict[k], model_from_ckpt_state_dict[k]) - - test_dataloader = pipeline.datamodule.test_dataloader() - predictions = task_from_ckpt.predict(test_dataloader, return_names=False) - assert np.array_equal(results["data"]["y_probabilities"], predictions["y_probabilities"]) diff --git a/tests/test_lightning_sequence_labeling_pipeline.py b/tests/test_lightning_sequence_labeling_pipeline.py index 9804f19a..ed710f14 100644 --- a/tests/test_lightning_sequence_labeling_pipeline.py +++ b/tests/test_lightning_sequence_labeling_pipeline.py @@ -5,12 +5,14 @@ import numpy as np import pytest import pytorch_lightning as pl +import torch from _pytest.tmpdir import TempdirFactory from embeddings.config.lightning_config import LightningAdvancedConfig from embeddings.pipeline.hf_preprocessing_pipeline import HuggingFacePreprocessingPipeline from embeddings.pipeline.lightning_pipeline import LightningPipeline from embeddings.pipeline.lightning_sequence_labeling import LightningSequenceLabelingPipeline +from embeddings.task.lightning_task.sequence_labeling import SequenceLabelingTask @pytest.fixture(scope="module") @@ -20,12 +22,11 @@ def tmp_path_module(tmpdir_factory: TempdirFactory) -> Path: @pytest.fixture(scope="module") -def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: - path = str(tmp_path_module) +def dataset_kwargs(tmp_path_module: Path) -> Dict[str, Any]: pipeline = HuggingFacePreprocessingPipeline( dataset_name="clarin-pl/kpwr-ner", load_dataset_kwargs=None, - persist_path=path, + persist_path=str(tmp_path_module), sample_missing_splits=None, ignore_test_subset=False, downsample_splits=(0.01, 0.01, 0.05), @@ -34,7 +35,7 @@ def dataset_kwargs(tmp_path_module) -> Dict[str, Any]: pipeline.run() return { - "dataset_name_or_path": path, + "dataset_name_or_path": tmp_path_module, "input_column_name": "tokens", "target_column_name": "ner", } @@ -79,28 +80,32 @@ def config() -> LightningAdvancedConfig: def lightning_sequence_labeling_pipeline( dataset_kwargs: Dict[str, Any], config: LightningAdvancedConfig, - tmp_path: Path, -) -> Tuple[LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], Path]: - return ( - LightningSequenceLabelingPipeline( - output_path=tmp_path, - embedding_name_or_path="allegro/herbert-base-cased", - config=config, - **dataset_kwargs, - ), - tmp_path, + tmp_path_module: Path, +) -> LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]]: + return LightningSequenceLabelingPipeline( + output_path=tmp_path_module, + embedding_name_or_path="allegro/herbert-base-cased", + config=config, + **dataset_kwargs, ) def test_lightning_sequence_labeling_pipeline( - lightning_sequence_labeling_pipeline: Tuple[ - LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], - Path, + lightning_sequence_labeling_pipeline: LightningPipeline[ + datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any] ], + tmp_path_module: Path, ) -> None: pl.seed_everything(441) - pipeline, path = lightning_sequence_labeling_pipeline + pipeline = lightning_sequence_labeling_pipeline result = pipeline.run() + + assert_result_values(result) + assert_result_types(result) + assert_inference_from_checkpoint(result, pipeline, tmp_path_module) + + +def assert_result_values(result: Dict[str, Any]) -> None: np.testing.assert_almost_equal( result["seqeval__mode_None__scheme_None"]["overall_accuracy"], 0.0015690, @@ -120,6 +125,8 @@ def test_lightning_sequence_labeling_pipeline( decimal=pytest.decimal, ) + +def assert_result_types(result: Dict[str, Any]) -> None: assert "data" in result assert "y_pred" in result["data"] assert "y_true" in result["data"] @@ -138,3 +145,26 @@ def test_lightning_sequence_labeling_pipeline( assert isinstance(result["data"]["y_probabilities"][0][0], np.ndarray) assert isinstance(result["data"]["names"][0], str) assert isinstance(result["data"]["y_probabilities"][0][0][0], np.float32) + + +def assert_inference_from_checkpoint( + result: Dict[str, Any], + pipeline: LightningPipeline[datasets.DatasetDict, Dict[str, np.ndarray], Dict[str, Any]], + tmp_path_module: Path, +) -> None: + ckpt_path = tmp_path_module / "checkpoints" / "last.ckpt" + task_from_ckpt = SequenceLabelingTask.from_checkpoint( + checkpoint_path=ckpt_path.resolve(), + output_path=tmp_path_module, + ) + + model_state_dict = pipeline.model.task.model.model.state_dict() + model_from_ckpt_state_dict = task_from_ckpt.model.model.state_dict() + assert model_state_dict.keys() == model_from_ckpt_state_dict.keys() + for k in model_state_dict.keys(): + assert torch.equal(model_state_dict[k], model_from_ckpt_state_dict[k]) + + predictions = task_from_ckpt.predict(pipeline.datamodule.test_dataloader()) + assert np.array_equal( + result["data"]["y_probabilities"][0][0], predictions["y_probabilities"][0][0] + ) diff --git a/tests/test_sequence_labelling.py b/tests/test_sequence_labelling.py index 5233514b..adebfa4b 100644 --- a/tests/test_sequence_labelling.py +++ b/tests/test_sequence_labelling.py @@ -162,10 +162,24 @@ def test_pos_tagging_pipeline( flair.device = torch.device("cpu") pipeline, path = pos_tagging_pipeline result = pipeline.run() - path.cleanup() np.testing.assert_almost_equal(result["UnitSeqeval"]["overall_f1"], 0.1450381) + task_from_ckpt = SequenceLabeling.from_checkpoint( + checkpoint_path=(Path(path.name) / "final-model.pt"), output_path=path.name + ) + loaded_data = pipeline.data_loader.load(pipeline.dataset) + transformed_data = pipeline.transformation.transform(loaded_data) + test_data = transformed_data.test + + y_pred, loss = task_from_ckpt.predict(test_data) + y_true = task_from_ckpt.get_y(test_data, task_from_ckpt.y_type, task_from_ckpt.y_dictionary) + results_from_ckpt = pipeline.evaluator.evaluate({"y_pred": y_pred, "y_true": y_true}) + + assert np.array_equal(result["data"]["y_pred"], results_from_ckpt["data"]["y_pred"]) + + path.cleanup() + def test_ner_tagging_pipeline( ner_tagging_pipeline: Tuple[ @@ -179,10 +193,24 @@ def test_ner_tagging_pipeline( flair.device = torch.device("cpu") pipeline, path = ner_tagging_pipeline result = pipeline.run() - path.cleanup() np.testing.assert_almost_equal(result["seqeval__mode_None__scheme_None"]["overall_f1"], 0.0) + task_from_ckpt = SequenceLabeling.from_checkpoint( + checkpoint_path=(Path(path.name) / "final-model.pt"), output_path=path.name + ) + loaded_data = pipeline.data_loader.load(pipeline.dataset) + transformed_data = pipeline.transformation.transform(loaded_data) + test_data = transformed_data.test + + y_pred, loss = task_from_ckpt.predict(test_data) + y_true = task_from_ckpt.get_y(test_data, task_from_ckpt.y_type, task_from_ckpt.y_dictionary) + results_from_ckpt = pipeline.evaluator.evaluate({"y_pred": y_pred, "y_true": y_true}) + + assert np.array_equal(result["data"]["y_pred"], results_from_ckpt["data"]["y_pred"]) + + path.cleanup() + def test_pos_tagging_pipeline_local_embedding( pos_tagging_pipeline_local_embedding: Tuple[ @@ -196,9 +224,8 @@ def test_pos_tagging_pipeline_local_embedding( flair.device = torch.device("cpu") pipeline, path = pos_tagging_pipeline_local_embedding result = pipeline.run() - path.cleanup() - np.testing.assert_almost_equal(result["UnitSeqeval"]["overall_f1"], 0.1832061) + path.cleanup() def test_ner_tagging_pipeline_local_embedding( @@ -213,8 +240,7 @@ def test_ner_tagging_pipeline_local_embedding( flair.device = torch.device("cpu") pipeline, path = ner_tagging_pipeline_local_embedding result = pipeline.run() - path.cleanup() - np.testing.assert_almost_equal( result["seqeval__mode_None__scheme_None"]["overall_f1"], 0.0107816 ) + path.cleanup() diff --git a/tests/test_text_classification.py b/tests/test_text_classification.py index 433ebbed..56d895ac 100644 --- a/tests/test_text_classification.py +++ b/tests/test_text_classification.py @@ -63,12 +63,25 @@ def test_text_classification_pipeline( flair.set_seed(441) pipeline, path = text_classification_pipeline result = pipeline.run() - path.cleanup() np.testing.assert_almost_equal(result["accuracy"]["accuracy"], 0.3333333) np.testing.assert_almost_equal(result["f1__average_macro"]["f1"], 0.1666666) np.testing.assert_almost_equal(result["precision__average_macro"]["precision"], 0.1111111) np.testing.assert_almost_equal(result["recall__average_macro"]["recall"], 0.3333333) + task_from_ckpt = TextClassification.from_checkpoint( + checkpoint_path=(Path(path.name) / "final-model.pt"), output_path=path.name + ) + loaded_data = pipeline.data_loader.load(pipeline.dataset) + transformed_data = pipeline.transformation.transform(loaded_data) + test_data = transformed_data.test + + y_pred, loss = task_from_ckpt.predict(test_data) + y_true = task_from_ckpt.get_y(test_data, task_from_ckpt.y_type, task_from_ckpt.y_dictionary) + results_from_ckpt = pipeline.evaluator.evaluate({"y_pred": y_pred, "y_true": y_true}) + assert np.array_equal(result["data"]["y_pred"], results_from_ckpt["data"]["y_pred"]) + + path.cleanup() + @pytest.fixture(scope="module") def text_classification_pipeline_local_embedding( @@ -111,8 +124,22 @@ def test_text_classification_pipeline_local_embedding( flair.set_seed(441) pipeline, path = text_classification_pipeline_local_embedding result = pipeline.run() - path.cleanup() + np.testing.assert_almost_equal(result["accuracy"]["accuracy"], 0.3333333) np.testing.assert_almost_equal(result["f1__average_macro"]["f1"], 0.3333333) np.testing.assert_almost_equal(result["precision__average_macro"]["precision"], 0.3333333) np.testing.assert_almost_equal(result["recall__average_macro"]["recall"], 0.3333333) + + task_from_ckpt = TextClassification.from_checkpoint( + checkpoint_path=(Path(path.name) / "final-model.pt"), output_path=path.name + ) + loaded_data = pipeline.data_loader.load(pipeline.dataset) + transformed_data = pipeline.transformation.transform(loaded_data) + test_data = transformed_data.test + + y_pred, loss = task_from_ckpt.predict(test_data) + y_true = task_from_ckpt.get_y(test_data, task_from_ckpt.y_type, task_from_ckpt.y_dictionary) + results_from_ckpt = pipeline.evaluator.evaluate({"y_pred": y_pred, "y_true": y_true}) + assert np.array_equal(result["data"]["y_pred"], results_from_ckpt["data"]["y_pred"]) + + path.cleanup() From f7385da34596778e4fb11356e824b275de8ca88f Mon Sep 17 00:00:00 2001 From: djaniak Date: Tue, 12 Apr 2022 17:17:30 +0200 Subject: [PATCH 21/22] misc: update tutorial notebook --- .../validate_lightning_models_inference.ipynb | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tutorials/validate_lightning_models_inference.ipynb b/tutorials/validate_lightning_models_inference.ipynb index d9326b1f..ee0cab56 100644 --- a/tutorials/validate_lightning_models_inference.ipynb +++ b/tutorials/validate_lightning_models_inference.ipynb @@ -115,7 +115,7 @@ " \"learning_rate\": 5e-4,\n", " \"train_batch_size\": 32,\n", " \"eval_batch_size\": 32,\n", - " \"use_scheduler\": False,\n", + " \"use_scheduler\": True,\n", " \"optimizer\": \"AdamW\",\n", " \"adam_epsilon\": 1e-8,\n", " \"warmup_steps\": 100,\n", @@ -165,7 +165,7 @@ "metadata": {}, "outputs": [], "source": [ - "ckpt_path = output_path / \"checkpoints\" / \"epoch=0-step=1.ckpt\"\n", + "ckpt_path = output_path / \"checkpoints\" / \"last.ckpt\"\n", "ckpt_path" ] }, @@ -231,11 +231,13 @@ "cell_type": "code", "execution_count": null, "id": "4ad7b9b0-823a-4c8e-aac5-61a333558ed1", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "test_dataloader = pipeline.datamodule.test_dataloader()\n", - "preds = task_from_ckpt.predict(test_dataloader, return_names=False)\n", + "preds = task_from_ckpt.predict(test_dataloader)\n", "preds" ] }, @@ -282,9 +284,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:clarinpl-embeddings]", + "display_name": "Python [conda env:embeddings]", "language": "python", - "name": "conda-env-clarinpl-embeddings-py" + "name": "conda-env-embeddings-py" }, "language_info": { "codemirror_mode": { From 2566140117d5015a83e90ba770962faeb5e28e57 Mon Sep 17 00:00:00 2001 From: djaniak Date: Wed, 13 Apr 2022 14:29:55 +0200 Subject: [PATCH 22/22] refactor: pr issues --- .../lightning_module/sequence_labeling.py | 71 ++----------------- .../task/lightning_task/sequence_labeling.py | 6 +- .../lightning_task/text_classification.py | 5 +- 3 files changed, 11 insertions(+), 71 deletions(-) diff --git a/embeddings/model/lightning_module/sequence_labeling.py b/embeddings/model/lightning_module/sequence_labeling.py index d6b38cfd..0e1e25fd 100644 --- a/embeddings/model/lightning_module/sequence_labeling.py +++ b/embeddings/model/lightning_module/sequence_labeling.py @@ -1,6 +1,7 @@ -from typing import Any, Dict, Iterable, Optional, Tuple, Union +from typing import Any, Dict, Optional, Tuple import torch +from datasets import ClassLabel from pytorch_lightning.utilities.types import STEP_OUTPUT from torchmetrics import MetricCollection from transformers import AutoModelForTokenClassification @@ -35,18 +36,13 @@ def __init__( task_model_kwargs=task_model_kwargs, ) self.ignore_index = ignore_index - self._str2int: Optional[Dict[str, int]] = None - self._int2str: Optional[Dict[int, str]] = None + self.class_label: Optional[ClassLabel] = None def setup(self, stage: Optional[str] = None) -> None: if stage in ("fit", None): assert self.trainer is not None - self._int2str = ( - self.trainer.datamodule.dataset["train"].features["labels"].feature._int2str - ) - self._str2int = ( - self.trainer.datamodule.dataset["train"].features["labels"].feature._str2int - ) + self.class_label = self.trainer.datamodule.dataset["train"].features["labels"].feature + assert isinstance(self.class_label, ClassLabel) super().setup(stage=stage) def shared_step(self, **batch: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: @@ -89,63 +85,10 @@ def test_step(self, *args: Any, **kwargs: Any) -> Optional[STEP_OUTPUT]: _logger.warning("Missing labels for the test data") return None - def str2int(self, values: Union[str, Iterable[Any]]) -> Union[int, Iterable[Any]]: - """Conversion class name string => integer duplicated from huggingface ClassLabel.""" - assert isinstance(values, str) or isinstance( - values, Iterable - ), f"Values {values} should be a string or an Iterable (list, numpy array, pytorch, tensorflow tensors)" - return_list = True - if isinstance(values, str): - values = [values] - return_list = False - - output = [] - for value in values: - if self._str2int: - # strip key if not in dict - if value not in self._str2int: - value = str(value).strip() - output.append(self._str2int[str(value)]) - else: - # No names provided, try to integerize - failed_parse = False - try: - output.append(int(value)) - if not 0 <= int(value) < self.hparams.num_classes: - failed_parse = True - except ValueError: - failed_parse = True - if failed_parse: - raise ValueError(f"Invalid string class label {value}") - return output if return_list else output[0] - - def int2str(self, values: Union[int, Iterable[Any]]) -> Union[str, Iterable[Any]]: - """Conversion integer => class name string duplicated from huggingface ClassLabel.""" - assert isinstance(values, int) or isinstance( - values, Iterable - ), f"Values {values} should be an integer or an Iterable (list, numpy array, pytorch, tensorflow tensors)" - return_list = True - if isinstance(values, int): - values = [values] - return_list = False - - for v in values: - if not 0 <= v < self.hparams.num_classes: - raise ValueError(f"Invalid integer class label {v:d}") - - if self._int2str: - output = [self._int2str[int(v)] for v in values] - else: - # No names provided, return str(values) - output = [str(v) for v in values] - return output if return_list else output[0] - def on_save_checkpoint(self, checkpoint: Dict[str, Any]) -> None: - checkpoint["_int2str"] = self._int2str - checkpoint["_str2int"] = self._str2int + checkpoint["class_label"] = self.class_label super().on_save_checkpoint(checkpoint=checkpoint) def on_load_checkpoint(self, checkpoint: Dict[str, Any]) -> None: - self._int2str = checkpoint["_int2str"] - self._str2int = checkpoint["_str2int"] + self.class_label = checkpoint["class_label"] super().on_load_checkpoint(checkpoint=checkpoint) diff --git a/embeddings/task/lightning_task/sequence_labeling.py b/embeddings/task/lightning_task/sequence_labeling.py index c31efa04..9e08da99 100644 --- a/embeddings/task/lightning_task/sequence_labeling.py +++ b/embeddings/task/lightning_task/sequence_labeling.py @@ -59,9 +59,8 @@ def predict( "y_pred": np.array(predictions, dtype=object), "y_true": np.array(ground_truth, dtype=object), "y_probabilities": np.array(probabilities, dtype=object), + "names": np.array(self.model.target_names), } - if return_names: - results["names"] = np.array(self.model.target_names) return results def _map_filter_data( @@ -69,7 +68,8 @@ def _map_filter_data( ) -> List[str]: assert self.model is not None return [ - self.model.int2str(x.item()) for x in data[ground_truth_data != self.model.ignore_index] + self.model.class_label.int2str(x.item()) + for x in data[ground_truth_data != self.model.ignore_index] ] @classmethod diff --git a/embeddings/task/lightning_task/text_classification.py b/embeddings/task/lightning_task/text_classification.py index c852a126..442e9406 100644 --- a/embeddings/task/lightning_task/text_classification.py +++ b/embeddings/task/lightning_task/text_classification.py @@ -45,10 +45,7 @@ def predict( ) -> Dict[str, nptyping.NDArray[Any]]: assert self.model is not None results = self.model.predict(dataloader=dataloader) - if return_names: - assert self.trainer is not None - assert hasattr(self.trainer, "datamodule") - results["names"] = np.array(self.model.target_names) + results["names"] = np.array(self.model.target_names) return results @classmethod