From ee1cfce349405d383dc9f2ff13fd20b77ac24de0 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 9 Sep 2022 17:37:15 +0200 Subject: [PATCH 01/96] move sample generation to datamodule instead of dataset --- anomalib/data/folder.py | 78 +++++++++++++++-------------------------- anomalib/data/mvtec.py | 52 +++++++++++++-------------- 2 files changed, 52 insertions(+), 78 deletions(-) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index 0f3b47adbd..ed2357619d 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -82,7 +82,6 @@ def make_dataset( abnormal_dir: Union[str, Path], normal_test_dir: Optional[Union[str, Path]] = None, mask_dir: Optional[Union[str, Path]] = None, - split: Optional[str] = None, split_ratio: float = 0.2, seed: Optional[int] = None, create_validation_set: bool = True, @@ -120,9 +119,10 @@ def make_dataset( dirs = {**dirs, **{"normal_test": normal_test_dir}} for dir_type, path in dirs.items(): - filename, label = _prepare_files_labels(path, dir_type, extensions) - filenames += filename - labels += label + if path is not None: + filename, label = _prepare_files_labels(path, dir_type, extensions) + filenames += filename + labels += label samples = DataFrame({"image_path": filenames, "label": labels}) @@ -158,11 +158,6 @@ def make_dataset( if create_validation_set: samples = create_validation_set_from_test_set(samples, seed=seed, normal_label="normal") - # Get the data frame for the split. - if split is not None and split in ["train", "val", "test"]: - samples = samples[samples.split == split] - samples = samples.reset_index(drop=True) - return samples @@ -171,19 +166,13 @@ class FolderDataset(Dataset): def __init__( self, - normal_dir: Union[Path, str], - abnormal_dir: Union[Path, str], + samples: DataFrame, split: str, pre_process: PreProcessor, - normal_test_dir: Optional[Union[Path, str]] = None, - split_ratio: float = 0.2, mask_dir: Optional[Union[Path, str]] = None, - extensions: Optional[Tuple[str, ...]] = None, task: Optional[str] = None, - seed: Optional[int] = None, - create_validation_set: bool = False, ) -> None: - """Create Folder Folder Dataset. + """Create Folder Dataset. Args: normal_dir (Union[str, Path]): Path to the directory containing normal images. @@ -232,17 +221,7 @@ def __init__( self.task = task self.pre_process = pre_process - self.samples = make_dataset( - normal_dir=normal_dir, - abnormal_dir=abnormal_dir, - normal_test_dir=normal_test_dir, - mask_dir=mask_dir, - split=split, - split_ratio=split_ratio, - seed=seed, - create_validation_set=create_validation_set, - extensions=extensions, - ) + self.samples = samples def __len__(self) -> int: """Get length of the dataset.""" @@ -423,7 +402,7 @@ def __init__( self.root = _check_and_convert_path(root) self.normal_dir = self.root / normal_dir - self.abnormal_dir = self.root / abnormal_dir + self.abnormal_dir = self.root / abnormal_dir if abnormal_dir is not None else None self.normal_test = normal_test_dir if normal_test_dir: self.normal_test = self.root / normal_test_dir @@ -461,6 +440,17 @@ def __init__( self.val_data: Dataset self.inference_data: Dataset + self.samples = make_dataset( + normal_dir=self.normal_dir, + abnormal_dir=self.abnormal_dir, + normal_test_dir=self.normal_test, + mask_dir=mask_dir, + split_ratio=split_ratio, + seed=seed, + create_validation_set=create_validation_set, + extensions=extensions, + ) + def setup(self, stage: Optional[str] = None) -> None: """Setup train, validation and test data. @@ -470,47 +460,35 @@ def setup(self, stage: Optional[str] = None) -> None: """ logger.info("Setting up train, validation, test and prediction datasets.") if stage in (None, "fit"): + train_samples = self.samples[self.samples.split == "train"] + train_samples = train_samples.reset_index(drop=True) self.train_data = FolderDataset( - normal_dir=self.normal_dir, - abnormal_dir=self.abnormal_dir, - normal_test_dir=self.normal_test, + samples=train_samples, split="train", - split_ratio=self.split_ratio, mask_dir=self.mask_dir, pre_process=self.pre_process_train, - extensions=self.extensions, task=self.task, - seed=self.seed, - create_validation_set=self.create_validation_set, ) if self.create_validation_set: + val_samples = self.samples[self.samples.split == "val"] + val_samples = val_samples.reset_index(drop=True) self.val_data = FolderDataset( - normal_dir=self.normal_dir, - abnormal_dir=self.abnormal_dir, - normal_test_dir=self.normal_test, + samples=val_samples, split="val", - split_ratio=self.split_ratio, mask_dir=self.mask_dir, pre_process=self.pre_process_val, - extensions=self.extensions, task=self.task, - seed=self.seed, - create_validation_set=self.create_validation_set, ) + test_samples = self.samples[self.samples.split == "test"] + test_samples = test_samples.reset_index(drop=True) self.test_data = FolderDataset( - normal_dir=self.normal_dir, - abnormal_dir=self.abnormal_dir, + samples=test_samples, split="test", - normal_test_dir=self.normal_test, - split_ratio=self.split_ratio, mask_dir=self.mask_dir, pre_process=self.pre_process_val, - extensions=self.extensions, task=self.task, - seed=self.seed, - create_validation_set=self.create_validation_set, ) if stage == "predict": diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 9b45699d64..8aa52af1d4 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -60,7 +60,6 @@ def make_mvtec_dataset( path: Path, - split: Optional[str] = None, split_ratio: float = 0.1, seed: Optional[int] = None, create_validation_set: bool = False, @@ -110,6 +109,13 @@ def make_mvtec_dataset( Returns: DataFrame: an output dataframe containing samples for the requested split (ie., train or test) """ + if seed is None: + warnings.warn( + "seed is None." + " When seed is not set, images from the normal directory are split between training and test dir." + " This will lead to inconsistency between runs." + ) + samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.png")] if len(samples_list) == 0: raise RuntimeError(f"Found 0 images in {path}") @@ -147,11 +153,6 @@ def make_mvtec_dataset( if create_validation_set: samples = create_validation_set_from_test_set(samples, seed=seed) - # Get the data frame for the split. - if split is not None and split in ["train", "val", "test"]: - samples = samples[samples.split == split] - samples = samples.reset_index(drop=True) - return samples @@ -160,13 +161,12 @@ class MVTecDataset(VisionDataset): def __init__( self, + samples: DataFrame, root: Union[Path, str], category: str, pre_process: PreProcessor, split: str, task: str = "segmentation", - seed: Optional[int] = None, - create_validation_set: bool = False, ) -> None: """Mvtec AD Dataset class. @@ -211,26 +211,13 @@ def __init__( """ super().__init__(root) - if seed is None: - warnings.warn( - "seed is None." - " When seed is not set, images from the normal directory are split between training and test dir." - " This will lead to inconsistency between runs." - ) - self.root = Path(root) if isinstance(root, str) else root self.category: str = category self.split = split self.task = task self.pre_process = pre_process - - self.samples = make_mvtec_dataset( - path=self.root / category, - split=self.split, - seed=seed, - create_validation_set=create_validation_set, - ) + self.samples = samples def __len__(self) -> int: """Get length of the dataset.""" @@ -368,6 +355,12 @@ def __init__( self.val_data: Dataset self.inference_data: Dataset + self.samples = make_mvtec_dataset( + path=self.root / category, + seed=seed, + create_validation_set=create_validation_set, + ) + def prepare_data(self) -> None: """Download the dataset if not available.""" if (self.root / self.category).is_dir(): @@ -404,35 +397,38 @@ def setup(self, stage: Optional[str] = None) -> None: """ logger.info("Setting up train, validation, test and prediction datasets.") if stage in (None, "fit"): + train_samples = self.samples[self.samples.split == "train"] + train_samples = train_samples.reset_index(drop=True) self.train_data = MVTecDataset( + samples=train_samples, root=self.root, category=self.category, pre_process=self.pre_process_train, split="train", task=self.task, - seed=self.seed, - create_validation_set=self.create_validation_set, ) if self.create_validation_set: + val_samples = self.samples[self.samples.split == "val"] + val_samples = val_samples.reset_index(drop=True) self.val_data = MVTecDataset( + samples=val_samples, root=self.root, category=self.category, pre_process=self.pre_process_val, split="val", task=self.task, - seed=self.seed, - create_validation_set=self.create_validation_set, ) + test_samples = self.samples[self.samples.split == "test"] + test_samples = test_samples.reset_index(drop=True) self.test_data = MVTecDataset( + samples=test_samples, root=self.root, category=self.category, pre_process=self.pre_process_val, split="test", task=self.task, - seed=self.seed, - create_validation_set=self.create_validation_set, ) if stage == "predict": From ec5199ec89765e975fd9b8c62b6b4badb09b9051 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 12 Sep 2022 14:15:47 +0200 Subject: [PATCH 02/96] move sample generation from init to setup --- anomalib/data/folder.py | 28 ++++++++++++++-------------- anomalib/data/mvtec.py | 18 +++++++++--------- 2 files changed, 23 insertions(+), 23 deletions(-) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index ed2357619d..3ed88b6be2 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -440,17 +440,6 @@ def __init__( self.val_data: Dataset self.inference_data: Dataset - self.samples = make_dataset( - normal_dir=self.normal_dir, - abnormal_dir=self.abnormal_dir, - normal_test_dir=self.normal_test, - mask_dir=mask_dir, - split_ratio=split_ratio, - seed=seed, - create_validation_set=create_validation_set, - extensions=extensions, - ) - def setup(self, stage: Optional[str] = None) -> None: """Setup train, validation and test data. @@ -458,9 +447,20 @@ def setup(self, stage: Optional[str] = None) -> None: stage: Optional[str]: Train/Val/Test stages. (Default value = None) """ + samples = make_dataset( + normal_dir=self.normal_dir, + abnormal_dir=self.abnormal_dir, + normal_test_dir=self.normal_test, + mask_dir=self.mask_dir, + split_ratio=self.split_ratio, + seed=self.seed, + create_validation_set=self.create_validation_set, + extensions=self.extensions, + ) + logger.info("Setting up train, validation, test and prediction datasets.") if stage in (None, "fit"): - train_samples = self.samples[self.samples.split == "train"] + train_samples = samples[samples.split == "train"] train_samples = train_samples.reset_index(drop=True) self.train_data = FolderDataset( samples=train_samples, @@ -471,7 +471,7 @@ def setup(self, stage: Optional[str] = None) -> None: ) if self.create_validation_set: - val_samples = self.samples[self.samples.split == "val"] + val_samples = samples[samples.split == "val"] val_samples = val_samples.reset_index(drop=True) self.val_data = FolderDataset( samples=val_samples, @@ -481,7 +481,7 @@ def setup(self, stage: Optional[str] = None) -> None: task=self.task, ) - test_samples = self.samples[self.samples.split == "test"] + test_samples = samples[samples.split == "test"] test_samples = test_samples.reset_index(drop=True) self.test_data = FolderDataset( samples=test_samples, diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 8aa52af1d4..c8b3244d63 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -355,12 +355,6 @@ def __init__( self.val_data: Dataset self.inference_data: Dataset - self.samples = make_mvtec_dataset( - path=self.root / category, - seed=seed, - create_validation_set=create_validation_set, - ) - def prepare_data(self) -> None: """Download the dataset if not available.""" if (self.root / self.category).is_dir(): @@ -395,9 +389,15 @@ def setup(self, stage: Optional[str] = None) -> None: stage: Optional[str]: Train/Val/Test stages. (Default value = None) """ + samples = make_mvtec_dataset( + path=self.root / self.category, + seed=self.seed, + create_validation_set=self.create_validation_set, + ) + logger.info("Setting up train, validation, test and prediction datasets.") if stage in (None, "fit"): - train_samples = self.samples[self.samples.split == "train"] + train_samples = samples[samples.split == "train"] train_samples = train_samples.reset_index(drop=True) self.train_data = MVTecDataset( samples=train_samples, @@ -409,7 +409,7 @@ def setup(self, stage: Optional[str] = None) -> None: ) if self.create_validation_set: - val_samples = self.samples[self.samples.split == "val"] + val_samples = samples[samples.split == "val"] val_samples = val_samples.reset_index(drop=True) self.val_data = MVTecDataset( samples=val_samples, @@ -420,7 +420,7 @@ def setup(self, stage: Optional[str] = None) -> None: task=self.task, ) - test_samples = self.samples[self.samples.split == "test"] + test_samples = samples[samples.split == "test"] test_samples = test_samples.reset_index(drop=True) self.test_data = MVTecDataset( samples=test_samples, From 9f0a35ee4d05f24afefe33451198758df42a4d3a Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Tue, 13 Sep 2022 13:11:57 +0200 Subject: [PATCH 03/96] remove inference stage and add base classes --- anomalib/data/base.py | 33 +++++++++++++++++++++++++++++++++ anomalib/data/folder.py | 28 +++++----------------------- anomalib/data/mvtec.py | 29 ++++------------------------- 3 files changed, 42 insertions(+), 48 deletions(-) create mode 100644 anomalib/data/base.py diff --git a/anomalib/data/base.py b/anomalib/data/base.py new file mode 100644 index 0000000000..722aedfe37 --- /dev/null +++ b/anomalib/data/base.py @@ -0,0 +1,33 @@ +"""Anomalib dataset and datamodule base classes.""" + +# Copyright (C) 2022 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from abc import ABC +from typing import Optional + +from pandas import DataFrame +from pytorch_lightning import LightningDataModule +from torch.utils.data import Dataset + + +class AnomalibDataset(Dataset, ABC): + """Base Anomalib dataset.""" + + def __init__(self, samples: DataFrame): + super().__init__() + self.samples = samples + + def contains_anomalous_images(self): + """Check if the dataset contains any anomalous images.""" + return "anomalous" in list(self.samples.label) + + +class AnomalibDataModule(LightningDataModule): + """Base Anomalib data module.""" + + def __init__(self): + super().__init__() + self.train_data: Optional[AnomalibDataset] = None + self.val_data: Optional[AnomalibDataset] = None + self.test_data: Optional[AnomalibDataset] = None diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index 3ed88b6be2..75e40ae981 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -15,14 +15,13 @@ import cv2 import numpy as np from pandas.core.frame import DataFrame -from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS from torch import Tensor -from torch.utils.data import DataLoader, Dataset +from torch.utils.data import DataLoader from torchvision.datasets.folder import IMG_EXTENSIONS -from anomalib.data.inference import InferenceDataset +from anomalib.data.base import AnomalibDataModule, AnomalibDataset from anomalib.data.utils import read_image from anomalib.data.utils.split import ( create_validation_set_from_test_set, @@ -161,7 +160,7 @@ def make_dataset( return samples -class FolderDataset(Dataset): +class FolderDataset(AnomalibDataset): """Folder Dataset.""" def __init__( @@ -199,6 +198,7 @@ def __init__( provided, `task` should be set to `segmentation`. """ + super().__init__(samples) self.split = split if task == "segmentation" and mask_dir is None: @@ -221,7 +221,6 @@ def __init__( self.task = task self.pre_process = pre_process - self.samples = samples def __len__(self) -> int: """Get length of the dataset.""" @@ -271,7 +270,7 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: @DATAMODULE_REGISTRY -class Folder(LightningDataModule): +class Folder(AnomalibDataModule): """Folder Lightning Data Module.""" def __init__( @@ -434,12 +433,6 @@ def __init__( self.create_validation_set = create_validation_set self.seed = seed - self.train_data: Dataset - self.test_data: Dataset - if create_validation_set: - self.val_data: Dataset - self.inference_data: Dataset - def setup(self, stage: Optional[str] = None) -> None: """Setup train, validation and test data. @@ -491,11 +484,6 @@ def setup(self, stage: Optional[str] = None) -> None: task=self.task, ) - if stage == "predict": - self.inference_data = InferenceDataset( - path=self.root, image_size=self.image_size, transform_config=self.transform_config_val - ) - def train_dataloader(self) -> TRAIN_DATALOADERS: """Get train dataloader.""" return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers) @@ -508,9 +496,3 @@ def val_dataloader(self) -> EVAL_DATALOADERS: def test_dataloader(self) -> EVAL_DATALOADERS: """Get test dataloader.""" return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) - - def predict_dataloader(self) -> EVAL_DATALOADERS: - """Get predict dataloader.""" - return DataLoader( - self.inference_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers - ) diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index c8b3244d63..c5df312486 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -39,15 +39,12 @@ import numpy as np import pandas as pd from pandas.core.frame import DataFrame -from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS from torch import Tensor from torch.utils.data import DataLoader -from torch.utils.data.dataset import Dataset -from torchvision.datasets.folder import VisionDataset -from anomalib.data.inference import InferenceDataset +from anomalib.data.base import AnomalibDataModule, AnomalibDataset from anomalib.data.utils import DownloadProgressBar, hash_check, read_image from anomalib.data.utils.split import ( create_validation_set_from_test_set, @@ -156,7 +153,7 @@ def make_mvtec_dataset( return samples -class MVTecDataset(VisionDataset): +class MVTecDataset(AnomalibDataset): """MVTec AD PyTorch Dataset.""" def __init__( @@ -209,7 +206,7 @@ def __init__( >>> dataset[0]["image"].shape, dataset[0]["mask"].shape (torch.Size([3, 256, 256]), torch.Size([256, 256])) """ - super().__init__(root) + super().__init__(samples) self.root = Path(root) if isinstance(root, str) else root self.category: str = category @@ -267,14 +264,13 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: @DATAMODULE_REGISTRY -class MVTec(LightningDataModule): +class MVTec(AnomalibDataModule): """MVTec AD Lightning Data Module.""" def __init__( self, root: str, category: str, - # TODO: Remove default values. IAAALD-211 image_size: Optional[Union[int, Tuple[int, int]]] = None, train_batch_size: int = 32, test_batch_size: int = 32, @@ -349,12 +345,6 @@ def __init__( self.task = task self.seed = seed - self.train_data: Dataset - self.test_data: Dataset - if create_validation_set: - self.val_data: Dataset - self.inference_data: Dataset - def prepare_data(self) -> None: """Download the dataset if not available.""" if (self.root / self.category).is_dir(): @@ -431,11 +421,6 @@ def setup(self, stage: Optional[str] = None) -> None: task=self.task, ) - if stage == "predict": - self.inference_data = InferenceDataset( - path=self.root, image_size=self.image_size, transform_config=self.transform_config_val - ) - def train_dataloader(self) -> TRAIN_DATALOADERS: """Get train dataloader.""" return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers) @@ -448,9 +433,3 @@ def val_dataloader(self) -> EVAL_DATALOADERS: def test_dataloader(self) -> EVAL_DATALOADERS: """Get test dataloader.""" return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) - - def predict_dataloader(self) -> EVAL_DATALOADERS: - """Get predict dataloader.""" - return DataLoader( - self.inference_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers - ) From dea176fcc186b3b4ee1a64da63b1274b7fe54bae Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Tue, 13 Sep 2022 14:47:47 +0200 Subject: [PATCH 04/96] replace dataset classes with AnomalibDataset --- anomalib/data/base.py | 63 ++++++++++++++++-- anomalib/data/folder.py | 138 ++++------------------------------------ anomalib/data/mvtec.py | 131 ++------------------------------------ 3 files changed, 76 insertions(+), 256 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 722aedfe37..ad5fd14bf8 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -4,26 +4,79 @@ # SPDX-License-Identifier: Apache-2.0 from abc import ABC -from typing import Optional +from typing import Dict, Optional, Union +import cv2 +import numpy as np from pandas import DataFrame from pytorch_lightning import LightningDataModule +from torch import Tensor from torch.utils.data import Dataset +from anomalib.data.utils import read_image +from anomalib.pre_processing import PreProcessor -class AnomalibDataset(Dataset, ABC): - """Base Anomalib dataset.""" - def __init__(self, samples: DataFrame): +class AnomalibDataset(Dataset): + """Anomalib dataset.""" + + def __init__(self, samples: DataFrame, task: str, split: str, pre_process: PreProcessor): super().__init__() self.samples = samples + self.task = task + self.split = split + self.pre_process = pre_process def contains_anomalous_images(self): """Check if the dataset contains any anomalous images.""" return "anomalous" in list(self.samples.label) + def __len__(self) -> int: + """Get length of the dataset.""" + return len(self.samples) + + def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: + """Get dataset item for the index ``index``. + + Args: + index (int): Index to get the item. + + Returns: + Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. + Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. + """ + image_path = self.samples.image_path[index] + image = read_image(image_path) + + pre_processed = self.pre_process(image=image) + item = {"image": pre_processed["image"]} + + if self.split in ["val", "test"]: + label_index = self.samples.label_index[index] + + item["image_path"] = image_path + item["label"] = label_index + + if self.task == "segmentation": + mask_path = self.samples.mask_path[index] + + # Only Anomalous (1) images has masks in MVTec AD dataset. + # Therefore, create empty mask for Normal (0) images. + if label_index == 0: + mask = np.zeros(shape=image.shape[:2]) + else: + mask = cv2.imread(mask_path, flags=0) / 255.0 + + pre_processed = self.pre_process(image=image, mask=mask) + + item["mask_path"] = mask_path + item["image"] = pre_processed["image"] + item["mask"] = pre_processed["mask"] + + return item + -class AnomalibDataModule(LightningDataModule): +class AnomalibDataModule(LightningDataModule, ABC): """Base Anomalib data module.""" def __init__(self): diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index 75e40ae981..e883c0da37 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -9,20 +9,16 @@ import logging import warnings from pathlib import Path -from typing import Dict, Optional, Tuple, Union +from typing import Optional, Tuple, Union import albumentations as A -import cv2 -import numpy as np from pandas.core.frame import DataFrame from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS -from torch import Tensor from torch.utils.data import DataLoader from torchvision.datasets.folder import IMG_EXTENSIONS from anomalib.data.base import AnomalibDataModule, AnomalibDataset -from anomalib.data.utils import read_image from anomalib.data.utils.split import ( create_validation_set_from_test_set, split_normal_images_in_train_set, @@ -160,115 +156,6 @@ def make_dataset( return samples -class FolderDataset(AnomalibDataset): - """Folder Dataset.""" - - def __init__( - self, - samples: DataFrame, - split: str, - pre_process: PreProcessor, - mask_dir: Optional[Union[Path, str]] = None, - task: Optional[str] = None, - ) -> None: - """Create Folder Dataset. - - Args: - normal_dir (Union[str, Path]): Path to the directory containing normal images. - abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images. - split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None. - pre_process (Optional[PreProcessor], optional): Image Pro-processor to apply transform. - Defaults to None. - normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing - normal images for the test dataset. Defaults to None. - split_ratio (float, optional): Ratio to split normal training images and add to the - test set in case test set doesn't contain any normal images. - Defaults to 0.2. - mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing - the mask annotations. Defaults to None. - extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the - directory. - task (Optional[str], optional): Task type. (classification or segmentation) Defaults to None. - seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0. - create_validation_set (bool, optional):Boolean to create a validation set from the test set. - Those wanting to create a validation set could set this flag to ``True``. - - Raises: - ValueError: When task is set to classification and `mask_dir` is provided. When `mask_dir` is - provided, `task` should be set to `segmentation`. - - """ - super().__init__(samples) - self.split = split - - if task == "segmentation" and mask_dir is None: - warnings.warn( - "Segmentation task is requested, but mask directory is not provided. " - "Classification is to be chosen if mask directory is not provided." - ) - self.task = "classification" - - if task == "classification" and mask_dir: - warnings.warn( - "Classification task is requested, but mask directory is provided. " - "Segmentation task is to be chosen if mask directory is provided." - ) - self.task = "segmentation" - - if task is None or mask_dir is None: - self.task = "classification" - else: - self.task = task - - self.pre_process = pre_process - - def __len__(self) -> int: - """Get length of the dataset.""" - return len(self.samples) - - def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: - """Get dataset item for the index ``index``. - - Args: - index (int): Index to get the item. - - Returns: - Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. - Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. - """ - item: Dict[str, Union[str, Tensor]] = {} - - image_path = self.samples.image_path[index] - image = read_image(image_path) - - pre_processed = self.pre_process(image=image) - item = {"image": pre_processed["image"]} - - if self.split in ["val", "test"]: - label_index = self.samples.label_index[index] - - item["image_path"] = image_path - item["label"] = label_index - - if self.task == "segmentation": - mask_path = self.samples.mask_path[index] - - # Only Anomalous (1) images has masks in MVTec AD dataset. - # Therefore, create empty mask for Normal (0) images. - if label_index == 0: - mask = np.zeros(shape=image.shape[:2]) - else: - mask = cv2.imread(mask_path, flags=0) / 255.0 - - pre_processed = self.pre_process(image=image, mask=mask) - - item["mask_path"] = mask_path - item["image"] = pre_processed["image"] - item["mask"] = pre_processed["mask"] - - return item - - @DATAMODULE_REGISTRY class Folder(AnomalibDataModule): """Folder Lightning Data Module.""" @@ -409,13 +296,15 @@ def __init__( self.extensions = extensions self.split_ratio = split_ratio - if task == "classification" and mask_dir is not None: - raise ValueError( - "Classification type is set but mask_dir provided. " - "If mask_dir is provided task type must be segmentation. " - "Check your configuration." + if task == "segmentation" and mask_dir is None: + warnings.warn( + "Segmentation task is requested, but mask directory is not provided. " + "Classification is to be chosen if mask directory is not provided." ) - self.task = task + self.task = "classification" + else: + self.task = task + self.transform_config_train = transform_config_train self.transform_config_val = transform_config_val self.image_size = image_size @@ -455,10 +344,9 @@ def setup(self, stage: Optional[str] = None) -> None: if stage in (None, "fit"): train_samples = samples[samples.split == "train"] train_samples = train_samples.reset_index(drop=True) - self.train_data = FolderDataset( + self.train_data = AnomalibDataset( samples=train_samples, split="train", - mask_dir=self.mask_dir, pre_process=self.pre_process_train, task=self.task, ) @@ -466,20 +354,18 @@ def setup(self, stage: Optional[str] = None) -> None: if self.create_validation_set: val_samples = samples[samples.split == "val"] val_samples = val_samples.reset_index(drop=True) - self.val_data = FolderDataset( + self.val_data = AnomalibDataset( samples=val_samples, split="val", - mask_dir=self.mask_dir, pre_process=self.pre_process_val, task=self.task, ) test_samples = samples[samples.split == "test"] test_samples = test_samples.reset_index(drop=True) - self.test_data = FolderDataset( + self.test_data = AnomalibDataset( samples=test_samples, split="test", - mask_dir=self.mask_dir, pre_process=self.pre_process_val, task=self.task, ) diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index c5df312486..af7d186a59 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -31,21 +31,18 @@ import tarfile import warnings from pathlib import Path -from typing import Dict, Optional, Tuple, Union +from typing import Optional, Tuple, Union from urllib.request import urlretrieve import albumentations as A -import cv2 -import numpy as np import pandas as pd from pandas.core.frame import DataFrame from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS -from torch import Tensor from torch.utils.data import DataLoader from anomalib.data.base import AnomalibDataModule, AnomalibDataset -from anomalib.data.utils import DownloadProgressBar, hash_check, read_image +from anomalib.data.utils import DownloadProgressBar, hash_check from anomalib.data.utils.split import ( create_validation_set_from_test_set, split_normal_images_in_train_set, @@ -153,116 +150,6 @@ def make_mvtec_dataset( return samples -class MVTecDataset(AnomalibDataset): - """MVTec AD PyTorch Dataset.""" - - def __init__( - self, - samples: DataFrame, - root: Union[Path, str], - category: str, - pre_process: PreProcessor, - split: str, - task: str = "segmentation", - ) -> None: - """Mvtec AD Dataset class. - - Args: - root: Path to the MVTec AD dataset - category: Name of the MVTec AD category. - pre_process: List of pre_processing object containing albumentation compose. - split: 'train', 'val' or 'test' - task: ``classification`` or ``segmentation`` - seed: seed used for the random subset splitting - create_validation_set: Create a validation subset in addition to the train and test subsets - - Examples: - >>> from anomalib.data.mvtec import MVTecDataset - >>> from anomalib.data.transforms import PreProcessor - >>> pre_process = PreProcessor(image_size=256) - >>> dataset = MVTecDataset( - ... root='./datasets/MVTec', - ... category='leather', - ... pre_process=pre_process, - ... task="classification", - ... is_train=True, - ... ) - >>> dataset[0].keys() - dict_keys(['image']) - - >>> dataset.split = "test" - >>> dataset[0].keys() - dict_keys(['image', 'image_path', 'label']) - - >>> dataset.task = "segmentation" - >>> dataset.split = "train" - >>> dataset[0].keys() - dict_keys(['image']) - - >>> dataset.split = "test" - >>> dataset[0].keys() - dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask']) - - >>> dataset[0]["image"].shape, dataset[0]["mask"].shape - (torch.Size([3, 256, 256]), torch.Size([256, 256])) - """ - super().__init__(samples) - - self.root = Path(root) if isinstance(root, str) else root - self.category: str = category - self.split = split - self.task = task - - self.pre_process = pre_process - self.samples = samples - - def __len__(self) -> int: - """Get length of the dataset.""" - return len(self.samples) - - def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: - """Get dataset item for the index ``index``. - - Args: - index (int): Index to get the item. - - Returns: - Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. - Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. - """ - item: Dict[str, Union[str, Tensor]] = {} - - image_path = self.samples.image_path[index] - image = read_image(image_path) - - pre_processed = self.pre_process(image=image) - item = {"image": pre_processed["image"]} - - if self.split in ["val", "test"]: - label_index = self.samples.label_index[index] - - item["image_path"] = image_path - item["label"] = label_index - - if self.task == "segmentation": - mask_path = self.samples.mask_path[index] - - # Only Anomalous (1) images has masks in MVTec AD dataset. - # Therefore, create empty mask for Normal (0) images. - if label_index == 0: - mask = np.zeros(shape=image.shape[:2]) - else: - mask = cv2.imread(mask_path, flags=0) / 255.0 - - pre_processed = self.pre_process(image=image, mask=mask) - - item["mask_path"] = mask_path - item["image"] = pre_processed["image"] - item["mask"] = pre_processed["mask"] - - return item - - @DATAMODULE_REGISTRY class MVTec(AnomalibDataModule): """MVTec AD Lightning Data Module.""" @@ -370,7 +257,7 @@ def prepare_data(self) -> None: tar_file.extractall(self.root) logger.info("Cleaning the tar file") - (zip_filename).unlink() + zip_filename.unlink() def setup(self, stage: Optional[str] = None) -> None: """Setup train, validation and test data. @@ -389,10 +276,8 @@ def setup(self, stage: Optional[str] = None) -> None: if stage in (None, "fit"): train_samples = samples[samples.split == "train"] train_samples = train_samples.reset_index(drop=True) - self.train_data = MVTecDataset( + self.train_data = AnomalibDataset( samples=train_samples, - root=self.root, - category=self.category, pre_process=self.pre_process_train, split="train", task=self.task, @@ -401,10 +286,8 @@ def setup(self, stage: Optional[str] = None) -> None: if self.create_validation_set: val_samples = samples[samples.split == "val"] val_samples = val_samples.reset_index(drop=True) - self.val_data = MVTecDataset( + self.val_data = AnomalibDataset( samples=val_samples, - root=self.root, - category=self.category, pre_process=self.pre_process_val, split="val", task=self.task, @@ -412,10 +295,8 @@ def setup(self, stage: Optional[str] = None) -> None: test_samples = samples[samples.split == "test"] test_samples = test_samples.reset_index(drop=True) - self.test_data = MVTecDataset( + self.test_data = AnomalibDataset( samples=test_samples, - root=self.root, - category=self.category, pre_process=self.pre_process_val, split="test", task=self.task, From 62a04f868854cabe306c8f87c96eb7e0aca99744 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Tue, 13 Sep 2022 18:18:27 +0200 Subject: [PATCH 05/96] move setup to base class, create samples as class method --- anomalib/data/base.py | 68 ++++++++++++- anomalib/data/folder.py | 215 ++++++++++++++-------------------------- anomalib/data/mvtec.py | 206 ++++++++++++-------------------------- 3 files changed, 203 insertions(+), 286 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index ad5fd14bf8..20bf9f52a9 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -3,9 +3,11 @@ # Copyright (C) 2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from abc import ABC -from typing import Dict, Optional, Union +import logging +from abc import ABC, abstractmethod +from typing import Dict, Optional, Tuple, Union +import albumentations as A import cv2 import numpy as np from pandas import DataFrame @@ -16,6 +18,8 @@ from anomalib.data.utils import read_image from anomalib.pre_processing import PreProcessor +logger = logging.getLogger(__name__) + class AnomalibDataset(Dataset): """Anomalib dataset.""" @@ -79,8 +83,66 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: class AnomalibDataModule(LightningDataModule, ABC): """Base Anomalib data module.""" - def __init__(self): + def __init__( + self, + task: str, + transform_config_train: Optional[Union[str, A.Compose]] = None, + transform_config_val: Optional[Union[str, A.Compose]] = None, + image_size: Optional[Union[int, Tuple[int, int]]] = None, + create_validation_set: bool = False, + ): super().__init__() + self.task = task + self.create_validation_set = create_validation_set + + if transform_config_train is not None and transform_config_val is None: + transform_config_val = transform_config_train + self.pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) + self.pre_process_val = PreProcessor(config=transform_config_val, image_size=image_size) + self.train_data: Optional[AnomalibDataset] = None self.val_data: Optional[AnomalibDataset] = None self.test_data: Optional[AnomalibDataset] = None + + @abstractmethod + def _create_samples(self) -> DataFrame: + """To be implemented in subclass.""" + + def setup(self, stage: Optional[str] = None) -> None: + """Setup train, validation and test data. + + Args: + stage: Optional[str]: Train/Val/Test stages. (Default value = None) + + """ + samples = self._create_samples() + + logger.info("Setting up train, validation, test and prediction datasets.") + if stage in (None, "fit"): + train_samples = samples[samples.split == "train"] + train_samples = train_samples.reset_index(drop=True) + self.train_data = AnomalibDataset( + samples=train_samples, + split="train", + task=self.task, + pre_process=self.pre_process_train, + ) + + if self.create_validation_set: + val_samples = samples[samples.split == "val"] + val_samples = val_samples.reset_index(drop=True) + self.val_data = AnomalibDataset( + samples=val_samples, + split="val", + task=self.task, + pre_process=self.pre_process_val, + ) + + test_samples = samples[samples.split == "test"] + test_samples = test_samples.reset_index(drop=True) + self.test_data = AnomalibDataset( + samples=test_samples, + split="test", + task=self.task, + pre_process=self.pre_process_val, + ) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index e883c0da37..12770ce2aa 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -18,12 +18,11 @@ from torch.utils.data import DataLoader from torchvision.datasets.folder import IMG_EXTENSIONS -from anomalib.data.base import AnomalibDataModule, AnomalibDataset +from anomalib.data.base import AnomalibDataModule from anomalib.data.utils.split import ( create_validation_set_from_test_set, split_normal_images_in_train_set, ) -from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) @@ -72,90 +71,6 @@ def _prepare_files_labels( return filenames, labels -def make_dataset( - normal_dir: Union[str, Path], - abnormal_dir: Union[str, Path], - normal_test_dir: Optional[Union[str, Path]] = None, - mask_dir: Optional[Union[str, Path]] = None, - split_ratio: float = 0.2, - seed: Optional[int] = None, - create_validation_set: bool = True, - extensions: Optional[Tuple[str, ...]] = None, -): - """Make Folder Dataset. - - Args: - normal_dir (Union[str, Path]): Path to the directory containing normal images. - abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images. - normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing - normal images for the test dataset. Normal test images will be a split of `normal_dir` - if `None`. Defaults to None. - mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing - the mask annotations. Defaults to None. - split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None. - split_ratio (float, optional): Ratio to split normal training images and add to the - test set in case test set doesn't contain any normal images. - Defaults to 0.2. - seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0. - create_validation_set (bool, optional):Boolean to create a validation set from the test set. - Those wanting to create a validation set could set this flag to ``True``. - extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the - directory. - - Returns: - DataFrame: an output dataframe containing samples for the requested split (ie., train or test) - """ - - filenames = [] - labels = [] - dirs = {"normal": normal_dir, "abnormal": abnormal_dir} - - if normal_test_dir: - dirs = {**dirs, **{"normal_test": normal_test_dir}} - - for dir_type, path in dirs.items(): - if path is not None: - filename, label = _prepare_files_labels(path, dir_type, extensions) - filenames += filename - labels += label - - samples = DataFrame({"image_path": filenames, "label": labels}) - - # Create label index for normal (0) and abnormal (1) images. - samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0 - samples.loc[(samples.label == "abnormal"), "label_index"] = 1 - samples.label_index = samples.label_index.astype(int) - - # If a path to mask is provided, add it to the sample dataframe. - if mask_dir is not None: - mask_dir = _check_and_convert_path(mask_dir) - samples["mask_path"] = "" - for index, row in samples.iterrows(): - if row.label_index == 1: - samples.loc[index, "mask_path"] = str(mask_dir / row.image_path.name) - - # Ensure the pathlib objects are converted to str. - # This is because torch dataloader doesn't like pathlib. - samples = samples.astype({"image_path": "str"}) - - # Create train/test split. - # By default, all the normal samples are assigned as train. - # and all the abnormal samples are test. - samples.loc[(samples.label == "normal"), "split"] = "train" - samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test" - - if not normal_test_dir: - samples = split_normal_images_in_train_set( - samples=samples, split_ratio=split_ratio, seed=seed, normal_label="normal" - ) - - # If `create_validation_set` is set to True, the test set is split into half. - if create_validation_set: - samples = create_validation_set_from_test_set(samples, seed=seed, normal_label="normal") - - return samples - - @DATAMODULE_REGISTRY class Folder(AnomalibDataModule): """Folder Lightning Data Module.""" @@ -277,7 +192,13 @@ def __init__( torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256]) """ - super().__init__() + super().__init__( + task=task, + transform_config_train=transform_config_train, + transform_config_val=transform_config_val, + image_size=image_size, + create_validation_set=create_validation_set, + ) if seed is None and normal_test_dir is None: raise ValueError( @@ -286,16 +207,6 @@ def __init__( " This will lead to inconsistency between runs." ) - self.root = _check_and_convert_path(root) - self.normal_dir = self.root / normal_dir - self.abnormal_dir = self.root / abnormal_dir if abnormal_dir is not None else None - self.normal_test = normal_test_dir - if normal_test_dir: - self.normal_test = self.root / normal_test_dir - self.mask_dir = mask_dir - self.extensions = extensions - self.split_ratio = split_ratio - if task == "segmentation" and mask_dir is None: warnings.warn( "Segmentation task is requested, but mask directory is not provided. " @@ -305,16 +216,20 @@ def __init__( else: self.task = task + self.root = _check_and_convert_path(root) + self.normal_dir = self.root / normal_dir + self.abnormal_dir = self.root / abnormal_dir if abnormal_dir is not None else None + self.normal_test_dir = normal_test_dir + if normal_test_dir: + self.normal_test_dir = self.root / normal_test_dir + self.mask_dir = mask_dir + self.extensions = extensions + self.split_ratio = split_ratio + self.transform_config_train = transform_config_train self.transform_config_val = transform_config_val self.image_size = image_size - if self.transform_config_train is not None and self.transform_config_val is None: - self.transform_config_val = self.transform_config_train - - self.pre_process_train = PreProcessor(config=self.transform_config_train, image_size=self.image_size) - self.pre_process_val = PreProcessor(config=self.transform_config_val, image_size=self.image_size) - self.train_batch_size = train_batch_size self.test_batch_size = test_batch_size self.num_workers = num_workers @@ -322,53 +237,69 @@ def __init__( self.create_validation_set = create_validation_set self.seed = seed - def setup(self, stage: Optional[str] = None) -> None: - """Setup train, validation and test data. + def _create_samples(self): + """Create the dataframe with samples for the Folder dataset. - Args: - stage: Optional[str]: Train/Val/Test stages. (Default value = None) + This function creates a dataframe to store the parsed information based on the following format: + |---|-------------------|--------|-------------|------------------|-------| + | | image_path | label | label_index | mask_path | split | + |---|-------------------|--------|-------------|------------------|-------| + | 0 | path/to/image.png | normal | 0 | path/to/mask.png | train | + |---|-------------------|--------|-------------|------------------|-------| + + Returns: + DataFrame: an output dataframe containing the samples of the dataset. """ - samples = make_dataset( - normal_dir=self.normal_dir, - abnormal_dir=self.abnormal_dir, - normal_test_dir=self.normal_test, - mask_dir=self.mask_dir, - split_ratio=self.split_ratio, - seed=self.seed, - create_validation_set=self.create_validation_set, - extensions=self.extensions, - ) - logger.info("Setting up train, validation, test and prediction datasets.") - if stage in (None, "fit"): - train_samples = samples[samples.split == "train"] - train_samples = train_samples.reset_index(drop=True) - self.train_data = AnomalibDataset( - samples=train_samples, - split="train", - pre_process=self.pre_process_train, - task=self.task, + filenames = [] + labels = [] + dirs = {"normal": self.normal_dir, "abnormal": self.abnormal_dir} + + if self.normal_test_dir: + dirs = {**dirs, **{"normal_test": self.normal_test_dir}} + + for dir_type, path in dirs.items(): + if path is not None: + filename, label = _prepare_files_labels(path, dir_type, self.extensions) + filenames += filename + labels += label + + samples = DataFrame({"image_path": filenames, "label": labels}) + + # Create label index for normal (0) and abnormal (1) images. + samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0 + samples.loc[(samples.label == "abnormal"), "label_index"] = 1 + samples.label_index = samples.label_index.astype(int) + + # If a path to mask is provided, add it to the sample dataframe. + if self.mask_dir is not None: + self.mask_dir = _check_and_convert_path(self.mask_dir) + samples["mask_path"] = "" + for index, row in samples.iterrows(): + if row.label_index == 1: + samples.loc[index, "mask_path"] = str(self.mask_dir / row.image_path.name) + + # Ensure the pathlib objects are converted to str. + # This is because torch dataloader doesn't like pathlib. + samples = samples.astype({"image_path": "str"}) + + # Create train/test split. + # By default, all the normal samples are assigned as train. + # and all the abnormal samples are test. + samples.loc[(samples.label == "normal"), "split"] = "train" + samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test" + + if not self.normal_test_dir: + samples = split_normal_images_in_train_set( + samples=samples, split_ratio=self.split_ratio, seed=self.seed, normal_label="normal" ) + # If `create_validation_set` is set to True, the test set is split into half. if self.create_validation_set: - val_samples = samples[samples.split == "val"] - val_samples = val_samples.reset_index(drop=True) - self.val_data = AnomalibDataset( - samples=val_samples, - split="val", - pre_process=self.pre_process_val, - task=self.task, - ) + samples = create_validation_set_from_test_set(samples, seed=self.seed, normal_label="normal") - test_samples = samples[samples.split == "test"] - test_samples = test_samples.reset_index(drop=True) - self.test_data = AnomalibDataset( - samples=test_samples, - split="test", - pre_process=self.pre_process_val, - task=self.task, - ) + return samples def train_dataloader(self) -> TRAIN_DATALOADERS: """Get train dataloader.""" diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index af7d186a59..466420eebd 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -41,115 +41,16 @@ from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS from torch.utils.data import DataLoader -from anomalib.data.base import AnomalibDataModule, AnomalibDataset +from anomalib.data.base import AnomalibDataModule from anomalib.data.utils import DownloadProgressBar, hash_check from anomalib.data.utils.split import ( create_validation_set_from_test_set, split_normal_images_in_train_set, ) -from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) -def make_mvtec_dataset( - path: Path, - split_ratio: float = 0.1, - seed: Optional[int] = None, - create_validation_set: bool = False, -) -> DataFrame: - """Create MVTec AD samples by parsing the MVTec AD data file structure. - - The files are expected to follow the structure: - path/to/dataset/split/category/image_filename.png - path/to/dataset/ground_truth/category/mask_filename.png - - This function creates a dataframe to store the parsed information based on the following format: - |---|---------------|-------|---------|---------------|---------------------------------------|-------------| - | | path | split | label | image_path | mask_path | label_index | - |---|---------------|-------|---------|---------------|---------------------------------------|-------------| - | 0 | datasets/name | test | defect | filename.png | ground_truth/defect/filename_mask.png | 1 | - |---|---------------|-------|---------|---------------|---------------------------------------|-------------| - - Args: - path (Path): Path to dataset - split (str, optional): Dataset split (ie., either train or test). Defaults to None. - split_ratio (float, optional): Ratio to split normal training images and add to the - test set in case test set doesn't contain any normal images. - Defaults to 0.1. - seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0. - create_validation_set (bool, optional): Boolean to create a validation set from the test set. - MVTec AD dataset does not contain a validation set. Those wanting to create a validation set - could set this flag to ``True``. - - Example: - The following example shows how to get training samples from MVTec AD bottle category: - - >>> root = Path('./MVTec') - >>> category = 'bottle' - >>> path = root / category - >>> path - PosixPath('MVTec/bottle') - - >>> samples = make_mvtec_dataset(path, split='train', split_ratio=0.1, seed=0) - >>> samples.head() - path split label image_path mask_path label_index - 0 MVTec/bottle train good MVTec/bottle/train/good/105.png MVTec/bottle/ground_truth/good/105_mask.png 0 - 1 MVTec/bottle train good MVTec/bottle/train/good/017.png MVTec/bottle/ground_truth/good/017_mask.png 0 - 2 MVTec/bottle train good MVTec/bottle/train/good/137.png MVTec/bottle/ground_truth/good/137_mask.png 0 - 3 MVTec/bottle train good MVTec/bottle/train/good/152.png MVTec/bottle/ground_truth/good/152_mask.png 0 - 4 MVTec/bottle train good MVTec/bottle/train/good/109.png MVTec/bottle/ground_truth/good/109_mask.png 0 - - Returns: - DataFrame: an output dataframe containing samples for the requested split (ie., train or test) - """ - if seed is None: - warnings.warn( - "seed is None." - " When seed is not set, images from the normal directory are split between training and test dir." - " This will lead to inconsistency between runs." - ) - - samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.png")] - if len(samples_list) == 0: - raise RuntimeError(f"Found 0 images in {path}") - - samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"]) - samples = samples[samples.split != "ground_truth"] - - # Create mask_path column - samples["mask_path"] = ( - samples.path - + "/ground_truth/" - + samples.label - + "/" - + samples.image_path.str.rstrip("png").str.rstrip(".") - + "_mask.png" - ) - - # Modify image_path column by converting to absolute path - samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path - - # Split the normal images in training set if test set doesn't - # contain any normal images. This is needed because AUC score - # cannot be computed based on 1-class - if sum((samples.split == "test") & (samples.label == "good")) == 0: - samples = split_normal_images_in_train_set(samples, split_ratio, seed) - - # Good images don't have mask - samples.loc[(samples.split == "test") & (samples.label == "good"), "mask_path"] = "" - - # Create label index for normal (0) and anomalous (1) images. - samples.loc[(samples.label == "good"), "label_index"] = 0 - samples.loc[(samples.label != "good"), "label_index"] = 1 - samples.label_index = samples.label_index.astype(int) - - if create_validation_set: - samples = create_validation_set_from_test_set(samples, seed=seed) - - return samples - - @DATAMODULE_REGISTRY class MVTec(AnomalibDataModule): """MVTec AD Lightning Data Module.""" @@ -165,6 +66,7 @@ def __init__( task: str = "segmentation", transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_val: Optional[Union[str, A.Compose]] = None, + split_ratio: float = 0.2, seed: Optional[int] = None, create_validation_set: bool = False, ) -> None: @@ -209,7 +111,13 @@ def __init__( >>> data["image"].shape, data["mask"].shape (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256])) """ - super().__init__() + super().__init__( + task=task, + transform_config_train=transform_config_train, + transform_config_val=transform_config_val, + image_size=image_size, + create_validation_set=create_validation_set, + ) self.root = root if isinstance(root, Path) else Path(root) self.category = category @@ -218,12 +126,6 @@ def __init__( self.transform_config_val = transform_config_val self.image_size = image_size - if self.transform_config_train is not None and self.transform_config_val is None: - self.transform_config_val = self.transform_config_train - - self.pre_process_train = PreProcessor(config=self.transform_config_train, image_size=self.image_size) - self.pre_process_val = PreProcessor(config=self.transform_config_val, image_size=self.image_size) - self.train_batch_size = train_batch_size self.test_batch_size = test_batch_size self.num_workers = num_workers @@ -231,6 +133,7 @@ def __init__( self.create_validation_set = create_validation_set self.task = task self.seed = seed + self.split_ratio = split_ratio def prepare_data(self) -> None: """Download the dataset if not available.""" @@ -259,48 +162,69 @@ def prepare_data(self) -> None: logger.info("Cleaning the tar file") zip_filename.unlink() - def setup(self, stage: Optional[str] = None) -> None: - """Setup train, validation and test data. + def _create_samples(self) -> DataFrame: + """Create MVTec AD samples by parsing the MVTec AD data file structure. - Args: - stage: Optional[str]: Train/Val/Test stages. (Default value = None) + The files are expected to follow the structure: + path/to/dataset/split/category/image_filename.png + path/to/dataset/ground_truth/category/mask_filename.png + This function creates a dataframe to store the parsed information based on the following format: + |---|---------------|-------|---------|---------------|---------------------------------------|-------------| + | | path | split | label | image_path | mask_path | label_index | + |---|---------------|-------|---------|---------------|---------------------------------------|-------------| + | 0 | datasets/name | test | defect | filename.png | ground_truth/defect/filename_mask.png | 1 | + |---|---------------|-------|---------|---------------|---------------------------------------|-------------| + + Returns: + DataFrame: an output dataframe containing the samples of the dataset. """ - samples = make_mvtec_dataset( - path=self.root / self.category, - seed=self.seed, - create_validation_set=self.create_validation_set, + if self.seed is None: + warnings.warn( + "seed is None." + " When seed is not set, images from the normal directory are split between training and test dir." + " This will lead to inconsistency between runs." + ) + + path = self.root / self.category + samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.png")] + if len(samples_list) == 0: + raise RuntimeError(f"Found 0 images in {path}") + + samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"]) + samples = samples[samples.split != "ground_truth"] + + # Create mask_path column + samples["mask_path"] = ( + samples.path + + "/ground_truth/" + + samples.label + + "/" + + samples.image_path.str.rstrip("png").str.rstrip(".") + + "_mask.png" ) - logger.info("Setting up train, validation, test and prediction datasets.") - if stage in (None, "fit"): - train_samples = samples[samples.split == "train"] - train_samples = train_samples.reset_index(drop=True) - self.train_data = AnomalibDataset( - samples=train_samples, - pre_process=self.pre_process_train, - split="train", - task=self.task, - ) + # Modify image_path column by converting to absolute path + samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path + + # Split the normal images in training set if test set doesn't + # contain any normal images. This is needed because AUC score + # cannot be computed based on 1-class + if sum((samples.split == "test") & (samples.label == "good")) == 0: + samples = split_normal_images_in_train_set(samples, self.split_ratio, self.seed) + + # Good images don't have mask + samples.loc[(samples.split == "test") & (samples.label == "good"), "mask_path"] = "" + + # Create label index for normal (0) and anomalous (1) images. + samples.loc[(samples.label == "good"), "label_index"] = 0 + samples.loc[(samples.label != "good"), "label_index"] = 1 + samples.label_index = samples.label_index.astype(int) if self.create_validation_set: - val_samples = samples[samples.split == "val"] - val_samples = val_samples.reset_index(drop=True) - self.val_data = AnomalibDataset( - samples=val_samples, - pre_process=self.pre_process_val, - split="val", - task=self.task, - ) + samples = create_validation_set_from_test_set(samples, seed=self.seed) - test_samples = samples[samples.split == "test"] - test_samples = test_samples.reset_index(drop=True) - self.test_data = AnomalibDataset( - samples=test_samples, - pre_process=self.pre_process_val, - split="test", - task=self.task, - ) + return samples def train_dataloader(self) -> TRAIN_DATALOADERS: """Get train dataloader.""" From e91afad8e9e0187d0e2fa2a5fed1bb038e1bc7db Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Tue, 13 Sep 2022 19:32:23 +0200 Subject: [PATCH 06/96] update docstrings --- anomalib/data/base.py | 19 ++++++++++++++++++- anomalib/data/folder.py | 6 +++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 20bf9f52a9..f053fa7532 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -106,7 +106,24 @@ def __init__( @abstractmethod def _create_samples(self) -> DataFrame: - """To be implemented in subclass.""" + """This method should be implemented in the subclass. + + This method should return a dataframe that contains the information needed by the dataloader to load each of + the dataset items into memory. The dataframe must at least contain the following columns: + split - The subset to which the dataset item is assigned. + image_path - Path to file system location where the image is stored. + label_index - Index of the anomaly label, typically 0 for "normal" and 1 for "anomalous". + + Additionally, when the task type is segmentation, the dataframe must have the mask_path column, which contains + the path the ground truth masks (for the anomalous images only). + + Example of a dataframe returned by calling this method from a concrete class: + |---|-------------------|-----------|-------------|------------------|-------| + | | image_path | label | label_index | mask_path | split | + |---|-------------------|-----------|-------------|------------------|-------| + | 0 | path/to/image.png | anomalous | 0 | path/to/mask.png | train | + |---|-------------------|-----------|-------------|------------------|-------| + """ def setup(self, stage: Optional[str] = None) -> None: """Setup train, validation and test data. diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index 12770ce2aa..aea65a4d64 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -240,6 +240,11 @@ def __init__( def _create_samples(self): """Create the dataframe with samples for the Folder dataset. + The files are expected to follow the structure: + path/to/dataset/normal_folder_name/normal_image_name.png + path/to/dataset/abnormal_folder_name/abnormal_image_name.png + + This function creates a dataframe to store the parsed information based on the following format: |---|-------------------|--------|-------------|------------------|-------| | | image_path | label | label_index | mask_path | split | @@ -249,7 +254,6 @@ def _create_samples(self): Returns: DataFrame: an output dataframe containing the samples of the dataset. - """ filenames = [] From df4a805d7f59814da9a1f0e3466c99c527d1e5e0 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Wed, 14 Sep 2022 12:37:11 +0200 Subject: [PATCH 07/96] refactor btech to new format --- anomalib/data/btech.py | 352 ++++++++--------------------------------- 1 file changed, 66 insertions(+), 286 deletions(-) diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 8b6bac792b..9f746f0e5c 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -11,260 +11,38 @@ import logging import shutil -import warnings import zipfile from pathlib import Path -from typing import Dict, Optional, Tuple, Union +from typing import Optional, Tuple, Union from urllib.request import urlretrieve import albumentations as A import cv2 -import numpy as np import pandas as pd from pandas.core.frame import DataFrame -from pytorch_lightning.core.datamodule import LightningDataModule from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS -from torch import Tensor from torch.utils.data import DataLoader -from torch.utils.data.dataset import Dataset -from torchvision.datasets.folder import VisionDataset from tqdm import tqdm -from anomalib.data.inference import InferenceDataset -from anomalib.data.utils import DownloadProgressBar, hash_check, read_image +from anomalib.data.base import AnomalibDataModule +from anomalib.data.utils import DownloadProgressBar, hash_check from anomalib.data.utils.split import ( create_validation_set_from_test_set, split_normal_images_in_train_set, ) -from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) -def make_btech_dataset( - path: Path, - split: Optional[str] = None, - split_ratio: float = 0.1, - seed: Optional[int] = None, - create_validation_set: bool = False, -) -> DataFrame: - """Create BTech samples by parsing the BTech data file structure. - - The files are expected to follow the structure: - path/to/dataset/split/category/image_filename.png - path/to/dataset/ground_truth/category/mask_filename.png - - Args: - path (Path): Path to dataset - split (str, optional): Dataset split (ie., either train or test). Defaults to None. - split_ratio (float, optional): Ratio to split normal training images and add to the - test set in case test set doesn't contain any normal images. - Defaults to 0.1. - seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0. - create_validation_set (bool, optional): Boolean to create a validation set from the test set. - BTech dataset does not contain a validation set. Those wanting to create a validation set - could set this flag to ``True``. - - Example: - The following example shows how to get training samples from BTech 01 category: - - >>> root = Path('./BTech') - >>> category = '01' - >>> path = root / category - >>> path - PosixPath('BTech/01') - - >>> samples = make_btech_dataset(path, split='train', split_ratio=0.1, seed=0) - >>> samples.head() - path split label image_path mask_path label_index - 0 BTech/01 train 01 BTech/01/train/ok/105.bmp BTech/01/ground_truth/ok/105.png 0 - 1 BTech/01 train 01 BTech/01/train/ok/017.bmp BTech/01/ground_truth/ok/017.png 0 - ... - - Returns: - DataFrame: an output dataframe containing samples for the requested split (ie., train or test) - """ - samples_list = [ - (str(path),) + filename.parts[-3:] for filename in path.glob("**/*") if filename.suffix in (".bmp", ".png") - ] - if len(samples_list) == 0: - raise RuntimeError(f"Found 0 images in {path}") - - samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"]) - samples = samples[samples.split != "ground_truth"] - - # Create mask_path column - samples["mask_path"] = ( - samples.path - + "/ground_truth/" - + samples.label - + "/" - + samples.image_path.str.rstrip("png").str.rstrip(".") - + ".png" - ) - - # Modify image_path column by converting to absolute path - samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path - - # Split the normal images in training set if test set doesn't - # contain any normal images. This is needed because AUC score - # cannot be computed based on 1-class - if sum((samples.split == "test") & (samples.label == "ok")) == 0: - samples = split_normal_images_in_train_set(samples, split_ratio, seed) - - # Good images don't have mask - samples.loc[(samples.split == "test") & (samples.label == "ok"), "mask_path"] = "" - - # Create label index for normal (0) and anomalous (1) images. - samples.loc[(samples.label == "ok"), "label_index"] = 0 - samples.loc[(samples.label != "ok"), "label_index"] = 1 - samples.label_index = samples.label_index.astype(int) - - if create_validation_set: - samples = create_validation_set_from_test_set(samples, seed=seed) - - # Get the data frame for the split. - if split is not None and split in ["train", "val", "test"]: - samples = samples[samples.split == split] - samples = samples.reset_index(drop=True) - - return samples - - -class BTechDataset(VisionDataset): - """BTech PyTorch Dataset.""" - - def __init__( - self, - root: Union[Path, str], - category: str, - pre_process: PreProcessor, - split: str, - task: str = "segmentation", - seed: Optional[int] = None, - create_validation_set: bool = False, - ) -> None: - """Btech Dataset class. - - Args: - root: Path to the BTech dataset - category: Name of the BTech category. - pre_process: List of pre_processing object containing albumentation compose. - split: 'train', 'val' or 'test' - task: ``classification`` or ``segmentation`` - seed: seed used for the random subset splitting - create_validation_set: Create a validation subset in addition to the train and test subsets - - Examples: - >>> from anomalib.data.btech import BTechDataset - >>> from anomalib.data.transforms import PreProcessor - >>> pre_process = PreProcessor(image_size=256) - >>> dataset = BTechDataset( - ... root='./datasets/BTech', - ... category='leather', - ... pre_process=pre_process, - ... task="classification", - ... is_train=True, - ... ) - >>> dataset[0].keys() - dict_keys(['image']) - - >>> dataset.split = "test" - >>> dataset[0].keys() - dict_keys(['image', 'image_path', 'label']) - - >>> dataset.task = "segmentation" - >>> dataset.split = "train" - >>> dataset[0].keys() - dict_keys(['image']) - - >>> dataset.split = "test" - >>> dataset[0].keys() - dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask']) - - >>> dataset[0]["image"].shape, dataset[0]["mask"].shape - (torch.Size([3, 256, 256]), torch.Size([256, 256])) - """ - super().__init__(root) - - if seed is None: - warnings.warn( - "seed is None." - " When seed is not set, images from the normal directory are split between training and test dir." - " This will lead to inconsistency between runs." - ) - - self.root = Path(root) if isinstance(root, str) else root - self.category: str = category - self.split = split - self.task = task - - self.pre_process = pre_process - - self.samples = make_btech_dataset( - path=self.root / category, - split=self.split, - seed=seed, - create_validation_set=create_validation_set, - ) - - def __len__(self) -> int: - """Get length of the dataset.""" - return len(self.samples) - - def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: - """Get dataset item for the index ``index``. - - Args: - index (int): Index to get the item. - - Returns: - Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. - Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. - """ - item: Dict[str, Union[str, Tensor]] = {} - - image_path = self.samples.image_path[index] - image = read_image(image_path) - - pre_processed = self.pre_process(image=image) - item = {"image": pre_processed["image"]} - - if self.split in ["val", "test"]: - label_index = self.samples.label_index[index] - - item["image_path"] = image_path - item["label"] = label_index - - if self.task == "segmentation": - mask_path = self.samples.mask_path[index] - - # Only Anomalous (1) images has masks in BTech dataset. - # Therefore, create empty mask for Normal (0) images. - if label_index == 0: - mask = np.zeros(shape=image.shape[:2]) - else: - mask = cv2.imread(mask_path, flags=0) / 255.0 - - pre_processed = self.pre_process(image=image, mask=mask) - - item["mask_path"] = mask_path - item["image"] = pre_processed["image"] - item["mask"] = pre_processed["mask"] - - return item - - @DATAMODULE_REGISTRY -class BTech(LightningDataModule): +class BTech(AnomalibDataModule): """BTechDataModule Lightning Data Module.""" def __init__( self, root: str, category: str, - # TODO: Remove default values. IAAALD-211 image_size: Optional[Union[int, Tuple[int, int]]] = None, train_batch_size: int = 32, test_batch_size: int = 32, @@ -272,6 +50,7 @@ def __init__( task: str = "segmentation", transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_val: Optional[Union[str, A.Compose]] = None, + split_ratio: float = 0.2, seed: Optional[int] = None, create_validation_set: bool = False, ) -> None: @@ -316,21 +95,21 @@ def __init__( >>> data["image"].shape, data["mask"].shape (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256])) """ - super().__init__() + super().__init__( + task=task, + transform_config_train=transform_config_train, + transform_config_val=transform_config_val, + image_size=image_size, + create_validation_set=create_validation_set, + ) self.root = root if isinstance(root, Path) else Path(root) self.category = category - self.dataset_path = self.root / self.category + self.path = self.root / self.category self.transform_config_train = transform_config_train self.transform_config_val = transform_config_val self.image_size = image_size - if self.transform_config_train is not None and self.transform_config_val is None: - self.transform_config_val = self.transform_config_train - - self.pre_process_train = PreProcessor(config=self.transform_config_train, image_size=self.image_size) - self.pre_process_val = PreProcessor(config=self.transform_config_val, image_size=self.image_size) - self.train_batch_size = train_batch_size self.test_batch_size = test_batch_size self.num_workers = num_workers @@ -338,12 +117,7 @@ def __init__( self.create_validation_set = create_validation_set self.task = task self.seed = seed - - self.train_data: Dataset - self.test_data: Dataset - if create_validation_set: - self.val_data: Dataset - self.inference_data: Dataset + self.split_ratio = split_ratio def prepare_data(self) -> None: """Download the dataset if not available.""" @@ -386,53 +160,65 @@ def prepare_data(self) -> None: logger.info("Cleaning the tar file") zip_filename.unlink() - def setup(self, stage: Optional[str] = None) -> None: - """Setup train, validation and test data. + def _create_samples(self) -> DataFrame: + """Create BTech samples by parsing the BTech data file structure. - BTech dataset uses BTech dataset structure, which is the reason for - using `anomalib.data.btech.BTech` class to get the dataset items. + The files are expected to follow the structure: + path/to/dataset/category/split/[ok|ko]/image_filename.bmp + path/to/dataset/category/ground_truth/ko/mask_filename.png - Args: - stage: Optional[str]: Train/Val/Test stages. (Default value = None) + This function creates a dataframe to store the parsed information based on the following format: + |---|---------------|-------|---------|---------------|---------------------------------------|-------------| + | | path | split | label | image_path | mask_path | label_index | + |---|---------------|-------|---------|---------------|---------------------------------------|-------------| + | 0 | datasets/name | test | ko | filename.png | ground_truth/ko/filename_mask.png | 1 | + |---|---------------|-------|---------|---------------|---------------------------------------|-------------| + Returns: + DataFrame: an output dataframe containing the samples of the dataset. """ - logger.info("Setting up train, validation, test and prediction datasets.") - if stage in (None, "fit"): - self.train_data = BTechDataset( - root=self.root, - category=self.category, - pre_process=self.pre_process_train, - split="train", - task=self.task, - seed=self.seed, - create_validation_set=self.create_validation_set, - ) + samples_list = [ + (str(self.path),) + filename.parts[-3:] + for filename in self.path.glob("**/*") + if filename.suffix in (".bmp", ".png") + ] + if len(samples_list) == 0: + raise RuntimeError(f"Found 0 images in {self.path}") + + samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"]) + samples = samples[samples.split != "ground_truth"] + + # Create mask_path column + samples["mask_path"] = ( + samples.path + + "/ground_truth/" + + samples.label + + "/" + + samples.image_path.str.rstrip("bmp").str.rstrip(".") + + ".png" + ) + + # Modify image_path column by converting to absolute path + samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path + + # Split the normal images in training set if test set doesn't + # contain any normal images. This is needed because AUC score + # cannot be computed based on 1-class + if sum((samples.split == "test") & (samples.label == "ok")) == 0: + samples = split_normal_images_in_train_set(samples, self.split_ratio, self.seed) + + # Good images don't have mask + samples.loc[(samples.split == "test") & (samples.label == "ok"), "mask_path"] = "" + + # Create label index for normal (0) and anomalous (1) images. + samples.loc[(samples.label == "ok"), "label_index"] = 0 + samples.loc[(samples.label != "ok"), "label_index"] = 1 + samples.label_index = samples.label_index.astype(int) if self.create_validation_set: - self.val_data = BTechDataset( - root=self.root, - category=self.category, - pre_process=self.pre_process_val, - split="val", - task=self.task, - seed=self.seed, - create_validation_set=self.create_validation_set, - ) - - self.test_data = BTechDataset( - root=self.root, - category=self.category, - pre_process=self.pre_process_val, - split="test", - task=self.task, - seed=self.seed, - create_validation_set=self.create_validation_set, - ) + samples = create_validation_set_from_test_set(samples, seed=self.seed) - if stage == "predict": - self.inference_data = InferenceDataset( - path=self.root, image_size=self.image_size, transform_config=self.transform_config_val - ) + return samples def train_dataloader(self) -> TRAIN_DATALOADERS: """Get train dataloader.""" @@ -446,9 +232,3 @@ def val_dataloader(self) -> EVAL_DATALOADERS: def test_dataloader(self) -> EVAL_DATALOADERS: """Get test dataloader.""" return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) - - def predict_dataloader(self) -> EVAL_DATALOADERS: - """Get predict dataloader.""" - return DataLoader( - self.inference_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers - ) From c225a835ac7295162d6f3a4c42b7f7158f98a4c2 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Wed, 14 Sep 2022 17:14:58 +0200 Subject: [PATCH 08/96] allow training with no anomalous data --- anomalib/data/__init__.py | 7 ++++--- anomalib/utils/metrics/adaptive_threshold.py | 10 ++++++++++ tools/train.py | 7 +++++-- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index 8c295a1061..f1691620f5 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -7,7 +7,8 @@ from typing import Union from omegaconf import DictConfig, ListConfig -from pytorch_lightning import LightningDataModule + +from anomalib.data.base import AnomalibDataModule from .btech import BTech from .folder import Folder @@ -17,7 +18,7 @@ logger = logging.getLogger(__name__) -def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule: +def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: """Get Anomaly Datamodule. Args: @@ -28,7 +29,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule """ logger.info("Loading the datamodule") - datamodule: LightningDataModule + datamodule: AnomalibDataModule if config.dataset.format.lower() == "mvtec": datamodule = MVTec( diff --git a/anomalib/utils/metrics/adaptive_threshold.py b/anomalib/utils/metrics/adaptive_threshold.py index fd112433f1..868c6e2ad6 100644 --- a/anomalib/utils/metrics/adaptive_threshold.py +++ b/anomalib/utils/metrics/adaptive_threshold.py @@ -3,6 +3,8 @@ # Copyright (C) 2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import warnings + import torch from torchmetrics import PrecisionRecallCurve @@ -33,6 +35,14 @@ def compute(self) -> torch.Tensor: recall: torch.Tensor thresholds: torch.Tensor + if not any(1 in batch for batch in self.target): + warnings.warn( + "The validation set does not contain any anomalous images. As a result, the adaptive threshold will " + "take the value of the highest anomaly score observed in the normal validation images, which may lead " + "to poor predictions. For a more reliable adaptive threshold computation, please add some anomalous " + "images to the validation set." + ) + precision, recall, thresholds = super().compute() f1_score = (2 * precision * recall) / (precision + recall + 1e-10) if thresholds.dim() == 0: diff --git a/tools/train.py b/tools/train.py index 0e5daa3b10..b1f176a591 100644 --- a/tools/train.py +++ b/tools/train.py @@ -63,8 +63,11 @@ def train(): load_model_callback = LoadModelCallback(weights_path=trainer.checkpoint_callback.best_model_path) trainer.callbacks.insert(0, load_model_callback) - logger.info("Testing the model.") - trainer.test(model=model, datamodule=datamodule) + if datamodule.test_data.contains_anomalous_images(): + logger.info("Testing the model.") + trainer.test(model=model, datamodule=datamodule) + else: + logger.info("No anomalous images found in dataset. Skipping test stage.") if __name__ == "__main__": From ac0dc8a939ec3c01d683218d895d12a3c7cf8d6c Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Thu, 15 Sep 2022 10:56:38 +0200 Subject: [PATCH 09/96] remove MVTec name from comment --- anomalib/data/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index f053fa7532..4561f30365 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -64,7 +64,7 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: if self.task == "segmentation": mask_path = self.samples.mask_path[index] - # Only Anomalous (1) images has masks in MVTec AD dataset. + # Only Anomalous (1) images have masks in anomaly datasets # Therefore, create empty mask for Normal (0) images. if label_index == 0: mask = np.zeros(shape=image.shape[:2]) From 5d90209cb036e3d6464fece4c1deac021efe598e Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Thu, 15 Sep 2022 10:58:07 +0200 Subject: [PATCH 10/96] raise NotImplementedError in base class --- anomalib/data/base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 4561f30365..b02b1f7d3c 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -124,6 +124,7 @@ def _create_samples(self) -> DataFrame: | 0 | path/to/image.png | anomalous | 0 | path/to/mask.png | train | |---|-------------------|-----------|-------------|------------------|-------| """ + raise NotImplementedError def setup(self, stage: Optional[str] = None) -> None: """Setup train, validation and test data. From c1e6724f4c3cf516c086bfe2058c188a7d446f07 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Thu, 15 Sep 2022 14:12:15 +0200 Subject: [PATCH 11/96] allow both png and bmp images for btech --- anomalib/data/btech.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 9f746f0e5c..270dcc09bf 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -194,7 +194,7 @@ def _create_samples(self) -> DataFrame: + "/ground_truth/" + samples.label + "/" - + samples.image_path.str.rstrip("bmp").str.rstrip(".") + + samples.image_path.str.rstrip("bmp|png").str.rstrip(".") + ".png" ) From 2d70d89dfee713bdd235e095f1b00ab79a24399b Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 16 Sep 2022 10:24:43 +0200 Subject: [PATCH 12/96] use label_index to check if dataset contains anomalous images --- anomalib/data/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index b02b1f7d3c..1d853c783c 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -33,7 +33,7 @@ def __init__(self, samples: DataFrame, task: str, split: str, pre_process: PrePr def contains_anomalous_images(self): """Check if the dataset contains any anomalous images.""" - return "anomalous" in list(self.samples.label) + return 1 in list(self.samples.label_index) def __len__(self) -> int: """Get length of the dataset.""" From f5f17db19a3f23c174cfda725b28fd1da7cea355 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 16 Sep 2022 15:46:42 +0200 Subject: [PATCH 13/96] refactor getitem in dataset class --- anomalib/data/base.py | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 1d853c783c..3503ed164e 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -51,31 +51,29 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: """ image_path = self.samples.image_path[index] image = read_image(image_path) + label_index = self.samples.label_index[index] - pre_processed = self.pre_process(image=image) - item = {"image": pre_processed["image"]} + item = dict(image_path=image_path, label=label_index) - if self.split in ["val", "test"]: - label_index = self.samples.label_index[index] + if self.task == "classification": + pre_processed = self.pre_process(image=image) + elif self.task == "segmentation": + mask_path = self.samples.mask_path[index] - item["image_path"] = image_path - item["label"] = label_index + # Only Anomalous (1) images have masks in anomaly datasets + # Therefore, create empty mask for Normal (0) images. + if label_index == 0: + mask = np.zeros(shape=image.shape[:2]) + else: + mask = cv2.imread(mask_path, flags=0) / 255.0 - if self.task == "segmentation": - mask_path = self.samples.mask_path[index] + pre_processed = self.pre_process(image=image, mask=mask) - # Only Anomalous (1) images have masks in anomaly datasets - # Therefore, create empty mask for Normal (0) images. - if label_index == 0: - mask = np.zeros(shape=image.shape[:2]) - else: - mask = cv2.imread(mask_path, flags=0) / 255.0 - - pre_processed = self.pre_process(image=image, mask=mask) - - item["mask_path"] = mask_path - item["image"] = pre_processed["image"] - item["mask"] = pre_processed["mask"] + item["mask_path"] = mask_path + item["mask"] = pre_processed["mask"] + else: + raise ValueError(f"Unknown task type: {self.task}") + item["image"] = pre_processed["image"] return item From f02065f50ba3792fbbf1101d2fad249f8bebd456 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 16 Sep 2022 15:53:13 +0200 Subject: [PATCH 14/96] use iloc for indexing --- anomalib/data/base.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 3503ed164e..cb56b9359e 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -49,16 +49,16 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. """ - image_path = self.samples.image_path[index] + image_path = self.samples.iloc[index].image_path image = read_image(image_path) - label_index = self.samples.label_index[index] + label_index = self.samples.iloc[index].label_index item = dict(image_path=image_path, label=label_index) if self.task == "classification": pre_processed = self.pre_process(image=image) elif self.task == "segmentation": - mask_path = self.samples.mask_path[index] + mask_path = self.samples.iloc[index].mask_path # Only Anomalous (1) images have masks in anomaly datasets # Therefore, create empty mask for Normal (0) images. From 9cba9da174594d7b4388bc9b57130871dbbdf511 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 16 Sep 2022 16:43:30 +0200 Subject: [PATCH 15/96] move dataloader getters to base class --- anomalib/data/base.py | 22 +++++++++++++++++++++- anomalib/data/btech.py | 26 +++----------------------- anomalib/data/folder.py | 26 +++----------------------- anomalib/data/mvtec.py | 33 ++++++--------------------------- 4 files changed, 33 insertions(+), 74 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index cb56b9359e..aa5b5083c0 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -12,8 +12,9 @@ import numpy as np from pandas import DataFrame from pytorch_lightning import LightningDataModule +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS from torch import Tensor -from torch.utils.data import Dataset +from torch.utils.data import DataLoader, Dataset from anomalib.data.utils import read_image from anomalib.pre_processing import PreProcessor @@ -84,6 +85,9 @@ class AnomalibDataModule(LightningDataModule, ABC): def __init__( self, task: str, + train_batch_size: int, + test_batch_size: int, + num_workers: int, transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_val: Optional[Union[str, A.Compose]] = None, image_size: Optional[Union[int, Tuple[int, int]]] = None, @@ -91,6 +95,9 @@ def __init__( ): super().__init__() self.task = task + self.train_batch_size = train_batch_size + self.test_batch_size = test_batch_size + self.num_workers = num_workers self.create_validation_set = create_validation_set if transform_config_train is not None and transform_config_val is None: @@ -162,3 +169,16 @@ def setup(self, stage: Optional[str] = None) -> None: task=self.task, pre_process=self.pre_process_val, ) + + def train_dataloader(self) -> TRAIN_DATALOADERS: + """Get train dataloader.""" + return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers) + + def val_dataloader(self) -> EVAL_DATALOADERS: + """Get validation dataloader.""" + dataset = self.val_data if self.create_validation_set else self.test_data + return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) + + def test_dataloader(self) -> EVAL_DATALOADERS: + """Get test dataloader.""" + return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 270dcc09bf..f1125ce0ff 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -21,8 +21,6 @@ import pandas as pd from pandas.core.frame import DataFrame from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY -from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS -from torch.utils.data import DataLoader from tqdm import tqdm from anomalib.data.base import AnomalibDataModule @@ -97,6 +95,9 @@ def __init__( """ super().__init__( task=task, + train_batch_size=train_batch_size, + test_batch_size=test_batch_size, + num_workers=num_workers, transform_config_train=transform_config_train, transform_config_val=transform_config_val, image_size=image_size, @@ -106,16 +107,8 @@ def __init__( self.root = root if isinstance(root, Path) else Path(root) self.category = category self.path = self.root / self.category - self.transform_config_train = transform_config_train - self.transform_config_val = transform_config_val - self.image_size = image_size - - self.train_batch_size = train_batch_size - self.test_batch_size = test_batch_size - self.num_workers = num_workers self.create_validation_set = create_validation_set - self.task = task self.seed = seed self.split_ratio = split_ratio @@ -219,16 +212,3 @@ def _create_samples(self) -> DataFrame: samples = create_validation_set_from_test_set(samples, seed=self.seed) return samples - - def train_dataloader(self) -> TRAIN_DATALOADERS: - """Get train dataloader.""" - return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers) - - def val_dataloader(self) -> EVAL_DATALOADERS: - """Get validation dataloader.""" - dataset = self.val_data if self.create_validation_set else self.test_data - return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) - - def test_dataloader(self) -> EVAL_DATALOADERS: - """Get test dataloader.""" - return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index aea65a4d64..e485863771 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -14,8 +14,6 @@ import albumentations as A from pandas.core.frame import DataFrame from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY -from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS -from torch.utils.data import DataLoader from torchvision.datasets.folder import IMG_EXTENSIONS from anomalib.data.base import AnomalibDataModule @@ -194,6 +192,9 @@ def __init__( """ super().__init__( task=task, + train_batch_size=train_batch_size, + test_batch_size=test_batch_size, + num_workers=num_workers, transform_config_train=transform_config_train, transform_config_val=transform_config_val, image_size=image_size, @@ -226,14 +227,6 @@ def __init__( self.extensions = extensions self.split_ratio = split_ratio - self.transform_config_train = transform_config_train - self.transform_config_val = transform_config_val - self.image_size = image_size - - self.train_batch_size = train_batch_size - self.test_batch_size = test_batch_size - self.num_workers = num_workers - self.create_validation_set = create_validation_set self.seed = seed @@ -304,16 +297,3 @@ def _create_samples(self): samples = create_validation_set_from_test_set(samples, seed=self.seed, normal_label="normal") return samples - - def train_dataloader(self) -> TRAIN_DATALOADERS: - """Get train dataloader.""" - return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers) - - def val_dataloader(self) -> EVAL_DATALOADERS: - """Get validation dataloader.""" - dataset = self.val_data if self.create_validation_set else self.test_data - return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) - - def test_dataloader(self) -> EVAL_DATALOADERS: - """Get test dataloader.""" - return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 466420eebd..0f59961182 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -38,8 +38,6 @@ import pandas as pd from pandas.core.frame import DataFrame from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY -from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS -from torch.utils.data import DataLoader from anomalib.data.base import AnomalibDataModule from anomalib.data.utils import DownloadProgressBar, hash_check @@ -113,6 +111,9 @@ def __init__( """ super().__init__( task=task, + train_batch_size=train_batch_size, + test_batch_size=test_batch_size, + num_workers=num_workers, transform_config_train=transform_config_train, transform_config_val=transform_config_val, image_size=image_size, @@ -121,17 +122,9 @@ def __init__( self.root = root if isinstance(root, Path) else Path(root) self.category = category - self.dataset_path = self.root / self.category - self.transform_config_train = transform_config_train - self.transform_config_val = transform_config_val - self.image_size = image_size - - self.train_batch_size = train_batch_size - self.test_batch_size = test_batch_size - self.num_workers = num_workers + self.path = self.root / self.category self.create_validation_set = create_validation_set - self.task = task self.seed = seed self.split_ratio = split_ratio @@ -186,10 +179,9 @@ def _create_samples(self) -> DataFrame: " This will lead to inconsistency between runs." ) - path = self.root / self.category - samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.png")] + samples_list = [(str(self.path),) + filename.parts[-3:] for filename in self.path.glob("**/*.png")] if len(samples_list) == 0: - raise RuntimeError(f"Found 0 images in {path}") + raise RuntimeError(f"Found 0 images in {self.path}") samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"]) samples = samples[samples.split != "ground_truth"] @@ -225,16 +217,3 @@ def _create_samples(self) -> DataFrame: samples = create_validation_set_from_test_set(samples, seed=self.seed) return samples - - def train_dataloader(self) -> TRAIN_DATALOADERS: - """Get train dataloader.""" - return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers) - - def val_dataloader(self) -> EVAL_DATALOADERS: - """Get validation dataloader.""" - dataset = self.val_data if self.create_validation_set else self.test_data - return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) - - def test_dataloader(self) -> EVAL_DATALOADERS: - """Get test dataloader.""" - return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) From 5b3e8410f11a0472703429b5b9fce5fe5f20c94a Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 16 Sep 2022 18:31:19 +0200 Subject: [PATCH 16/96] refactor to add validate stage in setup --- anomalib/data/base.py | 69 +++++++++++++++++++++++++++-------------- anomalib/data/btech.py | 16 +++++----- anomalib/data/folder.py | 22 ++++++------- anomalib/data/mvtec.py | 16 +++++----- tools/train.py | 2 +- 5 files changed, 74 insertions(+), 51 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index aa5b5083c0..ed61a83c9c 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -32,10 +32,6 @@ def __init__(self, samples: DataFrame, task: str, split: str, pre_process: PrePr self.split = split self.pre_process = pre_process - def contains_anomalous_images(self): - """Check if the dataset contains any anomalous images.""" - return 1 in list(self.samples.label_index) - def __len__(self) -> int: """Get length of the dataset.""" return len(self.samples) @@ -109,6 +105,8 @@ def __init__( self.val_data: Optional[AnomalibDataset] = None self.test_data: Optional[AnomalibDataset] = None + self._samples: Optional[DataFrame] = None + @abstractmethod def _create_samples(self) -> DataFrame: """This method should be implemented in the subclass. @@ -131,44 +129,70 @@ def _create_samples(self) -> DataFrame: """ raise NotImplementedError + def get_samples(self, split: Optional[str] = None) -> DataFrame: + """Retrieve the samples of the full dataset or one of the splits (train, val, test). + + Args: + split: (str): The split for which we want to retrieve the samples ("train", "val" or "test"). When + left empty, all samples will be returned. + + Returns: + DataFrame: A dataframe containing the samples of the split or full dataset. + """ + assert self._samples is not None, "Samples have not been created yet." + if split is None: + return self._samples + samples = self._samples[self._samples.split == split] + return samples.reset_index(drop=True) + def setup(self, stage: Optional[str] = None) -> None: """Setup train, validation and test data. Args: stage: Optional[str]: Train/Val/Test stages. (Default value = None) - """ - samples = self._create_samples() + self._samples = self._create_samples() logger.info("Setting up train, validation, test and prediction datasets.") if stage in (None, "fit"): - train_samples = samples[samples.split == "train"] - train_samples = train_samples.reset_index(drop=True) + samples = self.get_samples("train") self.train_data = AnomalibDataset( - samples=train_samples, + samples=samples, split="train", task=self.task, pre_process=self.pre_process_train, ) - if self.create_validation_set: - val_samples = samples[samples.split == "val"] - val_samples = val_samples.reset_index(drop=True) + if stage in (None, "fit", "validate"): + samples = self.get_samples("val") if self.create_validation_set else self.get_samples("test") self.val_data = AnomalibDataset( - samples=val_samples, + samples=samples, split="val", task=self.task, pre_process=self.pre_process_val, ) - test_samples = samples[samples.split == "test"] - test_samples = test_samples.reset_index(drop=True) - self.test_data = AnomalibDataset( - samples=test_samples, - split="test", - task=self.task, - pre_process=self.pre_process_val, - ) + if stage in (None, "test"): + samples = self.get_samples("test") + self.test_data = AnomalibDataset( + samples=samples, + split="test", + task=self.task, + pre_process=self.pre_process_val, + ) + + def contains_anomalous_images(self, split: Optional[str] = None) -> bool: + """Check if the dataset or the specified subset contains any anomalous images. + + Args: + split (str): the subset of interest ("train", "val" or "test"). When left empty, the full dataset will be + checked. + + Returns: + bool: Boolean indicating if any anomalous images have been assigned to the dataset or subset. + """ + samples = self.get_samples(split) + return 1 in list(samples.label_index) def train_dataloader(self) -> TRAIN_DATALOADERS: """Get train dataloader.""" @@ -176,8 +200,7 @@ def train_dataloader(self) -> TRAIN_DATALOADERS: def val_dataloader(self) -> EVAL_DATALOADERS: """Get validation dataloader.""" - dataset = self.val_data if self.create_validation_set else self.test_data - return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) + return DataLoader(self.val_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) def test_dataloader(self) -> EVAL_DATALOADERS: """Get test dataloader.""" diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index f1125ce0ff..489841ab94 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -93,6 +93,14 @@ def __init__( >>> data["image"].shape, data["mask"].shape (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256])) """ + self.root = root if isinstance(root, Path) else Path(root) + self.category = category + self.path = self.root / self.category + + self.create_validation_set = create_validation_set + self.seed = seed + self.split_ratio = split_ratio + super().__init__( task=task, train_batch_size=train_batch_size, @@ -104,14 +112,6 @@ def __init__( create_validation_set=create_validation_set, ) - self.root = root if isinstance(root, Path) else Path(root) - self.category = category - self.path = self.root / self.category - - self.create_validation_set = create_validation_set - self.seed = seed - self.split_ratio = split_ratio - def prepare_data(self) -> None: """Download the dataset if not available.""" if (self.root / self.category).is_dir(): diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index e485863771..22f6257308 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -190,17 +190,6 @@ def __init__( torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256]) """ - super().__init__( - task=task, - train_batch_size=train_batch_size, - test_batch_size=test_batch_size, - num_workers=num_workers, - transform_config_train=transform_config_train, - transform_config_val=transform_config_val, - image_size=image_size, - create_validation_set=create_validation_set, - ) - if seed is None and normal_test_dir is None: raise ValueError( "Both seed and normal_test_dir cannot be None." @@ -230,6 +219,17 @@ def __init__( self.create_validation_set = create_validation_set self.seed = seed + super().__init__( + task=task, + train_batch_size=train_batch_size, + test_batch_size=test_batch_size, + num_workers=num_workers, + transform_config_train=transform_config_train, + transform_config_val=transform_config_val, + image_size=image_size, + create_validation_set=create_validation_set, + ) + def _create_samples(self): """Create the dataframe with samples for the Folder dataset. diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 0f59961182..1772baf4f1 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -109,6 +109,14 @@ def __init__( >>> data["image"].shape, data["mask"].shape (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256])) """ + self.root = root if isinstance(root, Path) else Path(root) + self.category = category + self.path = self.root / self.category + + self.create_validation_set = create_validation_set + self.seed = seed + self.split_ratio = split_ratio + super().__init__( task=task, train_batch_size=train_batch_size, @@ -120,14 +128,6 @@ def __init__( create_validation_set=create_validation_set, ) - self.root = root if isinstance(root, Path) else Path(root) - self.category = category - self.path = self.root / self.category - - self.create_validation_set = create_validation_set - self.seed = seed - self.split_ratio = split_ratio - def prepare_data(self) -> None: """Download the dataset if not available.""" if (self.root / self.category).is_dir(): diff --git a/tools/train.py b/tools/train.py index b1f176a591..33952a7e20 100644 --- a/tools/train.py +++ b/tools/train.py @@ -63,7 +63,7 @@ def train(): load_model_callback = LoadModelCallback(weights_path=trainer.checkpoint_callback.best_model_path) trainer.callbacks.insert(0, load_model_callback) - if datamodule.test_data.contains_anomalous_images(): + if datamodule.contains_anomalous_images("test"): logger.info("Testing the model.") trainer.test(model=model, datamodule=datamodule) else: From f652227e377c8389e3bf481d39a571cfd17db6a8 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Wed, 21 Sep 2022 14:08:20 +0200 Subject: [PATCH 17/96] implement alternative datamodules solution --- anomalib/data/base.py | 161 +++++++++++++++++++----------------- anomalib/data/folder.py | 175 ++++++++++++++++++++++++---------------- 2 files changed, 193 insertions(+), 143 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index ed61a83c9c..e7ce591d00 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -5,6 +5,7 @@ import logging from abc import ABC, abstractmethod +from enum import Enum from typing import Dict, Optional, Tuple, Union import albumentations as A @@ -22,20 +23,91 @@ logger = logging.getLogger(__name__) +class Subset(str, Enum): + FULL = "full" + TRAIN = "train" + VAL = "val" + TEST = "test" + + class AnomalibDataset(Dataset): """Anomalib dataset.""" - def __init__(self, samples: DataFrame, task: str, split: str, pre_process: PreProcessor): + def __init__( + self, + task: str, + pre_process: PreProcessor, + split: Subset = Subset.FULL, + samples: Optional[DataFrame] = None, + seed: Optional[int] = None, + ): super().__init__() - self.samples = samples self.task = task self.split = split self.pre_process = pre_process + self.seed = seed + if samples is None: + self.samples = self._create_samples() + else: + self.samples = samples + self.samples = self.get_samples(self.split) def __len__(self) -> int: """Get length of the dataset.""" return len(self.samples) + @abstractmethod + def _create_samples(self) -> DataFrame: + """This method should be implemented in the subclass. + + This method should return a dataframe that contains the information needed by the dataloader to load each of + the dataset items into memory. The dataframe must at least contain the following columns: + split - The subset to which the dataset item is assigned. + image_path - Path to file system location where the image is stored. + label_index - Index of the anomaly label, typically 0 for "normal" and 1 for "anomalous". + + Additionally, when the task type is segmentation, the dataframe must have the mask_path column, which contains + the path the ground truth masks (for the anomalous images only). + + Example of a dataframe returned by calling this method from a concrete class: + |---|-------------------|-----------|-------------|------------------|-------| + | | image_path | label | label_index | mask_path | split | + |---|-------------------|-----------|-------------|------------------|-------| + | 0 | path/to/image.png | anomalous | 0 | path/to/mask.png | train | + |---|-------------------|-----------|-------------|------------------|-------| + """ + raise NotImplementedError + + def _get_subset(self, split: Subset): + samples = self.get_samples(split) + return AnomalibDataset( + task=self.task, pre_process=self.pre_process, split=split, samples=samples, seed=self.seed + ) + + def train_subset(self): + return self._get_subset(Subset.TRAIN) + + def val_subset(self): + return self._get_subset(Subset.VAL) + + def test_subset(self): + return self._get_subset(Subset.TEST) + + def get_samples(self, split: Subset): + """Retrieve the samples of the full dataset or one of the splits (train, val, test). + + Args: + split: (str): The split for which we want to retrieve the samples ("train", "val" or "test"). When + left empty, all samples will be returned. + + Returns: + DataFrame: A dataframe containing the samples of the split or full dataset. + """ + if split == Subset.FULL: + return self.samples + samples = self.samples[self.samples.split == split] + return samples.reset_index(drop=True) + def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: """Get dataset item for the index ``index``. @@ -107,92 +179,31 @@ def __init__( self._samples: Optional[DataFrame] = None - @abstractmethod - def _create_samples(self) -> DataFrame: - """This method should be implemented in the subclass. - - This method should return a dataframe that contains the information needed by the dataloader to load each of - the dataset items into memory. The dataframe must at least contain the following columns: - split - The subset to which the dataset item is assigned. - image_path - Path to file system location where the image is stored. - label_index - Index of the anomaly label, typically 0 for "normal" and 1 for "anomalous". - - Additionally, when the task type is segmentation, the dataframe must have the mask_path column, which contains - the path the ground truth masks (for the anomalous images only). + self.data: Optional[AnomalibDataset] = None - Example of a dataframe returned by calling this method from a concrete class: - |---|-------------------|-----------|-------------|------------------|-------| - | | image_path | label | label_index | mask_path | split | - |---|-------------------|-----------|-------------|------------------|-------| - | 0 | path/to/image.png | anomalous | 0 | path/to/mask.png | train | - |---|-------------------|-----------|-------------|------------------|-------| - """ + @abstractmethod + def create_dataset(self) -> AnomalibDataset: raise NotImplementedError - def get_samples(self, split: Optional[str] = None) -> DataFrame: - """Retrieve the samples of the full dataset or one of the splits (train, val, test). - - Args: - split: (str): The split for which we want to retrieve the samples ("train", "val" or "test"). When - left empty, all samples will be returned. + def prepare_data(self) -> None: + self.data = self.create_dataset() - Returns: - DataFrame: A dataframe containing the samples of the split or full dataset. - """ - assert self._samples is not None, "Samples have not been created yet." - if split is None: - return self._samples - samples = self._samples[self._samples.split == split] - return samples.reset_index(drop=True) + def contains_anomalous_images(self, split): + samples = self.data.get_samples(split) + return 1 in list(samples.label_index) - def setup(self, stage: Optional[str] = None) -> None: + def setup(self, stage: Optional[str] = None): """Setup train, validation and test data. Args: stage: Optional[str]: Train/Val/Test stages. (Default value = None) """ - self._samples = self._create_samples() - - logger.info("Setting up train, validation, test and prediction datasets.") if stage in (None, "fit"): - samples = self.get_samples("train") - self.train_data = AnomalibDataset( - samples=samples, - split="train", - task=self.task, - pre_process=self.pre_process_train, - ) - + self.train_data = self.data.train_subset() if stage in (None, "fit", "validate"): - samples = self.get_samples("val") if self.create_validation_set else self.get_samples("test") - self.val_data = AnomalibDataset( - samples=samples, - split="val", - task=self.task, - pre_process=self.pre_process_val, - ) - + self.val_data = self.data.val_subset() if self.create_validation_set else self.data.test_subset() if stage in (None, "test"): - samples = self.get_samples("test") - self.test_data = AnomalibDataset( - samples=samples, - split="test", - task=self.task, - pre_process=self.pre_process_val, - ) - - def contains_anomalous_images(self, split: Optional[str] = None) -> bool: - """Check if the dataset or the specified subset contains any anomalous images. - - Args: - split (str): the subset of interest ("train", "val" or "test"). When left empty, the full dataset will be - checked. - - Returns: - bool: Boolean indicating if any anomalous images have been assigned to the dataset or subset. - """ - samples = self.get_samples(split) - return 1 in list(samples.label_index) + self.test_data = self.data.test_subset() def train_dataloader(self) -> TRAIN_DATALOADERS: """Get train dataloader.""" diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index 22f6257308..8a95280b9b 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -16,7 +16,7 @@ from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY from torchvision.datasets.folder import IMG_EXTENSIONS -from anomalib.data.base import AnomalibDataModule +from anomalib.data.base import AnomalibDataModule, AnomalibDataset from anomalib.data.utils.split import ( create_validation_set_from_test_set, split_normal_images_in_train_set, @@ -69,6 +69,99 @@ def _prepare_files_labels( return filenames, labels +class FolderDataset(AnomalibDataset): + def __init__( + self, + normal_dir, + abnormal_dir, + normal_test_dir, + mask_dir, + extensions, + split_ratio, + seed, + create_validation_set, + *args, + **kwargs, + ): + self.normal_dir = normal_dir + self.abnormal_dir = abnormal_dir + self.normal_test_dir = normal_test_dir + self.extensions = extensions + self.mask_dir = mask_dir + self.split_ratio = split_ratio + self.seed = seed + self.create_validation_set = create_validation_set + super().__init__(*args, **kwargs) + + def _create_samples(self): + """Create the dataframe with samples for the Folder dataset. + + The files are expected to follow the structure: + path/to/dataset/normal_folder_name/normal_image_name.png + path/to/dataset/abnormal_folder_name/abnormal_image_name.png + + + This function creates a dataframe to store the parsed information based on the following format: + |---|-------------------|--------|-------------|------------------|-------| + | | image_path | label | label_index | mask_path | split | + |---|-------------------|--------|-------------|------------------|-------| + | 0 | path/to/image.png | normal | 0 | path/to/mask.png | train | + |---|-------------------|--------|-------------|------------------|-------| + + Returns: + DataFrame: an output dataframe containing the samples of the dataset. + """ + + filenames = [] + labels = [] + dirs = {"normal": self.normal_dir, "abnormal": self.abnormal_dir} + + if self.normal_test_dir: + dirs = {**dirs, **{"normal_test": self.normal_test_dir}} + + for dir_type, path in dirs.items(): + if path is not None: + filename, label = _prepare_files_labels(path, dir_type, self.extensions) + filenames += filename + labels += label + + samples = DataFrame({"image_path": filenames, "label": labels}) + + # Create label index for normal (0) and abnormal (1) images. + samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0 + samples.loc[(samples.label == "abnormal"), "label_index"] = 1 + samples.label_index = samples.label_index.astype(int) + + # If a path to mask is provided, add it to the sample dataframe. + if self.mask_dir is not None: + self.mask_dir = _check_and_convert_path(self.mask_dir) + samples["mask_path"] = "" + for index, row in samples.iterrows(): + if row.label_index == 1: + samples.loc[index, "mask_path"] = str(self.mask_dir / row.image_path.name) + + # Ensure the pathlib objects are converted to str. + # This is because torch dataloader doesn't like pathlib. + samples = samples.astype({"image_path": "str"}) + + # Create train/test split. + # By default, all the normal samples are assigned as train. + # and all the abnormal samples are test. + samples.loc[(samples.label == "normal"), "split"] = "train" + samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test" + + if not self.normal_test_dir: + samples = split_normal_images_in_train_set( + samples=samples, split_ratio=self.split_ratio, seed=self.seed, normal_label="normal" + ) + + # If `create_validation_set` is set to True, the test set is split into half. + if self.create_validation_set: + samples = create_validation_set_from_test_set(samples, seed=self.seed, normal_label="normal") + + return samples + + @DATAMODULE_REGISTRY class Folder(AnomalibDataModule): """Folder Lightning Data Module.""" @@ -230,70 +323,16 @@ def __init__( create_validation_set=create_validation_set, ) - def _create_samples(self): - """Create the dataframe with samples for the Folder dataset. - - The files are expected to follow the structure: - path/to/dataset/normal_folder_name/normal_image_name.png - path/to/dataset/abnormal_folder_name/abnormal_image_name.png - - - This function creates a dataframe to store the parsed information based on the following format: - |---|-------------------|--------|-------------|------------------|-------| - | | image_path | label | label_index | mask_path | split | - |---|-------------------|--------|-------------|------------------|-------| - | 0 | path/to/image.png | normal | 0 | path/to/mask.png | train | - |---|-------------------|--------|-------------|------------------|-------| - - Returns: - DataFrame: an output dataframe containing the samples of the dataset. - """ - - filenames = [] - labels = [] - dirs = {"normal": self.normal_dir, "abnormal": self.abnormal_dir} - - if self.normal_test_dir: - dirs = {**dirs, **{"normal_test": self.normal_test_dir}} - - for dir_type, path in dirs.items(): - if path is not None: - filename, label = _prepare_files_labels(path, dir_type, self.extensions) - filenames += filename - labels += label - - samples = DataFrame({"image_path": filenames, "label": labels}) - - # Create label index for normal (0) and abnormal (1) images. - samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0 - samples.loc[(samples.label == "abnormal"), "label_index"] = 1 - samples.label_index = samples.label_index.astype(int) - - # If a path to mask is provided, add it to the sample dataframe. - if self.mask_dir is not None: - self.mask_dir = _check_and_convert_path(self.mask_dir) - samples["mask_path"] = "" - for index, row in samples.iterrows(): - if row.label_index == 1: - samples.loc[index, "mask_path"] = str(self.mask_dir / row.image_path.name) - - # Ensure the pathlib objects are converted to str. - # This is because torch dataloader doesn't like pathlib. - samples = samples.astype({"image_path": "str"}) - - # Create train/test split. - # By default, all the normal samples are assigned as train. - # and all the abnormal samples are test. - samples.loc[(samples.label == "normal"), "split"] = "train" - samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test" - - if not self.normal_test_dir: - samples = split_normal_images_in_train_set( - samples=samples, split_ratio=self.split_ratio, seed=self.seed, normal_label="normal" - ) - - # If `create_validation_set` is set to True, the test set is split into half. - if self.create_validation_set: - samples = create_validation_set_from_test_set(samples, seed=self.seed, normal_label="normal") - - return samples + def create_dataset(self): + return FolderDataset( + normal_dir=self.normal_dir, + abnormal_dir=self.abnormal_dir, + normal_test_dir=self.normal_test_dir, + mask_dir=self.mask_dir, + extensions=self.extensions, + split_ratio=self.split_ratio, + seed=self.seed, + create_validation_set=self.create_validation_set, + task=self.task, + pre_process=self.pre_process_train, + ) From 0e565a42260f55f649ceb8ffe6b61632161b1983 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Wed, 21 Sep 2022 17:07:15 +0200 Subject: [PATCH 18/96] small improvements --- anomalib/data/base.py | 30 +++++++++++++++--------------- anomalib/data/folder.py | 21 ++++++++++----------- 2 files changed, 25 insertions(+), 26 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index e7ce591d00..cb8c3a7f22 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -78,20 +78,19 @@ def _create_samples(self) -> DataFrame: """ raise NotImplementedError - def _get_subset(self, split: Subset): + def _get_subset(self, split: Subset, pre_process: Optional[PreProcessor] = None): samples = self.get_samples(split) - return AnomalibDataset( - task=self.task, pre_process=self.pre_process, split=split, samples=samples, seed=self.seed - ) + pre_process = self.pre_process if pre_process is None else pre_process + return AnomalibDataset(task=self.task, pre_process=pre_process, split=split, samples=samples, seed=self.seed) - def train_subset(self): - return self._get_subset(Subset.TRAIN) + def train_subset(self, pre_process: Optional[PreProcessor] = None): + return self._get_subset(Subset.TRAIN, pre_process=pre_process) - def val_subset(self): - return self._get_subset(Subset.VAL) + def val_subset(self, pre_process: Optional[PreProcessor] = None): + return self._get_subset(Subset.VAL, pre_process=pre_process) - def test_subset(self): - return self._get_subset(Subset.TEST) + def test_subset(self, pre_process: Optional[PreProcessor] = None): + return self._get_subset(Subset.TEST, pre_process=pre_process) def get_samples(self, split: Subset): """Retrieve the samples of the full dataset or one of the splits (train, val, test). @@ -159,14 +158,12 @@ def __init__( transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_val: Optional[Union[str, A.Compose]] = None, image_size: Optional[Union[int, Tuple[int, int]]] = None, - create_validation_set: bool = False, ): super().__init__() self.task = task self.train_batch_size = train_batch_size self.test_batch_size = test_batch_size self.num_workers = num_workers - self.create_validation_set = create_validation_set if transform_config_train is not None and transform_config_val is None: transform_config_val = transform_config_train @@ -199,11 +196,14 @@ def setup(self, stage: Optional[str] = None): stage: Optional[str]: Train/Val/Test stages. (Default value = None) """ if stage in (None, "fit"): - self.train_data = self.data.train_subset() + self.train_data = self.data.train_subset(pre_process=self.pre_process_train) if stage in (None, "fit", "validate"): - self.val_data = self.data.val_subset() if self.create_validation_set else self.data.test_subset() + if self.contains_anomalous_images("val"): + self.val_data = self.data.val_subset(pre_process=self.pre_process_val) + else: + self.val_data = self.data.test_subset(pre_process=self.pre_process_val) if stage in (None, "test"): - self.test_data = self.data.test_subset() + self.test_data = self.data.test_subset(pre_process=self.pre_process_val) def train_dataloader(self) -> TRAIN_DATALOADERS: """Get train dataloader.""" diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index 8a95280b9b..29ba0dedf1 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -283,6 +283,16 @@ def __init__( torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256]) """ + super().__init__( + task=task, + train_batch_size=train_batch_size, + test_batch_size=test_batch_size, + num_workers=num_workers, + transform_config_train=transform_config_train, + transform_config_val=transform_config_val, + image_size=image_size, + ) + if seed is None and normal_test_dir is None: raise ValueError( "Both seed and normal_test_dir cannot be None." @@ -312,17 +322,6 @@ def __init__( self.create_validation_set = create_validation_set self.seed = seed - super().__init__( - task=task, - train_batch_size=train_batch_size, - test_batch_size=test_batch_size, - num_workers=num_workers, - transform_config_train=transform_config_train, - transform_config_val=transform_config_val, - image_size=image_size, - create_validation_set=create_validation_set, - ) - def create_dataset(self): return FolderDataset( normal_dir=self.normal_dir, From 297195a032bb182a69560b3a8c2bc0288f7aec5b Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 7 Oct 2022 12:20:03 +0200 Subject: [PATCH 19/96] improve design --- anomalib/data/__init__.py | 14 +- anomalib/data/base.py | 177 ++++++------- anomalib/data/folder.py | 409 ++++++++++++------------------ anomalib/data/mvtec.py | 278 +++++++------------- anomalib/data/utils/split.py | 47 +++- anomalib/models/padim/config.yaml | 2 +- tools/train.py | 2 +- 7 files changed, 388 insertions(+), 541 deletions(-) diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index f1691620f5..d1da8af375 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -11,9 +11,9 @@ from anomalib.data.base import AnomalibDataModule from .btech import BTech -from .folder import Folder +from .folder import FolderDataModule from .inference import InferenceDataset -from .mvtec import MVTec +from .mvtec import MVTecDataModule logger = logging.getLogger(__name__) @@ -32,7 +32,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: datamodule: AnomalibDataModule if config.dataset.format.lower() == "mvtec": - datamodule = MVTec( + datamodule = MVTecDataModule( # TODO: Remove config values. IAAALD-211 root=config.dataset.path, category=config.dataset.category, @@ -40,11 +40,10 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: train_batch_size=config.dataset.train_batch_size, test_batch_size=config.dataset.test_batch_size, num_workers=config.dataset.num_workers, - seed=config.project.seed, task=config.dataset.task, transform_config_train=config.dataset.transform_config.train, transform_config_val=config.dataset.transform_config.val, - create_validation_set=config.dataset.create_validation_set, + val_split_mode=config.dataset.validation_split_mode, ) elif config.dataset.format.lower() == "btech": datamodule = BTech( @@ -62,7 +61,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: create_validation_set=config.dataset.create_validation_set, ) elif config.dataset.format.lower() == "folder": - datamodule = Folder( + datamodule = FolderDataModule( root=config.dataset.path, normal_dir=config.dataset.normal_dir, abnormal_dir=config.dataset.abnormal_dir, @@ -71,14 +70,13 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: mask_dir=config.dataset.mask, extensions=config.dataset.extensions, split_ratio=config.dataset.split_ratio, - seed=config.project.seed, image_size=(config.dataset.image_size[0], config.dataset.image_size[1]), train_batch_size=config.dataset.train_batch_size, test_batch_size=config.dataset.test_batch_size, num_workers=config.dataset.num_workers, transform_config_train=config.dataset.transform_config.train, transform_config_val=config.dataset.transform_config.val, - create_validation_set=config.dataset.create_validation_set, + val_split_mode=config.dataset.validation_split_mode, ) else: raise ValueError( diff --git a/anomalib/data/base.py b/anomalib/data/base.py index cb8c3a7f22..9c5ab99ed5 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -3,14 +3,16 @@ # Copyright (C) 2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import logging from abc import ABC, abstractmethod from enum import Enum -from typing import Dict, Optional, Tuple, Union +from typing import Dict, Optional, Union -import albumentations as A import cv2 import numpy as np +import pandas as pd from pandas import DataFrame from pytorch_lightning import LightningDataModule from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS @@ -23,89 +25,54 @@ logger = logging.getLogger(__name__) -class Subset(str, Enum): +class Split(str, Enum): FULL = "full" TRAIN = "train" VAL = "val" TEST = "test" +class ValSplitMode(str, Enum): + SAME_AS_TEST = "same_as_test" + FROM_TEST = "from_test" + + class AnomalibDataset(Dataset): """Anomalib dataset.""" - def __init__( - self, - task: str, - pre_process: PreProcessor, - split: Subset = Subset.FULL, - samples: Optional[DataFrame] = None, - seed: Optional[int] = None, - ): + def __init__(self, task: str, pre_process: PreProcessor, samples: Optional[DataFrame] = None): super().__init__() self.task = task - self.split = split self.pre_process = pre_process - self.seed = seed - if samples is None: - self.samples = self._create_samples() - else: - self.samples = samples - self.samples = self.get_samples(self.split) + self._samples = samples def __len__(self) -> int: """Get length of the dataset.""" - return len(self.samples) - - @abstractmethod - def _create_samples(self) -> DataFrame: - """This method should be implemented in the subclass. - - This method should return a dataframe that contains the information needed by the dataloader to load each of - the dataset items into memory. The dataframe must at least contain the following columns: - split - The subset to which the dataset item is assigned. - image_path - Path to file system location where the image is stored. - label_index - Index of the anomaly label, typically 0 for "normal" and 1 for "anomalous". - - Additionally, when the task type is segmentation, the dataframe must have the mask_path column, which contains - the path the ground truth masks (for the anomalous images only). - - Example of a dataframe returned by calling this method from a concrete class: - |---|-------------------|-----------|-------------|------------------|-------| - | | image_path | label | label_index | mask_path | split | - |---|-------------------|-----------|-------------|------------------|-------| - | 0 | path/to/image.png | anomalous | 0 | path/to/mask.png | train | - |---|-------------------|-----------|-------------|------------------|-------| - """ - raise NotImplementedError - - def _get_subset(self, split: Subset, pre_process: Optional[PreProcessor] = None): - samples = self.get_samples(split) - pre_process = self.pre_process if pre_process is None else pre_process - return AnomalibDataset(task=self.task, pre_process=pre_process, split=split, samples=samples, seed=self.seed) - - def train_subset(self, pre_process: Optional[PreProcessor] = None): - return self._get_subset(Subset.TRAIN, pre_process=pre_process) + assert isinstance(self._samples, DataFrame) + return len(self._samples) - def val_subset(self, pre_process: Optional[PreProcessor] = None): - return self._get_subset(Subset.VAL, pre_process=pre_process) + def subsample(self, indices): + return AnomalibDataset(task=self.task, pre_process=self.pre_process, samples=self.samples.iloc[indices]) - def test_subset(self, pre_process: Optional[PreProcessor] = None): - return self._get_subset(Subset.TEST, pre_process=pre_process) + @property + def is_setup(self) -> bool: + """Has setup() been called?""" + return isinstance(self._samples, DataFrame) - def get_samples(self, split: Subset): - """Retrieve the samples of the full dataset or one of the splits (train, val, test). + @property + def samples(self) -> DataFrame: + """TODO""" + if not self.is_setup: + raise RuntimeError("Dataset is not setup yet. Call setup() first.") + return self._samples - Args: - split: (str): The split for which we want to retrieve the samples ("train", "val" or "test"). When - left empty, all samples will be returned. + @property + def has_normal(self) -> bool: + return 0 in list(self.samples.label_index) - Returns: - DataFrame: A dataframe containing the samples of the split or full dataset. - """ - if split == Subset.FULL: - return self.samples - samples = self.samples[self.samples.split == split] - return samples.reset_index(drop=True) + @property + def has_anomalous(self) -> bool: + return 1 in list(self.samples.label_index) def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: """Get dataset item for the index ``index``. @@ -117,16 +84,18 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. """ - image_path = self.samples.iloc[index].image_path + assert isinstance(self._samples, DataFrame) + + image_path = self._samples.iloc[index].image_path image = read_image(image_path) - label_index = self.samples.iloc[index].label_index + label_index = self._samples.iloc[index].label_index item = dict(image_path=image_path, label=label_index) if self.task == "classification": pre_processed = self.pre_process(image=image) elif self.task == "segmentation": - mask_path = self.samples.iloc[index].mask_path + mask_path = self._samples.iloc[index].mask_path # Only Anomalous (1) images have masks in anomaly datasets # Therefore, create empty mask for Normal (0) images. @@ -145,6 +114,35 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: return item + def __add__(self, other_dataset: AnomalibDataset): + assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first." + samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True) + return AnomalibDataset(self.task, self.pre_process, samples) + + def setup(self) -> None: + """Load data/metadata into memory""" + if not self.is_setup: + self._setup() + assert self.is_setup, "setup() should set self._samples" + + def _setup(self) -> DataFrame: + """previous _create_samples() + This method should return a dataframe that contains the information needed by the dataloader to load each of + the dataset items into memory. + The dataframe must at least contain the following columns: + split: the subset to which the dataset item is assigned. + image_path: path to file system location where the image is stored. + label_index: index of the anomaly label, typically 0 for "normal" and 1 for "anomalous". + mask_path (if task == "segmentation"): path to the ground truth masks (for the anomalous images only). + Example: + |---|-------------------|-----------|-------------|------------------|-------| + | | image_path | label | label_index | mask_path | split | + |---|-------------------|-----------|-------------|------------------|-------| + | 0 | path/to/image.png | anomalous | 0 | path/to/mask.png | train | + |---|-------------------|-----------|-------------|------------------|-------| + """ + pass + class AnomalibDataModule(LightningDataModule, ABC): """Base Anomalib data module.""" @@ -155,20 +153,14 @@ def __init__( train_batch_size: int, test_batch_size: int, num_workers: int, - transform_config_train: Optional[Union[str, A.Compose]] = None, - transform_config_val: Optional[Union[str, A.Compose]] = None, - image_size: Optional[Union[int, Tuple[int, int]]] = None, + val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, ): super().__init__() self.task = task self.train_batch_size = train_batch_size self.test_batch_size = test_batch_size self.num_workers = num_workers - - if transform_config_train is not None and transform_config_val is None: - transform_config_val = transform_config_train - self.pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) - self.pre_process_val = PreProcessor(config=transform_config_val, image_size=image_size) + self.val_split_mode = val_split_mode self.train_data: Optional[AnomalibDataset] = None self.val_data: Optional[AnomalibDataset] = None @@ -178,32 +170,25 @@ def __init__( self.data: Optional[AnomalibDataset] = None - @abstractmethod - def create_dataset(self) -> AnomalibDataset: - raise NotImplementedError - - def prepare_data(self) -> None: - self.data = self.create_dataset() - - def contains_anomalous_images(self, split): - samples = self.data.get_samples(split) - return 1 in list(samples.label_index) - def setup(self, stage: Optional[str] = None): """Setup train, validation and test data. Args: stage: Optional[str]: Train/Val/Test stages. (Default value = None) """ - if stage in (None, "fit"): - self.train_data = self.data.train_subset(pre_process=self.pre_process_train) - if stage in (None, "fit", "validate"): - if self.contains_anomalous_images("val"): - self.val_data = self.data.val_subset(pre_process=self.pre_process_val) - else: - self.val_data = self.data.test_subset(pre_process=self.pre_process_val) - if stage in (None, "test"): - self.test_data = self.data.test_subset(pre_process=self.pre_process_val) + if not self.is_setup: + self._setup(stage) + assert self.is_setup + + @abstractmethod + def _setup(self, _stage: Optional[str] = None) -> None: + pass + + @property + def is_setup(self): + if self.train_data is None or self.val_data is None or self.test_data is None: + return False + return self.train_data.is_setup and self.val_data.is_setup and self.test_data.is_setup def train_dataloader(self) -> TRAIN_DATALOADERS: """Get train dataloader.""" diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index 29ba0dedf1..cebba3ea46 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -6,23 +6,15 @@ # Copyright (C) 2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import logging -import warnings from pathlib import Path from typing import Optional, Tuple, Union -import albumentations as A -from pandas.core.frame import DataFrame -from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY +from pandas import DataFrame from torchvision.datasets.folder import IMG_EXTENSIONS -from anomalib.data.base import AnomalibDataModule, AnomalibDataset -from anomalib.data.utils.split import ( - create_validation_set_from_test_set, - split_normal_images_in_train_set, -) - -logger = logging.getLogger(__name__) +from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode +from anomalib.data.utils.split import split_normals_and_anomalous +from anomalib.pre_processing.pre_process import PreProcessor def _check_and_convert_path(path: Union[str, Path]) -> Path: @@ -69,269 +61,194 @@ def _prepare_files_labels( return filenames, labels -class FolderDataset(AnomalibDataset): - def __init__( - self, - normal_dir, - abnormal_dir, - normal_test_dir, - mask_dir, - extensions, - split_ratio, - seed, - create_validation_set, - *args, - **kwargs, - ): - self.normal_dir = normal_dir - self.abnormal_dir = abnormal_dir - self.normal_test_dir = normal_test_dir - self.extensions = extensions - self.mask_dir = mask_dir - self.split_ratio = split_ratio - self.seed = seed - self.create_validation_set = create_validation_set - super().__init__(*args, **kwargs) +def make_folder_dataset( + normal_dir: Union[str, Path], + abnormal_dir: Union[str, Path], + normal_test_dir: Optional[Union[str, Path]] = None, + mask_dir: Optional[Union[str, Path]] = None, + split: Optional[str] = None, + extensions: Optional[Tuple[str, ...]] = None, +): + """Make Folder Dataset. - def _create_samples(self): - """Create the dataframe with samples for the Folder dataset. + Args: + normal_dir (Union[str, Path]): Path to the directory containing normal images. + abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images. + normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing + normal images for the test dataset. Normal test images will be a split of `normal_dir` + if `None`. Defaults to None. + mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing + the mask annotations. Defaults to None. + split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None. + split_ratio (float, optional): Ratio to split normal training images and add to the + test set in case test set doesn't contain any normal images. + Defaults to 0.2. + seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0. + create_validation_set (bool, optional):Boolean to create a validation set from the test set. + Those wanting to create a validation set could set this flag to ``True``. + extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the + directory. - The files are expected to follow the structure: - path/to/dataset/normal_folder_name/normal_image_name.png - path/to/dataset/abnormal_folder_name/abnormal_image_name.png + Returns: + DataFrame: an output dataframe containing samples for the requested split (ie., train or test) + """ + filenames = [] + labels = [] + dirs = {"normal": normal_dir, "abnormal": abnormal_dir} - This function creates a dataframe to store the parsed information based on the following format: - |---|-------------------|--------|-------------|------------------|-------| - | | image_path | label | label_index | mask_path | split | - |---|-------------------|--------|-------------|------------------|-------| - | 0 | path/to/image.png | normal | 0 | path/to/mask.png | train | - |---|-------------------|--------|-------------|------------------|-------| + if normal_test_dir: + dirs = {**dirs, **{"normal_test": normal_test_dir}} - Returns: - DataFrame: an output dataframe containing the samples of the dataset. - """ + for dir_type, path in dirs.items(): + filename, label = _prepare_files_labels(path, dir_type, extensions) + filenames += filename + labels += label - filenames = [] - labels = [] - dirs = {"normal": self.normal_dir, "abnormal": self.abnormal_dir} + samples = DataFrame({"image_path": filenames, "label": labels}) - if self.normal_test_dir: - dirs = {**dirs, **{"normal_test": self.normal_test_dir}} + # Create label index for normal (0) and abnormal (1) images. + samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0 + samples.loc[(samples.label == "abnormal"), "label_index"] = 1 + samples.label_index = samples.label_index.astype(int) - for dir_type, path in dirs.items(): - if path is not None: - filename, label = _prepare_files_labels(path, dir_type, self.extensions) - filenames += filename - labels += label + # If a path to mask is provided, add it to the sample dataframe. + if mask_dir is not None: + mask_dir = _check_and_convert_path(mask_dir) + samples["mask_path"] = "" + for index, row in samples.iterrows(): + if row.label_index == 1: + samples.loc[index, "mask_path"] = str(mask_dir / row.image_path.name) - samples = DataFrame({"image_path": filenames, "label": labels}) + # Ensure the pathlib objects are converted to str. + # This is because torch dataloader doesn't like pathlib. + samples = samples.astype({"image_path": "str"}) - # Create label index for normal (0) and abnormal (1) images. - samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0 - samples.loc[(samples.label == "abnormal"), "label_index"] = 1 - samples.label_index = samples.label_index.astype(int) + # Create train/test split. + # By default, all the normal samples are assigned as train. + # and all the abnormal samples are test. + samples.loc[(samples.label == "normal"), "split"] = "train" + samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test" - # If a path to mask is provided, add it to the sample dataframe. - if self.mask_dir is not None: - self.mask_dir = _check_and_convert_path(self.mask_dir) - samples["mask_path"] = "" - for index, row in samples.iterrows(): - if row.label_index == 1: - samples.loc[index, "mask_path"] = str(self.mask_dir / row.image_path.name) + # Get the data frame for the split. + if split != Split.FULL: + samples = samples[samples.split == split] + samples = samples.reset_index(drop=True) - # Ensure the pathlib objects are converted to str. - # This is because torch dataloader doesn't like pathlib. - samples = samples.astype({"image_path": "str"}) + return samples - # Create train/test split. - # By default, all the normal samples are assigned as train. - # and all the abnormal samples are test. - samples.loc[(samples.label == "normal"), "split"] = "train" - samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test" - if not self.normal_test_dir: - samples = split_normal_images_in_train_set( - samples=samples, split_ratio=self.split_ratio, seed=self.seed, normal_label="normal" - ) +class Folder(AnomalibDataset): + def __init__( + self, + task: str, + pre_process: PreProcessor, + split: Split, + # + normal_dir: Union[str, Path], + abnormal_dir: Union[str, Path], + normal_test_dir: Optional[Union[str, Path]] = None, + mask_dir: Optional[Union[str, Path]] = None, + val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, + extensions=None, + samples=None, + ) -> None: + super().__init__(task, pre_process, samples=samples) - # If `create_validation_set` is set to True, the test set is split into half. - if self.create_validation_set: - samples = create_validation_set_from_test_set(samples, seed=self.seed, normal_label="normal") + self.split = split - return samples + self.normal_dir = normal_dir + self.abnormal_dir = abnormal_dir + self.normal_test_dir = normal_test_dir + self.mask_dir = mask_dir + self.extensions = extensions + self.val_split_mode = val_split_mode -@DATAMODULE_REGISTRY -class Folder(AnomalibDataModule): - """Folder Lightning Data Module.""" + def _setup(self): + self._samples = make_folder_dataset( + normal_dir=self.normal_dir, + abnormal_dir=self.abnormal_dir, + normal_test_dir=self.normal_test_dir, + mask_dir=self.mask_dir, + split=self.split, + extensions=self.extensions, + ) + +class FolderDataModule(AnomalibDataModule): def __init__( self, - root: Union[str, Path], - normal_dir: str = "normal", - abnormal_dir: str = "abnormal", - task: str = "classification", - normal_test_dir: Optional[Union[Path, str]] = None, - mask_dir: Optional[Union[Path, str]] = None, - extensions: Optional[Tuple[str, ...]] = None, - split_ratio: float = 0.2, - seed: Optional[int] = None, - image_size: Optional[Union[int, Tuple[int, int]]] = None, - train_batch_size: int = 32, - test_batch_size: int = 32, - num_workers: int = 8, - transform_config_train: Optional[Union[str, A.Compose]] = None, - transform_config_val: Optional[Union[str, A.Compose]] = None, - create_validation_set: bool = False, - ) -> None: - """Folder Dataset PL Datamodule. - - Args: - root (Union[str, Path]): Path to the root folder containing normal and abnormal dirs. - normal_dir (str, optional): Name of the directory containing normal images. - Defaults to "normal". - abnormal_dir (str, optional): Name of the directory containing abnormal images. - Defaults to "abnormal". - task (str, optional): Task type. Could be either classification or segmentation. - Defaults to "classification". - normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing - normal images for the test dataset. Defaults to None. - mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing - the mask annotations. Defaults to None. - extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the - directory. Defaults to None. - split_ratio (float, optional): Ratio to split normal training images and add to the - test set in case test set doesn't contain any normal images. - Defaults to 0.2. - seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0. - image_size (Optional[Union[int, Tuple[int, int]]], optional): Size of the input image. - Defaults to None. - train_batch_size (int, optional): Training batch size. Defaults to 32. - test_batch_size (int, optional): Test batch size. Defaults to 32. - num_workers (int, optional): Number of workers. Defaults to 8. - transform_config_train (Optional[Union[str, A.Compose]], optional): Config for pre-processing - during training. - Defaults to None. - transform_config_val (Optional[Union[str, A.Compose]], optional): Config for pre-processing - during validation. - Defaults to None. - create_validation_set (bool, optional):Boolean to create a validation set from the test set. - Those wanting to create a validation set could set this flag to ``True``. - - Examples: - Assume that we use Folder Dataset for the MVTec/bottle/broken_large category. We would do: - >>> from anomalib.data import Folder - >>> datamodule = Folder( - ... root="./datasets/MVTec/bottle/test", - ... normal="good", - ... abnormal="broken_large", - ... image_size=256 - ... ) - >>> datamodule.setup() - >>> i, data = next(enumerate(datamodule.train_dataloader())) - >>> data["image"].shape - torch.Size([16, 3, 256, 256]) - - >>> i, test_data = next(enumerate(datamodule.test_dataloader())) - >>> test_data.keys() - dict_keys(['image']) - - We could also create a Folder DataModule for datasets containing mask annotations. - The dataset expects that mask annotation filenames must be same as the original filename. - To this end, we modified mask filenames in MVTec AD bottle category. - Now we could try folder data module using the mvtec bottle broken large category - >>> datamodule = Folder( - ... root="./datasets/bottle/test", - ... normal="good", - ... abnormal="broken_large", - ... mask_dir="./datasets/bottle/ground_truth/broken_large", - ... image_size=256 - ... ) - - >>> i , train_data = next(enumerate(datamodule.train_dataloader())) - >>> train_data.keys() - dict_keys(['image']) - >>> train_data["image"].shape - torch.Size([16, 3, 256, 256]) - - >>> i, test_data = next(enumerate(datamodule.test_dataloader())) - dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask']) - >>> print(test_data["image"].shape, test_data["mask"].shape) - torch.Size([24, 3, 256, 256]) torch.Size([24, 256, 256]) - - By default, Folder Data Module does not create a validation set. If a validation set - is needed it could be set as follows: - - >>> datamodule = Folder( - ... root="./datasets/bottle/test", - ... normal="good", - ... abnormal="broken_large", - ... mask_dir="./datasets/bottle/ground_truth/broken_large", - ... image_size=256, - ... create_validation_set=True, - ... ) - - >>> i, val_data = next(enumerate(datamodule.val_dataloader())) - >>> val_data.keys() - dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask']) - >>> print(val_data["image"].shape, val_data["mask"].shape) - torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256]) - - >>> i, test_data = next(enumerate(datamodule.test_dataloader())) - >>> print(test_data["image"].shape, test_data["mask"].shape) - torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256]) - - """ + root, + task, + train_batch_size, + test_batch_size, + image_size, + num_workers, + val_split_mode, + # + normal_dir, + abnormal_dir, + normal_test_dir, + mask_dir, + split_ratio, + transform_config_train=None, + transform_config_val=None, + extensions=None, + ): super().__init__( task=task, train_batch_size=train_batch_size, test_batch_size=test_batch_size, num_workers=num_workers, - transform_config_train=transform_config_train, - transform_config_val=transform_config_val, - image_size=image_size, + val_split_mode=val_split_mode, ) - if seed is None and normal_test_dir is None: - raise ValueError( - "Both seed and normal_test_dir cannot be None." - " When seed is not set, images from the normal directory are split between training and test dir." - " This will lead to inconsistency between runs." - ) - - if task == "segmentation" and mask_dir is None: - warnings.warn( - "Segmentation task is requested, but mask directory is not provided. " - "Classification is to be chosen if mask directory is not provided." - ) - self.task = "classification" - else: - self.task = task - - self.root = _check_and_convert_path(root) - self.normal_dir = self.root / normal_dir - self.abnormal_dir = self.root / abnormal_dir if abnormal_dir is not None else None - self.normal_test_dir = normal_test_dir - if normal_test_dir: - self.normal_test_dir = self.root / normal_test_dir - self.mask_dir = mask_dir - self.extensions = extensions self.split_ratio = split_ratio - self.create_validation_set = create_validation_set - self.seed = seed + pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) + pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size) - def create_dataset(self): - return FolderDataset( - normal_dir=self.normal_dir, - abnormal_dir=self.abnormal_dir, - normal_test_dir=self.normal_test_dir, - mask_dir=self.mask_dir, - extensions=self.extensions, - split_ratio=self.split_ratio, - seed=self.seed, - create_validation_set=self.create_validation_set, - task=self.task, - pre_process=self.pre_process_train, + normal_dir = Path(root) / Path(normal_dir) + abnormal_dir = Path(root) / Path(abnormal_dir) + + self.train_data = Folder( + task=task, + pre_process=pre_process_train, + split=Split.TRAIN, + normal_dir=normal_dir, + abnormal_dir=abnormal_dir, + normal_test_dir=normal_test_dir, + mask_dir=mask_dir, + extensions=extensions, ) + + self.test_data = Folder( + task=task, + pre_process=pre_process_infer, + split=Split.TEST, + normal_dir=normal_dir, + abnormal_dir=abnormal_dir, + normal_test_dir=normal_test_dir, + mask_dir=mask_dir, + extensions=extensions, + ) + + def _setup(self, _stage: Optional[str] = None): + + assert self.train_data is not None + assert self.test_data is not None + + self.train_data.setup() + self.test_data.setup() + + if not self.test_data.has_normal: + self.train_data, normal_test_data = split_normals_and_anomalous(self.train_data, self.split_ratio) + self.test_data += normal_test_data + + if self.val_split_mode == ValSplitMode.FROM_TEST: + self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5) + elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: + self.val_data = self.test_data + else: + raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 1772baf4f1..5c26a87f59 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -1,58 +1,77 @@ -"""MVTec AD Dataset (CC BY-NC-SA 4.0). - -Description: - This script contains PyTorch Dataset, Dataloader and PyTorch - Lightning DataModule for the MVTec AD dataset. - - If the dataset is not on the file system, the script downloads and - extracts the dataset and create PyTorch data objects. - -License: - MVTec AD dataset is released under the Creative Commons - Attribution-NonCommercial-ShareAlike 4.0 International License - (CC BY-NC-SA 4.0)(https://creativecommons.org/licenses/by-nc-sa/4.0/). - -Reference: - - Paul Bergmann, Kilian Batzner, Michael Fauser, David Sattlegger, Carsten Steger: - The MVTec Anomaly Detection Dataset: A Comprehensive Real-World Dataset for - Unsupervised Anomaly Detection; in: International Journal of Computer Vision - 129(4):1038-1059, 2021, DOI: 10.1007/s11263-020-01400-4. - - - Paul Bergmann, Michael Fauser, David Sattlegger, Carsten Steger: MVTec AD — - A Comprehensive Real-World Dataset for Unsupervised Anomaly Detection; - in: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), - 9584-9592, 2019, DOI: 10.1109/CVPR.2019.00982. -""" - -# Copyright (C) 2022 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -import logging -import tarfile -import warnings from pathlib import Path from typing import Optional, Tuple, Union -from urllib.request import urlretrieve import albumentations as A -import pandas as pd -from pandas.core.frame import DataFrame -from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY +from pandas import DataFrame -from anomalib.data.base import AnomalibDataModule -from anomalib.data.utils import DownloadProgressBar, hash_check -from anomalib.data.utils.split import ( - create_validation_set_from_test_set, - split_normal_images_in_train_set, -) +from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode +from anomalib.data.utils.split import split_normals_and_anomalous +from anomalib.pre_processing import PreProcessor -logger = logging.getLogger(__name__) +def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> DataFrame: + """Create MVTec AD samples by parsing the MVTec AD data file structure. -@DATAMODULE_REGISTRY -class MVTec(AnomalibDataModule): - """MVTec AD Lightning Data Module.""" + The files are expected to follow the structure: + path/to/dataset/split/category/image_filename.png + path/to/dataset/ground_truth/category/mask_filename.png + This function creates a dataframe to store the parsed information based on the following format: + |---|---------------|-------|---------|---------------|---------------------------------------|-------------| + | | path | split | label | image_path | mask_path | label_index | + |---|---------------|-------|---------|---------------|---------------------------------------|-------------| + | 0 | datasets/name | test | defect | filename.png | ground_truth/defect/filename_mask.png | 1 | + |---|---------------|-------|---------|---------------|---------------------------------------|-------------| + + Returns: + DataFrame: an output dataframe containing the samples of the dataset. + """ + samples_list = [(str(root),) + filename.parts[-3:] for filename in Path(root).glob("**/*.png")] + if len(samples_list) == 0: + raise RuntimeError(f"Found 0 images in {root}") + + samples = DataFrame(samples_list, columns=["path", "split", "label", "image_path"]) + samples = samples[samples.split != "ground_truth"] + + # Create mask_path column + samples["mask_path"] = ( + samples.path + + "/ground_truth/" + + samples.label + + "/" + + samples.image_path.str.rstrip("png").str.rstrip(".") + + "_mask.png" + ) + + # Modify image_path column by converting to absolute path + samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path + + # Good images don't have mask + samples.loc[(samples.split == "test") & (samples.label == "good"), "mask_path"] = "" + + # Create label index for normal (0) and anomalous (1) images. + samples.loc[(samples.label == "good"), "label_index"] = 0 + samples.loc[(samples.label != "good"), "label_index"] = 1 + samples.label_index = samples.label_index.astype(int) + + if split != Split.FULL: + samples = samples[samples.split == split].reset_index(drop=True) + + return samples + + +class MVTec(AnomalibDataset): + def __init__(self, task: str, pre_process: PreProcessor, split: Split, root, category, samples=None) -> None: + super().__init__(task=task, pre_process=pre_process, samples=samples) + + self.root_category = Path(root) / Path(category) + self.split = split + + def _setup(self): + self._samples = make_mvtec_dataset(self.root_category, split=self.split) + + +class MVTecDataModule(AnomalibDataModule): def __init__( self, root: str, @@ -64,156 +83,39 @@ def __init__( task: str = "segmentation", transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_val: Optional[Union[str, A.Compose]] = None, - split_ratio: float = 0.2, - seed: Optional[int] = None, - create_validation_set: bool = False, - ) -> None: - """Mvtec AD Lightning Data Module. - - Args: - root: Path to the MVTec AD dataset - category: Name of the MVTec AD category. - image_size: Variable to which image is resized. - train_batch_size: Training batch size. - test_batch_size: Testing batch size. - num_workers: Number of workers. - task: ``classification`` or ``segmentation`` - transform_config_train: Config for pre-processing during training. - transform_config_val: Config for pre-processing during validation. - seed: seed used for the random subset splitting - create_validation_set: Create a validation subset in addition to the train and test subsets - - Examples - >>> from anomalib.data import MVTec - >>> datamodule = MVTec( - ... root="./datasets/MVTec", - ... category="leather", - ... image_size=256, - ... train_batch_size=32, - ... test_batch_size=32, - ... num_workers=8, - ... transform_config_train=None, - ... transform_config_val=None, - ... ) - >>> datamodule.setup() - - >>> i, data = next(enumerate(datamodule.train_dataloader())) - >>> data.keys() - dict_keys(['image']) - >>> data["image"].shape - torch.Size([32, 3, 256, 256]) - - >>> i, data = next(enumerate(datamodule.val_dataloader())) - >>> data.keys() - dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask']) - >>> data["image"].shape, data["mask"].shape - (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256])) - """ - self.root = root if isinstance(root, Path) else Path(root) - self.category = category - self.path = self.root / self.category - - self.create_validation_set = create_validation_set - self.seed = seed - self.split_ratio = split_ratio - + val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, + ): super().__init__( task=task, train_batch_size=train_batch_size, test_batch_size=test_batch_size, num_workers=num_workers, - transform_config_train=transform_config_train, - transform_config_val=transform_config_val, - image_size=image_size, - create_validation_set=create_validation_set, - ) - - def prepare_data(self) -> None: - """Download the dataset if not available.""" - if (self.root / self.category).is_dir(): - logger.info("Found the dataset.") - else: - self.root.mkdir(parents=True, exist_ok=True) - - logger.info("Downloading the Mvtec AD dataset.") - url = "https://www.mydrive.ch/shares/38536/3830184030e49fe74747669442f0f282/download/420938113-1629952094" - dataset_name = "mvtec_anomaly_detection.tar.xz" - zip_filename = self.root / dataset_name - with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc="MVTec AD") as progress_bar: - urlretrieve( - url=f"{url}/{dataset_name}", - filename=zip_filename, - reporthook=progress_bar.update_to, - ) - logger.info("Checking hash") - hash_check(zip_filename, "eefca59f2cede9c3fc5b6befbfec275e") - - logger.info("Extracting the dataset.") - with tarfile.open(zip_filename) as tar_file: - tar_file.extractall(self.root) - - logger.info("Cleaning the tar file") - zip_filename.unlink() - - def _create_samples(self) -> DataFrame: - """Create MVTec AD samples by parsing the MVTec AD data file structure. - - The files are expected to follow the structure: - path/to/dataset/split/category/image_filename.png - path/to/dataset/ground_truth/category/mask_filename.png - - This function creates a dataframe to store the parsed information based on the following format: - |---|---------------|-------|---------|---------------|---------------------------------------|-------------| - | | path | split | label | image_path | mask_path | label_index | - |---|---------------|-------|---------|---------------|---------------------------------------|-------------| - | 0 | datasets/name | test | defect | filename.png | ground_truth/defect/filename_mask.png | 1 | - |---|---------------|-------|---------|---------------|---------------------------------------|-------------| - - Returns: - DataFrame: an output dataframe containing the samples of the dataset. - """ - if self.seed is None: - warnings.warn( - "seed is None." - " When seed is not set, images from the normal directory are split between training and test dir." - " This will lead to inconsistency between runs." - ) - - samples_list = [(str(self.path),) + filename.parts[-3:] for filename in self.path.glob("**/*.png")] - if len(samples_list) == 0: - raise RuntimeError(f"Found 0 images in {self.path}") - - samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"]) - samples = samples[samples.split != "ground_truth"] - - # Create mask_path column - samples["mask_path"] = ( - samples.path - + "/ground_truth/" - + samples.label - + "/" - + samples.image_path.str.rstrip("png").str.rstrip(".") - + "_mask.png" + val_split_mode=val_split_mode, ) - # Modify image_path column by converting to absolute path - samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path - - # Split the normal images in training set if test set doesn't - # contain any normal images. This is needed because AUC score - # cannot be computed based on 1-class - if sum((samples.split == "test") & (samples.label == "good")) == 0: - samples = split_normal_images_in_train_set(samples, self.split_ratio, self.seed) + self.val_split_mode = val_split_mode - # Good images don't have mask - samples.loc[(samples.split == "test") & (samples.label == "good"), "mask_path"] = "" + pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) + pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size) - # Create label index for normal (0) and anomalous (1) images. - samples.loc[(samples.label == "good"), "label_index"] = 0 - samples.loc[(samples.label != "good"), "label_index"] = 1 - samples.label_index = samples.label_index.astype(int) + self.train_data = MVTec( + task=task, pre_process=pre_process_train, split=Split.TRAIN, root=root, category=category + ) + self.test_data = MVTec(task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category) - if self.create_validation_set: - samples = create_validation_set_from_test_set(samples, seed=self.seed) + def _setup(self, _stage: Optional[str] = None) -> None: + """Set up the datasets and perform dynamic subset splitting if necessary. - return samples + This method may be overridden in subclasses for custom splitting behaviour. + """ + assert self.train_data is not None + assert self.test_data is not None + + self.train_data.setup() + self.test_data.setup() + if self.val_split_mode == ValSplitMode.FROM_TEST: + self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5) + elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: + self.val_data = self.test_data + else: + raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index 311928bb6b..ba47dbdaa4 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -12,9 +12,12 @@ # SPDX-License-Identifier: Apache-2.0 import random -from typing import Optional +from typing import Optional, Tuple from pandas.core.frame import DataFrame +from torch.utils.data import Subset + +from anomalib.data.base import AnomalibDataset def split_normal_images_in_train_set( @@ -84,3 +87,45 @@ def create_validation_set_from_test_set( samples.loc[indices_to_sample, "split"] = "val" return samples + + +def split_normals_and_anomalous( + dataset: "AnomalibDataset", split_ratio: float, seed: Optional[int] = None +) -> Tuple[Subset, Subset]: + """Wrap dataset wit torch.utils.data.Subset twice to create two (non-overlaping) subsets. + Args: + dataset (AnomalibDataset): AnomalibDataset object. + split_ratio (float): Split ratio (0 to 100%) that goes to the NEW split. + seed (int): Random seed to ensure reproducibility. + Returns: + Tuple[AnomalibDataset, AnomalibDataset]: (new split, old split). + """ + + assert 0 < split_ratio < 1, "Split ratio must be between 0 and 1." + if seed is not None: + assert seed >= 0, "Seed must be non-negative." + random.seed(seed) + + # get the indices of the normal/anomalous images in the dataset + normals_indices = dataset.samples.index[dataset.samples.label_index == 0].to_list() + anomalous_indices = dataset.samples.index[dataset.samples.label_index == 1].to_list() + + # get the number of normal/anomalous images will got to the new split + new_split_n_normals = int(len(normals_indices) * split_ratio) + new_split_n_anomalous = int(len(anomalous_indices) * split_ratio) + + # ranmdomly sample the indices of the normal/anomalous images that will go to the new split + new_split_normals_indices = random.sample(population=normals_indices, k=new_split_n_normals) + new_split_anomalous_indices = random.sample(population=anomalous_indices, k=new_split_n_anomalous) + + # indices that remain in the original split + old_split_normals_indices = list(set(normals_indices) - set(new_split_normals_indices)) + old_split_anomalous_indices = list(set(anomalous_indices) - set(new_split_anomalous_indices)) + + # create the new split and the (reduced) original split + # new_split = Subset(dataset, new_split_normals_indices + new_split_anomalous_indices) + # old_split = Subset(dataset, old_split_normals_indices + old_split_anomalous_indices) + new_split = dataset.subsample(new_split_normals_indices + new_split_anomalous_indices) + old_split = dataset.subsample(old_split_normals_indices + old_split_anomalous_indices) + + return new_split, old_split diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml index 92e66618dc..a12d1d7a25 100644 --- a/anomalib/models/padim/config.yaml +++ b/anomalib/models/padim/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null val: null - create_validation_set: false + validation_split_mode: from_test tiling: apply: false tile_size: null diff --git a/tools/train.py b/tools/train.py index 33952a7e20..37b894af79 100644 --- a/tools/train.py +++ b/tools/train.py @@ -63,7 +63,7 @@ def train(): load_model_callback = LoadModelCallback(weights_path=trainer.checkpoint_callback.best_model_path) trainer.callbacks.insert(0, load_model_callback) - if datamodule.contains_anomalous_images("test"): + if datamodule.test_data.has_anomalous: logger.info("Testing the model.") trainer.test(model=model, datamodule=datamodule) else: From 94cabb7ba637478cf6f2c97fe5c1a42ce248ad39 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 7 Oct 2022 18:00:28 +0200 Subject: [PATCH 20/96] remove unused constructor arguments --- anomalib/data/base.py | 6 ------ anomalib/data/folder.py | 6 ++++-- anomalib/data/mvtec.py | 2 -- anomalib/data/utils/split.py | 2 +- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 9c5ab99ed5..0b25f5aa66 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -149,18 +149,14 @@ class AnomalibDataModule(LightningDataModule, ABC): def __init__( self, - task: str, train_batch_size: int, test_batch_size: int, num_workers: int, - val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, ): super().__init__() - self.task = task self.train_batch_size = train_batch_size self.test_batch_size = test_batch_size self.num_workers = num_workers - self.val_split_mode = val_split_mode self.train_data: Optional[AnomalibDataset] = None self.val_data: Optional[AnomalibDataset] = None @@ -168,8 +164,6 @@ def __init__( self._samples: Optional[DataFrame] = None - self.data: Optional[AnomalibDataset] = None - def setup(self, stage: Optional[str] = None): """Setup train, validation and test data. diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index cebba3ea46..be20425269 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -197,13 +197,12 @@ def __init__( extensions=None, ): super().__init__( - task=task, train_batch_size=train_batch_size, test_batch_size=test_batch_size, num_workers=num_workers, - val_split_mode=val_split_mode, ) + self.val_split_mode = val_split_mode self.split_ratio = split_ratio pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) @@ -242,11 +241,14 @@ def _setup(self, _stage: Optional[str] = None): self.train_data.setup() self.test_data.setup() + # add some normal images to the test set if not self.test_data.has_normal: self.train_data, normal_test_data = split_normals_and_anomalous(self.train_data, self.split_ratio) self.test_data += normal_test_data + # split validation set from test set if self.val_split_mode == ValSplitMode.FROM_TEST: + assert self.test_data is not None self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5) elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: self.val_data = self.test_data diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 5c26a87f59..72408a2eee 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -86,11 +86,9 @@ def __init__( val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, ): super().__init__( - task=task, train_batch_size=train_batch_size, test_batch_size=test_batch_size, num_workers=num_workers, - val_split_mode=val_split_mode, ) self.val_split_mode = val_split_mode diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index ba47dbdaa4..c8ed8a5e90 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -91,7 +91,7 @@ def create_validation_set_from_test_set( def split_normals_and_anomalous( dataset: "AnomalibDataset", split_ratio: float, seed: Optional[int] = None -) -> Tuple[Subset, Subset]: +) -> Tuple[AnomalibDataset, AnomalibDataset]: """Wrap dataset wit torch.utils.data.Subset twice to create two (non-overlaping) subsets. Args: dataset (AnomalibDataset): AnomalibDataset object. From 1ee8a962fa0686f0eb3aa5fb5d8bfc3b06889cbb Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 7 Oct 2022 18:11:30 +0200 Subject: [PATCH 21/96] adapt btech to new design --- anomalib/data/__init__.py | 15 +-- anomalib/data/btech.py | 254 +++++++++++++++++++++++++------------- anomalib/data/mvtec.py | 52 +++++++- 3 files changed, 221 insertions(+), 100 deletions(-) diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index d1da8af375..7ba61dc5c2 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -10,7 +10,7 @@ from anomalib.data.base import AnomalibDataModule -from .btech import BTech +from .btech import BTechDataModule from .folder import FolderDataModule from .inference import InferenceDataset from .mvtec import MVTecDataModule @@ -33,7 +33,6 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: if config.dataset.format.lower() == "mvtec": datamodule = MVTecDataModule( - # TODO: Remove config values. IAAALD-211 root=config.dataset.path, category=config.dataset.category, image_size=(config.dataset.image_size[0], config.dataset.image_size[1]), @@ -46,19 +45,17 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: val_split_mode=config.dataset.validation_split_mode, ) elif config.dataset.format.lower() == "btech": - datamodule = BTech( - # TODO: Remove config values. IAAALD-211 + datamodule = BTechDataModule( root=config.dataset.path, category=config.dataset.category, image_size=(config.dataset.image_size[0], config.dataset.image_size[1]), train_batch_size=config.dataset.train_batch_size, test_batch_size=config.dataset.test_batch_size, num_workers=config.dataset.num_workers, - seed=config.project.seed, task=config.dataset.task, transform_config_train=config.dataset.transform_config.train, transform_config_val=config.dataset.transform_config.val, - create_validation_set=config.dataset.create_validation_set, + val_split_mode=config.dataset.validation_split_mode, ) elif config.dataset.format.lower() == "folder": datamodule = FolderDataModule( @@ -90,8 +87,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: __all__ = [ "get_datamodule", - "BTech", - "Folder", + "BTechDataModule", + "FolderDataModule", "InferenceDataset", - "MVTec", + "MVTecDataModule", ] diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 489841ab94..fa799cc6e7 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -23,18 +23,153 @@ from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY from tqdm import tqdm -from anomalib.data.base import AnomalibDataModule +from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode from anomalib.data.utils import DownloadProgressBar, hash_check -from anomalib.data.utils.split import ( - create_validation_set_from_test_set, - split_normal_images_in_train_set, -) +from anomalib.data.utils.split import split_normals_and_anomalous +from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) +def make_btech_dataset(path: Path, split: Optional[str] = None) -> DataFrame: + """Create BTech samples by parsing the BTech data file structure. + + The files are expected to follow the structure: + path/to/dataset/split/category/image_filename.png + path/to/dataset/ground_truth/category/mask_filename.png + + Args: + path (Path): Path to dataset + split (str, optional): Dataset split (ie., either train or test). Defaults to None. + split_ratio (float, optional): Ratio to split normal training images and add to the + test set in case test set doesn't contain any normal images. + Defaults to 0.1. + seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0. + create_validation_set (bool, optional): Boolean to create a validation set from the test set. + BTech dataset does not contain a validation set. Those wanting to create a validation set + could set this flag to ``True``. + + Example: + The following example shows how to get training samples from BTech 01 category: + + >>> root = Path('./BTech') + >>> category = '01' + >>> path = root / category + >>> path + PosixPath('BTech/01') + + >>> samples = make_btech_dataset(path, split='train', split_ratio=0.1, seed=0) + >>> samples.head() + path split label image_path mask_path label_index + 0 BTech/01 train 01 BTech/01/train/ok/105.bmp BTech/01/ground_truth/ok/105.png 0 + 1 BTech/01 train 01 BTech/01/train/ok/017.bmp BTech/01/ground_truth/ok/017.png 0 + ... + + Returns: + DataFrame: an output dataframe containing samples for the requested split (ie., train or test) + """ + samples_list = [ + (str(path),) + filename.parts[-3:] for filename in path.glob("**/*") if filename.suffix in (".bmp", ".png") + ] + if len(samples_list) == 0: + raise RuntimeError(f"Found 0 images in {path}") + + samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"]) + samples = samples[samples.split != "ground_truth"] + + # Create mask_path column + samples["mask_path"] = ( + samples.path + + "/ground_truth/" + + samples.label + + "/" + + samples.image_path.str.rstrip("png").str.rstrip(".") + + ".png" + ) + + # Modify image_path column by converting to absolute path + samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path + + # Good images don't have mask + samples.loc[(samples.split == "test") & (samples.label == "ok"), "mask_path"] = "" + + # Create label index for normal (0) and anomalous (1) images. + samples.loc[(samples.label == "ok"), "label_index"] = 0 + samples.loc[(samples.label != "ok"), "label_index"] = 1 + samples.label_index = samples.label_index.astype(int) + + # Get the data frame for the split. + if split != Split.FULL: + samples = samples[samples.split == split] + samples = samples.reset_index(drop=True) + + return samples + + +class BTech(AnomalibDataset): + """BTech PyTorch Dataset.""" + + def __init__( + self, + root: Union[Path, str], + category: str, + pre_process: PreProcessor, + split: Split, + task: str = "segmentation", + samples: Optional[DataFrame] = None, + ) -> None: + """Btech Dataset class. + + Args: + root: Path to the BTech dataset + category: Name of the BTech category. + pre_process: List of pre_processing object containing albumentation compose. + split: 'train', 'val' or 'test' + task: ``classification`` or ``segmentation`` + seed: seed used for the random subset splitting + create_validation_set: Create a validation subset in addition to the train and test subsets + + Examples: + >>> from anomalib.data.btech import BTechDataset + >>> from anomalib.data.transforms import PreProcessor + >>> pre_process = PreProcessor(image_size=256) + >>> dataset = BTechDataset( + ... root='./datasets/BTech', + ... category='leather', + ... pre_process=pre_process, + ... task="classification", + ... is_train=True, + ... ) + >>> dataset[0].keys() + dict_keys(['image']) + + >>> dataset.split = "test" + >>> dataset[0].keys() + dict_keys(['image', 'image_path', 'label']) + + >>> dataset.task = "segmentation" + >>> dataset.split = "train" + >>> dataset[0].keys() + dict_keys(['image']) + + >>> dataset.split = "test" + >>> dataset[0].keys() + dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask']) + + >>> dataset[0]["image"].shape, dataset[0]["mask"].shape + (torch.Size([3, 256, 256]), torch.Size([256, 256])) + """ + super().__init__(task, pre_process, samples) + + self.root_category = Path(root) / Path(category) + self.split = split + + def _setup(self): + self._samples = make_btech_dataset(path=self.root_category, split=self.split) + + @DATAMODULE_REGISTRY -class BTech(AnomalibDataModule): +class BTechDataModule(AnomalibDataModule): """BTechDataModule Lightning Data Module.""" def __init__( @@ -48,9 +183,7 @@ def __init__( task: str = "segmentation", transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_val: Optional[Union[str, A.Compose]] = None, - split_ratio: float = 0.2, - seed: Optional[int] = None, - create_validation_set: bool = False, + val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, ) -> None: """Instantiate BTech Lightning Data Module. @@ -67,7 +200,7 @@ def __init__( seed: seed used for the random subset splitting create_validation_set: Create a validation subset in addition to the train and test subsets - Examples + Examples: >>> from anomalib.data import BTech >>> datamodule = BTech( ... root="./datasets/BTech", @@ -93,24 +226,19 @@ def __init__( >>> data["image"].shape, data["mask"].shape (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256])) """ - self.root = root if isinstance(root, Path) else Path(root) - self.category = category - self.path = self.root / self.category - - self.create_validation_set = create_validation_set - self.seed = seed - self.split_ratio = split_ratio - - super().__init__( - task=task, - train_batch_size=train_batch_size, - test_batch_size=test_batch_size, - num_workers=num_workers, - transform_config_train=transform_config_train, - transform_config_val=transform_config_val, - image_size=image_size, - create_validation_set=create_validation_set, + super().__init__(train_batch_size, test_batch_size, num_workers) + + self.root = Path(root) + self.category = Path(category) + self.val_split_mode = val_split_mode + + pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) + pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size) + + self.train_data = BTech( + task=task, pre_process=pre_process_train, split=Split.TRAIN, root=root, category=category ) + self.test_data = BTech(task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category) def prepare_data(self) -> None: """Download the dataset if not available.""" @@ -153,62 +281,16 @@ def prepare_data(self) -> None: logger.info("Cleaning the tar file") zip_filename.unlink() - def _create_samples(self) -> DataFrame: - """Create BTech samples by parsing the BTech data file structure. - - The files are expected to follow the structure: - path/to/dataset/category/split/[ok|ko]/image_filename.bmp - path/to/dataset/category/ground_truth/ko/mask_filename.png - - This function creates a dataframe to store the parsed information based on the following format: - |---|---------------|-------|---------|---------------|---------------------------------------|-------------| - | | path | split | label | image_path | mask_path | label_index | - |---|---------------|-------|---------|---------------|---------------------------------------|-------------| - | 0 | datasets/name | test | ko | filename.png | ground_truth/ko/filename_mask.png | 1 | - |---|---------------|-------|---------|---------------|---------------------------------------|-------------| - - Returns: - DataFrame: an output dataframe containing the samples of the dataset. - """ - samples_list = [ - (str(self.path),) + filename.parts[-3:] - for filename in self.path.glob("**/*") - if filename.suffix in (".bmp", ".png") - ] - if len(samples_list) == 0: - raise RuntimeError(f"Found 0 images in {self.path}") - - samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"]) - samples = samples[samples.split != "ground_truth"] - - # Create mask_path column - samples["mask_path"] = ( - samples.path - + "/ground_truth/" - + samples.label - + "/" - + samples.image_path.str.rstrip("bmp|png").str.rstrip(".") - + ".png" - ) - - # Modify image_path column by converting to absolute path - samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path - - # Split the normal images in training set if test set doesn't - # contain any normal images. This is needed because AUC score - # cannot be computed based on 1-class - if sum((samples.split == "test") & (samples.label == "ok")) == 0: - samples = split_normal_images_in_train_set(samples, self.split_ratio, self.seed) - - # Good images don't have mask - samples.loc[(samples.split == "test") & (samples.label == "ok"), "mask_path"] = "" - - # Create label index for normal (0) and anomalous (1) images. - samples.loc[(samples.label == "ok"), "label_index"] = 0 - samples.loc[(samples.label != "ok"), "label_index"] = 1 - samples.label_index = samples.label_index.astype(int) - - if self.create_validation_set: - samples = create_validation_set_from_test_set(samples, seed=self.seed) - - return samples + def _setup(self, _stage: Optional[str] = None): + """Set up the datasets and perform dynamic subset splitting.""" + assert self.train_data is not None + assert self.test_data is not None + + self.train_data.setup() + self.test_data.setup() + if self.val_split_mode == ValSplitMode.FROM_TEST: + self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5) + elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: + self.val_data = self.test_data + else: + raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 72408a2eee..6e2ba99e5d 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -1,3 +1,28 @@ +"""MVTec AD Dataset (CC BY-NC-SA 4.0). + +Description: + This script contains PyTorch Dataset, Dataloader and PyTorch + Lightning DataModule for the MVTec AD dataset. + If the dataset is not on the file system, the script downloads and + extracts the dataset and create PyTorch data objects. +License: + MVTec AD dataset is released under the Creative Commons + Attribution-NonCommercial-ShareAlike 4.0 International License + (CC BY-NC-SA 4.0)(https://creativecommons.org/licenses/by-nc-sa/4.0/). +Reference: + - Paul Bergmann, Kilian Batzner, Michael Fauser, David Sattlegger, Carsten Steger: + The MVTec Anomaly Detection Dataset: A Comprehensive Real-World Dataset for + Unsupervised Anomaly Detection; in: International Journal of Computer Vision + 129(4):1038-1059, 2021, DOI: 10.1007/s11263-020-01400-4. + - Paul Bergmann, Michael Fauser, David Sattlegger, Carsten Steger: MVTec AD — + A Comprehensive Real-World Dataset for Unsupervised Anomaly Detection; + in: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR), + 9584-9592, 2019, DOI: 10.1109/CVPR.2019.00982. +""" + +# Copyright (C) 2022 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + from pathlib import Path from typing import Optional, Tuple, Union @@ -61,7 +86,25 @@ def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> Dat class MVTec(AnomalibDataset): - def __init__(self, task: str, pre_process: PreProcessor, split: Split, root, category, samples=None) -> None: + """MVTec dataset class. + + Args: + task (str): Task type, either 'classification' or 'segmentation' + pre_process (PreProcessor): Pre-processor object + split (Split): Split of the dataset, usually Split.TRAIN or Split. TEST + root (str): Path to the root of the dataset + category (str): Sub-category of the dataset, e.g. 'bottle' + """ + + def __init__( + self, + task: str, + pre_process: PreProcessor, + split: Split, + root: str, + category: str, + samples: Optional[DataFrame] = None, + ) -> None: super().__init__(task=task, pre_process=pre_process, samples=samples) self.root_category = Path(root) / Path(category) @@ -72,6 +115,8 @@ def _setup(self): class MVTecDataModule(AnomalibDataModule): + """MVTec Datamodule.""" + def __init__( self, root: str, @@ -102,10 +147,7 @@ def __init__( self.test_data = MVTec(task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category) def _setup(self, _stage: Optional[str] = None) -> None: - """Set up the datasets and perform dynamic subset splitting if necessary. - - This method may be overridden in subclasses for custom splitting behaviour. - """ + """Set up the datasets and perform dynamic subset splitting.""" assert self.train_data is not None assert self.test_data is not None From 7fc5483ef57cd1d1eaf85dcacf719fa03bbded94 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 7 Oct 2022 18:17:20 +0200 Subject: [PATCH 22/96] add prepare_data method for mvtec --- anomalib/data/mvtec.py | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 6e2ba99e5d..8e14587162 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -23,16 +23,22 @@ # Copyright (C) 2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import logging +import tarfile from pathlib import Path from typing import Optional, Tuple, Union +from urllib.request import urlretrieve import albumentations as A from pandas import DataFrame from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode +from anomalib.data.utils import DownloadProgressBar, hash_check from anomalib.data.utils.split import split_normals_and_anomalous from anomalib.pre_processing import PreProcessor +logger = logging.getLogger(__name__) + def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> DataFrame: """Create MVTec AD samples by parsing the MVTec AD data file structure. @@ -136,6 +142,8 @@ def __init__( num_workers=num_workers, ) + self.root = Path(root) + self.category = Path(category) self.val_split_mode = val_split_mode pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) @@ -146,6 +154,33 @@ def __init__( ) self.test_data = MVTec(task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category) + def prepare_data(self) -> None: + """Download the dataset if not available.""" + if (self.root / self.category).is_dir(): + logger.info("Found the dataset.") + else: + self.root.mkdir(parents=True, exist_ok=True) + + logger.info("Downloading the Mvtec AD dataset.") + url = "https://www.mydrive.ch/shares/38536/3830184030e49fe74747669442f0f282/download/420938113-1629952094" + dataset_name = "mvtec_anomaly_detection.tar.xz" + zip_filename = self.root / dataset_name + with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc="MVTec AD") as progress_bar: + urlretrieve( + url=f"{url}/{dataset_name}", + filename=zip_filename, + reporthook=progress_bar.update_to, + ) + logger.info("Checking hash") + hash_check(zip_filename, "eefca59f2cede9c3fc5b6befbfec275e") + + logger.info("Extracting the dataset.") + with tarfile.open(zip_filename) as tar_file: + tar_file.extractall(self.root) + + logger.info("Cleaning the tar file") + (zip_filename).unlink() + def _setup(self, _stage: Optional[str] = None) -> None: """Set up the datasets and perform dynamic subset splitting.""" assert self.train_data is not None From 1ac7c652d9b66e7e49429138d170dc3a47165eae Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 10 Oct 2022 16:08:04 +0200 Subject: [PATCH 23/96] implement more generic random splitting function --- anomalib/data/base.py | 9 +- anomalib/data/btech.py | 4 +- anomalib/data/folder.py | 6 +- anomalib/data/mvtec.py | 5 +- anomalib/data/utils/split.py | 156 +++++++++++------------------------ 5 files changed, 63 insertions(+), 117 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 0b25f5aa66..392fa32438 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -52,7 +52,8 @@ def __len__(self) -> int: return len(self._samples) def subsample(self, indices): - return AnomalibDataset(task=self.task, pre_process=self.pre_process, samples=self.samples.iloc[indices]) + samples = self.samples.iloc[indices].reset_index(drop=True) + return AnomalibDataset(task=self.task, pre_process=self.pre_process, samples=samples) @property def is_setup(self) -> bool: @@ -119,6 +120,12 @@ def __add__(self, other_dataset: AnomalibDataset): samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True) return AnomalibDataset(self.task, self.pre_process, samples) + def __radd__(self, other): + if other == 0: + return self + else: + return self.__add__(other) + def setup(self) -> None: """Load data/metadata into memory""" if not self.is_setup: diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index fa799cc6e7..f7d03ac19b 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -25,7 +25,7 @@ from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode from anomalib.data.utils import DownloadProgressBar, hash_check -from anomalib.data.utils.split import split_normals_and_anomalous +from anomalib.data.utils.split import random_split from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) @@ -289,7 +289,7 @@ def _setup(self, _stage: Optional[str] = None): self.train_data.setup() self.test_data.setup() if self.val_split_mode == ValSplitMode.FROM_TEST: - self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5) + self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True) elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: self.val_data = self.test_data else: diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index be20425269..eef498735d 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -13,7 +13,7 @@ from torchvision.datasets.folder import IMG_EXTENSIONS from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode -from anomalib.data.utils.split import split_normals_and_anomalous +from anomalib.data.utils.split import random_split from anomalib.pre_processing.pre_process import PreProcessor @@ -243,13 +243,13 @@ def _setup(self, _stage: Optional[str] = None): # add some normal images to the test set if not self.test_data.has_normal: - self.train_data, normal_test_data = split_normals_and_anomalous(self.train_data, self.split_ratio) + self.train_data, normal_test_data = random_split(self.train_data, self.split_ratio) self.test_data += normal_test_data # split validation set from test set if self.val_split_mode == ValSplitMode.FROM_TEST: assert self.test_data is not None - self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5) + self.val_data, self.test_data = random_split(self.train_data, [0.5, 0.5], label_aware=True) elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: self.val_data = self.test_data else: diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 70fca5c1fe..f5655090e9 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -34,7 +34,7 @@ from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode from anomalib.data.utils import DownloadProgressBar, hash_check -from anomalib.data.utils.split import split_normals_and_anomalous +from anomalib.data.utils.split import random_split from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) @@ -175,6 +175,7 @@ def __init__( self.category = Path(category) self.val_split_mode = val_split_mode + # TODO: Get rid of PreProcessor by passing transform directly pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size) @@ -218,7 +219,7 @@ def _setup(self, _stage: Optional[str] = None) -> None: self.train_data.setup() self.test_data.setup() if self.val_split_mode == ValSplitMode.FROM_TEST: - self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5) + self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True) elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: self.val_data = self.test_data else: diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index c8ed8a5e90..b45fc23ea8 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -11,121 +11,59 @@ # Copyright (C) 2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -import random -from typing import Optional, Tuple +import math +import warnings +from typing import Sequence, Union -from pandas.core.frame import DataFrame -from torch.utils.data import Subset +from torch import randperm, split from anomalib.data.base import AnomalibDataset -def split_normal_images_in_train_set( - samples: DataFrame, split_ratio: float = 0.1, seed: Optional[int] = None, normal_label: str = "good" -) -> DataFrame: - """Split normal images in train set. - - This function splits the normal images in training set and assigns the - values to the test set. This is particularly useful especially when the - test set does not contain any normal images. - - This is important because when the test set doesn't have any normal images, - AUC computation fails due to having single class. +def random_split( + dataset: AnomalibDataset, split_ratio: Union[float, Sequence[float]], label_aware: bool = False +) -> Sequence[AnomalibDataset]: + """Perform a random split of a dataset. Args: - samples (DataFrame): Dataframe containing dataset info such as filenames, splits etc. - split_ratio (float, optional): Train-Test normal image split ratio. Defaults to 0.1. - seed (int, optional): Random seed to ensure reproducibility. Defaults to 0. - normal_label (str): Name of the normal label. For MVTec AD, for instance, this is normal_label. - - Returns: - DataFrame: Output dataframe where the part of the training set is assigned to test set. + dataset (AnomalibDataset): Source dataset + split_ratio (Union[float, Sequence[float]]): Fractions of the splits that will be produced. The values in the + sequence must sum to 1. If a single value is passed, the ratio will be converted to + [1-split_ratio, split_ratio]. + label_aware (bool): When True, the relative occurrence of the different class labels of the source dataset will + be maintained in each of the subsets. """ - if seed is not None: - random.seed(seed) - - normal_train_image_indices = samples.index[(samples.split == "train") & (samples.label == normal_label)].to_list() - num_normal_train_images = len(normal_train_image_indices) - num_normal_valid_images = int(num_normal_train_images * split_ratio) - - indices_to_split_from_train_set = random.sample(population=normal_train_image_indices, k=num_normal_valid_images) - samples.loc[indices_to_split_from_train_set, "split"] = "test" - - return samples - - -def create_validation_set_from_test_set( - samples: DataFrame, seed: Optional[int] = None, normal_label: str = "good" -) -> DataFrame: - """Craete Validation Set from Test Set. - - This function creates a validation set from test set by splitting both - normal and abnormal samples to two. - - Args: - samples (DataFrame): Dataframe containing dataset info such as filenames, splits etc. - seed (int, optional): Random seed to ensure reproducibility. Defaults to 0. - normal_label (str): Name of the normal label. For MVTec AD, for instance, this is normal_label. - """ - - if seed is not None: - random.seed(seed) - - # Split normal images. - normal_test_image_indices = samples.index[(samples.split == "test") & (samples.label == normal_label)].to_list() - num_normal_valid_images = len(normal_test_image_indices) // 2 - - indices_to_sample = random.sample(population=normal_test_image_indices, k=num_normal_valid_images) - samples.loc[indices_to_sample, "split"] = "val" - - # Split abnormal images. - abnormal_test_image_indices = samples.index[(samples.split == "test") & (samples.label != normal_label)].to_list() - num_abnormal_valid_images = len(abnormal_test_image_indices) // 2 - - indices_to_sample = random.sample(population=abnormal_test_image_indices, k=num_abnormal_valid_images) - samples.loc[indices_to_sample, "split"] = "val" - - return samples - - -def split_normals_and_anomalous( - dataset: "AnomalibDataset", split_ratio: float, seed: Optional[int] = None -) -> Tuple[AnomalibDataset, AnomalibDataset]: - """Wrap dataset wit torch.utils.data.Subset twice to create two (non-overlaping) subsets. - Args: - dataset (AnomalibDataset): AnomalibDataset object. - split_ratio (float): Split ratio (0 to 100%) that goes to the NEW split. - seed (int): Random seed to ensure reproducibility. - Returns: - Tuple[AnomalibDataset, AnomalibDataset]: (new split, old split). - """ - - assert 0 < split_ratio < 1, "Split ratio must be between 0 and 1." - if seed is not None: - assert seed >= 0, "Seed must be non-negative." - random.seed(seed) - - # get the indices of the normal/anomalous images in the dataset - normals_indices = dataset.samples.index[dataset.samples.label_index == 0].to_list() - anomalous_indices = dataset.samples.index[dataset.samples.label_index == 1].to_list() - - # get the number of normal/anomalous images will got to the new split - new_split_n_normals = int(len(normals_indices) * split_ratio) - new_split_n_anomalous = int(len(anomalous_indices) * split_ratio) - - # ranmdomly sample the indices of the normal/anomalous images that will go to the new split - new_split_normals_indices = random.sample(population=normals_indices, k=new_split_n_normals) - new_split_anomalous_indices = random.sample(population=anomalous_indices, k=new_split_n_anomalous) - - # indices that remain in the original split - old_split_normals_indices = list(set(normals_indices) - set(new_split_normals_indices)) - old_split_anomalous_indices = list(set(anomalous_indices) - set(new_split_anomalous_indices)) - - # create the new split and the (reduced) original split - # new_split = Subset(dataset, new_split_normals_indices + new_split_anomalous_indices) - # old_split = Subset(dataset, old_split_normals_indices + old_split_anomalous_indices) - new_split = dataset.subsample(new_split_normals_indices + new_split_anomalous_indices) - old_split = dataset.subsample(old_split_normals_indices + old_split_anomalous_indices) - - return new_split, old_split + if isinstance(split_ratio, float): + split_ratio = [1 - split_ratio, split_ratio] + + assert math.isclose(sum(split_ratio), 1) and sum(split_ratio) <= 1, "split ratios must sum to 1." + assert all(0 < ratio < 1 for ratio in split_ratio), "all split ratios must be between 0 and 1." + + # create list of source data + if label_aware: + indices_per_label = [group.index for _, group in dataset.samples.groupby("label_index")] + datasets = [dataset.subsample(indices) for indices in indices_per_label] + else: + datasets = [dataset] + + # split each (label-aware) subset of source data + subsets = [] + for dataset in datasets: + # get subset lengths + subset_lengths = [] + for ratio in split_ratio: + subset_lengths.append(int(math.floor(len(dataset) * ratio))) + for i in range(len(dataset) - sum(subset_lengths)): + subset_idx = i % sum(subset_lengths) + subset_lengths[subset_idx] += 1 + for index, length in enumerate(subset_lengths): + if length == 0: + warnings.warn(f"Length of subset at index {index} is 0.") + # perform random subsampling + indices = randperm(len(dataset)) + subsets.append([dataset.subsample(subset_indices) for subset_indices in split(indices, subset_lengths)]) + + # concatenate and return + subsets = list(map(list, zip(*subsets))) + return tuple(sum(subset) for subset in subsets) From 965ea949b8c22816583128ab1ca6396f94f8bcfc Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 10 Oct 2022 17:15:47 +0200 Subject: [PATCH 24/96] update docstrings for folder module --- anomalib/data/folder.py | 118 +++++++++++++++++++++++++++++----------- 1 file changed, 86 insertions(+), 32 deletions(-) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index eef498735d..4e88b22db4 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -9,6 +9,7 @@ from pathlib import Path from typing import Optional, Tuple, Union +import albumentations as A from pandas import DataFrame from torchvision.datasets.folder import IMG_EXTENSIONS @@ -79,13 +80,7 @@ def make_folder_dataset( if `None`. Defaults to None. mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing the mask annotations. Defaults to None. - split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None. - split_ratio (float, optional): Ratio to split normal training images and add to the - test set in case test set doesn't contain any normal images. - Defaults to 0.2. - seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0. - create_validation_set (bool, optional):Boolean to create a validation set from the test set. - Those wanting to create a validation set could set this flag to ``True``. + split (Optional[Split], optional): Dataset split (ie., Split.FULL, Split.TRAIN or Split.TEST). Defaults to None. extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the directory. @@ -139,26 +134,52 @@ def make_folder_dataset( class Folder(AnomalibDataset): + """Folder dataset. + + Args: + task (str): Task type. (classification or segmentation). + pre_process (PreProcessor): Image Pre-processor to apply transform. + split (Split): Fixed subset split that follows from folder structure on file system. Choose from + [Split.FULL, Split.TRAIN, Split.TEST] + + root (Union[str, Path]): Root folder of the dataset. + normal_dir (Union[str, Path]): Path to the directory containing normal images. + abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images. + split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None. + normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing + normal images for the test dataset. Defaults to None. + mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing + the mask annotations. Defaults to None. + + extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the + directory. + val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained. + + Raises: + ValueError: When task is set to classification and `mask_dir` is provided. When `mask_dir` is + provided, `task` should be set to `segmentation`. + """ + def __init__( self, task: str, pre_process: PreProcessor, split: Split, # + root: Union[str, Path], normal_dir: Union[str, Path], abnormal_dir: Union[str, Path], normal_test_dir: Optional[Union[str, Path]] = None, mask_dir: Optional[Union[str, Path]] = None, val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, - extensions=None, - samples=None, + extensions: Optional[Tuple[str]] = None, + samples: DataFrame = None, ) -> None: super().__init__(task, pre_process, samples=samples) self.split = split - - self.normal_dir = normal_dir - self.abnormal_dir = abnormal_dir + self.normal_dir = Path(root) / Path(normal_dir) + self.abnormal_dir = Path(root) / Path(abnormal_dir) self.normal_test_dir = normal_test_dir self.mask_dir = mask_dir self.extensions = extensions @@ -166,6 +187,7 @@ def __init__( self.val_split_mode = val_split_mode def _setup(self): + """Assign samples.""" self._samples = make_folder_dataset( normal_dir=self.normal_dir, abnormal_dir=self.abnormal_dir, @@ -177,24 +199,57 @@ def _setup(self): class FolderDataModule(AnomalibDataModule): + """Folder DataModule. + + Args: + root (Union[str, Path]): Path to the root folder containing normal and abnormal dirs. + normal_dir (Union[str, Path]): Name of the directory containing normal images. + Defaults to "normal". + abnormal_dir (str, optional): Name of the directory containing abnormal images. + Defaults to "abnormal". + normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing + normal images for the test dataset. Defaults to None. + mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing + the mask annotations. Defaults to None. + split_ratio (float, optional): Ratio to split normal training images and add to the + test set in case test set doesn't contain any normal images. + Defaults to 0.2. + extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the + directory. Defaults to None. + image_size (Optional[Union[int, Tuple[int, int]]], optional): Size of the input image. + Defaults to None. + train_batch_size (int, optional): Training batch size. Defaults to 32. + test_batch_size (int, optional): Test batch size. Defaults to 32. + num_workers (int, optional): Number of workers. Defaults to 8. + task (str, optional): Task type. Could be either classification or segmentation. + Defaults to "classification". + transform_config_train (Optional[Union[str, A.Compose]], optional): Config for pre-processing + during training. + Defaults to None. + transform_config_val (Optional[Union[str, A.Compose]], optional): Config for pre-processing + during validation. + Defaults to None. + val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained. + """ + def __init__( self, - root, - task, - train_batch_size, - test_batch_size, - image_size, - num_workers, - val_split_mode, + root: Union[str, Path], + normal_dir: Union[str, Path], + abnormal_dir: Union[str, Path], + normal_test_dir: Union[str, Path], + mask_dir: Union[str, Path], + split_ratio: float, + extensions: Optional[Tuple[str]] = None, # - normal_dir, - abnormal_dir, - normal_test_dir, - mask_dir, - split_ratio, - transform_config_train=None, - transform_config_val=None, - extensions=None, + image_size: Optional[Union[int, Tuple[int, int]]] = None, + train_batch_size: int = 32, + test_batch_size: int = 32, + num_workers: int = 8, + task: str = "segmentation", + transform_config_train: Optional[Union[str, A.Compose]] = None, + transform_config_val: Optional[Union[str, A.Compose]] = None, + val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, ): super().__init__( train_batch_size=train_batch_size, @@ -208,13 +263,11 @@ def __init__( pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size) - normal_dir = Path(root) / Path(normal_dir) - abnormal_dir = Path(root) / Path(abnormal_dir) - self.train_data = Folder( task=task, pre_process=pre_process_train, split=Split.TRAIN, + root=root, normal_dir=normal_dir, abnormal_dir=abnormal_dir, normal_test_dir=normal_test_dir, @@ -226,6 +279,7 @@ def __init__( task=task, pre_process=pre_process_infer, split=Split.TEST, + root=root, normal_dir=normal_dir, abnormal_dir=abnormal_dir, normal_test_dir=normal_test_dir, @@ -234,7 +288,7 @@ def __init__( ) def _setup(self, _stage: Optional[str] = None): - + """Set up the datasets for the Folder Data Module.""" assert self.train_data is not None assert self.test_data is not None @@ -249,7 +303,7 @@ def _setup(self, _stage: Optional[str] = None): # split validation set from test set if self.val_split_mode == ValSplitMode.FROM_TEST: assert self.test_data is not None - self.val_data, self.test_data = random_split(self.train_data, [0.5, 0.5], label_aware=True) + self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True) elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: self.val_data = self.test_data else: From 2a9f6f8a18158350c1a1e214f1249a85f58d261e Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 10 Oct 2022 19:06:45 +0200 Subject: [PATCH 25/96] ensure type consistency when performing operations on dataset --- anomalib/data/base.py | 68 ++++++++++++++++++++++++++---------- anomalib/data/utils/split.py | 21 +++++++++-- 2 files changed, 67 insertions(+), 22 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 392fa32438..268f82782f 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -5,10 +5,11 @@ from __future__ import annotations +import copy import logging from abc import ABC, abstractmethod from enum import Enum -from typing import Dict, Optional, Union +from typing import Dict, Optional, Sequence, Union import cv2 import numpy as np @@ -26,6 +27,8 @@ class Split(str, Enum): + """Split of a subset.""" + FULL = "full" TRAIN = "train" VAL = "val" @@ -33,11 +36,13 @@ class Split(str, Enum): class ValSplitMode(str, Enum): + """Splitting mode used to obtain validation subset.""" + SAME_AS_TEST = "same_as_test" FROM_TEST = "from_test" -class AnomalibDataset(Dataset): +class AnomalibDataset(Dataset, ABC): """Anomalib dataset.""" def __init__(self, task: str, pre_process: PreProcessor, samples: Optional[DataFrame] = None): @@ -51,28 +56,45 @@ def __len__(self) -> int: assert isinstance(self._samples, DataFrame) return len(self._samples) - def subsample(self, indices): - samples = self.samples.iloc[indices].reset_index(drop=True) - return AnomalibDataset(task=self.task, pre_process=self.pre_process, samples=samples) + def subsample(self, indices: Sequence[int], inplace=False) -> AnomalibDataset: + """Subsamples the dataset at the provided indices. + + Args: + indices (Sequence[int]): Indices at which the dataset is to be subsampled. + inplace (bool): When true, the subsampling will be performed on the instance itself. + """ + dataset = self if inplace else copy.deepcopy(self) + dataset.assign_samples(self.samples.iloc[indices].reset_index(drop=True)) + return dataset @property def is_setup(self) -> bool: - """Has setup() been called?""" + """Checks if setup() been called.""" return isinstance(self._samples, DataFrame) @property def samples(self) -> DataFrame: - """TODO""" + """Get the samples dataframe.""" if not self.is_setup: raise RuntimeError("Dataset is not setup yet. Call setup() first.") return self._samples + def assign_samples(self, samples: DataFrame): + """Overwrite the samples with a new dataframe. + + Args: + samples (DataFrame): DataFrame with new samples. + """ + self._samples = samples + @property def has_normal(self) -> bool: + """Check if the dataset contains any normal samples.""" return 0 in list(self.samples.label_index) @property def has_anomalous(self) -> bool: + """Check if the dataset contains any anomalous samples.""" return 1 in list(self.samples.label_index) def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: @@ -115,25 +137,24 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: return item - def __add__(self, other_dataset: AnomalibDataset): + def __add__(self, other_dataset: AnomalibDataset) -> AnomalibDataset: + """Concatenate this dataset with another dataset.""" + assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of same type." assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first." - samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True) - return AnomalibDataset(self.task, self.pre_process, samples) - - def __radd__(self, other): - if other == 0: - return self - else: - return self.__add__(other) + dataset = copy.deepcopy(self) + dataset.assign_samples(pd.concat([self.samples, other_dataset.samples], ignore_index=True)) + return dataset def setup(self) -> None: - """Load data/metadata into memory""" + """Load data/metadata into memory.""" if not self.is_setup: self._setup() assert self.is_setup, "setup() should set self._samples" + @abstractmethod def _setup(self) -> DataFrame: - """previous _create_samples() + """Set up the data module. + This method should return a dataframe that contains the information needed by the dataloader to load each of the dataset items into memory. The dataframe must at least contain the following columns: @@ -141,6 +162,7 @@ def _setup(self) -> DataFrame: image_path: path to file system location where the image is stored. label_index: index of the anomaly label, typically 0 for "normal" and 1 for "anomalous". mask_path (if task == "segmentation"): path to the ground truth masks (for the anomalous images only). + Example: |---|-------------------|-----------|-------------|------------------|-------| | | image_path | label | label_index | mask_path | split | @@ -152,7 +174,13 @@ def _setup(self) -> DataFrame: class AnomalibDataModule(LightningDataModule, ABC): - """Base Anomalib data module.""" + """Base Anomalib data module. + + Args: + train_batch_size (int): Batch size used by the train dataloader. + test_batch_size (int): Batch size used by the val and test dataloaders. + num_workers (int): Number of workers used by the train, val and test dataloaders. + """ def __init__( self, @@ -183,10 +211,12 @@ def setup(self, stage: Optional[str] = None): @abstractmethod def _setup(self, _stage: Optional[str] = None) -> None: + """To be implemented in conrete subclass.""" pass @property def is_setup(self): + """Checks if setup() has been called.""" if self.train_data is None or self.val_data is None or self.test_data is None: return False return self.train_data.is_setup and self.val_data.is_setup and self.test_data.is_setup diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index b45fc23ea8..367359bdf2 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -13,16 +13,31 @@ import math import warnings -from typing import Sequence, Union +from typing import List, Sequence, Union from torch import randperm, split from anomalib.data.base import AnomalibDataset +def concatenate_datasets(datasets: Sequence[AnomalibDataset]) -> AnomalibDataset: + """Concatenate multiple datasets into a single dataset object. + + Args: + datasets (Sequence[AnomalibDataset]): Sequence of at least two datasets. + + Returns: + AnomalibDataset: Dataset that contains the combined samples of all input datasets. + """ + concat_dataset = datasets[0] + for dataset in datasets[1:]: + concat_dataset += dataset + return concat_dataset + + def random_split( dataset: AnomalibDataset, split_ratio: Union[float, Sequence[float]], label_aware: bool = False -) -> Sequence[AnomalibDataset]: +) -> List[AnomalibDataset]: """Perform a random split of a dataset. Args: @@ -66,4 +81,4 @@ def random_split( # concatenate and return subsets = list(map(list, zip(*subsets))) - return tuple(sum(subset) for subset in subsets) + return [concatenate_datasets(subset) for subset in subsets] From 84997b9d9d6cc637ab8e70a4d91ffb104c792beb Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 10 Oct 2022 19:28:00 +0200 Subject: [PATCH 26/96] change imports --- anomalib/data/utils/split.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index 367359bdf2..2b11e32475 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -15,7 +15,7 @@ import warnings from typing import List, Sequence, Union -from torch import randperm, split +import torch from anomalib.data.base import AnomalibDataset @@ -76,8 +76,8 @@ def random_split( if length == 0: warnings.warn(f"Length of subset at index {index} is 0.") # perform random subsampling - indices = randperm(len(dataset)) - subsets.append([dataset.subsample(subset_indices) for subset_indices in split(indices, subset_lengths)]) + indices = torch.randperm(len(dataset)) + subsets.append([dataset.subsample(subset_indices) for subset_indices in torch.split(indices, subset_lengths)]) # concatenate and return subsets = list(map(list, zip(*subsets))) From f21c652c933c636b6ead107a44d36c768a925ebd Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 10 Oct 2022 19:34:10 +0200 Subject: [PATCH 27/96] change variable names --- anomalib/data/utils/split.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index 2b11e32475..266583d907 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -58,26 +58,28 @@ def random_split( # create list of source data if label_aware: indices_per_label = [group.index for _, group in dataset.samples.groupby("label_index")] - datasets = [dataset.subsample(indices) for indices in indices_per_label] + per_label_datasets = [dataset.subsample(indices) for indices in indices_per_label] else: - datasets = [dataset] + per_label_datasets = [dataset] # split each (label-aware) subset of source data subsets = [] - for dataset in datasets: + for label_dataset in per_label_datasets: # get subset lengths subset_lengths = [] for ratio in split_ratio: - subset_lengths.append(int(math.floor(len(dataset) * ratio))) - for i in range(len(dataset) - sum(subset_lengths)): + subset_lengths.append(int(math.floor(len(label_dataset) * ratio))) + for i in range(len(label_dataset) - sum(subset_lengths)): subset_idx = i % sum(subset_lengths) subset_lengths[subset_idx] += 1 for index, length in enumerate(subset_lengths): if length == 0: warnings.warn(f"Length of subset at index {index} is 0.") # perform random subsampling - indices = torch.randperm(len(dataset)) - subsets.append([dataset.subsample(subset_indices) for subset_indices in torch.split(indices, subset_lengths)]) + indices = torch.randperm(len(label_dataset)) + subsets.append( + [label_dataset.subsample(subset_indices) for subset_indices in torch.split(indices, subset_lengths)] + ) # concatenate and return subsets = list(map(list, zip(*subsets))) From ab7d0ff61693767eee17aca49ba5f1834e17f3e1 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 10 Oct 2022 19:34:48 +0200 Subject: [PATCH 28/96] replace pass with NotImplementedError --- anomalib/data/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 268f82782f..d6dcea8b17 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -170,7 +170,7 @@ def _setup(self) -> DataFrame: | 0 | path/to/image.png | anomalous | 0 | path/to/mask.png | train | |---|-------------------|-----------|-------------|------------------|-------| """ - pass + raise NotImplementedError class AnomalibDataModule(LightningDataModule, ABC): @@ -212,7 +212,7 @@ def setup(self, stage: Optional[str] = None): @abstractmethod def _setup(self, _stage: Optional[str] = None) -> None: """To be implemented in conrete subclass.""" - pass + raise NotImplementedError @property def is_setup(self): From d7e47a942195c72122849696ecde74e33fd2f770 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Tue, 11 Oct 2022 14:05:39 +0200 Subject: [PATCH 29/96] allow training on folder without test images --- anomalib/data/folder.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index 4e88b22db4..c059129191 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -64,7 +64,7 @@ def _prepare_files_labels( def make_folder_dataset( normal_dir: Union[str, Path], - abnormal_dir: Union[str, Path], + abnormal_dir: Optional[Union[str, Path]] = None, normal_test_dir: Optional[Union[str, Path]] = None, mask_dir: Optional[Union[str, Path]] = None, split: Optional[str] = None, @@ -90,7 +90,10 @@ def make_folder_dataset( filenames = [] labels = [] - dirs = {"normal": normal_dir, "abnormal": abnormal_dir} + dirs = {"normal": normal_dir} + + if abnormal_dir: + dirs = {**dirs, **{"abnormal": abnormal_dir}} if normal_test_dir: dirs = {**dirs, **{"normal_test": normal_test_dir}} @@ -168,7 +171,7 @@ def __init__( # root: Union[str, Path], normal_dir: Union[str, Path], - abnormal_dir: Union[str, Path], + abnormal_dir: Optional[Union[str, Path]] = None, normal_test_dir: Optional[Union[str, Path]] = None, mask_dir: Optional[Union[str, Path]] = None, val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, @@ -179,7 +182,7 @@ def __init__( self.split = split self.normal_dir = Path(root) / Path(normal_dir) - self.abnormal_dir = Path(root) / Path(abnormal_dir) + self.abnormal_dir = Path(root) / Path(abnormal_dir) if abnormal_dir else None self.normal_test_dir = normal_test_dir self.mask_dir = mask_dir self.extensions = extensions From da851c6869dd31712359828749be2b205592d4b5 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Tue, 11 Oct 2022 14:23:37 +0200 Subject: [PATCH 30/96] use relative path for normal_test_dir --- anomalib/data/folder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index c059129191..01f6f1f031 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -183,7 +183,7 @@ def __init__( self.split = split self.normal_dir = Path(root) / Path(normal_dir) self.abnormal_dir = Path(root) / Path(abnormal_dir) if abnormal_dir else None - self.normal_test_dir = normal_test_dir + self.normal_test_dir = Path(root) / Path(normal_test_dir) if normal_test_dir else None self.mask_dir = mask_dir self.extensions = extensions From f3e38ba384ad4ec716201798f097ee74ada734bb Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Tue, 11 Oct 2022 14:43:00 +0200 Subject: [PATCH 31/96] fix dataset tests --- anomalib/data/folder.py | 8 ++++---- tests/pre_merge/datasets/test_dataset.py | 15 +++++++++------ 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index 01f6f1f031..e0fe0c16dc 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -240,9 +240,9 @@ def __init__( root: Union[str, Path], normal_dir: Union[str, Path], abnormal_dir: Union[str, Path], - normal_test_dir: Union[str, Path], - mask_dir: Union[str, Path], - split_ratio: float, + normal_test_dir: Optional[Union[str, Path]] = None, + mask_dir: Optional[Union[str, Path]] = None, + split_ratio: float = 0.2, extensions: Optional[Tuple[str]] = None, # image_size: Optional[Union[int, Tuple[int, int]]] = None, @@ -252,7 +252,7 @@ def __init__( task: str = "segmentation", transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_val: Optional[Union[str, A.Compose]] = None, - val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, + val_split_mode: ValSplitMode = ValSplitMode.FROM_TEST, ): super().__init__( train_batch_size=train_batch_size, diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py index 06d9629b45..789b833ff8 100644 --- a/tests/pre_merge/datasets/test_dataset.py +++ b/tests/pre_merge/datasets/test_dataset.py @@ -6,7 +6,12 @@ import pytest from anomalib.config import update_input_size_config -from anomalib.data import BTech, Folder, MVTec, get_datamodule +from anomalib.data import ( + BTechDataModule, + FolderDataModule, + MVTecDataModule, + get_datamodule, +) from anomalib.pre_processing.transforms import Denormalize, ToNumpy from tests.helpers.config import get_test_configurable_parameters from tests.helpers.dataset import TestDataset, get_dataset_path @@ -14,7 +19,7 @@ @pytest.fixture(autouse=True) def mvtec_data_module(): - datamodule = MVTec( + datamodule = MVTecDataModule( root=get_dataset_path(dataset="MVTec"), category="leather", image_size=(256, 256), @@ -31,7 +36,7 @@ def mvtec_data_module(): @pytest.fixture(autouse=True) def btech_data_module(): """Create BTech Data Module.""" - datamodule = BTech( + datamodule = BTechDataModule( root=get_dataset_path(dataset="BTech"), category="01", image_size=(256, 256), @@ -49,19 +54,17 @@ def btech_data_module(): def folder_data_module(): """Create Folder Data Module.""" root = get_dataset_path(dataset="bottle") - datamodule = Folder( + datamodule = FolderDataModule( root=root, normal_dir="good", abnormal_dir="broken_large", mask_dir=os.path.join(root, "ground_truth/broken_large"), task="segmentation", split_ratio=0.2, - seed=0, image_size=(256, 256), train_batch_size=32, test_batch_size=32, num_workers=8, - create_validation_set=True, ) datamodule.setup() From f4719f2f7dad840beb7f1926fa4084a4baa4aca8 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Tue, 11 Oct 2022 15:00:05 +0200 Subject: [PATCH 32/96] update validation set parameter in configs --- anomalib/config/config.py | 7 +++++++ anomalib/models/cflow/config.yaml | 2 +- anomalib/models/dfkde/config.yaml | 2 +- anomalib/models/dfm/config.yaml | 2 +- anomalib/models/draem/config.yaml | 2 +- anomalib/models/fastflow/config.yaml | 2 +- anomalib/models/ganomaly/config.yaml | 2 +- anomalib/models/patchcore/config.yaml | 2 +- anomalib/models/reverse_distillation/config.yaml | 2 +- anomalib/models/stfpm/config.yaml | 2 +- 10 files changed, 16 insertions(+), 9 deletions(-) diff --git a/anomalib/config/config.py b/anomalib/config/config.py index 6312c1012f..9b174bc162 100644 --- a/anomalib/config/config.py +++ b/anomalib/config/config.py @@ -136,6 +136,13 @@ def get_configurable_parameters( if "format" not in config.dataset.keys(): config.dataset.format = "mvtec" + if "create_validation_set" in config.dataset.keys(): + warn( + "The 'create_validation_set' parameter is deprecated and will be removed in v0.4.0. Please use " + "validation_split_mode instead." + ) + config.dataset.validation_split_mode = "from_test" if config.dataset.create_validation_set else "same_as_test" + config = update_input_size_config(config) # Project Configs diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml index be166c2417..0a8eec5a65 100644 --- a/anomalib/models/cflow/config.yaml +++ b/anomalib/models/cflow/config.yaml @@ -13,7 +13,7 @@ dataset: transform_config: train: null val: null - create_validation_set: false + validation_split_mode: same_as_test model: name: cflow diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml index 070f0c2456..538a806bc6 100644 --- a/anomalib/models/dfkde/config.yaml +++ b/anomalib/models/dfkde/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null val: null - create_validation_set: false + validation_split_mode: same_as_test model: name: dfkde diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml index 47db50fb4e..104f7a2a07 100755 --- a/anomalib/models/dfm/config.yaml +++ b/anomalib/models/dfm/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null val: null - create_validation_set: false + validation_split_mode: same_as_test model: name: dfm diff --git a/anomalib/models/draem/config.yaml b/anomalib/models/draem/config.yaml index 9f3326daa1..05ab67360c 100644 --- a/anomalib/models/draem/config.yaml +++ b/anomalib/models/draem/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: ./anomalib/models/draem/transform_config.yaml val: ./anomalib/models/draem/transform_config.yaml - create_validation_set: false + validation_split_mode: same_as_test tiling: apply: false tile_size: null diff --git a/anomalib/models/fastflow/config.yaml b/anomalib/models/fastflow/config.yaml index b02f430fd9..cef97d58fb 100644 --- a/anomalib/models/fastflow/config.yaml +++ b/anomalib/models/fastflow/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null val: null - create_validation_set: false + validation_split_mode: same_as_test tiling: apply: false tile_size: null diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml index 2e5dfb6bba..3e0a0cc677 100644 --- a/anomalib/models/ganomaly/config.yaml +++ b/anomalib/models/ganomaly/config.yaml @@ -12,7 +12,7 @@ dataset: transform_config: train: null val: null - create_validation_set: false + validation_split_mode: same_as_test tiling: apply: true tile_size: 64 diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml index 31567ad530..9f98f7604a 100644 --- a/anomalib/models/patchcore/config.yaml +++ b/anomalib/models/patchcore/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null val: null - create_validation_set: false + validation_split_mode: same_as_test tiling: apply: false tile_size: null diff --git a/anomalib/models/reverse_distillation/config.yaml b/anomalib/models/reverse_distillation/config.yaml index d30f3baedf..cc474091e4 100644 --- a/anomalib/models/reverse_distillation/config.yaml +++ b/anomalib/models/reverse_distillation/config.yaml @@ -12,7 +12,7 @@ dataset: transform_config: train: null val: null - create_validation_set: false + validation_split_mode: same_as_test tiling: apply: false tile_size: 64 diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml index fe3637bf27..524e58e42b 100644 --- a/anomalib/models/stfpm/config.yaml +++ b/anomalib/models/stfpm/config.yaml @@ -12,7 +12,7 @@ dataset: transform_config: train: null val: null - create_validation_set: false + validation_split_mode: same_as_test tiling: apply: false tile_size: null From e25a587f034981cf93aaf7ebc4920e76ea9bb3cd Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Tue, 11 Oct 2022 15:26:57 +0200 Subject: [PATCH 33/96] change default argument --- anomalib/models/padim/config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml index a12d1d7a25..058e78cd25 100644 --- a/anomalib/models/padim/config.yaml +++ b/anomalib/models/padim/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null val: null - validation_split_mode: from_test + validation_split_mode: same_as_test tiling: apply: false tile_size: null @@ -58,7 +58,7 @@ logging: log_graph: false # Logs the model graph to respective logger. optimization: - export_mode: null #options: onnx, openvino + export_mode: openvino #options: onnx, openvino # PL Trainer Args. Don't add extra parameter here. trainer: From fb84cd1d032140170ec776cf306df1c13ca6fbe7 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Wed, 12 Oct 2022 10:53:10 +0200 Subject: [PATCH 34/96] use setter for samples --- anomalib/data/base.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index d6dcea8b17..2f9076d7de 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -64,7 +64,7 @@ def subsample(self, indices: Sequence[int], inplace=False) -> AnomalibDataset: inplace (bool): When true, the subsampling will be performed on the instance itself. """ dataset = self if inplace else copy.deepcopy(self) - dataset.assign_samples(self.samples.iloc[indices].reset_index(drop=True)) + dataset.samples = self.samples.iloc[indices].reset_index(drop=True) return dataset @property @@ -79,7 +79,8 @@ def samples(self) -> DataFrame: raise RuntimeError("Dataset is not setup yet. Call setup() first.") return self._samples - def assign_samples(self, samples: DataFrame): + @samples.setter + def samples(self, samples: DataFrame): """Overwrite the samples with a new dataframe. Args: @@ -142,7 +143,7 @@ def __add__(self, other_dataset: AnomalibDataset) -> AnomalibDataset: assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of same type." assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first." dataset = copy.deepcopy(self) - dataset.assign_samples(pd.concat([self.samples, other_dataset.samples], ignore_index=True)) + dataset.samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True) return dataset def setup(self) -> None: From cfa4f52aca25d32a82b28eaa47e5acc0688a921b Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Wed, 12 Oct 2022 10:56:26 +0200 Subject: [PATCH 35/96] hint options for val_split_mode --- anomalib/models/cflow/config.yaml | 2 +- anomalib/models/dfkde/config.yaml | 2 +- anomalib/models/dfm/config.yaml | 2 +- anomalib/models/draem/config.yaml | 2 +- anomalib/models/fastflow/config.yaml | 2 +- anomalib/models/ganomaly/config.yaml | 2 +- anomalib/models/padim/config.yaml | 4 ++-- anomalib/models/patchcore/config.yaml | 2 +- anomalib/models/reverse_distillation/config.yaml | 2 +- anomalib/models/stfpm/config.yaml | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml index 0a8eec5a65..725589b8e1 100644 --- a/anomalib/models/cflow/config.yaml +++ b/anomalib/models/cflow/config.yaml @@ -13,7 +13,7 @@ dataset: transform_config: train: null val: null - validation_split_mode: same_as_test + validation_split_mode: same_as_test # options: [same_as_test, from_test] model: name: cflow diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml index 538a806bc6..5fc7b53861 100644 --- a/anomalib/models/dfkde/config.yaml +++ b/anomalib/models/dfkde/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null val: null - validation_split_mode: same_as_test + validation_split_mode: same_as_test # options: [same_as_test, from_test] model: name: dfkde diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml index 104f7a2a07..34256daa32 100755 --- a/anomalib/models/dfm/config.yaml +++ b/anomalib/models/dfm/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null val: null - validation_split_mode: same_as_test + validation_split_mode: same_as_test # options: [same_as_test, from_test] model: name: dfm diff --git a/anomalib/models/draem/config.yaml b/anomalib/models/draem/config.yaml index 05ab67360c..510e661d15 100644 --- a/anomalib/models/draem/config.yaml +++ b/anomalib/models/draem/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: ./anomalib/models/draem/transform_config.yaml val: ./anomalib/models/draem/transform_config.yaml - validation_split_mode: same_as_test + validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false tile_size: null diff --git a/anomalib/models/fastflow/config.yaml b/anomalib/models/fastflow/config.yaml index cef97d58fb..59d2a12aa5 100644 --- a/anomalib/models/fastflow/config.yaml +++ b/anomalib/models/fastflow/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null val: null - validation_split_mode: same_as_test + validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false tile_size: null diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml index 3e0a0cc677..c8d09276f2 100644 --- a/anomalib/models/ganomaly/config.yaml +++ b/anomalib/models/ganomaly/config.yaml @@ -12,7 +12,7 @@ dataset: transform_config: train: null val: null - validation_split_mode: same_as_test + validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: true tile_size: 64 diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml index 058e78cd25..b857d6d692 100644 --- a/anomalib/models/padim/config.yaml +++ b/anomalib/models/padim/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null val: null - validation_split_mode: same_as_test + validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false tile_size: null @@ -58,7 +58,7 @@ logging: log_graph: false # Logs the model graph to respective logger. optimization: - export_mode: openvino #options: onnx, openvino + export_mode: null #options: onnx, openvino # PL Trainer Args. Don't add extra parameter here. trainer: diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml index 9f98f7604a..c97603ea2e 100644 --- a/anomalib/models/patchcore/config.yaml +++ b/anomalib/models/patchcore/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null val: null - validation_split_mode: same_as_test + validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false tile_size: null diff --git a/anomalib/models/reverse_distillation/config.yaml b/anomalib/models/reverse_distillation/config.yaml index cc474091e4..8dcdd0fe9c 100644 --- a/anomalib/models/reverse_distillation/config.yaml +++ b/anomalib/models/reverse_distillation/config.yaml @@ -12,7 +12,7 @@ dataset: transform_config: train: null val: null - validation_split_mode: same_as_test + validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false tile_size: 64 diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml index 524e58e42b..4a2b251173 100644 --- a/anomalib/models/stfpm/config.yaml +++ b/anomalib/models/stfpm/config.yaml @@ -12,7 +12,7 @@ dataset: transform_config: train: null val: null - validation_split_mode: same_as_test + validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false tile_size: null From 624e5229ed4b387547a34c5826001878954fbcea Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Wed, 12 Oct 2022 10:58:25 +0200 Subject: [PATCH 36/96] update assert message and docstring --- anomalib/data/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 2f9076d7de..dd3b0ec2b7 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -140,7 +140,7 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: def __add__(self, other_dataset: AnomalibDataset) -> AnomalibDataset: """Concatenate this dataset with another dataset.""" - assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of same type." + assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of the same type." assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first." dataset = copy.deepcopy(self) dataset.samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True) @@ -168,7 +168,7 @@ def _setup(self) -> DataFrame: |---|-------------------|-----------|-------------|------------------|-------| | | image_path | label | label_index | mask_path | split | |---|-------------------|-----------|-------------|------------------|-------| - | 0 | path/to/image.png | anomalous | 0 | path/to/mask.png | train | + | 0 | path/to/image.png | anomalous | 1 | path/to/mask.png | train | |---|-------------------|-----------|-------------|------------------|-------| """ raise NotImplementedError From 0bd77f9325f1df8c10780106e2b8a7ca338b9c61 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Wed, 12 Oct 2022 15:51:20 +0200 Subject: [PATCH 37/96] revert name change dataset vs datamodule --- anomalib/data/__init__.py | 18 +++++++++--------- anomalib/data/btech.py | 10 ++++++---- anomalib/data/folder.py | 8 ++++---- anomalib/data/mvtec.py | 10 ++++++---- tests/pre_merge/datasets/test_dataset.py | 13 ++++--------- 5 files changed, 29 insertions(+), 30 deletions(-) diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index 7ba61dc5c2..6e77606aba 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -10,10 +10,10 @@ from anomalib.data.base import AnomalibDataModule -from .btech import BTechDataModule -from .folder import FolderDataModule +from .btech import BTech +from .folder import Folder from .inference import InferenceDataset -from .mvtec import MVTecDataModule +from .mvtec import MVTec logger = logging.getLogger(__name__) @@ -32,7 +32,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: datamodule: AnomalibDataModule if config.dataset.format.lower() == "mvtec": - datamodule = MVTecDataModule( + datamodule = MVTec( root=config.dataset.path, category=config.dataset.category, image_size=(config.dataset.image_size[0], config.dataset.image_size[1]), @@ -45,7 +45,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: val_split_mode=config.dataset.validation_split_mode, ) elif config.dataset.format.lower() == "btech": - datamodule = BTechDataModule( + datamodule = BTech( root=config.dataset.path, category=config.dataset.category, image_size=(config.dataset.image_size[0], config.dataset.image_size[1]), @@ -58,7 +58,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: val_split_mode=config.dataset.validation_split_mode, ) elif config.dataset.format.lower() == "folder": - datamodule = FolderDataModule( + datamodule = Folder( root=config.dataset.path, normal_dir=config.dataset.normal_dir, abnormal_dir=config.dataset.abnormal_dir, @@ -87,8 +87,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: __all__ = [ "get_datamodule", - "BTechDataModule", - "FolderDataModule", + "BTech", + "Folder", "InferenceDataset", - "MVTecDataModule", + "MVTec", ] diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index f7d03ac19b..e0a45d5d62 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -106,7 +106,7 @@ def make_btech_dataset(path: Path, split: Optional[str] = None) -> DataFrame: return samples -class BTech(AnomalibDataset): +class BTechDataset(AnomalibDataset): """BTech PyTorch Dataset.""" def __init__( @@ -169,7 +169,7 @@ def _setup(self): @DATAMODULE_REGISTRY -class BTechDataModule(AnomalibDataModule): +class BTech(AnomalibDataModule): """BTechDataModule Lightning Data Module.""" def __init__( @@ -235,10 +235,12 @@ def __init__( pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size) - self.train_data = BTech( + self.train_data = BTechDataset( task=task, pre_process=pre_process_train, split=Split.TRAIN, root=root, category=category ) - self.test_data = BTech(task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category) + self.test_data = BTechDataset( + task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category + ) def prepare_data(self) -> None: """Download the dataset if not available.""" diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index e0fe0c16dc..a34a96f0d4 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -136,7 +136,7 @@ def make_folder_dataset( return samples -class Folder(AnomalibDataset): +class FolderDataset(AnomalibDataset): """Folder dataset. Args: @@ -201,7 +201,7 @@ def _setup(self): ) -class FolderDataModule(AnomalibDataModule): +class Folder(AnomalibDataModule): """Folder DataModule. Args: @@ -266,7 +266,7 @@ def __init__( pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size) - self.train_data = Folder( + self.train_data = FolderDataset( task=task, pre_process=pre_process_train, split=Split.TRAIN, @@ -278,7 +278,7 @@ def __init__( extensions=extensions, ) - self.test_data = Folder( + self.test_data = FolderDataset( task=task, pre_process=pre_process_infer, split=Split.TEST, diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index f5655090e9..21ce00e622 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -120,7 +120,7 @@ def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> Dat return samples -class MVTec(AnomalibDataset): +class MVTecDataset(AnomalibDataset): """MVTec dataset class. Args: @@ -149,7 +149,7 @@ def _setup(self): self._samples = make_mvtec_dataset(self.root_category, split=self.split) -class MVTecDataModule(AnomalibDataModule): +class MVTec(AnomalibDataModule): """MVTec Datamodule.""" def __init__( @@ -179,10 +179,12 @@ def __init__( pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size) - self.train_data = MVTec( + self.train_data = MVTecDataset( task=task, pre_process=pre_process_train, split=Split.TRAIN, root=root, category=category ) - self.test_data = MVTec(task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category) + self.test_data = MVTecDataset( + task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category + ) def prepare_data(self) -> None: """Download the dataset if not available.""" diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py index 789b833ff8..39707bb69a 100644 --- a/tests/pre_merge/datasets/test_dataset.py +++ b/tests/pre_merge/datasets/test_dataset.py @@ -6,12 +6,7 @@ import pytest from anomalib.config import update_input_size_config -from anomalib.data import ( - BTechDataModule, - FolderDataModule, - MVTecDataModule, - get_datamodule, -) +from anomalib.data import BTech, Folder, MVTec, get_datamodule from anomalib.pre_processing.transforms import Denormalize, ToNumpy from tests.helpers.config import get_test_configurable_parameters from tests.helpers.dataset import TestDataset, get_dataset_path @@ -19,7 +14,7 @@ @pytest.fixture(autouse=True) def mvtec_data_module(): - datamodule = MVTecDataModule( + datamodule = MVTec( root=get_dataset_path(dataset="MVTec"), category="leather", image_size=(256, 256), @@ -36,7 +31,7 @@ def mvtec_data_module(): @pytest.fixture(autouse=True) def btech_data_module(): """Create BTech Data Module.""" - datamodule = BTechDataModule( + datamodule = BTech( root=get_dataset_path(dataset="BTech"), category="01", image_size=(256, 256), @@ -54,7 +49,7 @@ def btech_data_module(): def folder_data_module(): """Create Folder Data Module.""" root = get_dataset_path(dataset="bottle") - datamodule = FolderDataModule( + datamodule = Folder( root=root, normal_dir="good", abnormal_dir="broken_large", From 6bed98f39d4c616d34a3d90223d306cd026b9673 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Wed, 12 Oct 2022 15:57:16 +0200 Subject: [PATCH 38/96] typing and docstrings --- anomalib/data/folder.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index a34a96f0d4..c935f48550 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -67,20 +67,21 @@ def make_folder_dataset( abnormal_dir: Optional[Union[str, Path]] = None, normal_test_dir: Optional[Union[str, Path]] = None, mask_dir: Optional[Union[str, Path]] = None, - split: Optional[str] = None, + split: Optional[Union[Split, str]] = None, extensions: Optional[Tuple[str, ...]] = None, ): """Make Folder Dataset. Args: normal_dir (Union[str, Path]): Path to the directory containing normal images. - abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images. + abnormal_dir (Optional[Union[str, Path]], optional): Path to the directory containing abnormal images. normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing normal images for the test dataset. Normal test images will be a split of `normal_dir` if `None`. Defaults to None. mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing the mask annotations. Defaults to None. - split (Optional[Split], optional): Dataset split (ie., Split.FULL, Split.TRAIN or Split.TEST). Defaults to None. + split (Optional[Union[Split, str]], optional): Dataset split (ie., Split.FULL, Split.TRAIN or Split.TEST). + Defaults to None. extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the directory. @@ -147,8 +148,7 @@ class FolderDataset(AnomalibDataset): root (Union[str, Path]): Root folder of the dataset. normal_dir (Union[str, Path]): Path to the directory containing normal images. - abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images. - split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None. + abnormal_dir (Optional[Union[str, Path]], optional): Path to the directory containing abnormal images. normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing normal images for the test dataset. Defaults to None. mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing @@ -175,7 +175,7 @@ def __init__( normal_test_dir: Optional[Union[str, Path]] = None, mask_dir: Optional[Union[str, Path]] = None, val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, - extensions: Optional[Tuple[str]] = None, + extensions: Optional[Tuple[str, ...]] = None, samples: DataFrame = None, ) -> None: super().__init__(task, pre_process, samples=samples) @@ -208,7 +208,7 @@ class Folder(AnomalibDataModule): root (Union[str, Path]): Path to the root folder containing normal and abnormal dirs. normal_dir (Union[str, Path]): Name of the directory containing normal images. Defaults to "normal". - abnormal_dir (str, optional): Name of the directory containing abnormal images. + abnormal_dir (Union[str, Path]): Name of the directory containing abnormal images. Defaults to "abnormal". normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing normal images for the test dataset. Defaults to None. From fc34f8eb9763755667a49521bfcce181be225481 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Wed, 12 Oct 2022 16:00:24 +0200 Subject: [PATCH 39/96] remove samples argument from dataset constructor --- anomalib/data/base.py | 4 ++-- anomalib/data/btech.py | 3 +-- anomalib/data/folder.py | 3 +-- anomalib/data/mvtec.py | 3 +-- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index dd3b0ec2b7..f79847292d 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -45,11 +45,11 @@ class ValSplitMode(str, Enum): class AnomalibDataset(Dataset, ABC): """Anomalib dataset.""" - def __init__(self, task: str, pre_process: PreProcessor, samples: Optional[DataFrame] = None): + def __init__(self, task: str, pre_process: PreProcessor): super().__init__() self.task = task self.pre_process = pre_process - self._samples = samples + self._samples = None def __len__(self) -> int: """Get length of the dataset.""" diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index e0a45d5d62..0bdab70d6a 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -116,7 +116,6 @@ def __init__( pre_process: PreProcessor, split: Split, task: str = "segmentation", - samples: Optional[DataFrame] = None, ) -> None: """Btech Dataset class. @@ -159,7 +158,7 @@ def __init__( >>> dataset[0]["image"].shape, dataset[0]["mask"].shape (torch.Size([3, 256, 256]), torch.Size([256, 256])) """ - super().__init__(task, pre_process, samples) + super().__init__(task, pre_process) self.root_category = Path(root) / Path(category) self.split = split diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index c935f48550..b2fa9ab3d7 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -176,9 +176,8 @@ def __init__( mask_dir: Optional[Union[str, Path]] = None, val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, extensions: Optional[Tuple[str, ...]] = None, - samples: DataFrame = None, ) -> None: - super().__init__(task, pre_process, samples=samples) + super().__init__(task, pre_process) self.split = split self.normal_dir = Path(root) / Path(normal_dir) diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 21ce00e622..6b67c0bb33 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -138,9 +138,8 @@ def __init__( split: Split, root: str, category: str, - samples: Optional[DataFrame] = None, ) -> None: - super().__init__(task=task, pre_process=pre_process, samples=samples) + super().__init__(task=task, pre_process=pre_process) self.root_category = Path(root) / Path(category) self.split = split From 1482c138f5735cc40ea50d74f01c64ab35d75a3b Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Wed, 12 Oct 2022 16:40:51 +0200 Subject: [PATCH 40/96] val/test -> eval --- anomalib/config/config.py | 16 +++++++++++++++- anomalib/data/__init__.py | 12 ++++++------ anomalib/data/base.py | 8 ++++---- anomalib/data/btech.py | 10 +++++----- anomalib/data/folder.py | 10 +++++----- anomalib/data/mvtec.py | 10 +++++----- anomalib/models/cflow/config.yaml | 4 ++-- anomalib/models/dfkde/config.yaml | 4 ++-- anomalib/models/dfm/config.yaml | 4 ++-- anomalib/models/draem/config.yaml | 4 ++-- anomalib/models/fastflow/config.yaml | 4 ++-- anomalib/models/ganomaly/config.yaml | 4 ++-- anomalib/models/padim/config.yaml | 4 ++-- anomalib/models/patchcore/config.yaml | 4 ++-- anomalib/models/reverse_distillation/config.yaml | 4 ++-- anomalib/models/stfpm/config.yaml | 4 ++-- tests/pre_merge/datasets/test_dataset.py | 6 +++--- 17 files changed, 63 insertions(+), 49 deletions(-) diff --git a/anomalib/config/config.py b/anomalib/config/config.py index 9b174bc162..38d9e5d531 100644 --- a/anomalib/config/config.py +++ b/anomalib/config/config.py @@ -139,10 +139,24 @@ def get_configurable_parameters( if "create_validation_set" in config.dataset.keys(): warn( "The 'create_validation_set' parameter is deprecated and will be removed in v0.4.0. Please use " - "validation_split_mode instead." + "'validation_split_mode' instead." ) config.dataset.validation_split_mode = "from_test" if config.dataset.create_validation_set else "same_as_test" + if "test_batch_size" in config.dataset.keys(): + warn( + "The 'test_batch_size' parameter is deprecated and will be removed in v0.4.0. Please use " + "'eval_batch_size' instead." + ) + config.dataset.eval_batch_size = config.dataset.test_batch_size + + if "transform_config" in config.dataset.keys() and "val" in config.dataset.transform_config.keys(): + warn( + "The 'transform_config.val' parameter is deprecated and will be removed in v0.4.0. Please use " + "'transform_config.eval' instead." + ) + config.dataset.transform_config.eval = config.dataset.transform_config.val + config = update_input_size_config(config) # Project Configs diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index 6e77606aba..98ee0394e2 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -37,11 +37,11 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: category=config.dataset.category, image_size=(config.dataset.image_size[0], config.dataset.image_size[1]), train_batch_size=config.dataset.train_batch_size, - test_batch_size=config.dataset.test_batch_size, + eval_batch_size=config.dataset.eval_batch_size, num_workers=config.dataset.num_workers, task=config.dataset.task, transform_config_train=config.dataset.transform_config.train, - transform_config_val=config.dataset.transform_config.val, + transform_config_eval=config.dataset.transform_config.eval, val_split_mode=config.dataset.validation_split_mode, ) elif config.dataset.format.lower() == "btech": @@ -50,11 +50,11 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: category=config.dataset.category, image_size=(config.dataset.image_size[0], config.dataset.image_size[1]), train_batch_size=config.dataset.train_batch_size, - test_batch_size=config.dataset.test_batch_size, + eval_batch_size=config.dataset.eval_batch_size, num_workers=config.dataset.num_workers, task=config.dataset.task, transform_config_train=config.dataset.transform_config.train, - transform_config_val=config.dataset.transform_config.val, + transform_config_eval=config.dataset.transform_config.eval, val_split_mode=config.dataset.validation_split_mode, ) elif config.dataset.format.lower() == "folder": @@ -69,10 +69,10 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: split_ratio=config.dataset.split_ratio, image_size=(config.dataset.image_size[0], config.dataset.image_size[1]), train_batch_size=config.dataset.train_batch_size, - test_batch_size=config.dataset.test_batch_size, + eval_batch_size=config.dataset.eval_batch_size, num_workers=config.dataset.num_workers, transform_config_train=config.dataset.transform_config.train, - transform_config_val=config.dataset.transform_config.val, + transform_config_eval=config.dataset.transform_config.eval, val_split_mode=config.dataset.validation_split_mode, ) else: diff --git a/anomalib/data/base.py b/anomalib/data/base.py index f79847292d..55ed39b5a9 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -186,12 +186,12 @@ class AnomalibDataModule(LightningDataModule, ABC): def __init__( self, train_batch_size: int, - test_batch_size: int, + eval_batch_size: int, num_workers: int, ): super().__init__() self.train_batch_size = train_batch_size - self.test_batch_size = test_batch_size + self.eval_batch_size = eval_batch_size self.num_workers = num_workers self.train_data: Optional[AnomalibDataset] = None @@ -228,8 +228,8 @@ def train_dataloader(self) -> TRAIN_DATALOADERS: def val_dataloader(self) -> EVAL_DATALOADERS: """Get validation dataloader.""" - return DataLoader(self.val_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) + return DataLoader(self.val_data, shuffle=False, batch_size=self.eval_batch_size, num_workers=self.num_workers) def test_dataloader(self) -> EVAL_DATALOADERS: """Get test dataloader.""" - return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) + return DataLoader(self.test_data, shuffle=False, batch_size=self.eval_batch_size, num_workers=self.num_workers) diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 0bdab70d6a..e501275390 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -177,11 +177,11 @@ def __init__( category: str, image_size: Optional[Union[int, Tuple[int, int]]] = None, train_batch_size: int = 32, - test_batch_size: int = 32, + eval_batch_size: int = 32, num_workers: int = 8, task: str = "segmentation", transform_config_train: Optional[Union[str, A.Compose]] = None, - transform_config_val: Optional[Union[str, A.Compose]] = None, + transform_config_eval: Optional[Union[str, A.Compose]] = None, val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, ) -> None: """Instantiate BTech Lightning Data Module. @@ -225,20 +225,20 @@ def __init__( >>> data["image"].shape, data["mask"].shape (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256])) """ - super().__init__(train_batch_size, test_batch_size, num_workers) + super().__init__(train_batch_size, eval_batch_size, num_workers) self.root = Path(root) self.category = Path(category) self.val_split_mode = val_split_mode pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) - pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size) + pre_process_eval = PreProcessor(config=transform_config_eval, image_size=image_size) self.train_data = BTechDataset( task=task, pre_process=pre_process_train, split=Split.TRAIN, root=root, category=category ) self.test_data = BTechDataset( - task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category + task=task, pre_process=pre_process_eval, split=Split.TEST, root=root, category=category ) def prepare_data(self) -> None: diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index b2fa9ab3d7..2f31c5cfb8 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -246,16 +246,16 @@ def __init__( # image_size: Optional[Union[int, Tuple[int, int]]] = None, train_batch_size: int = 32, - test_batch_size: int = 32, + eval_batch_size: int = 32, num_workers: int = 8, task: str = "segmentation", transform_config_train: Optional[Union[str, A.Compose]] = None, - transform_config_val: Optional[Union[str, A.Compose]] = None, + transform_config_eval: Optional[Union[str, A.Compose]] = None, val_split_mode: ValSplitMode = ValSplitMode.FROM_TEST, ): super().__init__( train_batch_size=train_batch_size, - test_batch_size=test_batch_size, + eval_batch_size=eval_batch_size, num_workers=num_workers, ) @@ -263,7 +263,7 @@ def __init__( self.split_ratio = split_ratio pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) - pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size) + pre_process_eval = PreProcessor(config=transform_config_eval, image_size=image_size) self.train_data = FolderDataset( task=task, @@ -279,7 +279,7 @@ def __init__( self.test_data = FolderDataset( task=task, - pre_process=pre_process_infer, + pre_process=pre_process_eval, split=Split.TEST, root=root, normal_dir=normal_dir, diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 6b67c0bb33..252c556b80 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -157,16 +157,16 @@ def __init__( category: str, image_size: Optional[Union[int, Tuple[int, int]]] = None, train_batch_size: int = 32, - test_batch_size: int = 32, + eval_batch_size: int = 32, num_workers: int = 8, task: str = "segmentation", transform_config_train: Optional[Union[str, A.Compose]] = None, - transform_config_val: Optional[Union[str, A.Compose]] = None, + transform_config_eval: Optional[Union[str, A.Compose]] = None, val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, ): super().__init__( train_batch_size=train_batch_size, - test_batch_size=test_batch_size, + eval_batch_size=eval_batch_size, num_workers=num_workers, ) @@ -176,13 +176,13 @@ def __init__( # TODO: Get rid of PreProcessor by passing transform directly pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) - pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size) + pre_process_eval = PreProcessor(config=transform_config_eval, image_size=image_size) self.train_data = MVTecDataset( task=task, pre_process=pre_process_train, split=Split.TRAIN, root=root, category=category ) self.test_data = MVTecDataset( - task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category + task=task, pre_process=pre_process_eval, split=Split.TEST, root=root, category=category ) def prepare_data(self) -> None: diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml index 725589b8e1..2a823620ba 100644 --- a/anomalib/models/cflow/config.yaml +++ b/anomalib/models/cflow/config.yaml @@ -6,13 +6,13 @@ dataset: task: segmentation image_size: 256 train_batch_size: 16 - test_batch_size: 16 + eval_batch_size: 16 inference_batch_size: 16 fiber_batch_size: 64 num_workers: 8 transform_config: train: null - val: null + eval: null validation_split_mode: same_as_test # options: [same_as_test, from_test] model: diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml index 5fc7b53861..7e9961f660 100644 --- a/anomalib/models/dfkde/config.yaml +++ b/anomalib/models/dfkde/config.yaml @@ -6,11 +6,11 @@ dataset: task: classification image_size: 256 train_batch_size: 32 - test_batch_size: 32 + eval_batch_size: 32 num_workers: 8 transform_config: train: null - val: null + eval: null validation_split_mode: same_as_test # options: [same_as_test, from_test] model: diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml index 34256daa32..807f39e5db 100755 --- a/anomalib/models/dfm/config.yaml +++ b/anomalib/models/dfm/config.yaml @@ -6,11 +6,11 @@ dataset: task: classification image_size: 256 train_batch_size: 32 - test_batch_size: 32 + eval_batch_size: 32 num_workers: 8 transform_config: train: null - val: null + eval: null validation_split_mode: same_as_test # options: [same_as_test, from_test] model: diff --git a/anomalib/models/draem/config.yaml b/anomalib/models/draem/config.yaml index 510e661d15..5f225e4cff 100644 --- a/anomalib/models/draem/config.yaml +++ b/anomalib/models/draem/config.yaml @@ -6,11 +6,11 @@ dataset: task: segmentation image_size: 256 train_batch_size: 8 - test_batch_size: 32 + eval_batch_size: 32 num_workers: 8 transform_config: train: ./anomalib/models/draem/transform_config.yaml - val: ./anomalib/models/draem/transform_config.yaml + eval: ./anomalib/models/draem/transform_config.yaml validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false diff --git a/anomalib/models/fastflow/config.yaml b/anomalib/models/fastflow/config.yaml index 59d2a12aa5..d1ad0d6eae 100644 --- a/anomalib/models/fastflow/config.yaml +++ b/anomalib/models/fastflow/config.yaml @@ -6,11 +6,11 @@ dataset: category: bottle image_size: 256 # options: [256, 256, 448, 384] - for each supported backbone train_batch_size: 32 - test_batch_size: 32 + eval_batch_size: 32 num_workers: 8 transform_config: train: null - val: null + eval: null validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml index c8d09276f2..542f117df1 100644 --- a/anomalib/models/ganomaly/config.yaml +++ b/anomalib/models/ganomaly/config.yaml @@ -6,12 +6,12 @@ dataset: task: classification image_size: 256 train_batch_size: 32 - test_batch_size: 32 + eval_batch_size: 32 inference_batch_size: 32 num_workers: 8 transform_config: train: null - val: null + eval: null validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: true diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml index b857d6d692..bb08d58ab5 100644 --- a/anomalib/models/padim/config.yaml +++ b/anomalib/models/padim/config.yaml @@ -6,11 +6,11 @@ dataset: task: segmentation image_size: 256 train_batch_size: 32 - test_batch_size: 32 + eval_batch_size: 32 num_workers: 8 transform_config: train: null - val: null + eval: null validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml index c97603ea2e..38fc14bb38 100644 --- a/anomalib/models/patchcore/config.yaml +++ b/anomalib/models/patchcore/config.yaml @@ -6,11 +6,11 @@ dataset: category: bottle image_size: 224 train_batch_size: 32 - test_batch_size: 1 + eval_batch_size: 1 num_workers: 8 transform_config: train: null - val: null + eval: null validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false diff --git a/anomalib/models/reverse_distillation/config.yaml b/anomalib/models/reverse_distillation/config.yaml index 8dcdd0fe9c..1a6a697f36 100644 --- a/anomalib/models/reverse_distillation/config.yaml +++ b/anomalib/models/reverse_distillation/config.yaml @@ -6,12 +6,12 @@ dataset: task: segmentation image_size: 256 train_batch_size: 32 - test_batch_size: 32 + eval_batch_size: 32 inference_batch_size: 32 num_workers: 8 transform_config: train: null - val: null + eval: null validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml index 4a2b251173..a25a558f41 100644 --- a/anomalib/models/stfpm/config.yaml +++ b/anomalib/models/stfpm/config.yaml @@ -6,12 +6,12 @@ dataset: task: segmentation image_size: 256 train_batch_size: 32 - test_batch_size: 32 + eval_batch_size: 32 inference_batch_size: 32 num_workers: 36 transform_config: train: null - val: null + eval: null validation_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py index 39707bb69a..b625621b35 100644 --- a/tests/pre_merge/datasets/test_dataset.py +++ b/tests/pre_merge/datasets/test_dataset.py @@ -19,7 +19,7 @@ def mvtec_data_module(): category="leather", image_size=(256, 256), train_batch_size=1, - test_batch_size=1, + eval_batch_size=1, num_workers=0, ) datamodule.prepare_data() @@ -36,7 +36,7 @@ def btech_data_module(): category="01", image_size=(256, 256), train_batch_size=1, - test_batch_size=1, + eval_batch_size=1, num_workers=0, ) datamodule.prepare_data() @@ -58,7 +58,7 @@ def folder_data_module(): split_ratio=0.2, image_size=(256, 256), train_batch_size=32, - test_batch_size=32, + eval_batch_size=32, num_workers=8, ) datamodule.setup() From e16816333ac4b272187cfdbb18d8124315ea3876 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Thu, 13 Oct 2022 13:21:43 +0200 Subject: [PATCH 41/96] remove Split.Full from enum --- anomalib/data/base.py | 1 - anomalib/data/btech.py | 8 ++++---- anomalib/data/folder.py | 9 ++++----- anomalib/data/mvtec.py | 10 +++++----- tests/pre_merge/datasets/test_dataset.py | 3 +++ 5 files changed, 16 insertions(+), 15 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 55ed39b5a9..b5ce95716d 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -29,7 +29,6 @@ class Split(str, Enum): """Split of a subset.""" - FULL = "full" TRAIN = "train" VAL = "val" TEST = "test" diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index e501275390..934b5d57eb 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -31,7 +31,7 @@ logger = logging.getLogger(__name__) -def make_btech_dataset(path: Path, split: Optional[str] = None) -> DataFrame: +def make_btech_dataset(path: Path, split: Optional[Union[Split, str]] = None) -> DataFrame: """Create BTech samples by parsing the BTech data file structure. The files are expected to follow the structure: @@ -40,7 +40,7 @@ def make_btech_dataset(path: Path, split: Optional[str] = None) -> DataFrame: Args: path (Path): Path to dataset - split (str, optional): Dataset split (ie., either train or test). Defaults to None. + split (Optional[Union[Split, str]], optional): Dataset split (ie., either train or test). Defaults to None. split_ratio (float, optional): Ratio to split normal training images and add to the test set in case test set doesn't contain any normal images. Defaults to 0.1. @@ -99,7 +99,7 @@ def make_btech_dataset(path: Path, split: Optional[str] = None) -> DataFrame: samples.label_index = samples.label_index.astype(int) # Get the data frame for the split. - if split != Split.FULL: + if split: samples = samples[samples.split == split] samples = samples.reset_index(drop=True) @@ -114,7 +114,7 @@ def __init__( root: Union[Path, str], category: str, pre_process: PreProcessor, - split: Split, + split: Optional[Union[Split, str]] = None, task: str = "segmentation", ) -> None: """Btech Dataset class. diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index 2f31c5cfb8..db42d6ef6b 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -130,7 +130,7 @@ def make_folder_dataset( samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test" # Get the data frame for the split. - if split != Split.FULL: + if split: samples = samples[samples.split == split] samples = samples.reset_index(drop=True) @@ -143,8 +143,8 @@ class FolderDataset(AnomalibDataset): Args: task (str): Task type. (classification or segmentation). pre_process (PreProcessor): Image Pre-processor to apply transform. - split (Split): Fixed subset split that follows from folder structure on file system. Choose from - [Split.FULL, Split.TRAIN, Split.TEST] + split (Optional[Union[Split, str]]): Fixed subset split that follows from folder structure on file system. + Choose from [Split.FULL, Split.TRAIN, Split.TEST] root (Union[str, Path]): Root folder of the dataset. normal_dir (Union[str, Path]): Path to the directory containing normal images. @@ -167,13 +167,12 @@ def __init__( self, task: str, pre_process: PreProcessor, - split: Split, - # root: Union[str, Path], normal_dir: Union[str, Path], abnormal_dir: Optional[Union[str, Path]] = None, normal_test_dir: Optional[Union[str, Path]] = None, mask_dir: Optional[Union[str, Path]] = None, + split: Optional[Union[Split, str]] = None, val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, extensions: Optional[Tuple[str, ...]] = None, ) -> None: diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 252c556b80..704c8f6626 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -40,7 +40,7 @@ logger = logging.getLogger(__name__) -def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> DataFrame: +def make_mvtec_dataset(root: Union[str, Path], split: Optional[Union[Split, str]] = None) -> DataFrame: """Create MVTec AD samples by parsing the MVTec AD data file structure. The files are expected to follow the structure: @@ -56,7 +56,7 @@ def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> Dat Args: path (Path): Path to dataset - split (str, optional): Dataset split (ie., either train or test). Defaults to None. + split (Optional[Union[Split, str]], optional): Dataset split (ie., either train or test). Defaults to None. split_ratio (float, optional): Ratio to split normal training images and add to the test set in case test set doesn't contain any normal images. Defaults to 0.1. @@ -114,7 +114,7 @@ def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> Dat samples.loc[(samples.label != "good"), "label_index"] = 1 samples.label_index = samples.label_index.astype(int) - if split != Split.FULL: + if split: samples = samples[samples.split == split].reset_index(drop=True) return samples @@ -126,7 +126,7 @@ class MVTecDataset(AnomalibDataset): Args: task (str): Task type, either 'classification' or 'segmentation' pre_process (PreProcessor): Pre-processor object - split (Split): Split of the dataset, usually Split.TRAIN or Split. TEST + split (Optional[Union[Split, str]]): Split of the dataset, usually Split.TRAIN or Split.TEST root (str): Path to the root of the dataset category (str): Sub-category of the dataset, e.g. 'bottle' """ @@ -135,9 +135,9 @@ def __init__( self, task: str, pre_process: PreProcessor, - split: Split, root: str, category: str, + split: Optional[Union[Split, str]] = None, ) -> None: super().__init__(task=task, pre_process=pre_process) diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py index b625621b35..a893c01478 100644 --- a/tests/pre_merge/datasets/test_dataset.py +++ b/tests/pre_merge/datasets/test_dataset.py @@ -21,6 +21,7 @@ def mvtec_data_module(): train_batch_size=1, eval_batch_size=1, num_workers=0, + val_split_mode="from_test", ) datamodule.prepare_data() datamodule.setup() @@ -38,6 +39,7 @@ def btech_data_module(): train_batch_size=1, eval_batch_size=1, num_workers=0, + val_split_mode="from_test", ) datamodule.prepare_data() datamodule.setup() @@ -60,6 +62,7 @@ def folder_data_module(): train_batch_size=32, eval_batch_size=32, num_workers=8, + val_split_mode="from_test", ) datamodule.setup() From 5071dcf96abeed9f44bebdf95d50ed358fcaca8a Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Thu, 13 Oct 2022 14:35:37 +0200 Subject: [PATCH 42/96] sort samples when setting --- anomalib/data/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index b5ce95716d..53d0c571c4 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -85,7 +85,7 @@ def samples(self, samples: DataFrame): Args: samples (DataFrame): DataFrame with new samples. """ - self._samples = samples + self._samples = samples.sort_values(by="image_path", ignore_index=True) @property def has_normal(self) -> bool: From e175d7d0309975ca13e374874341c7916aa9d23a Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Thu, 13 Oct 2022 14:36:50 +0200 Subject: [PATCH 43/96] update warn message --- anomalib/data/utils/split.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index 266583d907..e4c356af92 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -72,9 +72,9 @@ def random_split( for i in range(len(label_dataset) - sum(subset_lengths)): subset_idx = i % sum(subset_lengths) subset_lengths[subset_idx] += 1 - for index, length in enumerate(subset_lengths): - if length == 0: - warnings.warn(f"Length of subset at index {index} is 0.") + if 0 in subset_lengths: + warnings.warn("Zero subset length encountered during splitting. This means one of your subsets might be" + " empty or devoid of either normal or anomalous images.") # perform random subsampling indices = torch.randperm(len(label_dataset)) subsets.append( From 03773b0211cd68e0f27d2f1f7a20e77feed232ff Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Thu, 13 Oct 2022 14:37:49 +0200 Subject: [PATCH 44/96] formatting --- anomalib/data/utils/split.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index e4c356af92..824a27f594 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -73,8 +73,10 @@ def random_split( subset_idx = i % sum(subset_lengths) subset_lengths[subset_idx] += 1 if 0 in subset_lengths: - warnings.warn("Zero subset length encountered during splitting. This means one of your subsets might be" - " empty or devoid of either normal or anomalous images.") + warnings.warn( + "Zero subset length encountered during splitting. This means one of your subsets might be" + " empty or devoid of either normal or anomalous images." + ) # perform random subsampling indices = torch.randperm(len(label_dataset)) subsets.append( From 3910c32766441a6e59b677bf5c37ad3db05b2b35 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Thu, 13 Oct 2022 14:40:40 +0200 Subject: [PATCH 45/96] use setter when creating samples in dataset classes --- anomalib/data/btech.py | 2 +- anomalib/data/folder.py | 2 +- anomalib/data/mvtec.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 934b5d57eb..b7f913750e 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -164,7 +164,7 @@ def __init__( self.split = split def _setup(self): - self._samples = make_btech_dataset(path=self.root_category, split=self.split) + self.samples = make_btech_dataset(path=self.root_category, split=self.split) @DATAMODULE_REGISTRY diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index db42d6ef6b..b67d7abe4d 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -189,7 +189,7 @@ def __init__( def _setup(self): """Assign samples.""" - self._samples = make_folder_dataset( + self.samples = make_folder_dataset( normal_dir=self.normal_dir, abnormal_dir=self.abnormal_dir, normal_test_dir=self.normal_test_dir, diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 704c8f6626..445ba48440 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -145,7 +145,7 @@ def __init__( self.split = split def _setup(self): - self._samples = make_mvtec_dataset(self.root_category, split=self.split) + self.samples = make_mvtec_dataset(self.root_category, split=self.split) class MVTec(AnomalibDataModule): From 894ef123e7bc885498d5d5e4955057d79c57097a Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Thu, 13 Oct 2022 14:41:47 +0200 Subject: [PATCH 46/96] add tests for new dataset class --- tests/pre_merge/datasets/test_datamodule.py | 244 ++++++++++++++++++ tests/pre_merge/datasets/test_dataset.py | 272 ++++---------------- 2 files changed, 289 insertions(+), 227 deletions(-) create mode 100644 tests/pre_merge/datasets/test_datamodule.py diff --git a/tests/pre_merge/datasets/test_datamodule.py b/tests/pre_merge/datasets/test_datamodule.py new file mode 100644 index 0000000000..a893c01478 --- /dev/null +++ b/tests/pre_merge/datasets/test_datamodule.py @@ -0,0 +1,244 @@ +"""Test Dataset.""" + +import os + +import numpy as np +import pytest + +from anomalib.config import update_input_size_config +from anomalib.data import BTech, Folder, MVTec, get_datamodule +from anomalib.pre_processing.transforms import Denormalize, ToNumpy +from tests.helpers.config import get_test_configurable_parameters +from tests.helpers.dataset import TestDataset, get_dataset_path + + +@pytest.fixture(autouse=True) +def mvtec_data_module(): + datamodule = MVTec( + root=get_dataset_path(dataset="MVTec"), + category="leather", + image_size=(256, 256), + train_batch_size=1, + eval_batch_size=1, + num_workers=0, + val_split_mode="from_test", + ) + datamodule.prepare_data() + datamodule.setup() + + return datamodule + + +@pytest.fixture(autouse=True) +def btech_data_module(): + """Create BTech Data Module.""" + datamodule = BTech( + root=get_dataset_path(dataset="BTech"), + category="01", + image_size=(256, 256), + train_batch_size=1, + eval_batch_size=1, + num_workers=0, + val_split_mode="from_test", + ) + datamodule.prepare_data() + datamodule.setup() + + return datamodule + + +@pytest.fixture(autouse=True) +def folder_data_module(): + """Create Folder Data Module.""" + root = get_dataset_path(dataset="bottle") + datamodule = Folder( + root=root, + normal_dir="good", + abnormal_dir="broken_large", + mask_dir=os.path.join(root, "ground_truth/broken_large"), + task="segmentation", + split_ratio=0.2, + image_size=(256, 256), + train_batch_size=32, + eval_batch_size=32, + num_workers=8, + val_split_mode="from_test", + ) + datamodule.setup() + + return datamodule + + +@pytest.fixture(autouse=True) +def data_sample(mvtec_data_module): + _, data = next(enumerate(mvtec_data_module.train_dataloader())) + return data + + +class TestMVTecDataModule: + """Test MVTec AD Data Module.""" + + def test_batch_size(self, mvtec_data_module): + """test_mvtec_datamodule [summary]""" + _, train_data_sample = next(enumerate(mvtec_data_module.train_dataloader())) + _, val_data_sample = next(enumerate(mvtec_data_module.val_dataloader())) + assert train_data_sample["image"].shape[0] == 1 + assert val_data_sample["image"].shape[0] == 1 + + def test_val_and_test_dataloaders_has_mask_and_gt(self, mvtec_data_module): + """Test Validation and Test dataloaders should return filenames, image, mask and label.""" + _, val_data = next(enumerate(mvtec_data_module.val_dataloader())) + _, test_data = next(enumerate(mvtec_data_module.test_dataloader())) + + assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys()) + assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys()) + + def test_non_overlapping_splits(self, mvtec_data_module): + """This test ensures that the train and test splits generated are non-overlapping.""" + assert ( + len( + set(mvtec_data_module.test_data.samples["image_path"].values).intersection( + set(mvtec_data_module.train_data.samples["image_path"].values) + ) + ) + == 0 + ), "Found train and test split contamination" + + +class TestBTechDataModule: + """Test BTech Data Module.""" + + def test_batch_size(self, btech_data_module): + """Test batch size.""" + _, train_data_sample = next(enumerate(btech_data_module.train_dataloader())) + _, val_data_sample = next(enumerate(btech_data_module.val_dataloader())) + assert train_data_sample["image"].shape[0] == 1 + assert val_data_sample["image"].shape[0] == 1 + + def test_val_and_test_dataloaders_has_mask_and_gt(self, btech_data_module): + """Test Validation and Test dataloaders should return filenames, image, mask and label.""" + _, val_data = next(enumerate(btech_data_module.val_dataloader())) + _, test_data = next(enumerate(btech_data_module.test_dataloader())) + + assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys()) + assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys()) + + def test_non_overlapping_splits(self, btech_data_module): + """This test ensures that the train and test splits generated are non-overlapping.""" + assert ( + len( + set(btech_data_module.test_data.samples["image_path"].values).intersection( + set(btech_data_module.train_data.samples["image_path"].values) + ) + ) + == 0 + ), "Found train and test split contamination" + + +class TestFolderDataModule: + """Test Folder Data Module.""" + + def test_batch_size(self, folder_data_module): + """Test batch size.""" + _, train_data_sample = next(enumerate(folder_data_module.train_dataloader())) + _, val_data_sample = next(enumerate(folder_data_module.val_dataloader())) + assert train_data_sample["image"].shape[0] == 16 + assert val_data_sample["image"].shape[0] == 12 + + def test_val_and_test_dataloaders_has_mask_and_gt(self, folder_data_module): + """Test Validation and Test dataloaders should return filenames, image, mask and label.""" + _, val_data = next(enumerate(folder_data_module.val_dataloader())) + _, test_data = next(enumerate(folder_data_module.test_dataloader())) + + assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys()) + assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys()) + + def test_non_overlapping_splits(self, folder_data_module): + """This test ensures that the train and test splits generated are non-overlapping.""" + assert ( + len( + set(folder_data_module.test_data.samples["image_path"].values).intersection( + set(folder_data_module.train_data.samples["image_path"].values) + ) + ) + == 0 + ), "Found train and test split contamination" + + +class TestDenormalize: + """Test Denormalize Util.""" + + def test_denormalize_image_pixel_values(self, data_sample): + """Test Denormalize denormalizes tensor into [0, 256] range.""" + denormalized_sample = Denormalize().__call__(data_sample["image"].squeeze()) + assert denormalized_sample.min() >= 0 and denormalized_sample.max() <= 256 + + def test_denormalize_return_numpy(self, data_sample): + """Denormalize should return a numpy array.""" + denormalized_sample = Denormalize()(data_sample["image"].squeeze()) + assert isinstance(denormalized_sample, np.ndarray) + + def test_denormalize_channel_order(self, data_sample): + """Denormalize should return a numpy array of order [HxWxC]""" + denormalized_sample = Denormalize().__call__(data_sample["image"].squeeze()) + assert len(denormalized_sample.shape) == 3 and denormalized_sample.shape[-1] == 3 + + def test_representation(self): + """Test Denormalize representation should return string + Denormalize()""" + assert str(Denormalize()) == "Denormalize()" + + +class TestToNumpy: + """Test ToNumpy whether it properly converts tensor into numpy array.""" + + def test_to_numpy_image_pixel_values(self, data_sample): + """Test ToNumpy should return an array whose pixels in the range of [0, + 256]""" + array = ToNumpy()(data_sample["image"]) + assert array.min() >= 0 and array.max() <= 256 + + def test_to_numpy_converts_tensor_to_np_array(self, data_sample): + """ToNumpy returns a numpy array.""" + array = ToNumpy()(data_sample["image"]) + assert isinstance(array, np.ndarray) + + def test_to_numpy_channel_order(self, data_sample): + """ToNumpy() should return a numpy array of order [HxWxC]""" + array = ToNumpy()(data_sample["image"]) + assert len(array.shape) == 3 and array.shape[-1] == 3 + + def test_one_channel_images(self, data_sample): + """One channel tensor should be converted to HxW np array.""" + data = data_sample["image"][:, 0, :, :].unsqueeze(0) + array = ToNumpy()(data) + assert len(array.shape) == 2 + + def test_representation(self): + """Test ToNumpy() representation should return string `ToNumpy()`""" + assert str(ToNumpy()) == "ToNumpy()" + + +class TestConfigToDataModule: + """Tests that check if the dataset parameters in the config achieve the desired effect.""" + + @pytest.mark.parametrize( + ["input_size", "effective_image_size"], + [ + (512, (512, 512)), + ((245, 276), (245, 276)), + ((263, 134), (263, 134)), + ((267, 267), (267, 267)), + ], + ) + @TestDataset(num_train=20, num_test=10) + def test_image_size(self, input_size, effective_image_size, category="shapes", path=None): + """Test if the image size parameter works as expected.""" + configurable_parameters = get_test_configurable_parameters(dataset_path=path, model_name="stfpm") + configurable_parameters.dataset.category = category + configurable_parameters.dataset.image_size = input_size + configurable_parameters = update_input_size_config(configurable_parameters) + + data_module = get_datamodule(configurable_parameters) + data_module.setup() + assert iter(data_module.train_dataloader()).__next__()["image"].shape[-2:] == effective_image_size diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py index a893c01478..bc06286ec8 100644 --- a/tests/pre_merge/datasets/test_dataset.py +++ b/tests/pre_merge/datasets/test_dataset.py @@ -1,244 +1,62 @@ -"""Test Dataset.""" +"""Test the AnomalibDataset class.""" -import os +import random -import numpy as np import pytest -from anomalib.config import update_input_size_config -from anomalib.data import BTech, Folder, MVTec, get_datamodule -from anomalib.pre_processing.transforms import Denormalize, ToNumpy -from tests.helpers.config import get_test_configurable_parameters -from tests.helpers.dataset import TestDataset, get_dataset_path +from anomalib.data.folder import FolderDataset +from anomalib.data.utils.split import concatenate_datasets, random_split +from anomalib.pre_processing import PreProcessor +from tests.helpers.dataset import get_dataset_path @pytest.fixture(autouse=True) -def mvtec_data_module(): - datamodule = MVTec( - root=get_dataset_path(dataset="MVTec"), - category="leather", - image_size=(256, 256), - train_batch_size=1, - eval_batch_size=1, - num_workers=0, - val_split_mode="from_test", +def folder_dataset(): + """Create Folder Dataset.""" + root = get_dataset_path(dataset="bottle") + pre_process = PreProcessor(image_size=(256, 256)) + dataset = FolderDataset( + task="classification", + pre_process=pre_process, + root=root, + normal_dir="good", + abnormal_dir="broken_large", ) - datamodule.prepare_data() - datamodule.setup() + dataset.setup() - return datamodule + return dataset -@pytest.fixture(autouse=True) -def btech_data_module(): - """Create BTech Data Module.""" - datamodule = BTech( - root=get_dataset_path(dataset="BTech"), - category="01", - image_size=(256, 256), - train_batch_size=1, - eval_batch_size=1, - num_workers=0, - val_split_mode="from_test", - ) - datamodule.prepare_data() - datamodule.setup() +class TestAnomalibDataset: + def test_subsample(self, folder_dataset): + """Test the subsample functionality.""" - return datamodule + sample_size = int(0.5 * len(folder_dataset)) + indices = random.sample(range(len(folder_dataset)), sample_size) + subset = folder_dataset.subsample(indices) + # check if the dataset has been subsampled to correct size + assert len(subset) == sample_size + # check if index has been reset + assert subset.samples.index.start == 0 + assert subset.samples.index.stop == sample_size -@pytest.fixture(autouse=True) -def folder_data_module(): - """Create Folder Data Module.""" - root = get_dataset_path(dataset="bottle") - datamodule = Folder( - root=root, - normal_dir="good", - abnormal_dir="broken_large", - mask_dir=os.path.join(root, "ground_truth/broken_large"), - task="segmentation", - split_ratio=0.2, - image_size=(256, 256), - train_batch_size=32, - eval_batch_size=32, - num_workers=8, - val_split_mode="from_test", - ) - datamodule.setup() + def test_random_split(self, folder_dataset): + """Test the random subset splitting.""" - return datamodule + # split the dataset + subsets = random_split(folder_dataset, [0.4, 0.35, 0.25], label_aware=True) + # check if subset splitting has been performed correctly + assert len(subsets) == 3 -@pytest.fixture(autouse=True) -def data_sample(mvtec_data_module): - _, data = next(enumerate(mvtec_data_module.train_dataloader())) - return data - - -class TestMVTecDataModule: - """Test MVTec AD Data Module.""" - - def test_batch_size(self, mvtec_data_module): - """test_mvtec_datamodule [summary]""" - _, train_data_sample = next(enumerate(mvtec_data_module.train_dataloader())) - _, val_data_sample = next(enumerate(mvtec_data_module.val_dataloader())) - assert train_data_sample["image"].shape[0] == 1 - assert val_data_sample["image"].shape[0] == 1 - - def test_val_and_test_dataloaders_has_mask_and_gt(self, mvtec_data_module): - """Test Validation and Test dataloaders should return filenames, image, mask and label.""" - _, val_data = next(enumerate(mvtec_data_module.val_dataloader())) - _, test_data = next(enumerate(mvtec_data_module.test_dataloader())) - - assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys()) - assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys()) - - def test_non_overlapping_splits(self, mvtec_data_module): - """This test ensures that the train and test splits generated are non-overlapping.""" - assert ( - len( - set(mvtec_data_module.test_data.samples["image_path"].values).intersection( - set(mvtec_data_module.train_data.samples["image_path"].values) - ) - ) - == 0 - ), "Found train and test split contamination" - - -class TestBTechDataModule: - """Test BTech Data Module.""" - - def test_batch_size(self, btech_data_module): - """Test batch size.""" - _, train_data_sample = next(enumerate(btech_data_module.train_dataloader())) - _, val_data_sample = next(enumerate(btech_data_module.val_dataloader())) - assert train_data_sample["image"].shape[0] == 1 - assert val_data_sample["image"].shape[0] == 1 - - def test_val_and_test_dataloaders_has_mask_and_gt(self, btech_data_module): - """Test Validation and Test dataloaders should return filenames, image, mask and label.""" - _, val_data = next(enumerate(btech_data_module.val_dataloader())) - _, test_data = next(enumerate(btech_data_module.test_dataloader())) - - assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys()) - assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys()) - - def test_non_overlapping_splits(self, btech_data_module): - """This test ensures that the train and test splits generated are non-overlapping.""" - assert ( - len( - set(btech_data_module.test_data.samples["image_path"].values).intersection( - set(btech_data_module.train_data.samples["image_path"].values) - ) - ) - == 0 - ), "Found train and test split contamination" - - -class TestFolderDataModule: - """Test Folder Data Module.""" - - def test_batch_size(self, folder_data_module): - """Test batch size.""" - _, train_data_sample = next(enumerate(folder_data_module.train_dataloader())) - _, val_data_sample = next(enumerate(folder_data_module.val_dataloader())) - assert train_data_sample["image"].shape[0] == 16 - assert val_data_sample["image"].shape[0] == 12 - - def test_val_and_test_dataloaders_has_mask_and_gt(self, folder_data_module): - """Test Validation and Test dataloaders should return filenames, image, mask and label.""" - _, val_data = next(enumerate(folder_data_module.val_dataloader())) - _, test_data = next(enumerate(folder_data_module.test_dataloader())) - - assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys()) - assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys()) - - def test_non_overlapping_splits(self, folder_data_module): - """This test ensures that the train and test splits generated are non-overlapping.""" - assert ( - len( - set(folder_data_module.test_data.samples["image_path"].values).intersection( - set(folder_data_module.train_data.samples["image_path"].values) - ) - ) - == 0 - ), "Found train and test split contamination" - - -class TestDenormalize: - """Test Denormalize Util.""" - - def test_denormalize_image_pixel_values(self, data_sample): - """Test Denormalize denormalizes tensor into [0, 256] range.""" - denormalized_sample = Denormalize().__call__(data_sample["image"].squeeze()) - assert denormalized_sample.min() >= 0 and denormalized_sample.max() <= 256 - - def test_denormalize_return_numpy(self, data_sample): - """Denormalize should return a numpy array.""" - denormalized_sample = Denormalize()(data_sample["image"].squeeze()) - assert isinstance(denormalized_sample, np.ndarray) - - def test_denormalize_channel_order(self, data_sample): - """Denormalize should return a numpy array of order [HxWxC]""" - denormalized_sample = Denormalize().__call__(data_sample["image"].squeeze()) - assert len(denormalized_sample.shape) == 3 and denormalized_sample.shape[-1] == 3 - - def test_representation(self): - """Test Denormalize representation should return string - Denormalize()""" - assert str(Denormalize()) == "Denormalize()" - - -class TestToNumpy: - """Test ToNumpy whether it properly converts tensor into numpy array.""" - - def test_to_numpy_image_pixel_values(self, data_sample): - """Test ToNumpy should return an array whose pixels in the range of [0, - 256]""" - array = ToNumpy()(data_sample["image"]) - assert array.min() >= 0 and array.max() <= 256 - - def test_to_numpy_converts_tensor_to_np_array(self, data_sample): - """ToNumpy returns a numpy array.""" - array = ToNumpy()(data_sample["image"]) - assert isinstance(array, np.ndarray) - - def test_to_numpy_channel_order(self, data_sample): - """ToNumpy() should return a numpy array of order [HxWxC]""" - array = ToNumpy()(data_sample["image"]) - assert len(array.shape) == 3 and array.shape[-1] == 3 - - def test_one_channel_images(self, data_sample): - """One channel tensor should be converted to HxW np array.""" - data = data_sample["image"][:, 0, :, :].unsqueeze(0) - array = ToNumpy()(data) - assert len(array.shape) == 2 - - def test_representation(self): - """Test ToNumpy() representation should return string `ToNumpy()`""" - assert str(ToNumpy()) == "ToNumpy()" - - -class TestConfigToDataModule: - """Tests that check if the dataset parameters in the config achieve the desired effect.""" - - @pytest.mark.parametrize( - ["input_size", "effective_image_size"], - [ - (512, (512, 512)), - ((245, 276), (245, 276)), - ((263, 134), (263, 134)), - ((267, 267), (267, 267)), - ], - ) - @TestDataset(num_train=20, num_test=10) - def test_image_size(self, input_size, effective_image_size, category="shapes", path=None): - """Test if the image size parameter works as expected.""" - configurable_parameters = get_test_configurable_parameters(dataset_path=path, model_name="stfpm") - configurable_parameters.dataset.category = category - configurable_parameters.dataset.image_size = input_size - configurable_parameters = update_input_size_config(configurable_parameters) - - data_module = get_datamodule(configurable_parameters) - data_module.setup() - assert iter(data_module.train_dataloader()).__next__()["image"].shape[-2:] == effective_image_size + # reconstruct the original dataset by concatenating the subsets + reconstructed_dataset = concatenate_datasets(subsets) + + # check if reconstructed dataset is equal to original dataset + assert folder_dataset.samples.equals(reconstructed_dataset.samples) + + # check if warning raised when one of the subsets is empty + split_ratios = [1 - (1 / (len(folder_dataset) + 1)), 1 / (len(folder_dataset) + 1)] + with pytest.warns(): + subsets = random_split(folder_dataset, split_ratios) From 44009e27e193d5a20132d92addeacfd35a42bb02 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Thu, 13 Oct 2022 15:17:13 +0200 Subject: [PATCH 47/96] add test case for label aware random split --- tests/pre_merge/datasets/test_dataset.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py index bc06286ec8..81daad062c 100644 --- a/tests/pre_merge/datasets/test_dataset.py +++ b/tests/pre_merge/datasets/test_dataset.py @@ -2,6 +2,7 @@ import random +import pandas as pd import pytest from anomalib.data.folder import FolderDataset @@ -44,15 +45,12 @@ def test_subsample(self, folder_dataset): def test_random_split(self, folder_dataset): """Test the random subset splitting.""" - # split the dataset - subsets = random_split(folder_dataset, [0.4, 0.35, 0.25], label_aware=True) - + # subset splitting + subsets = random_split(folder_dataset, [0.4, 0.35, 0.25]) # check if subset splitting has been performed correctly assert len(subsets) == 3 - # reconstruct the original dataset by concatenating the subsets reconstructed_dataset = concatenate_datasets(subsets) - # check if reconstructed dataset is equal to original dataset assert folder_dataset.samples.equals(reconstructed_dataset.samples) @@ -60,3 +58,17 @@ def test_random_split(self, folder_dataset): split_ratios = [1 - (1 / (len(folder_dataset) + 1)), 1 / (len(folder_dataset) + 1)] with pytest.warns(): subsets = random_split(folder_dataset, split_ratios) + + # label-aware subset splitting + samples = folder_dataset.samples + normal_samples = samples[samples["label_index"] == 0] + anomalous_samples = samples[samples["label_index"] == 1] + samples = pd.concat([normal_samples, anomalous_samples[0:5]]) + folder_dataset.samples = samples + + subsets = random_split(folder_dataset, [0.4, 0.4, 0.2], label_aware=True) + + # 5 anomalous images in total, so the first two subsets should each have 2, and the last subset 1 + assert len(subsets[0].samples[subsets[0].samples["label_index"] == 1]) == 2 + assert len(subsets[1].samples[subsets[1].samples["label_index"] == 1]) == 2 + assert len(subsets[2].samples[subsets[2].samples["label_index"] == 1]) == 1 From 012ed479b2fd79889a39442ab176782f27248d6b Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 11:00:55 +0200 Subject: [PATCH 48/96] update parameter name in inferencers --- anomalib/deploy/inferencers/openvino_inferencer.py | 2 +- anomalib/deploy/inferencers/torch_inferencer.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/anomalib/deploy/inferencers/openvino_inferencer.py b/anomalib/deploy/inferencers/openvino_inferencer.py index 804abb52d0..e68ef0f294 100644 --- a/anomalib/deploy/inferencers/openvino_inferencer.py +++ b/anomalib/deploy/inferencers/openvino_inferencer.py @@ -94,7 +94,7 @@ def pre_process(self, image: np.ndarray) -> np.ndarray: np.ndarray: pre-processed image. """ transform_config = ( - self.config.dataset.transform_config.val if "transform_config" in self.config.dataset.keys() else None + self.config.dataset.transform_config.eval if "transform_config" in self.config.dataset.keys() else None ) image_size = tuple(self.config.dataset.image_size) pre_processor = PreProcessor(transform_config, image_size) diff --git a/anomalib/deploy/inferencers/torch_inferencer.py b/anomalib/deploy/inferencers/torch_inferencer.py index 795149e6c6..ff2d8813a6 100644 --- a/anomalib/deploy/inferencers/torch_inferencer.py +++ b/anomalib/deploy/inferencers/torch_inferencer.py @@ -96,7 +96,7 @@ def pre_process(self, image: np.ndarray) -> Tensor: Tensor: pre-processed image. """ transform_config = ( - self.config.dataset.transform_config.val if "transform_config" in self.config.dataset.keys() else None + self.config.dataset.transform_config.eval if "transform_config" in self.config.dataset.keys() else None ) image_size = tuple(self.config.dataset.image_size) pre_processor = PreProcessor(transform_config, image_size) From 62b176e53c4c72afbbec349c347b5b756c85a4ec Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 11:44:19 +0200 Subject: [PATCH 49/96] move _setup implementation to base class --- anomalib/data/base.py | 20 ++++++++++++++++---- anomalib/data/btech.py | 15 --------------- anomalib/data/folder.py | 9 +-------- anomalib/data/mvtec.py | 15 --------------- anomalib/data/utils/__init__.py | 3 +++ anomalib/data/utils/split.py | 7 +++++-- 6 files changed, 25 insertions(+), 44 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 53d0c571c4..10b7c2ecf4 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -20,7 +20,7 @@ from torch import Tensor from torch.utils.data import DataLoader, Dataset -from anomalib.data.utils import read_image +from anomalib.data.utils import random_split, read_image from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) @@ -209,10 +209,22 @@ def setup(self, stage: Optional[str] = None): self._setup(stage) assert self.is_setup - @abstractmethod def _setup(self, _stage: Optional[str] = None) -> None: - """To be implemented in conrete subclass.""" - raise NotImplementedError + """Set up the datasets and perform dynamic subset splitting. + + May be overridden in subclass for custom splitting behaviour. + """ + assert self.train_data is not None + assert self.test_data is not None + + self.train_data.setup() + self.test_data.setup() + if self.val_split_mode == ValSplitMode.FROM_TEST: + self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True) + elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: + self.val_data = self.test_data + else: + raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") @property def is_setup(self): diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index b7f913750e..76be5ca087 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -25,7 +25,6 @@ from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode from anomalib.data.utils import DownloadProgressBar, hash_check -from anomalib.data.utils.split import random_split from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) @@ -281,17 +280,3 @@ def prepare_data(self) -> None: logger.info("Cleaning the tar file") zip_filename.unlink() - - def _setup(self, _stage: Optional[str] = None): - """Set up the datasets and perform dynamic subset splitting.""" - assert self.train_data is not None - assert self.test_data is not None - - self.train_data.setup() - self.test_data.setup() - if self.val_split_mode == ValSplitMode.FROM_TEST: - self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True) - elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: - self.val_data = self.test_data - else: - raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index b67d7abe4d..2a8f63b79b 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -301,11 +301,4 @@ def _setup(self, _stage: Optional[str] = None): self.train_data, normal_test_data = random_split(self.train_data, self.split_ratio) self.test_data += normal_test_data - # split validation set from test set - if self.val_split_mode == ValSplitMode.FROM_TEST: - assert self.test_data is not None - self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True) - elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: - self.val_data = self.test_data - else: - raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") + super()._setup() diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 445ba48440..e52d361c91 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -34,7 +34,6 @@ from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode from anomalib.data.utils import DownloadProgressBar, hash_check -from anomalib.data.utils.split import random_split from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) @@ -211,17 +210,3 @@ def prepare_data(self) -> None: logger.info("Cleaning the tar file") (zip_filename).unlink() - - def _setup(self, _stage: Optional[str] = None) -> None: - """Set up the datasets and perform dynamic subset splitting.""" - assert self.train_data is not None - assert self.test_data is not None - - self.train_data.setup() - self.test_data.setup() - if self.val_split_mode == ValSplitMode.FROM_TEST: - self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True) - elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: - self.val_data = self.test_data - else: - raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py index 5059b51c06..52b21b8fcf 100644 --- a/anomalib/data/utils/__init__.py +++ b/anomalib/data/utils/__init__.py @@ -11,6 +11,7 @@ get_image_height_and_width, read_image, ) +from .split import concatenate_datasets, random_split __all__ = [ "generate_output_image_filename", @@ -20,4 +21,6 @@ "random_2d_perlin", "read_image", "DownloadProgressBar", + "random_split", + "concatenate_datasets", ] diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index 824a27f594..5ab5f6074e 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -11,13 +11,16 @@ # Copyright (C) 2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +from __future__ import annotations + import math import warnings -from typing import List, Sequence, Union +from typing import TYPE_CHECKING, List, Sequence, Union import torch -from anomalib.data.base import AnomalibDataset +if TYPE_CHECKING: + from anomalib.data.base import AnomalibDataset def concatenate_datasets(datasets: Sequence[AnomalibDataset]) -> AnomalibDataset: From 7e957b690d751c1ca302de9d6b2fd9525cbd8810 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 11:44:36 +0200 Subject: [PATCH 50/96] address codacy issues --- tests/pre_merge/datasets/test_datamodule.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/pre_merge/datasets/test_datamodule.py b/tests/pre_merge/datasets/test_datamodule.py index a893c01478..7d9000fc17 100644 --- a/tests/pre_merge/datasets/test_datamodule.py +++ b/tests/pre_merge/datasets/test_datamodule.py @@ -180,7 +180,7 @@ def test_denormalize_return_numpy(self, data_sample): def test_denormalize_channel_order(self, data_sample): """Denormalize should return a numpy array of order [HxWxC]""" - denormalized_sample = Denormalize().__call__(data_sample["image"].squeeze()) + denormalized_sample = Denormalize()(data_sample["image"].squeeze()) assert len(denormalized_sample.shape) == 3 and denormalized_sample.shape[-1] == 3 def test_representation(self): @@ -241,4 +241,4 @@ def test_image_size(self, input_size, effective_image_size, category="shapes", p data_module = get_datamodule(configurable_parameters) data_module.setup() - assert iter(data_module.train_dataloader()).__next__()["image"].shape[-2:] == effective_image_size + assert next(iter(data_module.train_dataloader()))["image"].shape[-2:] == effective_image_size From 25f503d26fe6af5796c1e34843d4ae9719913786 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 11:53:17 +0200 Subject: [PATCH 51/96] fix pylint issues --- anomalib/data/base.py | 8 ++------ anomalib/data/btech.py | 3 +-- anomalib/data/folder.py | 2 +- anomalib/data/mvtec.py | 2 +- 4 files changed, 5 insertions(+), 10 deletions(-) diff --git a/anomalib/data/base.py b/anomalib/data/base.py index 10b7c2ecf4..0ac402e047 100644 --- a/anomalib/data/base.py +++ b/anomalib/data/base.py @@ -182,16 +182,12 @@ class AnomalibDataModule(LightningDataModule, ABC): num_workers (int): Number of workers used by the train, val and test dataloaders. """ - def __init__( - self, - train_batch_size: int, - eval_batch_size: int, - num_workers: int, - ): + def __init__(self, train_batch_size: int, eval_batch_size: int, num_workers: int, val_split_mode: ValSplitMode): super().__init__() self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size self.num_workers = num_workers + self.val_split_mode = val_split_mode self.train_data: Optional[AnomalibDataset] = None self.val_data: Optional[AnomalibDataset] = None diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 76be5ca087..f6a4879245 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -224,11 +224,10 @@ def __init__( >>> data["image"].shape, data["mask"].shape (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256])) """ - super().__init__(train_batch_size, eval_batch_size, num_workers) + super().__init__(train_batch_size, eval_batch_size, num_workers, val_split_mode) self.root = Path(root) self.category = Path(category) - self.val_split_mode = val_split_mode pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) pre_process_eval = PreProcessor(config=transform_config_eval, image_size=image_size) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index 2a8f63b79b..f5b3c415a3 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -256,9 +256,9 @@ def __init__( train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, num_workers=num_workers, + val_split_mode=val_split_mode, ) - self.val_split_mode = val_split_mode self.split_ratio = split_ratio pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index e52d361c91..33353f5eed 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -167,11 +167,11 @@ def __init__( train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, num_workers=num_workers, + val_split_mode=val_split_mode, ) self.root = Path(root) self.category = Path(category) - self.val_split_mode = val_split_mode # TODO: Get rid of PreProcessor by passing transform directly pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size) From 12459281440f2b996912d86194d4dbbac9943627 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 12:02:21 +0200 Subject: [PATCH 52/96] codacy --- tests/pre_merge/datasets/test_datamodule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pre_merge/datasets/test_datamodule.py b/tests/pre_merge/datasets/test_datamodule.py index 7d9000fc17..6b41137f69 100644 --- a/tests/pre_merge/datasets/test_datamodule.py +++ b/tests/pre_merge/datasets/test_datamodule.py @@ -170,7 +170,7 @@ class TestDenormalize: def test_denormalize_image_pixel_values(self, data_sample): """Test Denormalize denormalizes tensor into [0, 256] range.""" - denormalized_sample = Denormalize().__call__(data_sample["image"].squeeze()) + denormalized_sample = Denormalize()(data_sample["image"].squeeze()) assert denormalized_sample.min() >= 0 and denormalized_sample.max() <= 256 def test_denormalize_return_numpy(self, data_sample): From 0459a0d5f250311cb083443d08d6fc0e78bd8cbd Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 12:30:49 +0200 Subject: [PATCH 53/96] update example dataset config in docs --- docs/source/how_to_guides/train_custom_data.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/how_to_guides/train_custom_data.rst b/docs/source/how_to_guides/train_custom_data.rst index 5974ccffed..9a70fdc88e 100644 --- a/docs/source/how_to_guides/train_custom_data.rst +++ b/docs/source/how_to_guides/train_custom_data.rst @@ -82,12 +82,12 @@ Let's choose `Padim algorithm `_, copy the seed: 0 image_size: 256 train_batch_size: 32 - test_batch_size: 32 + eval_batch_size: 32 num_workers: 8 transform_config: train: null - val: null - create_validation_set: true + eval: null + validation_split_mode: from_test # determines how the validation set is created, options [same_as_test, from_test] tiling: apply: false tile_size: null From 30dc45aafb8495bfbdcf74e59fc98f09a13ff303 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 12:47:32 +0200 Subject: [PATCH 54/96] fix test --- tests/pre_merge/utils/metrics/test_adaptive_threshold.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/pre_merge/utils/metrics/test_adaptive_threshold.py b/tests/pre_merge/utils/metrics/test_adaptive_threshold.py index 1a7eef5b61..607a544c2e 100644 --- a/tests/pre_merge/utils/metrics/test_adaptive_threshold.py +++ b/tests/pre_merge/utils/metrics/test_adaptive_threshold.py @@ -39,6 +39,7 @@ def test_non_adaptive_threshold(): """ config = get_test_configurable_parameters(config_path="anomalib/models/padim/config.yaml") + config.dataset.num_workers = 0 config.model.normalization_method = "none" config.metrics.threshold.adaptive = False config.trainer.fast_dev_run = True From 85c475a70657144d51faa432a2f5e56b600a4004 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 15:32:39 +0200 Subject: [PATCH 55/96] move base classes to separate files (avoid circular import) --- anomalib/data/__init__.py | 5 +- anomalib/data/base.py | 242 -------------------------------- anomalib/data/btech.py | 4 +- anomalib/data/folder.py | 4 +- anomalib/data/mvtec.py | 4 +- anomalib/data/utils/__init__.py | 4 +- anomalib/data/utils/split.py | 18 ++- 7 files changed, 29 insertions(+), 252 deletions(-) delete mode 100644 anomalib/data/base.py diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index 98ee0394e2..0cc71d9d0b 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -8,8 +8,7 @@ from omegaconf import DictConfig, ListConfig -from anomalib.data.base import AnomalibDataModule - +from .base import AnomalibDataModule, AnomalibDataset from .btech import BTech from .folder import Folder from .inference import InferenceDataset @@ -86,6 +85,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: __all__ = [ + "AnomalibDataset", + "AnomalibDataModule", "get_datamodule", "BTech", "Folder", diff --git a/anomalib/data/base.py b/anomalib/data/base.py deleted file mode 100644 index 0ac402e047..0000000000 --- a/anomalib/data/base.py +++ /dev/null @@ -1,242 +0,0 @@ -"""Anomalib dataset and datamodule base classes.""" - -# Copyright (C) 2022 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from __future__ import annotations - -import copy -import logging -from abc import ABC, abstractmethod -from enum import Enum -from typing import Dict, Optional, Sequence, Union - -import cv2 -import numpy as np -import pandas as pd -from pandas import DataFrame -from pytorch_lightning import LightningDataModule -from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS -from torch import Tensor -from torch.utils.data import DataLoader, Dataset - -from anomalib.data.utils import random_split, read_image -from anomalib.pre_processing import PreProcessor - -logger = logging.getLogger(__name__) - - -class Split(str, Enum): - """Split of a subset.""" - - TRAIN = "train" - VAL = "val" - TEST = "test" - - -class ValSplitMode(str, Enum): - """Splitting mode used to obtain validation subset.""" - - SAME_AS_TEST = "same_as_test" - FROM_TEST = "from_test" - - -class AnomalibDataset(Dataset, ABC): - """Anomalib dataset.""" - - def __init__(self, task: str, pre_process: PreProcessor): - super().__init__() - self.task = task - self.pre_process = pre_process - self._samples = None - - def __len__(self) -> int: - """Get length of the dataset.""" - assert isinstance(self._samples, DataFrame) - return len(self._samples) - - def subsample(self, indices: Sequence[int], inplace=False) -> AnomalibDataset: - """Subsamples the dataset at the provided indices. - - Args: - indices (Sequence[int]): Indices at which the dataset is to be subsampled. - inplace (bool): When true, the subsampling will be performed on the instance itself. - """ - dataset = self if inplace else copy.deepcopy(self) - dataset.samples = self.samples.iloc[indices].reset_index(drop=True) - return dataset - - @property - def is_setup(self) -> bool: - """Checks if setup() been called.""" - return isinstance(self._samples, DataFrame) - - @property - def samples(self) -> DataFrame: - """Get the samples dataframe.""" - if not self.is_setup: - raise RuntimeError("Dataset is not setup yet. Call setup() first.") - return self._samples - - @samples.setter - def samples(self, samples: DataFrame): - """Overwrite the samples with a new dataframe. - - Args: - samples (DataFrame): DataFrame with new samples. - """ - self._samples = samples.sort_values(by="image_path", ignore_index=True) - - @property - def has_normal(self) -> bool: - """Check if the dataset contains any normal samples.""" - return 0 in list(self.samples.label_index) - - @property - def has_anomalous(self) -> bool: - """Check if the dataset contains any anomalous samples.""" - return 1 in list(self.samples.label_index) - - def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: - """Get dataset item for the index ``index``. - - Args: - index (int): Index to get the item. - - Returns: - Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. - Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. - """ - assert isinstance(self._samples, DataFrame) - - image_path = self._samples.iloc[index].image_path - image = read_image(image_path) - label_index = self._samples.iloc[index].label_index - - item = dict(image_path=image_path, label=label_index) - - if self.task == "classification": - pre_processed = self.pre_process(image=image) - elif self.task == "segmentation": - mask_path = self._samples.iloc[index].mask_path - - # Only Anomalous (1) images have masks in anomaly datasets - # Therefore, create empty mask for Normal (0) images. - if label_index == 0: - mask = np.zeros(shape=image.shape[:2]) - else: - mask = cv2.imread(mask_path, flags=0) / 255.0 - - pre_processed = self.pre_process(image=image, mask=mask) - - item["mask_path"] = mask_path - item["mask"] = pre_processed["mask"] - else: - raise ValueError(f"Unknown task type: {self.task}") - item["image"] = pre_processed["image"] - - return item - - def __add__(self, other_dataset: AnomalibDataset) -> AnomalibDataset: - """Concatenate this dataset with another dataset.""" - assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of the same type." - assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first." - dataset = copy.deepcopy(self) - dataset.samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True) - return dataset - - def setup(self) -> None: - """Load data/metadata into memory.""" - if not self.is_setup: - self._setup() - assert self.is_setup, "setup() should set self._samples" - - @abstractmethod - def _setup(self) -> DataFrame: - """Set up the data module. - - This method should return a dataframe that contains the information needed by the dataloader to load each of - the dataset items into memory. - The dataframe must at least contain the following columns: - split: the subset to which the dataset item is assigned. - image_path: path to file system location where the image is stored. - label_index: index of the anomaly label, typically 0 for "normal" and 1 for "anomalous". - mask_path (if task == "segmentation"): path to the ground truth masks (for the anomalous images only). - - Example: - |---|-------------------|-----------|-------------|------------------|-------| - | | image_path | label | label_index | mask_path | split | - |---|-------------------|-----------|-------------|------------------|-------| - | 0 | path/to/image.png | anomalous | 1 | path/to/mask.png | train | - |---|-------------------|-----------|-------------|------------------|-------| - """ - raise NotImplementedError - - -class AnomalibDataModule(LightningDataModule, ABC): - """Base Anomalib data module. - - Args: - train_batch_size (int): Batch size used by the train dataloader. - test_batch_size (int): Batch size used by the val and test dataloaders. - num_workers (int): Number of workers used by the train, val and test dataloaders. - """ - - def __init__(self, train_batch_size: int, eval_batch_size: int, num_workers: int, val_split_mode: ValSplitMode): - super().__init__() - self.train_batch_size = train_batch_size - self.eval_batch_size = eval_batch_size - self.num_workers = num_workers - self.val_split_mode = val_split_mode - - self.train_data: Optional[AnomalibDataset] = None - self.val_data: Optional[AnomalibDataset] = None - self.test_data: Optional[AnomalibDataset] = None - - self._samples: Optional[DataFrame] = None - - def setup(self, stage: Optional[str] = None): - """Setup train, validation and test data. - - Args: - stage: Optional[str]: Train/Val/Test stages. (Default value = None) - """ - if not self.is_setup: - self._setup(stage) - assert self.is_setup - - def _setup(self, _stage: Optional[str] = None) -> None: - """Set up the datasets and perform dynamic subset splitting. - - May be overridden in subclass for custom splitting behaviour. - """ - assert self.train_data is not None - assert self.test_data is not None - - self.train_data.setup() - self.test_data.setup() - if self.val_split_mode == ValSplitMode.FROM_TEST: - self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True) - elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: - self.val_data = self.test_data - else: - raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") - - @property - def is_setup(self): - """Checks if setup() has been called.""" - if self.train_data is None or self.val_data is None or self.test_data is None: - return False - return self.train_data.is_setup and self.val_data.is_setup and self.test_data.is_setup - - def train_dataloader(self) -> TRAIN_DATALOADERS: - """Get train dataloader.""" - return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers) - - def val_dataloader(self) -> EVAL_DATALOADERS: - """Get validation dataloader.""" - return DataLoader(self.val_data, shuffle=False, batch_size=self.eval_batch_size, num_workers=self.num_workers) - - def test_dataloader(self) -> EVAL_DATALOADERS: - """Get test dataloader.""" - return DataLoader(self.test_data, shuffle=False, batch_size=self.eval_batch_size, num_workers=self.num_workers) diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index f6a4879245..97bdeabef5 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -23,8 +23,8 @@ from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY from tqdm import tqdm -from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode -from anomalib.data.utils import DownloadProgressBar, hash_check +from anomalib.data import AnomalibDataModule, AnomalibDataset +from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index f5b3c415a3..b0907788fb 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -13,8 +13,8 @@ from pandas import DataFrame from torchvision.datasets.folder import IMG_EXTENSIONS -from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode -from anomalib.data.utils.split import random_split +from anomalib.data import AnomalibDataModule, AnomalibDataset +from anomalib.data.utils import Split, ValSplitMode, random_split from anomalib.pre_processing.pre_process import PreProcessor diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 33353f5eed..7c8bbee9bd 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -32,8 +32,8 @@ import albumentations as A from pandas import DataFrame -from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode -from anomalib.data.utils import DownloadProgressBar, hash_check +from anomalib.data import AnomalibDataModule, AnomalibDataset +from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py index 52b21b8fcf..53eb3f8ef2 100644 --- a/anomalib/data/utils/__init__.py +++ b/anomalib/data/utils/__init__.py @@ -11,7 +11,7 @@ get_image_height_and_width, read_image, ) -from .split import concatenate_datasets, random_split +from .split import Split, ValSplitMode, concatenate_datasets, random_split __all__ = [ "generate_output_image_filename", @@ -23,4 +23,6 @@ "DownloadProgressBar", "random_split", "concatenate_datasets", + "Split", + "ValSplitMode", ] diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index 5ab5f6074e..bec80e3bfd 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -15,12 +15,28 @@ import math import warnings +from enum import Enum from typing import TYPE_CHECKING, List, Sequence, Union import torch if TYPE_CHECKING: - from anomalib.data.base import AnomalibDataset + from anomalib.data import AnomalibDataset + + +class Split(str, Enum): + """Split of a subset.""" + + TRAIN = "train" + VAL = "val" + TEST = "test" + + +class ValSplitMode(str, Enum): + """Splitting mode used to obtain validation subset.""" + + SAME_AS_TEST = "same_as_test" + FROM_TEST = "from_test" def concatenate_datasets(datasets: Sequence[AnomalibDataset]) -> AnomalibDataset: From 0552c1ad8b96529e1db640a1f0a0000ab10e723a Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 16:30:20 +0200 Subject: [PATCH 56/96] add synthetic dataset class --- anomalib/data/synthetic.py | 104 +++++++++++++++++++++++++++++++++++ anomalib/data/utils/split.py | 1 + 2 files changed, 105 insertions(+) create mode 100644 anomalib/data/synthetic.py diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py new file mode 100644 index 0000000000..53ab7e3a05 --- /dev/null +++ b/anomalib/data/synthetic.py @@ -0,0 +1,104 @@ +"""Dataset that generates synthetic anomalies. + +This dataset can be used when there is a lack of real anomalous data. +""" + +import os +import tempfile +from pathlib import Path +from typing import Union + +import albumentations as A +import cv2 +import pandas as pd +from albumentations.pytorch import ToTensorV2 +from pandas import DataFrame + +from anomalib.data.base.dataset import AnomalibDataset +from anomalib.data.utils import read_image +from anomalib.models.draem.utils import Augmenter +from anomalib.pre_processing import PreProcessor + + +def make_synthetic_dataset(normal_samples: DataFrame, root: Union[Path, str]) -> DataFrame: + """Convert a set of normal samples into a mixed set of normal and synthetic anomalous samples. + + The synthetic images will be saved to the file system in the specified root directory under /images. + For the synthetic anomalous images, the masks will be saved under /ground_truth. + + Args: + normal_samples (DataFrame): DataFrame describing a set of normal images. + root (Union[Path, str]): Root directory to which the image files will be written. + """ + im_dir = Path(root) / "images" + mask_dir = Path(root) / "ground_truth" + os.makedirs(im_dir) + os.makedirs(mask_dir) + + # make fakes + augmenter = Augmenter("./datasets/dtd") + + transform = A.Compose([A.ToFloat(), ToTensorV2()]) + + new_samples_list = [] + for index, sample in normal_samples.iterrows(): + # load image + im = read_image(sample.image_path) + # to tensor + im = transform(image=im)["image"].unsqueeze(0) + # apply rand aug + aug_im, mask = augmenter.augment_batch(im) + # + is_anomalous = mask.max() == 1 + # write image + aug_im = (aug_im.squeeze().permute((1, 2, 0)) * 255).numpy() + aug_im = cv2.cvtColor(aug_im, cv2.COLOR_RGB2BGR) + im_path = im_dir / (str(index).zfill(3) + ".png") + cv2.imwrite(str(im_path), aug_im) + # write mask + if is_anomalous: + mask = (mask.squeeze() * 255).numpy() + mask_path = mask_dir / (str(index).zfill(3) + ".png") + cv2.imwrite(str(mask_path), mask) + # update path in samples + new_samples_list.append( + dict( + image_path=str(im_path), + label="abnormal" if is_anomalous else "normal", + label_index=1 if is_anomalous else 0, + mask_path=str(mask_path) if is_anomalous else "", + split=None, + ) + ) + + return pd.DataFrame(new_samples_list) + + +class SyntheticValidationSet(AnomalibDataset): + """Dataset which reads synthetically generated anomalous images from a temporary folder. + + Args: + task (str): Task type, either "classification" or "segmentation". + pre_process (PreProcessor): Preprocessor object used to transform the input images. + normal_samples (DataFrame): Normal samples to which the anomalous augmentations will be applied. + """ + + def __init__(self, task: str, pre_process: PreProcessor, normal_samples: DataFrame): + super().__init__(task, pre_process) + + self.normal_samples = normal_samples + self.tempfolder = tempfile.TemporaryDirectory(dir="./datasets") + self.setup() + + @classmethod + def from_dataset(cls, dataset): + """Create a synthetic anomaly dataset from an existing dataset of normal images.""" + return cls(task=dataset.task, pre_process=dataset.pre_process, normal_samples=dataset.samples) + + def _setup(self) -> None: + """Create samples dataframe.""" + self.samples = make_synthetic_dataset(self.normal_samples, self.tempfolder.name) + + def __del__(self): + """Make sure the temporary directory is cleaned up when the dataset object is deleted.""" + self.tempfolder.cleanup() diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index bec80e3bfd..1c93d86095 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -37,6 +37,7 @@ class ValSplitMode(str, Enum): SAME_AS_TEST = "same_as_test" FROM_TEST = "from_test" + SYNTHETIC = "synthetic" def concatenate_datasets(datasets: Sequence[AnomalibDataset]) -> AnomalibDataset: From bf4f5372375ff4e9ceadd13dc9db8fffdc504294 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 17:07:09 +0200 Subject: [PATCH 57/96] move augmenter to data directory --- anomalib/data/synthetic.py | 5 +- anomalib/data/utils/__init__.py | 2 + .../{models/draem => data}/utils/augmenter.py | 29 ++++++--- anomalib/models/draem/lightning_model.py | 2 +- anomalib/models/draem/perlin_new.py | 59 +++++++++++++++++++ anomalib/models/draem/utils/__init__.py | 8 --- 6 files changed, 85 insertions(+), 20 deletions(-) rename anomalib/{models/draem => data}/utils/augmenter.py (86%) create mode 100644 anomalib/models/draem/perlin_new.py delete mode 100644 anomalib/models/draem/utils/__init__.py diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py index 53ab7e3a05..6b9e7b7bf8 100644 --- a/anomalib/data/synthetic.py +++ b/anomalib/data/synthetic.py @@ -15,8 +15,7 @@ from pandas import DataFrame from anomalib.data.base.dataset import AnomalibDataset -from anomalib.data.utils import read_image -from anomalib.models.draem.utils import Augmenter +from anomalib.data.utils import Augmenter, read_image from anomalib.pre_processing import PreProcessor @@ -36,7 +35,7 @@ def make_synthetic_dataset(normal_samples: DataFrame, root: Union[Path, str]) -> os.makedirs(mask_dir) # make fakes - augmenter = Augmenter("./datasets/dtd") + augmenter = Augmenter("./datasets/dtd", beta=(0.01, 0.2)) transform = A.Compose([A.ToFloat(), ToTensorV2()]) diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py index 53eb3f8ef2..ecb869568f 100644 --- a/anomalib/data/utils/__init__.py +++ b/anomalib/data/utils/__init__.py @@ -3,6 +3,7 @@ # Copyright (C) 2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +from .augmenter import Augmenter from .download import DownloadProgressBar, hash_check from .generators import random_2d_perlin from .image import ( @@ -25,4 +26,5 @@ "concatenate_datasets", "Split", "ValSplitMode", + "Augmenter", ] diff --git a/anomalib/models/draem/utils/augmenter.py b/anomalib/data/utils/augmenter.py similarity index 86% rename from anomalib/models/draem/utils/augmenter.py rename to anomalib/data/utils/augmenter.py index 6433c4338c..e9d8e3f4ce 100644 --- a/anomalib/models/draem/utils/augmenter.py +++ b/anomalib/data/utils/augmenter.py @@ -13,7 +13,7 @@ import glob import math import random -from typing import Optional, Tuple +from typing import Optional, Tuple, Union import cv2 import imgaug.augmenters as iaa @@ -22,7 +22,7 @@ from torch import Tensor from torchvision.datasets.folder import IMG_EXTENSIONS -from anomalib.data.utils import random_2d_perlin +from anomalib.data.utils.generators.perlin import random_2d_perlin def nextpow2(value): @@ -38,7 +38,15 @@ class Augmenter: noise. If not specified, random noise will be used instead. """ - def __init__(self, anomaly_source_path: Optional[str] = None): + def __init__( + self, + anomaly_source_path: Optional[str] = None, + p_anomalous: float = 0.5, + beta: Union[float, Tuple[float, float]] = (0.2, 1.0), + ): + + self.p_anomalous = p_anomalous + self.beta = beta self.anomaly_source_paths = [] if anomaly_source_path is not None: @@ -132,7 +140,7 @@ def augment_batch(self, batch: Tensor) -> Tuple[Tensor, Tensor]: perturbations_list = [] masks_list = [] for _ in range(batch_size): - if random.random() > 0.5: # include 50% normal samples + if random.random() < self.p_anomalous: # include 50% normal samples perturbations_list.append(torch.zeros((channels, height, width))) masks_list.append(torch.zeros((1, height, width))) else: @@ -147,9 +155,14 @@ def augment_batch(self, batch: Tensor) -> Tuple[Tensor, Tensor]: masks = torch.stack(masks_list).to(batch.device) # Apply perturbations batch wise - beta = torch.rand(batch_size) * 0.8 - beta = beta.view(batch_size, 1, 1, 1).expand_as(batch).to(batch.device) - - augmented_batch = batch * (1 - masks) + (1 - beta) * perturbations + beta * batch * (masks) + if isinstance(self.beta, float): + beta = self.beta + elif isinstance(self.beta, tuple): + beta = torch.rand(batch_size) * (self.beta[1] - self.beta[0]) + self.beta[0] + beta = beta.view(batch_size, 1, 1, 1).expand_as(batch).to(batch.device) + else: + raise ValueError("Beta must be either float or tuple of floats") + + augmented_batch = batch * (1 - masks) + (beta) * perturbations + (1 - beta) * batch * (masks) return augmented_batch, masks diff --git a/anomalib/models/draem/lightning_model.py b/anomalib/models/draem/lightning_model.py index 18e3d2d41b..e54c8e9bd4 100644 --- a/anomalib/models/draem/lightning_model.py +++ b/anomalib/models/draem/lightning_model.py @@ -14,10 +14,10 @@ from pytorch_lightning.utilities.cli import MODEL_REGISTRY from torch import Tensor, nn +from anomalib.data.utils import Augmenter from anomalib.models.components import AnomalyModule from anomalib.models.draem.loss import DraemLoss from anomalib.models.draem.torch_model import DraemModel -from anomalib.models.draem.utils import Augmenter __all__ = ["Draem", "DraemLightning"] diff --git a/anomalib/models/draem/perlin_new.py b/anomalib/models/draem/perlin_new.py new file mode 100644 index 0000000000..d20e2e6957 --- /dev/null +++ b/anomalib/models/draem/perlin_new.py @@ -0,0 +1,59 @@ +import torch as th +from matplotlib import pyplot as plt + + +def interp(t): + # return 3 * t**2 - 2 * t ** 3 + return 6 * t**5 - 15 * t**4 + 10 * t**3 + + +def fade(t): + return 6 * t**5 - 15 * t**4 + 10 * t**3 + + +def perlin(width, height, scale=10, device=None): + gx, gy = th.randn(2, width + 1, height + 1, 1, 1, device=device) + xs = th.linspace(0, 1, scale + 1)[:-1, None].to(device) + ys = th.linspace(0, 1, scale + 1)[None, :-1].to(device) + + wx = 1 - interp(xs) + wy = 1 - interp(ys) + + dots = 0 + dots += wx * wy * (gx[:-1, :-1] * xs + gy[:-1, :-1] * ys) + dots += (1 - wx) * wy * (-gx[1:, :-1] * (1 - xs) + gy[1:, :-1] * ys) + dots += wx * (1 - wy) * (gx[:-1, 1:] * xs - gy[:-1, 1:] * (1 - ys)) + dots += (1 - wx) * (1 - wy) * (-gx[1:, 1:] * (1 - xs) - gy[1:, 1:] * (1 - ys)) + + return dots.permute(0, 2, 1, 3).contiguous().view(width * scale, height * scale) + + +# def my_perlin(width, height, scale=10): + + +def perlin_ms(octaves=[1, 1, 1, 1], width=2, height=2, device=None): + scale = 2 ** len(octaves) + out = 0 + for oct in octaves: + p = perlin(width, height, scale, device) + out += p * oct + scale //= 2 + width *= 2 + height *= 2 + return out + + +if __name__ == "__main__": + perlin = perlin(224, 224, 2) + plt.figure(figsize=(12, 12)) + plt.imshow(perlin) + plt.show() + + plt.figure(figsize=(12, 12)) + for idx, rho in enumerate([1, 2, 4, 8]): + plt.subplot(2, 2, idx + 1) + out = perlin_ms([rho**-i for i in range(4)], 6, 6).cpu().numpy() + # out = perlin(6, 6, 2**rho).cpu().numpy() + plt.imshow(out) + plt.title(f"Decay for finer grids as {rho} ** -scale") + plt.show() diff --git a/anomalib/models/draem/utils/__init__.py b/anomalib/models/draem/utils/__init__.py deleted file mode 100644 index dde7003813..0000000000 --- a/anomalib/models/draem/utils/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Helpers for the DRAEM model implementation.""" - -# Copyright (C) 2022 Intel Corporation -# SPDX-License-Identifier: Apache-2.0 - -from .augmenter import Augmenter - -__all__ = ["Augmenter"] From cc328967952a17d99630665edc262f22e4e05668 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 17:10:38 +0200 Subject: [PATCH 58/96] add base classes --- anomalib/data/base/__init__.py | 10 ++ anomalib/data/base/datamodule.py | 89 ++++++++++++++++++ anomalib/data/base/dataset.py | 155 +++++++++++++++++++++++++++++++ 3 files changed, 254 insertions(+) create mode 100644 anomalib/data/base/__init__.py create mode 100644 anomalib/data/base/datamodule.py create mode 100644 anomalib/data/base/dataset.py diff --git a/anomalib/data/base/__init__.py b/anomalib/data/base/__init__.py new file mode 100644 index 0000000000..afb5a62463 --- /dev/null +++ b/anomalib/data/base/__init__.py @@ -0,0 +1,10 @@ +"""Base classes for custom dataset and datamodules.""" + +# Copyright (C) 2022 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + + +from .datamodule import AnomalibDataModule +from .dataset import AnomalibDataset + +__all__ = ["AnomalibDataset", "AnomalibDataModule"] diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py new file mode 100644 index 0000000000..b8d8603bdd --- /dev/null +++ b/anomalib/data/base/datamodule.py @@ -0,0 +1,89 @@ +"""Anomalib datamodule base class.""" + +# Copyright (C) 2022 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import logging +from abc import ABC +from typing import Optional + +from pandas import DataFrame +from pytorch_lightning import LightningDataModule +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS +from torch.utils.data import DataLoader + +from anomalib.data.base.dataset import AnomalibDataset +from anomalib.data.utils import ValSplitMode, random_split + +logger = logging.getLogger(__name__) + + +class AnomalibDataModule(LightningDataModule, ABC): + """Base Anomalib data module. + + Args: + train_batch_size (int): Batch size used by the train dataloader. + test_batch_size (int): Batch size used by the val and test dataloaders. + num_workers (int): Number of workers used by the train, val and test dataloaders. + """ + + def __init__(self, train_batch_size: int, eval_batch_size: int, num_workers: int, val_split_mode: ValSplitMode): + super().__init__() + self.train_batch_size = train_batch_size + self.eval_batch_size = eval_batch_size + self.num_workers = num_workers + self.val_split_mode = val_split_mode + + self.train_data: Optional[AnomalibDataset] = None + self.val_data: Optional[AnomalibDataset] = None + self.test_data: Optional[AnomalibDataset] = None + + self._samples: Optional[DataFrame] = None + + def setup(self, stage: Optional[str] = None): + """Setup train, validation and test data. + + Args: + stage: Optional[str]: Train/Val/Test stages. (Default value = None) + """ + if not self.is_setup: + self._setup(stage) + assert self.is_setup + + def _setup(self, _stage: Optional[str] = None) -> None: + """Set up the datasets and perform dynamic subset splitting. + + May be overridden in subclass for custom splitting behaviour. + """ + assert self.train_data is not None + assert self.test_data is not None + + self.train_data.setup() + self.test_data.setup() + if self.val_split_mode == ValSplitMode.FROM_TEST: + self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True) + elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: + self.val_data = self.test_data + else: + raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") + + @property + def is_setup(self): + """Checks if setup() has been called.""" + if self.train_data is None or self.val_data is None or self.test_data is None: + return False + return self.train_data.is_setup and self.val_data.is_setup and self.test_data.is_setup + + def train_dataloader(self) -> TRAIN_DATALOADERS: + """Get train dataloader.""" + return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers) + + def val_dataloader(self) -> EVAL_DATALOADERS: + """Get validation dataloader.""" + return DataLoader(self.val_data, shuffle=False, batch_size=self.eval_batch_size, num_workers=self.num_workers) + + def test_dataloader(self) -> EVAL_DATALOADERS: + """Get test dataloader.""" + return DataLoader(self.test_data, shuffle=False, batch_size=self.eval_batch_size, num_workers=self.num_workers) diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py new file mode 100644 index 0000000000..e5ae8fceaf --- /dev/null +++ b/anomalib/data/base/dataset.py @@ -0,0 +1,155 @@ +"""Anomalib dataset and datamodule base classes.""" + +# Copyright (C) 2022 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import copy +import logging +from abc import ABC, abstractmethod +from typing import Dict, Sequence, Union + +import cv2 +import numpy as np +import pandas as pd +from pandas import DataFrame +from torch import Tensor +from torch.utils.data import Dataset + +from anomalib.data.utils import read_image +from anomalib.pre_processing import PreProcessor + +logger = logging.getLogger(__name__) + + +class AnomalibDataset(Dataset, ABC): + """Anomalib dataset.""" + + def __init__(self, task: str, pre_process: PreProcessor): + super().__init__() + self.task = task + self.pre_process = pre_process + self._samples = None + + def __len__(self) -> int: + """Get length of the dataset.""" + assert isinstance(self._samples, DataFrame) + return len(self._samples) + + def subsample(self, indices: Sequence[int], inplace=False) -> AnomalibDataset: + """Subsamples the dataset at the provided indices. + + Args: + indices (Sequence[int]): Indices at which the dataset is to be subsampled. + inplace (bool): When true, the subsampling will be performed on the instance itself. + """ + dataset = self if inplace else copy.deepcopy(self) + dataset.samples = self.samples.iloc[indices].reset_index(drop=True) + return dataset + + @property + def is_setup(self) -> bool: + """Checks if setup() been called.""" + return isinstance(self._samples, DataFrame) + + @property + def samples(self) -> DataFrame: + """Get the samples dataframe.""" + if not self.is_setup: + raise RuntimeError("Dataset is not setup yet. Call setup() first.") + return self._samples + + @samples.setter + def samples(self, samples: DataFrame): + """Overwrite the samples with a new dataframe. + + Args: + samples (DataFrame): DataFrame with new samples. + """ + self._samples = samples.sort_values(by="image_path", ignore_index=True) + + @property + def has_normal(self) -> bool: + """Check if the dataset contains any normal samples.""" + return 0 in list(self.samples.label_index) + + @property + def has_anomalous(self) -> bool: + """Check if the dataset contains any anomalous samples.""" + return 1 in list(self.samples.label_index) + + def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: + """Get dataset item for the index ``index``. + + Args: + index (int): Index to get the item. + + Returns: + Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. + Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. + """ + assert isinstance(self._samples, DataFrame) + + image_path = self._samples.iloc[index].image_path + image = read_image(image_path) + label_index = self._samples.iloc[index].label_index + + item = dict(image_path=image_path, label=label_index) + + if self.task == "classification": + pre_processed = self.pre_process(image=image) + elif self.task == "segmentation": + mask_path = self._samples.iloc[index].mask_path + + # Only Anomalous (1) images have masks in anomaly datasets + # Therefore, create empty mask for Normal (0) images. + if label_index == 0: + mask = np.zeros(shape=image.shape[:2]) + else: + mask = cv2.imread(mask_path, flags=0) / 255.0 + + pre_processed = self.pre_process(image=image, mask=mask) + + item["mask_path"] = mask_path + item["mask"] = pre_processed["mask"] + else: + raise ValueError(f"Unknown task type: {self.task}") + item["image"] = pre_processed["image"] + + return item + + def __add__(self, other_dataset: AnomalibDataset) -> AnomalibDataset: + """Concatenate this dataset with another dataset.""" + assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of the same type." + assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first." + dataset = copy.deepcopy(self) + dataset.samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True) + return dataset + + def setup(self) -> None: + """Load data/metadata into memory.""" + if not self.is_setup: + self._setup() + assert self.is_setup, "setup() should set self._samples" + + @abstractmethod + def _setup(self) -> DataFrame: + """Set up the data module. + + This method should return a dataframe that contains the information needed by the dataloader to load each of + the dataset items into memory. + The dataframe must at least contain the following columns: + split: the subset to which the dataset item is assigned. + image_path: path to file system location where the image is stored. + label_index: index of the anomaly label, typically 0 for "normal" and 1 for "anomalous". + mask_path (if task == "segmentation"): path to the ground truth masks (for the anomalous images only). + + Example: + |---|-------------------|-----------|-------------|------------------|-------| + | | image_path | label | label_index | mask_path | split | + |---|-------------------|-----------|-------------|------------------|-------| + | 0 | path/to/image.png | anomalous | 1 | path/to/mask.png | train | + |---|-------------------|-----------|-------------|------------------|-------| + """ + raise NotImplementedError From 23d47666bd15e317c163eef02c26782e9f5142e8 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 17:12:01 +0200 Subject: [PATCH 59/96] update docstring --- anomalib/data/base/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py index e5ae8fceaf..c73c06b185 100644 --- a/anomalib/data/base/dataset.py +++ b/anomalib/data/base/dataset.py @@ -1,4 +1,4 @@ -"""Anomalib dataset and datamodule base classes.""" +"""Anomalib dataset base class.""" # Copyright (C) 2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 From 05ba31df773fd4b0c1c59f546f5bbdc716d8ac51 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 17:18:43 +0200 Subject: [PATCH 60/96] use synthetic dataset in base datamodule --- anomalib/data/base/datamodule.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py index b8d8603bdd..576d68bc43 100644 --- a/anomalib/data/base/datamodule.py +++ b/anomalib/data/base/datamodule.py @@ -15,6 +15,7 @@ from torch.utils.data import DataLoader from anomalib.data.base.dataset import AnomalibDataset +from anomalib.data.synthetic import SyntheticValidationSet from anomalib.data.utils import ValSplitMode, random_split logger = logging.getLogger(__name__) @@ -66,6 +67,9 @@ def _setup(self, _stage: Optional[str] = None) -> None: self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True) elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: self.val_data = self.test_data + elif self.val_split_mode == ValSplitMode.SYNTHETIC: + self.train_data, normal_val_data = random_split(self.train_data, 0.5) + self.val_data = SyntheticValidationSet.from_dataset(normal_val_data) else: raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") From e8d7998c669baf9f8f1f0827c008cbafc8473c7e Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 14 Oct 2022 17:21:33 +0200 Subject: [PATCH 61/96] fix imports --- anomalib/data/btech.py | 2 +- anomalib/data/folder.py | 2 +- anomalib/data/mvtec.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 97bdeabef5..1636d5ec9f 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -23,7 +23,7 @@ from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY from tqdm import tqdm -from anomalib.data import AnomalibDataModule, AnomalibDataset +from anomalib.data.base import AnomalibDataModule, AnomalibDataset from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check from anomalib.pre_processing import PreProcessor diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index b0907788fb..bcfd30adf6 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -13,7 +13,7 @@ from pandas import DataFrame from torchvision.datasets.folder import IMG_EXTENSIONS -from anomalib.data import AnomalibDataModule, AnomalibDataset +from anomalib.data.base import AnomalibDataModule, AnomalibDataset from anomalib.data.utils import Split, ValSplitMode, random_split from anomalib.pre_processing.pre_process import PreProcessor diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 7c8bbee9bd..8dd70af4a1 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -32,7 +32,7 @@ import albumentations as A from pandas import DataFrame -from anomalib.data import AnomalibDataModule, AnomalibDataset +from anomalib.data.base import AnomalibDataModule, AnomalibDataset from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check from anomalib.pre_processing import PreProcessor From 26b6b83375483333eda530f1755e7812d371161b Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 17 Oct 2022 15:11:09 +0200 Subject: [PATCH 62/96] clean up synthetic anomaly dataset implementation --- anomalib/data/synthetic.py | 128 +++++++++++++++++++++++-------------- 1 file changed, 79 insertions(+), 49 deletions(-) diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py index 6b9e7b7bf8..3f8956f21b 100644 --- a/anomalib/data/synthetic.py +++ b/anomalib/data/synthetic.py @@ -3,8 +3,9 @@ This dataset can be used when there is a lack of real anomalous data. """ +import math import os -import tempfile +import shutil from pathlib import Path from typing import Union @@ -12,65 +13,81 @@ import cv2 import pandas as pd from albumentations.pytorch import ToTensorV2 -from pandas import DataFrame +from pandas import DataFrame, Series from anomalib.data.base.dataset import AnomalibDataset -from anomalib.data.utils import Augmenter, read_image +from anomalib.data.utils import Augmenter, Split, read_image from anomalib.pre_processing import PreProcessor -def make_synthetic_dataset(normal_samples: DataFrame, root: Union[Path, str]) -> DataFrame: +def make_synthetic_dataset( + source_samples: DataFrame, im_dir: Union[Path, str], mask_dir: Union[Path, str], anomalous_ratio: float = 0.5 +) -> DataFrame: """Convert a set of normal samples into a mixed set of normal and synthetic anomalous samples. The synthetic images will be saved to the file system in the specified root directory under /images. For the synthetic anomalous images, the masks will be saved under /ground_truth. Args: - normal_samples (DataFrame): DataFrame describing a set of normal images. - root (Union[Path, str]): Root directory to which the image files will be written. + source_samples (DataFrame): Normal images that will be used as source for the synthetic anomalous images. + im_dir (Union[Path, str]): Directory to which the synthetic anomalous image files will be written. + mask_dir (Union[Path, str]): Directory to which the ground truth anomaly masks will be written. + anomalous_ratio (float): Fraction of source samples that will be converted into anomalous samples. """ - im_dir = Path(root) / "images" - mask_dir = Path(root) / "ground_truth" - os.makedirs(im_dir) - os.makedirs(mask_dir) - - # make fakes - augmenter = Augmenter("./datasets/dtd", beta=(0.01, 0.2)) - + assert 1 not in source_samples.label_index.values, "All source images must be normal." + assert os.path.isdir(im_dir), f"{im_dir} is not a folder." + assert os.path.isdir(mask_dir), f"{mask_dir} is not a folder" + + # filter relevant columns + source_samples = source_samples.filter(["image_path", "label", "label_index", "mask_path", "split"]) + # randomly select samples for augmentation + n_anomalous = int(anomalous_ratio * len(source_samples)) + anomalous_samples = source_samples.sample(n_anomalous) + normal_samples = source_samples.drop(anomalous_samples.index) + anomalous_samples = anomalous_samples.reset_index(drop=True) + + # initialize augmenter + augmenter = Augmenter("./datasets/dtd", p_anomalous=1.0, beta=(0.01, 0.2)) + + # initialize transform for source images transform = A.Compose([A.ToFloat(), ToTensorV2()]) - new_samples_list = [] - for index, sample in normal_samples.iterrows(): - # load image - im = read_image(sample.image_path) - # to tensor - im = transform(image=im)["image"].unsqueeze(0) - # apply rand aug - aug_im, mask = augmenter.augment_batch(im) - # - is_anomalous = mask.max() == 1 + def augment(sample: Series) -> Series: + """Helper function to apply synthetic anomalous augmentation to a sample from a dataframe. + + Reads an image, applies the augmentations, writes the augmented image and corresponding mask to the file system, + and returns a new Series object with the updates labels and file locations. + + Args: + sample (Series): DataFrame row containing info about the image that will be augmented. + + Returns: + Series: DataFrame row with updated information about the augmented image. + """ + # read and transform image + image = read_image(sample.image_path) + image = transform(image=image)["image"].unsqueeze(0) + # apply anomalous perturbation + aug_im, mask = augmenter.augment_batch(image) + # target file name with leading zeros + file_name = f"{str(sample.name).zfill(int(math.log10(n_anomalous)) + 1)}.png" # write image aug_im = (aug_im.squeeze().permute((1, 2, 0)) * 255).numpy() aug_im = cv2.cvtColor(aug_im, cv2.COLOR_RGB2BGR) - im_path = im_dir / (str(index).zfill(3) + ".png") - cv2.imwrite(str(im_path), aug_im) + im_path = str(Path(im_dir) / file_name) + cv2.imwrite(im_path, aug_im) # write mask - if is_anomalous: - mask = (mask.squeeze() * 255).numpy() - mask_path = mask_dir / (str(index).zfill(3) + ".png") - cv2.imwrite(str(mask_path), mask) - # update path in samples - new_samples_list.append( - dict( - image_path=str(im_path), - label="abnormal" if is_anomalous else "normal", - label_index=1 if is_anomalous else 0, - mask_path=str(mask_path) if is_anomalous else "", - split=None, - ) - ) - - return pd.DataFrame(new_samples_list) + mask = (mask.squeeze() * 255).numpy() + mask_path = str(Path(mask_dir) / file_name) + cv2.imwrite(mask_path, mask) + out = dict(image_path=im_path, label="abnormal", label_index=1, mask_path=mask_path, split=Split.VAL) + return Series(out) + + anomalous_samples = anomalous_samples.apply(augment, axis=1) + + samples = pd.concat([normal_samples, anomalous_samples], ignore_index=True) + + return samples class SyntheticValidationSet(AnomalibDataset): @@ -79,25 +96,38 @@ class SyntheticValidationSet(AnomalibDataset): Args: task (str): Task type, either "classification" or "segmentation". pre_process (PreProcessor): Preprocessor object used to transform the input images. - normal_samples (DataFrame): Normal samples to which the anomalous augmentations will be applied. + source_samples (DataFrame): Normal samples to which the anomalous augmentations will be applied. """ - def __init__(self, task: str, pre_process: PreProcessor, normal_samples: DataFrame): + def __init__(self, task: str, pre_process: PreProcessor, source_samples: DataFrame): super().__init__(task, pre_process) - self.normal_samples = normal_samples - self.tempfolder = tempfile.TemporaryDirectory(dir="./datasets") + self.source_samples = source_samples + + # Files will be written to a temporary directory in the workdir, which is cleaned up after code execution + self.root = Path("./.tmp/synthetic_anomaly") + self.im_dir = self.root / "images" + self.mask_dir = self.root / "ground_truth" + + # clean up any existing data that may be left over from previous run + if os.path.exists(self.root): + shutil.rmtree(self.root) + + # create directories + os.makedirs(self.im_dir) + os.makedirs(self.mask_dir) + self.setup() @classmethod def from_dataset(cls, dataset): """Create a synthetic anomaly dataset from an existing dataset of normal images.""" - return cls(task=dataset.task, pre_process=dataset.pre_process, normal_samples=dataset.samples) + return cls(task=dataset.task, pre_process=dataset.pre_process, source_samples=dataset.samples) def _setup(self) -> None: """Create samples dataframe.""" - self.samples = make_synthetic_dataset(self.normal_samples, self.tempfolder.name) + self.samples = make_synthetic_dataset(self.source_samples, self.im_dir, self.mask_dir, 0.5) def __del__(self): """Make sure the temporary directory is cleaned up when the dataset object is deleted.""" - self.tempfolder.cleanup() + shutil.rmtree(self.root) From c32fee94bb656b3e607201852530b89afd8d3446 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 17 Oct 2022 15:35:16 +0200 Subject: [PATCH 63/96] fix mistake in augmenter --- anomalib/data/utils/augmenter.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anomalib/data/utils/augmenter.py b/anomalib/data/utils/augmenter.py index e9d8e3f4ce..b08bb11898 100644 --- a/anomalib/data/utils/augmenter.py +++ b/anomalib/data/utils/augmenter.py @@ -140,7 +140,7 @@ def augment_batch(self, batch: Tensor) -> Tuple[Tensor, Tensor]: perturbations_list = [] masks_list = [] for _ in range(batch_size): - if random.random() < self.p_anomalous: # include 50% normal samples + if random.random() > self.p_anomalous: # include normal samples perturbations_list.append(torch.zeros((channels, height, width))) masks_list.append(torch.zeros((1, height, width))) else: @@ -159,7 +159,7 @@ def augment_batch(self, batch: Tensor) -> Tuple[Tensor, Tensor]: beta = self.beta elif isinstance(self.beta, tuple): beta = torch.rand(batch_size) * (self.beta[1] - self.beta[0]) + self.beta[0] - beta = beta.view(batch_size, 1, 1, 1).expand_as(batch).to(batch.device) + beta = beta.view(batch_size, 1, 1, 1).expand_as(batch).to(batch.device) # type: ignore else: raise ValueError("Beta must be either float or tuple of floats") From e1204349c87a630a4b9c8cd23af57a5a98069a7d Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 17 Oct 2022 15:51:57 +0200 Subject: [PATCH 64/96] change default split ratio --- anomalib/data/base/datamodule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py index 576d68bc43..361676a60a 100644 --- a/anomalib/data/base/datamodule.py +++ b/anomalib/data/base/datamodule.py @@ -68,7 +68,7 @@ def _setup(self, _stage: Optional[str] = None) -> None: elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: self.val_data = self.test_data elif self.val_split_mode == ValSplitMode.SYNTHETIC: - self.train_data, normal_val_data = random_split(self.train_data, 0.5) + self.train_data, normal_val_data = random_split(self.train_data, 0.3) self.val_data = SyntheticValidationSet.from_dataset(normal_val_data) else: raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") From 14ee645399651eeeee740d99ef268dd8c7bc76d8 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 17 Oct 2022 17:22:43 +0200 Subject: [PATCH 65/96] remove accidentally added file --- anomalib/models/draem/perlin_new.py | 59 ----------------------------- 1 file changed, 59 deletions(-) delete mode 100644 anomalib/models/draem/perlin_new.py diff --git a/anomalib/models/draem/perlin_new.py b/anomalib/models/draem/perlin_new.py deleted file mode 100644 index d20e2e6957..0000000000 --- a/anomalib/models/draem/perlin_new.py +++ /dev/null @@ -1,59 +0,0 @@ -import torch as th -from matplotlib import pyplot as plt - - -def interp(t): - # return 3 * t**2 - 2 * t ** 3 - return 6 * t**5 - 15 * t**4 + 10 * t**3 - - -def fade(t): - return 6 * t**5 - 15 * t**4 + 10 * t**3 - - -def perlin(width, height, scale=10, device=None): - gx, gy = th.randn(2, width + 1, height + 1, 1, 1, device=device) - xs = th.linspace(0, 1, scale + 1)[:-1, None].to(device) - ys = th.linspace(0, 1, scale + 1)[None, :-1].to(device) - - wx = 1 - interp(xs) - wy = 1 - interp(ys) - - dots = 0 - dots += wx * wy * (gx[:-1, :-1] * xs + gy[:-1, :-1] * ys) - dots += (1 - wx) * wy * (-gx[1:, :-1] * (1 - xs) + gy[1:, :-1] * ys) - dots += wx * (1 - wy) * (gx[:-1, 1:] * xs - gy[:-1, 1:] * (1 - ys)) - dots += (1 - wx) * (1 - wy) * (-gx[1:, 1:] * (1 - xs) - gy[1:, 1:] * (1 - ys)) - - return dots.permute(0, 2, 1, 3).contiguous().view(width * scale, height * scale) - - -# def my_perlin(width, height, scale=10): - - -def perlin_ms(octaves=[1, 1, 1, 1], width=2, height=2, device=None): - scale = 2 ** len(octaves) - out = 0 - for oct in octaves: - p = perlin(width, height, scale, device) - out += p * oct - scale //= 2 - width *= 2 - height *= 2 - return out - - -if __name__ == "__main__": - perlin = perlin(224, 224, 2) - plt.figure(figsize=(12, 12)) - plt.imshow(perlin) - plt.show() - - plt.figure(figsize=(12, 12)) - for idx, rho in enumerate([1, 2, 4, 8]): - plt.subplot(2, 2, idx + 1) - out = perlin_ms([rho**-i for i in range(4)], 6, 6).cpu().numpy() - # out = perlin(6, 6, 2**rho).cpu().numpy() - plt.imshow(out) - plt.title(f"Decay for finer grids as {rho} ** -scale") - plt.show() From 9c4e7bf6e7f05a8b036f3bde5b4579193df861e2 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Tue, 18 Oct 2022 15:35:18 +0200 Subject: [PATCH 66/96] validation_split_mode -> val_split_mode --- anomalib/data/__init__.py | 6 +++--- anomalib/models/cflow/config.yaml | 2 +- anomalib/models/dfkde/config.yaml | 2 +- anomalib/models/dfm/config.yaml | 2 +- anomalib/models/draem/config.yaml | 2 +- anomalib/models/fastflow/config.yaml | 2 +- anomalib/models/ganomaly/config.yaml | 2 +- anomalib/models/padim/config.yaml | 2 +- anomalib/models/patchcore/config.yaml | 2 +- anomalib/models/reverse_distillation/config.yaml | 2 +- anomalib/models/stfpm/config.yaml | 2 +- 11 files changed, 13 insertions(+), 13 deletions(-) diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index 0cc71d9d0b..55cdd7aa11 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -41,7 +41,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: task=config.dataset.task, transform_config_train=config.dataset.transform_config.train, transform_config_eval=config.dataset.transform_config.eval, - val_split_mode=config.dataset.validation_split_mode, + val_split_mode=config.dataset.val_split_mode, ) elif config.dataset.format.lower() == "btech": datamodule = BTech( @@ -54,7 +54,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: task=config.dataset.task, transform_config_train=config.dataset.transform_config.train, transform_config_eval=config.dataset.transform_config.eval, - val_split_mode=config.dataset.validation_split_mode, + val_split_mode=config.dataset.val_split_mode, ) elif config.dataset.format.lower() == "folder": datamodule = Folder( @@ -72,7 +72,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: num_workers=config.dataset.num_workers, transform_config_train=config.dataset.transform_config.train, transform_config_eval=config.dataset.transform_config.eval, - val_split_mode=config.dataset.validation_split_mode, + val_split_mode=config.dataset.val_split_mode, ) else: raise ValueError( diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml index 2a823620ba..239bfddfa5 100644 --- a/anomalib/models/cflow/config.yaml +++ b/anomalib/models/cflow/config.yaml @@ -13,7 +13,7 @@ dataset: transform_config: train: null eval: null - validation_split_mode: same_as_test # options: [same_as_test, from_test] + val_split_mode: same_as_test # options: [same_as_test, from_test] model: name: cflow diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml index 7e9961f660..1c0dd3491a 100644 --- a/anomalib/models/dfkde/config.yaml +++ b/anomalib/models/dfkde/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null eval: null - validation_split_mode: same_as_test # options: [same_as_test, from_test] + val_split_mode: same_as_test # options: [same_as_test, from_test] model: name: dfkde diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml index 807f39e5db..e9ebee1501 100755 --- a/anomalib/models/dfm/config.yaml +++ b/anomalib/models/dfm/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null eval: null - validation_split_mode: same_as_test # options: [same_as_test, from_test] + val_split_mode: same_as_test # options: [same_as_test, from_test] model: name: dfm diff --git a/anomalib/models/draem/config.yaml b/anomalib/models/draem/config.yaml index 5f225e4cff..2435654923 100644 --- a/anomalib/models/draem/config.yaml +++ b/anomalib/models/draem/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: ./anomalib/models/draem/transform_config.yaml eval: ./anomalib/models/draem/transform_config.yaml - validation_split_mode: same_as_test # options: [same_as_test, from_test] + val_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false tile_size: null diff --git a/anomalib/models/fastflow/config.yaml b/anomalib/models/fastflow/config.yaml index d1ad0d6eae..e7fb76da45 100644 --- a/anomalib/models/fastflow/config.yaml +++ b/anomalib/models/fastflow/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null eval: null - validation_split_mode: same_as_test # options: [same_as_test, from_test] + val_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false tile_size: null diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml index 542f117df1..f8ddab8fba 100644 --- a/anomalib/models/ganomaly/config.yaml +++ b/anomalib/models/ganomaly/config.yaml @@ -12,7 +12,7 @@ dataset: transform_config: train: null eval: null - validation_split_mode: same_as_test # options: [same_as_test, from_test] + val_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: true tile_size: 64 diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml index bb08d58ab5..91e3cccaf8 100644 --- a/anomalib/models/padim/config.yaml +++ b/anomalib/models/padim/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null eval: null - validation_split_mode: same_as_test # options: [same_as_test, from_test] + val_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false tile_size: null diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml index 38fc14bb38..5392e2740e 100644 --- a/anomalib/models/patchcore/config.yaml +++ b/anomalib/models/patchcore/config.yaml @@ -11,7 +11,7 @@ dataset: transform_config: train: null eval: null - validation_split_mode: same_as_test # options: [same_as_test, from_test] + val_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false tile_size: null diff --git a/anomalib/models/reverse_distillation/config.yaml b/anomalib/models/reverse_distillation/config.yaml index 1a6a697f36..e8d2289ff6 100644 --- a/anomalib/models/reverse_distillation/config.yaml +++ b/anomalib/models/reverse_distillation/config.yaml @@ -12,7 +12,7 @@ dataset: transform_config: train: null eval: null - validation_split_mode: same_as_test # options: [same_as_test, from_test] + val_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false tile_size: 64 diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml index a25a558f41..b08bc97387 100644 --- a/anomalib/models/stfpm/config.yaml +++ b/anomalib/models/stfpm/config.yaml @@ -12,7 +12,7 @@ dataset: transform_config: train: null eval: null - validation_split_mode: same_as_test # options: [same_as_test, from_test] + val_split_mode: same_as_test # options: [same_as_test, from_test] tiling: apply: false tile_size: null From 067d601de090419cf1b3d4010bee5b781ec1e3f3 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Wed, 19 Oct 2022 09:01:24 +0200 Subject: [PATCH 67/96] update docs --- docs/source/how_to_guides/train_custom_data.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/how_to_guides/train_custom_data.rst b/docs/source/how_to_guides/train_custom_data.rst index 9a70fdc88e..4d1652462f 100644 --- a/docs/source/how_to_guides/train_custom_data.rst +++ b/docs/source/how_to_guides/train_custom_data.rst @@ -87,7 +87,7 @@ Let's choose `Padim algorithm `_, copy the transform_config: train: null eval: null - validation_split_mode: from_test # determines how the validation set is created, options [same_as_test, from_test] + val_split_mode: from_test # determines how the validation set is created, options [same_as_test, from_test] tiling: apply: false tile_size: null From c84c99c9917cd733d30ea5eb191794805d05bd7b Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 21 Oct 2022 11:33:56 +0200 Subject: [PATCH 68/96] Update anomalib/data/base/dataset.py Co-authored-by: Joao P C Bertoldo <24547377+jpcbertoldo@users.noreply.github.com> --- anomalib/data/base/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py index c73c06b185..2baae26eaf 100644 --- a/anomalib/data/base/dataset.py +++ b/anomalib/data/base/dataset.py @@ -30,7 +30,7 @@ def __init__(self, task: str, pre_process: PreProcessor): super().__init__() self.task = task self.pre_process = pre_process - self._samples = None + self._samples: DataFrame = None def __len__(self) -> int: """Get length of the dataset.""" From b680d44ade9409a6627e420968fa57b75f85ab3a Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 21 Oct 2022 11:41:15 +0200 Subject: [PATCH 69/96] get length from self.samples --- anomalib/data/base/dataset.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py index 2baae26eaf..daec19c2ee 100644 --- a/anomalib/data/base/dataset.py +++ b/anomalib/data/base/dataset.py @@ -34,8 +34,7 @@ def __init__(self, task: str, pre_process: PreProcessor): def __len__(self) -> int: """Get length of the dataset.""" - assert isinstance(self._samples, DataFrame) - return len(self._samples) + return len(self.samples) def subsample(self, indices: Sequence[int], inplace=False) -> AnomalibDataset: """Subsamples the dataset at the provided indices. From 95c37b004bbd2e0ff4ea9655510a7f1fd43e5171 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 21 Oct 2022 11:48:40 +0200 Subject: [PATCH 70/96] assert unique indices --- anomalib/data/base/dataset.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py index daec19c2ee..af732ef26c 100644 --- a/anomalib/data/base/dataset.py +++ b/anomalib/data/base/dataset.py @@ -43,6 +43,7 @@ def subsample(self, indices: Sequence[int], inplace=False) -> AnomalibDataset: indices (Sequence[int]): Indices at which the dataset is to be subsampled. inplace (bool): When true, the subsampling will be performed on the instance itself. """ + assert len(set(indices)) == len(indices), "No duplicates allowed in indices." dataset = self if inplace else copy.deepcopy(self) dataset.samples = self.samples.iloc[indices].reset_index(drop=True) return dataset From 3e77014b496068ded588139d4cacf64cc1c0c37b Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 21 Oct 2022 11:59:01 +0200 Subject: [PATCH 71/96] check is_setup for individual datasets Co-authored-by: Joao P C Bertoldo <24547377+jpcbertoldo@users.noreply.github.com> --- anomalib/data/base/dataset.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py index af732ef26c..d6bfb33af6 100644 --- a/anomalib/data/base/dataset.py +++ b/anomalib/data/base/dataset.py @@ -122,7 +122,8 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: def __add__(self, other_dataset: AnomalibDataset) -> AnomalibDataset: """Concatenate this dataset with another dataset.""" assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of the same type." - assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first." + assert self.is_setup, "Cannot concatenate uninitialized datasets. Call setup first." + assert other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first." dataset = copy.deepcopy(self) dataset.samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True) return dataset From ede213a215bf123cb15e986ed6c4b5af7ea71965 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 21 Oct 2022 12:04:39 +0200 Subject: [PATCH 72/96] remove assert in __getitem_\ Co-authored-by: Joao P C Bertoldo <24547377+jpcbertoldo@users.noreply.github.com> --- anomalib/data/base/dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py index d6bfb33af6..48e0fa3489 100644 --- a/anomalib/data/base/dataset.py +++ b/anomalib/data/base/dataset.py @@ -89,7 +89,6 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. """ - assert isinstance(self._samples, DataFrame) image_path = self._samples.iloc[index].image_path image = read_image(image_path) From f5e2d240dbce0df4610bb3eec265aed7eb1dc6b3 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 21 Oct 2022 12:05:49 +0200 Subject: [PATCH 73/96] Update anomalib/data/btech.py Co-authored-by: Joao P C Bertoldo <24547377+jpcbertoldo@users.noreply.github.com> --- anomalib/data/btech.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 1636d5ec9f..271ad066ec 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -159,7 +159,7 @@ def __init__( """ super().__init__(task, pre_process) - self.root_category = Path(root) / Path(category) + self.root_category = Path(root) / category self.split = split def _setup(self): From d9e136905d24b2b9e46b77330c5cf0a434427f91 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 21 Oct 2022 15:44:20 +0200 Subject: [PATCH 74/96] clearer assert message --- anomalib/data/utils/split.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index bec80e3bfd..501a271c73 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -71,8 +71,10 @@ def random_split( if isinstance(split_ratio, float): split_ratio = [1 - split_ratio, split_ratio] - assert math.isclose(sum(split_ratio), 1) and sum(split_ratio) <= 1, "split ratios must sum to 1." - assert all(0 < ratio < 1 for ratio in split_ratio), "all split ratios must be between 0 and 1." + assert ( + math.isclose(sum(split_ratio), 1) and sum(split_ratio) <= 1 + ), f"split ratios must sum to 1, found {sum(split_ratio)}" + assert all(0 < ratio < 1 for ratio in split_ratio), f"all split ratios must be between 0 and 1, found {split_ratio}" # create list of source data if label_aware: @@ -102,6 +104,5 @@ def random_split( [label_dataset.subsample(subset_indices) for subset_indices in torch.split(indices, subset_lengths)] ) - # concatenate and return subsets = list(map(list, zip(*subsets))) return [concatenate_datasets(subset) for subset in subsets] From 2e6bc608c17e8df3e4672706bfd71eb90733ac50 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 21 Oct 2022 15:49:29 +0200 Subject: [PATCH 75/96] clarify list inversion in comment --- anomalib/data/utils/split.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index 501a271c73..60087da96a 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -104,5 +104,7 @@ def random_split( [label_dataset.subsample(subset_indices) for subset_indices in torch.split(indices, subset_lengths)] ) + # invert outer/inner lists + # outer list: subsets with the given ratio, inner list: per-label unique subsets = list(map(list, zip(*subsets))) return [concatenate_datasets(subset) for subset in subsets] From af0cd99f9df427335b4f99068254a5ddbac56c35 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 21 Oct 2022 15:51:26 +0200 Subject: [PATCH 76/96] comments and typing --- anomalib/data/utils/split.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index 60087da96a..1ce0ab2362 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -83,8 +83,9 @@ def random_split( else: per_label_datasets = [dataset] + # outer list: per-label unique, inner list: random subsets with the given ratio + subsets: List[List[AnomalibDataset]] = [] # split each (label-aware) subset of source data - subsets = [] for label_dataset in per_label_datasets: # get subset lengths subset_lengths = [] From 5ee8480ddf7f7f64f0945d2a056c19a2558d38d0 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 21 Oct 2022 17:50:08 +0200 Subject: [PATCH 77/96] validate contents of samples dataframe before setting --- anomalib/data/base/dataset.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py index 48e0fa3489..274955a49e 100644 --- a/anomalib/data/base/dataset.py +++ b/anomalib/data/base/dataset.py @@ -20,6 +20,13 @@ from anomalib.data.utils import read_image from anomalib.pre_processing import PreProcessor +_EXPECTED_COlS_CLASSIFICATION = ["image_path", "label", "label_index", "mask_path", "split"] +_EXPECTED_COLS_SEGMENTATION = _EXPECTED_COlS_CLASSIFICATION + ["mask_path"] +_EXPECTED_COLS_PERTASK = { + "classification": _EXPECTED_COlS_CLASSIFICATION, + "segmentation": _EXPECTED_COLS_SEGMENTATION, +} + logger = logging.getLogger(__name__) @@ -67,6 +74,13 @@ def samples(self, samples: DataFrame): Args: samples (DataFrame): DataFrame with new samples. """ + # validate the passed samples by checking the + assert isinstance(samples, DataFrame), f"samples must be a pandas.DataFrame, found {type(samples)}" + expected_columns = _EXPECTED_COLS_PERTASK[self.task] + assert all( + col in samples.columns for col in expected_columns + ), f"samples must have (at least) columns {expected_columns}, found {samples.columns}" + self._samples = samples.sort_values(by="image_path", ignore_index=True) @property From a5e876a4139ed845e56ccc2854dd451d0bc0a67f Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 21 Oct 2022 17:55:22 +0200 Subject: [PATCH 78/96] add file paths check --- anomalib/data/base/dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py index 274955a49e..d86194a7de 100644 --- a/anomalib/data/base/dataset.py +++ b/anomalib/data/base/dataset.py @@ -8,6 +8,7 @@ import copy import logging from abc import ABC, abstractmethod +from pathlib import Path from typing import Dict, Sequence, Union import cv2 @@ -80,6 +81,7 @@ def samples(self, samples: DataFrame): assert all( col in samples.columns for col in expected_columns ), f"samples must have (at least) columns {expected_columns}, found {samples.columns}" + assert samples["image_path"].apply(lambda p: Path(p).exists()).all(), "missing file path(s) in samples" self._samples = samples.sort_values(by="image_path", ignore_index=True) From c490e30ddb525786593077908df25008d8a8c9fc Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 21 Oct 2022 18:47:02 +0200 Subject: [PATCH 79/96] add seed to random_split function --- anomalib/data/utils/split.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index 1ce0ab2362..72f97ff79d 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -16,7 +16,7 @@ import math import warnings from enum import Enum -from typing import TYPE_CHECKING, List, Sequence, Union +from typing import TYPE_CHECKING, List, Optional, Sequence, Union import torch @@ -55,7 +55,10 @@ def concatenate_datasets(datasets: Sequence[AnomalibDataset]) -> AnomalibDataset def random_split( - dataset: AnomalibDataset, split_ratio: Union[float, Sequence[float]], label_aware: bool = False + dataset: AnomalibDataset, + split_ratio: Union[float, Sequence[float]], + label_aware: bool = False, + seed: Optional[int] = None, ) -> List[AnomalibDataset]: """Perform a random split of a dataset. @@ -66,6 +69,7 @@ def random_split( [1-split_ratio, split_ratio]. label_aware (bool): When True, the relative occurrence of the different class labels of the source dataset will be maintained in each of the subsets. + seed (Optional[int], optional): Seed that can be passed if results need to be reproducible """ if isinstance(split_ratio, float): @@ -99,8 +103,10 @@ def random_split( "Zero subset length encountered during splitting. This means one of your subsets might be" " empty or devoid of either normal or anomalous images." ) + # perform random subsampling - indices = torch.randperm(len(label_dataset)) + random_state = torch.Generator().manual_seed(seed) if seed else None + indices = torch.randperm(len(label_dataset), generator=random_state) subsets.append( [label_dataset.subsample(subset_indices) for subset_indices in torch.split(indices, subset_lengths)] ) From 48082877145b23894e73f3efece34fc5a2334a06 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 24 Oct 2022 13:11:58 +0200 Subject: [PATCH 80/96] fix expected columns --- anomalib/data/base/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py index d86194a7de..86ce72c96b 100644 --- a/anomalib/data/base/dataset.py +++ b/anomalib/data/base/dataset.py @@ -21,7 +21,7 @@ from anomalib.data.utils import read_image from anomalib.pre_processing import PreProcessor -_EXPECTED_COlS_CLASSIFICATION = ["image_path", "label", "label_index", "mask_path", "split"] +_EXPECTED_COlS_CLASSIFICATION = ["image_path", "label", "label_index", "split"] _EXPECTED_COLS_SEGMENTATION = _EXPECTED_COlS_CLASSIFICATION + ["mask_path"] _EXPECTED_COLS_PERTASK = { "classification": _EXPECTED_COlS_CLASSIFICATION, From 10bbf9c0c28b6c63ddc6f3be5f4b2d282ee7c7e1 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 24 Oct 2022 13:45:43 +0200 Subject: [PATCH 81/96] fix typo --- anomalib/data/base/dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py index 86ce72c96b..6b2c9aefd4 100644 --- a/anomalib/data/base/dataset.py +++ b/anomalib/data/base/dataset.py @@ -21,10 +21,10 @@ from anomalib.data.utils import read_image from anomalib.pre_processing import PreProcessor -_EXPECTED_COlS_CLASSIFICATION = ["image_path", "label", "label_index", "split"] -_EXPECTED_COLS_SEGMENTATION = _EXPECTED_COlS_CLASSIFICATION + ["mask_path"] +_EXPECTED_COLS_CLASSIFICATION = ["image_path", "label", "label_index", "split"] +_EXPECTED_COLS_SEGMENTATION = _EXPECTED_COLS_CLASSIFICATION + ["mask_path"] _EXPECTED_COLS_PERTASK = { - "classification": _EXPECTED_COlS_CLASSIFICATION, + "classification": _EXPECTED_COLS_CLASSIFICATION, "segmentation": _EXPECTED_COLS_SEGMENTATION, } From 81d3ca310c973c941a1de78f2cd80380915daaa6 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 28 Oct 2022 14:10:35 +0200 Subject: [PATCH 82/96] add seed parameter to datamodules --- anomalib/data/base/datamodule.py | 13 +++++++++++-- anomalib/data/btech.py | 6 +++--- anomalib/data/folder.py | 5 ++++- anomalib/data/mvtec.py | 2 ++ 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py index b8d8603bdd..38cb6eac61 100644 --- a/anomalib/data/base/datamodule.py +++ b/anomalib/data/base/datamodule.py @@ -27,14 +27,23 @@ class AnomalibDataModule(LightningDataModule, ABC): train_batch_size (int): Batch size used by the train dataloader. test_batch_size (int): Batch size used by the val and test dataloaders. num_workers (int): Number of workers used by the train, val and test dataloaders. + seed (Optional[int], optional): Seed used during random subset splitting. """ - def __init__(self, train_batch_size: int, eval_batch_size: int, num_workers: int, val_split_mode: ValSplitMode): + def __init__( + self, + train_batch_size: int, + eval_batch_size: int, + num_workers: int, + val_split_mode: ValSplitMode, + seed: Optional[int] = None, + ): super().__init__() self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size self.num_workers = num_workers self.val_split_mode = val_split_mode + self.seed = seed self.train_data: Optional[AnomalibDataset] = None self.val_data: Optional[AnomalibDataset] = None @@ -63,7 +72,7 @@ def _setup(self, _stage: Optional[str] = None) -> None: self.train_data.setup() self.test_data.setup() if self.val_split_mode == ValSplitMode.FROM_TEST: - self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True) + self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True, seed=self.seed) elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: self.val_data = self.test_data else: diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 271ad066ec..74b341c9a4 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -124,7 +124,6 @@ def __init__( pre_process: List of pre_processing object containing albumentation compose. split: 'train', 'val' or 'test' task: ``classification`` or ``segmentation`` - seed: seed used for the random subset splitting create_validation_set: Create a validation subset in addition to the train and test subsets Examples: @@ -182,6 +181,7 @@ def __init__( transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_eval: Optional[Union[str, A.Compose]] = None, val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, + seed: Optional[int] = None, ) -> None: """Instantiate BTech Lightning Data Module. @@ -195,8 +195,8 @@ def __init__( task: ``classification`` or ``segmentation`` transform_config_train: Config for pre-processing during training. transform_config_val: Config for pre-processing during validation. - seed: seed used for the random subset splitting create_validation_set: Create a validation subset in addition to the train and test subsets + seed (Optional[int], optional): Seed used during random subset splitting. Examples: >>> from anomalib.data import BTech @@ -224,7 +224,7 @@ def __init__( >>> data["image"].shape, data["mask"].shape (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256])) """ - super().__init__(train_batch_size, eval_batch_size, num_workers, val_split_mode) + super().__init__(train_batch_size, eval_batch_size, num_workers, val_split_mode, seed) self.root = Path(root) self.category = Path(category) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index bcfd30adf6..53f5d922d7 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -231,6 +231,7 @@ class Folder(AnomalibDataModule): during validation. Defaults to None. val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained. + seed (Optional[int], optional): Seed used during random subset splitting. """ def __init__( @@ -251,12 +252,14 @@ def __init__( transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_eval: Optional[Union[str, A.Compose]] = None, val_split_mode: ValSplitMode = ValSplitMode.FROM_TEST, + seed: Optional[int] = None, ): super().__init__( train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, num_workers=num_workers, val_split_mode=val_split_mode, + seed=seed, ) self.split_ratio = split_ratio @@ -298,7 +301,7 @@ def _setup(self, _stage: Optional[str] = None): # add some normal images to the test set if not self.test_data.has_normal: - self.train_data, normal_test_data = random_split(self.train_data, self.split_ratio) + self.train_data, normal_test_data = random_split(self.train_data, self.split_ratio, seed=self.seed) self.test_data += normal_test_data super()._setup() diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 8dd70af4a1..2b21edf180 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -162,12 +162,14 @@ def __init__( transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_eval: Optional[Union[str, A.Compose]] = None, val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, + seed: Optional[int] = None, ): super().__init__( train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, num_workers=num_workers, val_split_mode=val_split_mode, + seed=seed, ) self.root = Path(root) From b372dd1283864105db6606915d5e1501cde26eea Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 28 Oct 2022 14:30:22 +0200 Subject: [PATCH 83/96] set global seed in test entrypoint --- tools/test.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/test.py b/tools/test.py index 5427cf9f06..b2772aaf5d 100644 --- a/tools/test.py +++ b/tools/test.py @@ -5,7 +5,7 @@ from argparse import ArgumentParser, Namespace -from pytorch_lightning import Trainer +from pytorch_lightning import Trainer, seed_everything from anomalib.config import get_configurable_parameters from anomalib.data import get_datamodule @@ -40,6 +40,9 @@ def test(): weight_file=args.weight_file, ) + if config.project.seed: + seed_everything(config.project.seed) + datamodule = get_datamodule(config) model = get_model(config) From e07a12c1cbd336ab1f3c1cf67e8557f34cef5f5a Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 28 Oct 2022 14:34:16 +0200 Subject: [PATCH 84/96] add NONE option to valsplitmode --- anomalib/data/base/datamodule.py | 13 +++++++++---- anomalib/data/utils/split.py | 1 + 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py index 38cb6eac61..e843d3450f 100644 --- a/anomalib/data/base/datamodule.py +++ b/anomalib/data/base/datamodule.py @@ -75,15 +75,20 @@ def _setup(self, _stage: Optional[str] = None) -> None: self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True, seed=self.seed) elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: self.val_data = self.test_data - else: + elif self.val_split_mode != ValSplitMode.NONE: raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") @property def is_setup(self): """Checks if setup() has been called.""" - if self.train_data is None or self.val_data is None or self.test_data is None: - return False - return self.train_data.is_setup and self.val_data.is_setup and self.test_data.is_setup + # at least one of [train_data, val_data, test_data] should be setup + if self.train_data is not None and self.train_data.is_setup: + return True + if self.val_data is not None and self.val_data.is_setup: + return True + if self.test_data is not None and self.test_data.is_setup: + return True + return False def train_dataloader(self) -> TRAIN_DATALOADERS: """Get train dataloader.""" diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index 72f97ff79d..86249086c2 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -35,6 +35,7 @@ class Split(str, Enum): class ValSplitMode(str, Enum): """Splitting mode used to obtain validation subset.""" + NONE = "none" SAME_AS_TEST = "same_as_test" FROM_TEST = "from_test" From ffdb47c0b9a652ae0d235af4664e9eb462c009dc Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 28 Oct 2022 15:22:05 +0200 Subject: [PATCH 85/96] clarify setup behaviour in docstring --- anomalib/data/base/datamodule.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py index e843d3450f..bfdbbfb82a 100644 --- a/anomalib/data/base/datamodule.py +++ b/anomalib/data/base/datamodule.py @@ -64,7 +64,12 @@ def setup(self, stage: Optional[str] = None): def _setup(self, _stage: Optional[str] = None) -> None: """Set up the datasets and perform dynamic subset splitting. - May be overridden in subclass for custom splitting behaviour. + This method yay be overridden in subclass for custom splitting behaviour. + + Note: The stage argument is not used here. This is because, for a given instance of an AnomalibDataModule + subclass, all three subsets are created at the first call of setup(). This is to accommodate the subset + splitting behaviour of anomaly tasks, where the validation set is usually extracted from the test set, and + the test set must therefore be created as early as the `fit` stage. """ assert self.train_data is not None assert self.test_data is not None From 63801a28bdca342effdc2d59798e28f9cc83ee24 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 5 Dec 2022 16:36:51 +0100 Subject: [PATCH 86/96] add logging message --- anomalib/data/synthetic.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py index 3f8956f21b..e3c88d8537 100644 --- a/anomalib/data/synthetic.py +++ b/anomalib/data/synthetic.py @@ -3,6 +3,7 @@ This dataset can be used when there is a lack of real anomalous data. """ +import logging import math import os import shutil @@ -19,6 +20,8 @@ from anomalib.data.utils import Augmenter, Split, read_image from anomalib.pre_processing import PreProcessor +logger = logging.getLogger(__name__) + def make_synthetic_dataset( source_samples: DataFrame, im_dir: Union[Path, str], mask_dir: Union[Path, str], anomalous_ratio: float = 0.5 @@ -126,6 +129,7 @@ def from_dataset(cls, dataset): def _setup(self) -> None: """Create samples dataframe.""" + logger.info("Generating synthetic anomalous images for validation set") self.samples = make_synthetic_dataset(self.source_samples, self.im_dir, self.mask_dir, 0.5) def __del__(self): From 74cbc0ae56559cc6b4f83edc7b1e14c1993efbeb Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Tue, 6 Dec 2022 09:33:09 +0100 Subject: [PATCH 87/96] use val_split_ratio for synthetic validation set --- anomalib/data/base/datamodule.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py index ddff4d5177..5200e214d7 100644 --- a/anomalib/data/base/datamodule.py +++ b/anomalib/data/base/datamodule.py @@ -86,7 +86,7 @@ def _setup(self, _stage: Optional[str] = None) -> None: elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: self.val_data = self.test_data elif self.val_split_mode == ValSplitMode.SYNTHETIC: - self.train_data, normal_val_data = random_split(self.train_data, 0.3) + self.train_data, normal_val_data = random_split(self.train_data, self.val_split_ratio) self.val_data = SyntheticValidationSet.from_dataset(normal_val_data) elif self.val_split_mode != ValSplitMode.NONE: raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") From 090cec265c073394df164e1876763aea334540d0 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Tue, 6 Dec 2022 13:26:45 +0100 Subject: [PATCH 88/96] pathlib --- anomalib/data/synthetic.py | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py index e3c88d8537..a4ce480ff6 100644 --- a/anomalib/data/synthetic.py +++ b/anomalib/data/synthetic.py @@ -5,10 +5,8 @@ import logging import math -import os import shutil from pathlib import Path -from typing import Union import albumentations as A import cv2 @@ -24,7 +22,7 @@ def make_synthetic_dataset( - source_samples: DataFrame, im_dir: Union[Path, str], mask_dir: Union[Path, str], anomalous_ratio: float = 0.5 + source_samples: DataFrame, im_dir: Path, mask_dir: Path, anomalous_ratio: float = 0.5 ) -> DataFrame: """Convert a set of normal samples into a mixed set of normal and synthetic anomalous samples. @@ -33,13 +31,13 @@ def make_synthetic_dataset( Args: source_samples (DataFrame): Normal images that will be used as source for the synthetic anomalous images. - im_dir (Union[Path, str]): Directory to which the synthetic anomalous image files will be written. - mask_dir (Union[Path, str]): Directory to which the ground truth anomaly masks will be written. + im_dir (Path): Directory to which the synthetic anomalous image files will be written. + mask_dir (Path): Directory to which the ground truth anomaly masks will be written. anomalous_ratio (float): Fraction of source samples that will be converted into anomalous samples. """ assert 1 not in source_samples.label_index.values, "All source images must be normal." - assert os.path.isdir(im_dir), f"{im_dir} is not a folder." - assert os.path.isdir(mask_dir), f"{mask_dir} is not a folder" + assert im_dir.is_dir(), f"{im_dir} is not a folder." + assert mask_dir.is_dir(), f"{mask_dir} is not a folder" # filter relevant columns source_samples = source_samples.filter(["image_path", "label", "label_index", "mask_path", "split"]) @@ -77,13 +75,13 @@ def augment(sample: Series) -> Series: # write image aug_im = (aug_im.squeeze().permute((1, 2, 0)) * 255).numpy() aug_im = cv2.cvtColor(aug_im, cv2.COLOR_RGB2BGR) - im_path = str(Path(im_dir) / file_name) - cv2.imwrite(im_path, aug_im) + im_path = im_dir / file_name + cv2.imwrite(str(im_path), aug_im) # write mask mask = (mask.squeeze() * 255).numpy() - mask_path = str(Path(mask_dir) / file_name) - cv2.imwrite(mask_path, mask) - out = dict(image_path=im_path, label="abnormal", label_index=1, mask_path=mask_path, split=Split.VAL) + mask_path = mask_dir / file_name + cv2.imwrite(str(mask_path), mask) + out = dict(image_path=str(im_path), label="abnormal", label_index=1, mask_path=str(mask_path), split=Split.VAL) return Series(out) anomalous_samples = anomalous_samples.apply(augment, axis=1) @@ -113,12 +111,12 @@ def __init__(self, task: str, pre_process: PreProcessor, source_samples: DataFra self.mask_dir = self.root / "ground_truth" # clean up any existing data that may be left over from previous run - if os.path.exists(self.root): + if self.root.exists(): shutil.rmtree(self.root) # create directories - os.makedirs(self.im_dir) - os.makedirs(self.mask_dir) + self.im_dir.mkdir(parents=True) + self.mask_dir.mkdir() self.setup() From 2a8df7b3923f008a8b4143f22c1707897717e16b Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 9 Dec 2022 17:08:18 +0100 Subject: [PATCH 89/96] make synthetic anomaly available for test set --- anomalib/data/__init__.py | 7 ++++- anomalib/data/avenue.py | 6 ++-- anomalib/data/base/__init__.py | 4 +-- anomalib/data/base/datamodule.py | 51 ++++++++++++++++++++++++++++++-- anomalib/data/base/video.py | 25 +++++++++++++++- anomalib/data/btech.py | 17 ++++++++++- anomalib/data/folder.py | 24 +++++---------- anomalib/data/mvtec.py | 36 ++++++++++++++++++++-- anomalib/data/synthetic.py | 44 ++++++++++++++++++++------- anomalib/data/ucsd_ped.py | 7 +++-- anomalib/data/utils/__init__.py | 11 ++++++- anomalib/data/utils/split.py | 21 ++++++++++++- 12 files changed, 210 insertions(+), 43 deletions(-) diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index 2ec51182e9..0d1c4f1d75 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -44,6 +44,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: task=config.dataset.task, transform_config_train=config.dataset.transform_config.train, transform_config_eval=config.dataset.transform_config.eval, + test_split_mode=config.dataset.test_split_mode, + test_split_ratio=config.dataset.test_split_ratio, val_split_mode=config.dataset.val_split_mode, val_split_ratio=config.dataset.val_split_ratio, ) @@ -58,6 +60,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: task=config.dataset.task, transform_config_train=config.dataset.transform_config.train, transform_config_eval=config.dataset.transform_config.eval, + test_split_mode=config.dataset.test_split_mode, + test_split_ratio=config.dataset.test_split_ratio, val_split_mode=config.dataset.val_split_mode, val_split_ratio=config.dataset.val_split_ratio, ) @@ -70,13 +74,14 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule: normal_test_dir=config.dataset.normal_test_dir, mask_dir=config.dataset.mask, extensions=config.dataset.extensions, - normal_split_ratio=config.dataset.normal_split_ratio, image_size=(config.dataset.image_size[0], config.dataset.image_size[1]), train_batch_size=config.dataset.train_batch_size, eval_batch_size=config.dataset.eval_batch_size, num_workers=config.dataset.num_workers, transform_config_train=config.dataset.transform_config.train, transform_config_eval=config.dataset.transform_config.eval, + test_split_mode=config.dataset.test_split_mode, + test_split_ratio=config.dataset.test_split_ratio, val_split_mode=config.dataset.val_split_mode, val_split_ratio=config.dataset.val_split_ratio, ) diff --git a/anomalib/data/avenue.py b/anomalib/data/avenue.py index 792d2f663b..459f42677c 100644 --- a/anomalib/data/avenue.py +++ b/anomalib/data/avenue.py @@ -26,7 +26,7 @@ from pandas import DataFrame from torch import Tensor -from anomalib.data.base import AnomalibDataModule, VideoAnomalibDataset +from anomalib.data.base import VideoAnomalibDataModule, VideoAnomalibDataset from anomalib.data.task_type import TaskType from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check from anomalib.data.utils.video import ClipsIndexer @@ -156,7 +156,7 @@ def _setup(self): self.samples = make_avenue_dataset(self.root, self.gt_dir, self.split) -class Avenue(AnomalibDataModule): +class Avenue(VideoAnomalibDataModule): """Avenue DataModule class. Args: @@ -177,6 +177,8 @@ class Avenue(AnomalibDataModule): during validation. Defaults to None. val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained. + val_split_ratio (float): Fraction of train or test images that will be reserved for validation. + seed (Optional[int], optional): Seed which may be set to a fixed value for reproducibility. """ def __init__( diff --git a/anomalib/data/base/__init__.py b/anomalib/data/base/__init__.py index e158357fc8..0c8fe84257 100644 --- a/anomalib/data/base/__init__.py +++ b/anomalib/data/base/__init__.py @@ -6,6 +6,6 @@ from .datamodule import AnomalibDataModule from .dataset import AnomalibDataset -from .video import VideoAnomalibDataset +from .video import VideoAnomalibDataModule, VideoAnomalibDataset -__all__ = ["AnomalibDataset", "AnomalibDataModule", "VideoAnomalibDataset"] +__all__ = ["AnomalibDataset", "AnomalibDataModule", "VideoAnomalibDataset", "VideoAnomalibDataModule"] diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py index b9e5bf720b..019da5e3ec 100644 --- a/anomalib/data/base/datamodule.py +++ b/anomalib/data/base/datamodule.py @@ -6,6 +6,7 @@ from __future__ import annotations import logging +import warnings from abc import ABC from typing import Any, Dict, List, Optional @@ -15,8 +16,13 @@ from torch.utils.data import DataLoader, default_collate from anomalib.data.base.dataset import AnomalibDataset -from anomalib.data.synthetic import SyntheticValidationSet -from anomalib.data.utils import ValSplitMode, random_split +from anomalib.data.synthetic import SyntheticAnomalyDataset +from anomalib.data.utils import ( + TestSplitMode, + ValSplitMode, + random_split, + split_normal_and_anomalous, +) logger = logging.getLogger(__name__) @@ -61,12 +67,16 @@ def __init__( num_workers: int, val_split_mode: ValSplitMode, val_split_ratio: float, + test_split_mode: Optional[TestSplitMode] = None, + test_split_ratio: Optional[float] = None, seed: Optional[int] = None, ): super().__init__() self.train_batch_size = train_batch_size self.eval_batch_size = eval_batch_size self.num_workers = num_workers + self.test_split_mode = test_split_mode + self.test_split_ratio = test_split_ratio self.val_split_mode = val_split_mode self.val_split_ratio = val_split_ratio self.seed = seed @@ -102,6 +112,41 @@ def _setup(self, _stage: Optional[str] = None) -> None: self.train_data.setup() self.test_data.setup() + + self._create_test_split() + self._create_val_split() + + def _create_test_split(self): + # perform subset splitting for test set + if self.test_split_mode == TestSplitMode.FROM_DIR: + # normal data taken from normal_test_dir if available, otherwise sampled from training set + if not self.test_data.has_normal: + logger.info( + "No normal test images found. Sampling from training set using a split ratio of %d", + self.test_split_ratio, + ) + self.train_data, normal_test_data = random_split(self.train_data, self.test_split_ratio) + self.test_data += normal_test_data + # anomalous data taken from abnormal_dir if available, otherwise raise warning + if not self.test_data.has_anomalous: + warnings.warn( + "Your test set does not contain any anomalous images, which may lead to unreliable " + "evaluation results. To fix, please include anomalous images in your dataset, or set " + "`test_split_mode` to `synthetic`." + ) + elif self.test_split_mode == TestSplitMode.SYNTHETIC: + if not self.test_data.has_normal: + logger.info( + "No normal test images found. Sampling from training set using a split ratio of %d", + self.test_split_ratio, + ) + self.train_data, normal_test_data = random_split(self.train_data, self.test_split_ratio) + else: + normal_test_data, _ = split_normal_and_anomalous(self.test_data) + self.test_data = SyntheticAnomalyDataset.from_dataset(normal_test_data) + + def _create_val_split(self): + # perform subset splitting for validation set if self.val_split_mode == ValSplitMode.FROM_TEST: self.test_data, self.val_data = random_split( self.test_data, self.val_split_ratio, label_aware=True, seed=self.seed @@ -110,7 +155,7 @@ def _setup(self, _stage: Optional[str] = None) -> None: self.val_data = self.test_data elif self.val_split_mode == ValSplitMode.SYNTHETIC: self.train_data, normal_val_data = random_split(self.train_data, self.val_split_ratio) - self.val_data = SyntheticValidationSet.from_dataset(normal_val_data) + self.val_data = SyntheticAnomalyDataset.from_dataset(normal_val_data) elif self.val_split_mode != ValSplitMode.NONE: raise ValueError(f"Unknown validation split mode: {self.val_split_mode}") diff --git a/anomalib/data/base/video.py b/anomalib/data/base/video.py index 65735dab05..538b45e6a9 100644 --- a/anomalib/data/base/video.py +++ b/anomalib/data/base/video.py @@ -6,9 +6,10 @@ import torch from torch import Tensor +from anomalib.data.base.datamodule import AnomalibDataModule from anomalib.data.base.dataset import AnomalibDataset from anomalib.data.task_type import TaskType -from anomalib.data.utils import masks_to_boxes +from anomalib.data.utils import ValSplitMode, masks_to_boxes from anomalib.data.utils.video import ClipsIndexer from anomalib.pre_processing import PreProcessor @@ -93,3 +94,25 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: item.pop("mask") return item + + +class VideoAnomalibDataModule(AnomalibDataModule): + """Base class for video data modules.""" + + def _setup(self, _stage: Optional[str] = None) -> None: + """Set up the datasets and perform dynamic subset splitting. + + This method may be overridden in subclass for custom splitting behaviour. + + Video datamodules are not compatible with synthetic anomaly generation. + """ + assert self.train_data is not None + assert self.test_data is not None + + self.train_data.setup() + self.test_data.setup() + + if self.val_split_mode == ValSplitMode.SYNTHETIC: + raise ValueError(f"Val split mode {self.test_split_mode} not supported for video datasets.") + + self._create_val_split() diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 61696c31d0..7d1b0fc75c 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -25,7 +25,13 @@ from anomalib.data.base import AnomalibDataModule, AnomalibDataset from anomalib.data.task_type import TaskType -from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check +from anomalib.data.utils import ( + DownloadProgressBar, + Split, + TestSplitMode, + ValSplitMode, + hash_check, +) from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) @@ -181,6 +187,8 @@ def __init__( task: TaskType = TaskType.SEGMENTATION, transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_eval: Optional[Union[str, A.Compose]] = None, + test_split_mode: TestSplitMode = TestSplitMode.FROM_DIR, + test_split_ratio: float = 0.2, val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, val_split_ratio: float = 0.5, seed: Optional[int] = None, @@ -199,6 +207,11 @@ def __init__( transform_config_val: Config for pre-processing during validation. create_validation_set: Create a validation subset in addition to the train and test subsets seed (Optional[int], optional): Seed used during random subset splitting. + test_split_mode (TestSplitMode): Setting that determines how the testing subset is obtained. + test_split_ratio (float): Fraction of images from the train set that will be reserved for testing. + val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained. + val_split_ratio (float): Fraction of train or test images that will be reserved for validation. + seed (Optional[int], optional): Seed which may be set to a fixed value for reproducibility. Examples: >>> from anomalib.data import BTech @@ -230,6 +243,8 @@ def __init__( train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, num_workers=num_workers, + test_split_mode=test_split_mode, + test_split_ratio=test_split_ratio, val_split_mode=val_split_mode, val_split_ratio=val_split_ratio, seed=seed, diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py index 957b0e1274..ddb327e7a9 100644 --- a/anomalib/data/folder.py +++ b/anomalib/data/folder.py @@ -15,7 +15,7 @@ from anomalib.data.base import AnomalibDataModule, AnomalibDataset from anomalib.data.task_type import TaskType -from anomalib.data.utils import Split, ValSplitMode, random_split +from anomalib.data.utils import Split, TestSplitMode, ValSplitMode from anomalib.pre_processing.pre_process import PreProcessor @@ -237,7 +237,10 @@ class Folder(AnomalibDataModule): transform_config_val (Optional[Union[str, A.Compose]], optional): Config for pre-processing during validation. Defaults to None. + test_split_mode (TestSplitMode): Setting that determines how the testing subset is obtained. + test_split_ratio (float): Fraction of images from the train set that will be reserved for testing. val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained. + val_split_ratio (float): Fraction of train or test images that will be reserved for validation. seed (Optional[int], optional): Seed used during random subset splitting. """ @@ -258,6 +261,8 @@ def __init__( task: TaskType = TaskType.SEGMENTATION, transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_eval: Optional[Union[str, A.Compose]] = None, + test_split_mode: TestSplitMode = TestSplitMode.FROM_DIR, + test_split_ratio: float = 0.2, val_split_mode: ValSplitMode = ValSplitMode.FROM_TEST, val_split_ratio: float = 0.5, seed: Optional[int] = None, @@ -266,6 +271,8 @@ def __init__( train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, num_workers=num_workers, + test_split_mode=test_split_mode, + test_split_ratio=test_split_ratio, val_split_mode=val_split_mode, val_split_ratio=val_split_ratio, seed=seed, @@ -299,18 +306,3 @@ def __init__( mask_dir=mask_dir, extensions=extensions, ) - - def _setup(self, _stage: Optional[str] = None): - """Set up the datasets for the Folder Data Module.""" - assert self.train_data is not None - assert self.test_data is not None - - self.train_data.setup() - self.test_data.setup() - - # add some normal images to the test set - if not self.test_data.has_normal: - self.train_data, normal_test_data = random_split(self.train_data, self.normal_split_ratio, seed=self.seed) - self.test_data += normal_test_data - - super()._setup() diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index 6aa2424cec..e70bb823a4 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -34,7 +34,13 @@ from anomalib.data.base import AnomalibDataModule, AnomalibDataset from anomalib.data.task_type import TaskType -from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check +from anomalib.data.utils import ( + DownloadProgressBar, + Split, + TestSplitMode, + ValSplitMode, + hash_check, +) from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) @@ -149,7 +155,29 @@ def _setup(self): class MVTec(AnomalibDataModule): - """MVTec Datamodule.""" + """MVTec Datamodule. + + Args: + root (str): Path to the root of the dataset + category (str): Category of the MVTec dataset (e.g. "bottle" or "cable"). + image_size (Optional[Union[int, Tuple[int, int]]], optional): Size of the input image. + Defaults to None. + train_batch_size (int, optional): Training batch size. Defaults to 32. + eval_batch_size (int, optional): Test batch size. Defaults to 32. + num_workers (int, optional): Number of workers. Defaults to 8. + task TaskType): Task type, 'classification', 'detection' or 'segmentation' + transform_config_train (Optional[Union[str, A.Compose]], optional): Config for pre-processing + during training. + Defaults to None. + transform_config_val (Optional[Union[str, A.Compose]], optional): Config for pre-processing + during validation. + Defaults to None. + test_split_mode (TestSplitMode): Setting that determines how the testing subset is obtained. + test_split_ratio (float): Fraction of images from the train set that will be reserved for testing. + val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained. + val_split_ratio (float): Fraction of train or test images that will be reserved for validation. + seed (Optional[int], optional): Seed which may be set to a fixed value for reproducibility. + """ def __init__( self, @@ -162,6 +190,8 @@ def __init__( task: TaskType = TaskType.SEGMENTATION, transform_config_train: Optional[Union[str, A.Compose]] = None, transform_config_eval: Optional[Union[str, A.Compose]] = None, + test_split_mode: TestSplitMode = TestSplitMode.FROM_DIR, + test_split_ratio: float = 0.2, val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST, val_split_ratio: float = 0.5, seed: Optional[int] = None, @@ -170,6 +200,8 @@ def __init__( train_batch_size=train_batch_size, eval_batch_size=eval_batch_size, num_workers=num_workers, + test_split_mode=test_split_mode, + test_split_ratio=test_split_ratio, val_split_mode=val_split_mode, val_split_ratio=val_split_ratio, seed=seed, diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py index 534ec9a64f..42c4cc7cec 100644 --- a/anomalib/data/synthetic.py +++ b/anomalib/data/synthetic.py @@ -6,7 +6,10 @@ import logging import math import shutil +from copy import deepcopy from pathlib import Path +from tempfile import mkdtemp +from typing import Dict import albumentations as A import cv2 @@ -14,14 +17,17 @@ from albumentations.pytorch import ToTensorV2 from pandas import DataFrame, Series -from anomalib.data import TaskType from anomalib.data.base.dataset import AnomalibDataset +from anomalib.data.task_type import TaskType from anomalib.data.utils import Augmenter, Split, read_image from anomalib.pre_processing import PreProcessor logger = logging.getLogger(__name__) +ROOT = "./.tmp/synthetic_anomaly" + + def make_synthetic_dataset( source_samples: DataFrame, im_dir: Path, mask_dir: Path, anomalous_ratio: float = 0.5 ) -> DataFrame: @@ -92,7 +98,7 @@ def augment(sample: Series) -> Series: return samples -class SyntheticValidationSet(AnomalibDataset): +class SyntheticAnomalyDataset(AnomalibDataset): """Dataset which reads synthetically generated anomalous images from a temporary folder. Args: @@ -107,18 +113,18 @@ def __init__(self, task: TaskType, pre_process: PreProcessor, source_samples: Da self.source_samples = source_samples # Files will be written to a temporary directory in the workdir, which is cleaned up after code execution - self.root = Path("./.tmp/synthetic_anomaly") - self.im_dir = self.root / "images" - self.mask_dir = self.root / "ground_truth" + root = Path(ROOT) + root.mkdir(parents=True, exist_ok=True) - # clean up any existing data that may be left over from previous run - if self.root.exists(): - shutil.rmtree(self.root) + self.root = Path(mkdtemp(dir=root)) + self.im_dir = self.root / "abnormal" + self.mask_dir = self.root / "ground_truth" # create directories - self.im_dir.mkdir(parents=True) + self.im_dir.mkdir() self.mask_dir.mkdir() + self._cleanup = True # flag that determines if temp dir is cleaned up when instance is deleted self.setup() @classmethod @@ -126,6 +132,23 @@ def from_dataset(cls, dataset): """Create a synthetic anomaly dataset from an existing dataset of normal images.""" return cls(task=dataset.task, pre_process=dataset.pre_process, source_samples=dataset.samples) + def __copy__(self) -> "SyntheticAnomalyDataset": + """Returns a shallow copy of the dataset object and prevents cleanup when original object is deleted.""" + cls = self.__class__ + new = cls.__new__(cls) + new.__dict__.update(self.__dict__) + self._cleanup = False + return new + + def __deepcopy__(self, _memo: Dict) -> "SyntheticAnomalyDataset": + """Returns a deep copy of the dataset object and prevents cleanup when original object is deleted.""" + cls = self.__class__ + new = cls.__new__(cls) + for key, value in self.__dict__.items(): + setattr(new, key, deepcopy(value)) + self._cleanup = False + return new + def _setup(self) -> None: """Create samples dataframe.""" logger.info("Generating synthetic anomalous images for validation set") @@ -133,4 +156,5 @@ def _setup(self) -> None: def __del__(self): """Make sure the temporary directory is cleaned up when the dataset object is deleted.""" - shutil.rmtree(self.root) + if self._cleanup: + shutil.rmtree(self.root) diff --git a/anomalib/data/ucsd_ped.py b/anomalib/data/ucsd_ped.py index 0ce32ce8b7..fd810c4402 100644 --- a/anomalib/data/ucsd_ped.py +++ b/anomalib/data/ucsd_ped.py @@ -14,8 +14,7 @@ from pandas import DataFrame from torch import Tensor -from anomalib.data.base import AnomalibDataModule -from anomalib.data.base.video import VideoAnomalibDataset +from anomalib.data.base import VideoAnomalibDataModule, VideoAnomalibDataset from anomalib.data.task_type import TaskType from anomalib.data.utils import ( DownloadProgressBar, @@ -169,7 +168,7 @@ def _setup(self): self.samples = make_ucsd_dataset(self.root_category, self.split) -class UCSDped(AnomalibDataModule): +class UCSDped(VideoAnomalibDataModule): """UCSDped DataModule class. Args: @@ -190,6 +189,8 @@ class UCSDped(AnomalibDataModule): during validation. Defaults to None. val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained. + val_split_ratio (float): Fraction of train or test images that will be reserved for validation. + seed (Optional[int], optional): Seed which may be set to a fixed value for reproducibility. """ def __init__( diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py index d274577f54..4add078386 100644 --- a/anomalib/data/utils/__init__.py +++ b/anomalib/data/utils/__init__.py @@ -13,7 +13,14 @@ get_image_height_and_width, read_image, ) -from .split import Split, ValSplitMode, concatenate_datasets, random_split +from .split import ( + Split, + TestSplitMode, + ValSplitMode, + concatenate_datasets, + random_split, + split_normal_and_anomalous, +) __all__ = [ "generate_output_image_filename", @@ -24,9 +31,11 @@ "read_image", "DownloadProgressBar", "random_split", + "split_normal_and_anomalous", "concatenate_datasets", "Split", "ValSplitMode", + "TestSplitMode", "Augmenter", "masks_to_boxes", "boxes_to_masks", diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index acf4338b3e..56cc15a736 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -16,7 +16,7 @@ import math import warnings from enum import Enum -from typing import TYPE_CHECKING, List, Optional, Sequence, Union +from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple, Union import torch @@ -32,6 +32,14 @@ class Split(str, Enum): TEST = "test" +class TestSplitMode(str, Enum): + """Splitting mode used to obtain subset.""" + + NONE = "none" + FROM_DIR = "from_dir" + SYNTHETIC = "synthetic" + + class ValSplitMode(str, Enum): """Splitting mode used to obtain validation subset.""" @@ -117,3 +125,14 @@ def random_split( # outer list: subsets with the given ratio, inner list: per-label unique subsets = list(map(list, zip(*subsets))) return [concatenate_datasets(subset) for subset in subsets] + + +def split_normal_and_anomalous(dataset: AnomalibDataset) -> Tuple[AnomalibDataset, AnomalibDataset]: + """Splits the dataset into the normal and anomalous subsets.""" + samples = dataset.samples + normal_indices = samples[samples.label_index == 0].index + anomalous_indices = samples[samples.label_index == 1].index + + normal_subset = dataset.subsample(list(normal_indices)) + anomalous_subset = dataset.subsample(list(anomalous_indices)) + return normal_subset, anomalous_subset From ea004423c8f5792e3a8bfecc5615834bf41bdeba Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 9 Dec 2022 17:16:16 +0100 Subject: [PATCH 90/96] update configs --- anomalib/models/cflow/config.yaml | 6 ++++-- anomalib/models/dfkde/config.yaml | 6 ++++-- anomalib/models/dfm/config.yaml | 6 ++++-- anomalib/models/draem/config.yaml | 6 ++++-- anomalib/models/fastflow/config.yaml | 6 ++++-- anomalib/models/ganomaly/config.yaml | 6 ++++-- anomalib/models/padim/config.yaml | 6 ++++-- anomalib/models/patchcore/config.yaml | 6 ++++-- anomalib/models/reverse_distillation/config.yaml | 6 ++++-- anomalib/models/stfpm/config.yaml | 6 ++++-- 10 files changed, 40 insertions(+), 20 deletions(-) diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml index 6261a8b0dd..9da02805c4 100644 --- a/anomalib/models/cflow/config.yaml +++ b/anomalib/models/cflow/config.yaml @@ -12,8 +12,10 @@ dataset: transform_config: train: null eval: null - val_split_mode: same_as_test # options: [same_as_test, from_test] - val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode) + test_split_mode: from_dir # options: [from_dir, synthetic] + test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode) + val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic] + val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode) model: name: cflow diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml index 3fe5dcdeaa..82d4a159c9 100644 --- a/anomalib/models/dfkde/config.yaml +++ b/anomalib/models/dfkde/config.yaml @@ -11,8 +11,10 @@ dataset: transform_config: train: null eval: null - val_split_mode: same_as_test # options: [same_as_test, from_test] - val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode) + test_split_mode: from_dir # options: [from_dir, synthetic] + test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode) + val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic] + val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode) model: name: dfkde diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml index fec94106e8..ecc59aac36 100755 --- a/anomalib/models/dfm/config.yaml +++ b/anomalib/models/dfm/config.yaml @@ -11,8 +11,10 @@ dataset: transform_config: train: null eval: null - val_split_mode: same_as_test # options: [same_as_test, from_test] - val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode) + test_split_mode: from_dir # options: [from_dir, synthetic] + test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode) + val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic] + val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode) model: name: dfm diff --git a/anomalib/models/draem/config.yaml b/anomalib/models/draem/config.yaml index 495b6444bf..31727abe5e 100644 --- a/anomalib/models/draem/config.yaml +++ b/anomalib/models/draem/config.yaml @@ -11,8 +11,10 @@ dataset: transform_config: train: ./anomalib/models/draem/transform_config.yaml eval: ./anomalib/models/draem/transform_config.yaml - val_split_mode: same_as_test # options: [same_as_test, from_test] - val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode) + test_split_mode: from_dir # options: [from_dir, synthetic] + test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode) + val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic] + val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode) tiling: apply: false tile_size: null diff --git a/anomalib/models/fastflow/config.yaml b/anomalib/models/fastflow/config.yaml index 05aa838fe2..93953a9ef0 100644 --- a/anomalib/models/fastflow/config.yaml +++ b/anomalib/models/fastflow/config.yaml @@ -11,8 +11,10 @@ dataset: transform_config: train: null eval: null - val_split_mode: same_as_test # options: [same_as_test, from_test] - val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode) + test_split_mode: from_dir # options: [from_dir, synthetic] + test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode) + val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic] + val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode) tiling: apply: false tile_size: null diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml index a41d9d2421..bbe5738830 100644 --- a/anomalib/models/ganomaly/config.yaml +++ b/anomalib/models/ganomaly/config.yaml @@ -12,8 +12,10 @@ dataset: transform_config: train: null eval: null - val_split_mode: same_as_test # options: [same_as_test, from_test] - val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode) + test_split_mode: from_dir # options: [from_dir, synthetic] + test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode) + val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic] + val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode) tiling: apply: true tile_size: 64 diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml index 315f3e6691..3861bca8a3 100644 --- a/anomalib/models/padim/config.yaml +++ b/anomalib/models/padim/config.yaml @@ -11,8 +11,10 @@ dataset: transform_config: train: null eval: null - val_split_mode: same_as_test # options: [same_as_test, from_test] - val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode) + test_split_mode: from_dir # options: [from_dir, synthetic] + test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode) + val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic] + val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode) tiling: apply: false tile_size: null diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml index d29a5a39f9..fad1104931 100644 --- a/anomalib/models/patchcore/config.yaml +++ b/anomalib/models/patchcore/config.yaml @@ -11,8 +11,10 @@ dataset: transform_config: train: null eval: null - val_split_mode: same_as_test # options: [same_as_test, from_test] - val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode) + test_split_mode: from_dir # options: [from_dir, synthetic] + test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode) + val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic] + val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode) tiling: apply: false tile_size: null diff --git a/anomalib/models/reverse_distillation/config.yaml b/anomalib/models/reverse_distillation/config.yaml index 1e5c3f8f82..2deafe3ca3 100644 --- a/anomalib/models/reverse_distillation/config.yaml +++ b/anomalib/models/reverse_distillation/config.yaml @@ -12,8 +12,10 @@ dataset: transform_config: train: null eval: null - val_split_mode: same_as_test # options: [same_as_test, from_test] - val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode) + test_split_mode: from_dir # options: [from_dir, synthetic] + test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode) + val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic] + val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode) tiling: apply: false tile_size: 64 diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml index 504998ec72..1284156847 100644 --- a/anomalib/models/stfpm/config.yaml +++ b/anomalib/models/stfpm/config.yaml @@ -12,8 +12,10 @@ dataset: transform_config: train: null eval: null - val_split_mode: same_as_test # options: [same_as_test, from_test] - val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode) + test_split_mode: from_dir # options: [from_dir, synthetic] + test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode) + val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic] + val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode) tiling: apply: false tile_size: null From dfd2d80266698758bb96c696ad5fa1dd7a98c619 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 9 Dec 2022 17:17:32 +0100 Subject: [PATCH 91/96] add tests --- tests/pre_merge/datasets/test_datamodule.py | 69 ++++++++++++-- .../pre_merge/datasets/test_synthetic_data.py | 93 +++++++++++++++++++ 2 files changed, 155 insertions(+), 7 deletions(-) create mode 100644 tests/pre_merge/datasets/test_synthetic_data.py diff --git a/tests/pre_merge/datasets/test_datamodule.py b/tests/pre_merge/datasets/test_datamodule.py index d39c385912..063155de99 100644 --- a/tests/pre_merge/datasets/test_datamodule.py +++ b/tests/pre_merge/datasets/test_datamodule.py @@ -37,7 +37,7 @@ def make_avenue_data_module(task="classification", batch_size=1, val_split_mode= return data_module -def make_mvtec_data_module(task="classification", batch_size=1, val_split_mode="from_test"): +def make_mvtec_data_module(task="classification", batch_size=1, test_split_mode="from_dir", val_split_mode="from_test"): data_module = MVTec( root=get_dataset_path(dataset="MVTec"), category="leather", @@ -46,6 +46,7 @@ def make_mvtec_data_module(task="classification", batch_size=1, val_split_mode=" eval_batch_size=batch_size, num_workers=0, task=task, + test_split_mode=test_split_mode, val_split_mode=val_split_mode, ) data_module.prepare_data() @@ -53,7 +54,7 @@ def make_mvtec_data_module(task="classification", batch_size=1, val_split_mode=" return data_module -def make_btech_data_module(task="classification", batch_size=1, val_split_mode="from_test"): +def make_btech_data_module(task="classification", batch_size=1, test_split_mode="from_dir", val_split_mode="from_test"): """Create BTech Data Module.""" data_module = BTech( root=get_dataset_path(dataset="BTech"), @@ -63,6 +64,7 @@ def make_btech_data_module(task="classification", batch_size=1, val_split_mode=" eval_batch_size=batch_size, num_workers=0, task=task, + test_split_mode=test_split_mode, val_split_mode=val_split_mode, ) data_module.prepare_data() @@ -70,13 +72,22 @@ def make_btech_data_module(task="classification", batch_size=1, val_split_mode=" return data_module -def make_folder_data_module(task="classification", batch_size=1, val_split_mode="from_test"): +def make_folder_data_module( + task="classification", + batch_size=1, + test_split_mode="from_dir", + val_split_mode="from_test", + normal_dir="good", + abnormal_dir="broken_large", + normal_test_dir="good_test", +): """Create Folder Data Module.""" root = get_dataset_path(dataset="bottle") data_module = Folder( root=root, - normal_dir="good", - abnormal_dir="broken_large", + normal_dir=normal_dir, + abnormal_dir=abnormal_dir, + normal_test_dir=normal_test_dir, mask_dir=os.path.join(root, "ground_truth/broken_large"), normal_split_ratio=0.2, image_size=(256, 256), @@ -84,6 +95,7 @@ def make_folder_data_module(task="classification", batch_size=1, val_split_mode= eval_batch_size=batch_size, num_workers=8, task=task, + test_split_mode=test_split_mode, val_split_mode=val_split_mode, ) data_module.setup() @@ -116,8 +128,8 @@ def make_ucsdped_data_module(task="classification", batch_size=1, val_split_mode @pytest.fixture(autouse=True) def make_data_module(): - def make(dataset="folder", task="classification", batch_size=1, val_split_mode="from_test"): - return DATASETS[dataset](task=task, batch_size=batch_size, val_split_mode=val_split_mode) + def make(dataset="folder", **kwargs): + return DATASETS[dataset](**kwargs) return make @@ -271,3 +283,46 @@ def test_image_size(self, input_size, effective_image_size, category="shapes", p data_module = get_datamodule(configurable_parameters) data_module.setup() assert next(iter(data_module.train_dataloader()))["image"].shape[-2:] == effective_image_size + + +class TestSubsetSplitting: + @pytest.mark.parametrize("dataset", ["folder", "mvtec", "btech"]) + # @pytest.mark.parametrize("dataset", ["folder"]) + @pytest.mark.parametrize("test_split_mode", ("from_dir", "synthetic")) + @pytest.mark.parametrize("val_split_mode", ("from_test", "synthetic")) + def test_non_overlapping_splits(self, make_data_module, dataset, test_split_mode, val_split_mode): + """Tests if train, test and val splits are non-overlapping.""" + data_module = make_data_module(dataset, test_split_mode=test_split_mode, val_split_mode=val_split_mode) + train_samples = data_module.train_data.samples + val_samples = data_module.val_data.samples + test_samples = data_module.test_data.samples + assert len(set(train_samples.image_path).intersection(set(test_samples.image_path))) == 0 + assert len(set(val_samples.image_path).intersection(set(test_samples.image_path))) == 0 + + @pytest.mark.parametrize("dataset", ["folder", "mvtec", "btech"]) + # @pytest.mark.parametrize("dataset", ["folder"]) + @pytest.mark.parametrize("test_split_mode", ("from_dir", "synthetic")) + def test_equal_splits(self, make_data_module, dataset, test_split_mode): + """Tests if test and and val splits are equal and non-overlapping with train when val_split_mode == same_as_test.""" + data_module = make_data_module(dataset, test_split_mode=test_split_mode, val_split_mode="same_as_test") + train_samples = data_module.train_data.samples + val_samples = data_module.val_data.samples + test_samples = data_module.test_data.samples + assert len(set(train_samples.image_path).intersection(set(test_samples.image_path))) == 0 + assert len(set(val_samples.image_path).intersection(set(test_samples.image_path))) == len(val_samples) + + @pytest.mark.parametrize("test_split_mode", ("from_dir", "synthetic")) + def test_normal_test_dir_omitted(self, make_data_module, test_split_mode): + """The test set should always contain normal samples even when no normal_test_dir ir provided.""" + data_module = make_data_module(dataset="folder", test_split_mode=test_split_mode, normal_test_dir=None) + assert data_module.test_data.has_normal + + def test_abnormal_dir_omitted_from_dir(self, make_data_module): + """The test set should not contain anomalous samples if no abnormal_dir provided and split mode is from_dir.""" + data_module = make_data_module(dataset="folder", test_split_mode="from_dir", abnormal_dir=None) + assert not data_module.test_data.has_anomalous + + def test_abnormal_dir_omitted_synthetic(self, make_data_module): + """The test set should contain anomalous samples if no abnormal_dir provided and split mode is synthetic.""" + data_module = make_data_module(dataset="folder", test_split_mode="synthetic", abnormal_dir=None) + assert data_module.test_data.has_anomalous diff --git a/tests/pre_merge/datasets/test_synthetic_data.py b/tests/pre_merge/datasets/test_synthetic_data.py new file mode 100644 index 0000000000..f7b97d768b --- /dev/null +++ b/tests/pre_merge/datasets/test_synthetic_data.py @@ -0,0 +1,93 @@ +"""Tests for synthetic anomalous dataset.""" +import os +from copy import copy, deepcopy +from pathlib import Path + +import pytest + +from anomalib.data import TaskType +from anomalib.data.folder import FolderDataset +from anomalib.data.synthetic import SyntheticAnomalyDataset +from anomalib.pre_processing import PreProcessor +from tests.helpers.dataset import get_dataset_path + + +def get_folder_dataset(): + """Create Folder Dataset.""" + root = get_dataset_path(dataset="bottle") + pre_process = PreProcessor(image_size=(256, 256)) + dataset = FolderDataset( + task="segmentation", + pre_process=pre_process, + root=root, + normal_dir="good", + abnormal_dir="broken_large", + mask_dir=os.path.join(root, "ground_truth/broken_large"), + split="train", + ) + dataset.setup() + + return dataset + + +@pytest.fixture(autouse=True) +def make_synthetic_dataset(): + """Create synthetic anomaly dataset from folder dataset.""" + + def make(): + folder_dataset = get_folder_dataset() + synthetic_dataset = SyntheticAnomalyDataset.from_dataset(folder_dataset) + return synthetic_dataset + + return make + + +@pytest.fixture(autouse=True) +def synthetic_dataset_from_samples(): + """Create synthetic anomaly dataset by passing a samples dataframe.""" + folder_dataset = get_folder_dataset() + pre_process = PreProcessor(image_size=(256, 256)) + synthetic_dataset = SyntheticAnomalyDataset( + task=folder_dataset.task, pre_process=pre_process, source_samples=folder_dataset.samples + ) + return synthetic_dataset + + +def test_create_synthetic_dataset(make_synthetic_dataset): + """Tests if the image and mask files listed in the synthetic dataset exist.""" + synthetic_dataset = make_synthetic_dataset() + assert all(Path(path).exists() for path in synthetic_dataset.samples.image_path) + assert all(Path(path).exists() for path in synthetic_dataset.samples.mask_path) + + +def test_create_from_dataset(synthetic_dataset_from_samples): + """Tests if the image and mask files listed in the synthetic dataset exist, when instantiated from samples df.""" + synthetic_dataset = synthetic_dataset_from_samples + assert all(Path(path).exists() for path in synthetic_dataset.samples.image_path) + assert all(Path(path).exists() for path in synthetic_dataset.samples.mask_path) + + +def test_cleanup(make_synthetic_dataset): + """Tests if the temporary directory is cleaned up when the instance is deleted.""" + synthetic_dataset = make_synthetic_dataset() + root = synthetic_dataset.root + del synthetic_dataset + assert not root.exists() + + +def test_copy(make_synthetic_dataset): + """Tests if the dataset is copied correctly, and files still exist after original instance is deleted.""" + synthetic_dataset = make_synthetic_dataset() + synthetic_dataset_cp = copy(synthetic_dataset) + assert all(synthetic_dataset.samples == synthetic_dataset_cp.samples) + del synthetic_dataset + assert synthetic_dataset_cp.root.exists() + + +def test_deepcopy(make_synthetic_dataset): + """Tests if the dataset is deep-copied correctly, and files still exist after original instance is deleted.""" + synthetic_dataset = make_synthetic_dataset() + synthetic_dataset_cp = deepcopy(synthetic_dataset) + assert all(synthetic_dataset.samples == synthetic_dataset_cp.samples) + del synthetic_dataset + assert synthetic_dataset_cp.root.exists() From ce43e091ada82ae03729772d9ebf424948e48a00 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Fri, 9 Dec 2022 17:46:02 +0100 Subject: [PATCH 92/96] simplify test set splitting logic --- anomalib/data/base/datamodule.py | 45 ++++++++++++++------------------ 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py index 019da5e3ec..e53f358f74 100644 --- a/anomalib/data/base/datamodule.py +++ b/anomalib/data/base/datamodule.py @@ -6,7 +6,6 @@ from __future__ import annotations import logging -import warnings from abc import ABC from typing import Any, Dict, List, Optional @@ -117,43 +116,37 @@ def _setup(self, _stage: Optional[str] = None) -> None: self._create_val_split() def _create_test_split(self): - # perform subset splitting for test set + """Obtain the test set based on the settings in the config.""" + if self.test_data.has_normal: + # split the test data into normal and anomalous so these can be processed separately + normal_test_data, self.test_data = split_normal_and_anomalous(self.test_data) + else: + # when the user did not provide any normal images for testing, we sample some from the training set + logger.info( + "No normal test images found. Sampling from training set using a split ratio of %d", + self.test_split_ratio, + ) + self.train_data, normal_test_data = random_split(self.train_data, self.test_split_ratio) + if self.test_split_mode == TestSplitMode.FROM_DIR: - # normal data taken from normal_test_dir if available, otherwise sampled from training set - if not self.test_data.has_normal: - logger.info( - "No normal test images found. Sampling from training set using a split ratio of %d", - self.test_split_ratio, - ) - self.train_data, normal_test_data = random_split(self.train_data, self.test_split_ratio) - self.test_data += normal_test_data - # anomalous data taken from abnormal_dir if available, otherwise raise warning - if not self.test_data.has_anomalous: - warnings.warn( - "Your test set does not contain any anomalous images, which may lead to unreliable " - "evaluation results. To fix, please include anomalous images in your dataset, or set " - "`test_split_mode` to `synthetic`." - ) + self.test_data += normal_test_data elif self.test_split_mode == TestSplitMode.SYNTHETIC: - if not self.test_data.has_normal: - logger.info( - "No normal test images found. Sampling from training set using a split ratio of %d", - self.test_split_ratio, - ) - self.train_data, normal_test_data = random_split(self.train_data, self.test_split_ratio) - else: - normal_test_data, _ = split_normal_and_anomalous(self.test_data) self.test_data = SyntheticAnomalyDataset.from_dataset(normal_test_data) + else: + raise ValueError(f"Unsupported Test Split Mode: {self.test_split_mode}") def _create_val_split(self): - # perform subset splitting for validation set + """Obtain the validation set based on the settings in the config.""" if self.val_split_mode == ValSplitMode.FROM_TEST: + # randomly sampled from test set self.test_data, self.val_data = random_split( self.test_data, self.val_split_ratio, label_aware=True, seed=self.seed ) elif self.val_split_mode == ValSplitMode.SAME_AS_TEST: + # equal to test set self.val_data = self.test_data elif self.val_split_mode == ValSplitMode.SYNTHETIC: + # converted from random training sample self.train_data, normal_val_data = random_split(self.train_data, self.val_split_ratio) self.val_data = SyntheticAnomalyDataset.from_dataset(normal_val_data) elif self.val_split_mode != ValSplitMode.NONE: From 8b2d35640400f83ea162f095d19965cb64531686 Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 12 Dec 2022 14:49:49 +0100 Subject: [PATCH 93/96] update docstring --- anomalib/data/utils/augmenter.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anomalib/data/utils/augmenter.py b/anomalib/data/utils/augmenter.py index b08bb11898..b72b9f2d78 100644 --- a/anomalib/data/utils/augmenter.py +++ b/anomalib/data/utils/augmenter.py @@ -36,6 +36,8 @@ class Augmenter: Args: anomaly_source_path (Optional[str]): Path to a folder of images that will be used as source of the anomalous noise. If not specified, random noise will be used instead. + p_anomalous (float): Probability that the anomalous perturbation will be applied to a given image. + beta (float): Parameter that determines the opacity of the noise mask. """ def __init__( From a126af1d90a41ae58f811fd847db4157c05496ea Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 12 Dec 2022 17:49:39 +0100 Subject: [PATCH 94/96] add missing licence --- anomalib/data/synthetic.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py index 42c4cc7cec..561e0fdc3f 100644 --- a/anomalib/data/synthetic.py +++ b/anomalib/data/synthetic.py @@ -3,6 +3,9 @@ This dataset can be used when there is a lack of real anomalous data. """ +# Copyright (C) 2022 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + import logging import math import shutil From b2879c8cefe302c059c315c1c70ce5050188e77f Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 12 Dec 2022 17:55:31 +0100 Subject: [PATCH 95/96] split_normal_and_anomalous -> split_by_label --- anomalib/data/base/datamodule.py | 4 ++-- anomalib/data/utils/__init__.py | 4 ++-- anomalib/data/utils/split.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py index e53f358f74..5870ab56ea 100644 --- a/anomalib/data/base/datamodule.py +++ b/anomalib/data/base/datamodule.py @@ -20,7 +20,7 @@ TestSplitMode, ValSplitMode, random_split, - split_normal_and_anomalous, + split_by_label, ) logger = logging.getLogger(__name__) @@ -119,7 +119,7 @@ def _create_test_split(self): """Obtain the test set based on the settings in the config.""" if self.test_data.has_normal: # split the test data into normal and anomalous so these can be processed separately - normal_test_data, self.test_data = split_normal_and_anomalous(self.test_data) + normal_test_data, self.test_data = split_by_label(self.test_data) else: # when the user did not provide any normal images for testing, we sample some from the training set logger.info( diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py index 4add078386..288e167762 100644 --- a/anomalib/data/utils/__init__.py +++ b/anomalib/data/utils/__init__.py @@ -19,7 +19,7 @@ ValSplitMode, concatenate_datasets, random_split, - split_normal_and_anomalous, + split_by_label, ) __all__ = [ @@ -31,7 +31,7 @@ "read_image", "DownloadProgressBar", "random_split", - "split_normal_and_anomalous", + "split_by_label", "concatenate_datasets", "Split", "ValSplitMode", diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py index 56cc15a736..60f8b7f0e1 100644 --- a/anomalib/data/utils/split.py +++ b/anomalib/data/utils/split.py @@ -127,7 +127,7 @@ def random_split( return [concatenate_datasets(subset) for subset in subsets] -def split_normal_and_anomalous(dataset: AnomalibDataset) -> Tuple[AnomalibDataset, AnomalibDataset]: +def split_by_label(dataset: AnomalibDataset) -> Tuple[AnomalibDataset, AnomalibDataset]: """Splits the dataset into the normal and anomalous subsets.""" samples = dataset.samples normal_indices = samples[samples.label_index == 0].index From 532ff8be8894ae66c6e53a1e76fd311cf989c61f Mon Sep 17 00:00:00 2001 From: Dick Ameln Date: Mon, 12 Dec 2022 18:04:20 +0100 Subject: [PATCH 96/96] VideoAnomalib -> AnomalibVideo --- anomalib/data/avenue.py | 6 +++--- anomalib/data/base/__init__.py | 4 ++-- anomalib/data/base/video.py | 6 +++--- anomalib/data/ucsd_ped.py | 6 +++--- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/anomalib/data/avenue.py b/anomalib/data/avenue.py index 459f42677c..ca58b37cde 100644 --- a/anomalib/data/avenue.py +++ b/anomalib/data/avenue.py @@ -26,7 +26,7 @@ from pandas import DataFrame from torch import Tensor -from anomalib.data.base import VideoAnomalibDataModule, VideoAnomalibDataset +from anomalib.data.base import AnomalibVideoDataModule, AnomalibVideoDataset from anomalib.data.task_type import TaskType from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check from anomalib.data.utils.video import ClipsIndexer @@ -121,7 +121,7 @@ def get_mask(self, idx) -> Optional[Tensor]: return masks -class AvenueDataset(VideoAnomalibDataset): +class AvenueDataset(AnomalibVideoDataset): """Avenue Dataset class. Args: @@ -156,7 +156,7 @@ def _setup(self): self.samples = make_avenue_dataset(self.root, self.gt_dir, self.split) -class Avenue(VideoAnomalibDataModule): +class Avenue(AnomalibVideoDataModule): """Avenue DataModule class. Args: diff --git a/anomalib/data/base/__init__.py b/anomalib/data/base/__init__.py index 0c8fe84257..936388b228 100644 --- a/anomalib/data/base/__init__.py +++ b/anomalib/data/base/__init__.py @@ -6,6 +6,6 @@ from .datamodule import AnomalibDataModule from .dataset import AnomalibDataset -from .video import VideoAnomalibDataModule, VideoAnomalibDataset +from .video import AnomalibVideoDataModule, AnomalibVideoDataset -__all__ = ["AnomalibDataset", "AnomalibDataModule", "VideoAnomalibDataset", "VideoAnomalibDataModule"] +__all__ = ["AnomalibDataset", "AnomalibDataModule", "AnomalibVideoDataset", "AnomalibVideoDataModule"] diff --git a/anomalib/data/base/video.py b/anomalib/data/base/video.py index 538b45e6a9..b2f3b3678b 100644 --- a/anomalib/data/base/video.py +++ b/anomalib/data/base/video.py @@ -14,7 +14,7 @@ from anomalib.pre_processing import PreProcessor -class VideoAnomalibDataset(AnomalibDataset, ABC): +class AnomalibVideoDataset(AnomalibDataset, ABC): """Base video anomalib dataset class. Args: @@ -49,7 +49,7 @@ def samples(self): @samples.setter def samples(self, samples): """Overwrite samples and re-index subvideos.""" - super(VideoAnomalibDataset, self.__class__).samples.fset(self, samples) + super(AnomalibVideoDataset, self.__class__).samples.fset(self, samples) self._setup_clips() def _setup_clips(self) -> None: @@ -96,7 +96,7 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: return item -class VideoAnomalibDataModule(AnomalibDataModule): +class AnomalibVideoDataModule(AnomalibDataModule): """Base class for video data modules.""" def _setup(self, _stage: Optional[str] = None) -> None: diff --git a/anomalib/data/ucsd_ped.py b/anomalib/data/ucsd_ped.py index fd810c4402..996d786e19 100644 --- a/anomalib/data/ucsd_ped.py +++ b/anomalib/data/ucsd_ped.py @@ -14,7 +14,7 @@ from pandas import DataFrame from torch import Tensor -from anomalib.data.base import VideoAnomalibDataModule, VideoAnomalibDataset +from anomalib.data.base import AnomalibVideoDataModule, AnomalibVideoDataset from anomalib.data.task_type import TaskType from anomalib.data.utils import ( DownloadProgressBar, @@ -134,7 +134,7 @@ def get_clip(self, idx: int) -> Tuple[Tensor, Tensor, Dict[str, Any], int]: return video, torch.empty((1, 0)), {}, video_idx -class UCSDpedDataset(VideoAnomalibDataset): +class UCSDpedDataset(AnomalibVideoDataset): """UCSDped Dataset class. Args: @@ -168,7 +168,7 @@ def _setup(self): self.samples = make_ucsd_dataset(self.root_category, self.split) -class UCSDped(VideoAnomalibDataModule): +class UCSDped(AnomalibVideoDataModule): """UCSDped DataModule class. Args: