From ee1cfce349405d383dc9f2ff13fd20b77ac24de0 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 9 Sep 2022 17:37:15 +0200
Subject: [PATCH 01/96] move sample generation to datamodule instead of dataset

---
 anomalib/data/folder.py | 78 +++++++++++++++--------------------------
 anomalib/data/mvtec.py  | 52 +++++++++++++--------------
 2 files changed, 52 insertions(+), 78 deletions(-)

diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index 0f3b47adbd..ed2357619d 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -82,7 +82,6 @@ def make_dataset(
     abnormal_dir: Union[str, Path],
     normal_test_dir: Optional[Union[str, Path]] = None,
     mask_dir: Optional[Union[str, Path]] = None,
-    split: Optional[str] = None,
     split_ratio: float = 0.2,
     seed: Optional[int] = None,
     create_validation_set: bool = True,
@@ -120,9 +119,10 @@ def make_dataset(
         dirs = {**dirs, **{"normal_test": normal_test_dir}}
 
     for dir_type, path in dirs.items():
-        filename, label = _prepare_files_labels(path, dir_type, extensions)
-        filenames += filename
-        labels += label
+        if path is not None:
+            filename, label = _prepare_files_labels(path, dir_type, extensions)
+            filenames += filename
+            labels += label
 
     samples = DataFrame({"image_path": filenames, "label": labels})
 
@@ -158,11 +158,6 @@ def make_dataset(
     if create_validation_set:
         samples = create_validation_set_from_test_set(samples, seed=seed, normal_label="normal")
 
-    # Get the data frame for the split.
-    if split is not None and split in ["train", "val", "test"]:
-        samples = samples[samples.split == split]
-        samples = samples.reset_index(drop=True)
-
     return samples
 
 
@@ -171,19 +166,13 @@ class FolderDataset(Dataset):
 
     def __init__(
         self,
-        normal_dir: Union[Path, str],
-        abnormal_dir: Union[Path, str],
+        samples: DataFrame,
         split: str,
         pre_process: PreProcessor,
-        normal_test_dir: Optional[Union[Path, str]] = None,
-        split_ratio: float = 0.2,
         mask_dir: Optional[Union[Path, str]] = None,
-        extensions: Optional[Tuple[str, ...]] = None,
         task: Optional[str] = None,
-        seed: Optional[int] = None,
-        create_validation_set: bool = False,
     ) -> None:
-        """Create Folder Folder Dataset.
+        """Create Folder Dataset.
 
         Args:
             normal_dir (Union[str, Path]): Path to the directory containing normal images.
@@ -232,17 +221,7 @@ def __init__(
             self.task = task
 
         self.pre_process = pre_process
-        self.samples = make_dataset(
-            normal_dir=normal_dir,
-            abnormal_dir=abnormal_dir,
-            normal_test_dir=normal_test_dir,
-            mask_dir=mask_dir,
-            split=split,
-            split_ratio=split_ratio,
-            seed=seed,
-            create_validation_set=create_validation_set,
-            extensions=extensions,
-        )
+        self.samples = samples
 
     def __len__(self) -> int:
         """Get length of the dataset."""
@@ -423,7 +402,7 @@ def __init__(
 
         self.root = _check_and_convert_path(root)
         self.normal_dir = self.root / normal_dir
-        self.abnormal_dir = self.root / abnormal_dir
+        self.abnormal_dir = self.root / abnormal_dir if abnormal_dir is not None else None
         self.normal_test = normal_test_dir
         if normal_test_dir:
             self.normal_test = self.root / normal_test_dir
@@ -461,6 +440,17 @@ def __init__(
             self.val_data: Dataset
         self.inference_data: Dataset
 
+        self.samples = make_dataset(
+            normal_dir=self.normal_dir,
+            abnormal_dir=self.abnormal_dir,
+            normal_test_dir=self.normal_test,
+            mask_dir=mask_dir,
+            split_ratio=split_ratio,
+            seed=seed,
+            create_validation_set=create_validation_set,
+            extensions=extensions,
+        )
+
     def setup(self, stage: Optional[str] = None) -> None:
         """Setup train, validation and test data.
 
@@ -470,47 +460,35 @@ def setup(self, stage: Optional[str] = None) -> None:
         """
         logger.info("Setting up train, validation, test and prediction datasets.")
         if stage in (None, "fit"):
+            train_samples = self.samples[self.samples.split == "train"]
+            train_samples = train_samples.reset_index(drop=True)
             self.train_data = FolderDataset(
-                normal_dir=self.normal_dir,
-                abnormal_dir=self.abnormal_dir,
-                normal_test_dir=self.normal_test,
+                samples=train_samples,
                 split="train",
-                split_ratio=self.split_ratio,
                 mask_dir=self.mask_dir,
                 pre_process=self.pre_process_train,
-                extensions=self.extensions,
                 task=self.task,
-                seed=self.seed,
-                create_validation_set=self.create_validation_set,
             )
 
         if self.create_validation_set:
+            val_samples = self.samples[self.samples.split == "val"]
+            val_samples = val_samples.reset_index(drop=True)
             self.val_data = FolderDataset(
-                normal_dir=self.normal_dir,
-                abnormal_dir=self.abnormal_dir,
-                normal_test_dir=self.normal_test,
+                samples=val_samples,
                 split="val",
-                split_ratio=self.split_ratio,
                 mask_dir=self.mask_dir,
                 pre_process=self.pre_process_val,
-                extensions=self.extensions,
                 task=self.task,
-                seed=self.seed,
-                create_validation_set=self.create_validation_set,
             )
 
+        test_samples = self.samples[self.samples.split == "test"]
+        test_samples = test_samples.reset_index(drop=True)
         self.test_data = FolderDataset(
-            normal_dir=self.normal_dir,
-            abnormal_dir=self.abnormal_dir,
+            samples=test_samples,
             split="test",
-            normal_test_dir=self.normal_test,
-            split_ratio=self.split_ratio,
             mask_dir=self.mask_dir,
             pre_process=self.pre_process_val,
-            extensions=self.extensions,
             task=self.task,
-            seed=self.seed,
-            create_validation_set=self.create_validation_set,
         )
 
         if stage == "predict":
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 9b45699d64..8aa52af1d4 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -60,7 +60,6 @@
 
 def make_mvtec_dataset(
     path: Path,
-    split: Optional[str] = None,
     split_ratio: float = 0.1,
     seed: Optional[int] = None,
     create_validation_set: bool = False,
@@ -110,6 +109,13 @@ def make_mvtec_dataset(
     Returns:
         DataFrame: an output dataframe containing samples for the requested split (ie., train or test)
     """
+    if seed is None:
+        warnings.warn(
+            "seed is None."
+            " When seed is not set, images from the normal directory are split between training and test dir."
+            " This will lead to inconsistency between runs."
+        )
+
     samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.png")]
     if len(samples_list) == 0:
         raise RuntimeError(f"Found 0 images in {path}")
@@ -147,11 +153,6 @@ def make_mvtec_dataset(
     if create_validation_set:
         samples = create_validation_set_from_test_set(samples, seed=seed)
 
-    # Get the data frame for the split.
-    if split is not None and split in ["train", "val", "test"]:
-        samples = samples[samples.split == split]
-        samples = samples.reset_index(drop=True)
-
     return samples
 
 
@@ -160,13 +161,12 @@ class MVTecDataset(VisionDataset):
 
     def __init__(
         self,
+        samples: DataFrame,
         root: Union[Path, str],
         category: str,
         pre_process: PreProcessor,
         split: str,
         task: str = "segmentation",
-        seed: Optional[int] = None,
-        create_validation_set: bool = False,
     ) -> None:
         """Mvtec AD Dataset class.
 
@@ -211,26 +211,13 @@ def __init__(
         """
         super().__init__(root)
 
-        if seed is None:
-            warnings.warn(
-                "seed is None."
-                " When seed is not set, images from the normal directory are split between training and test dir."
-                " This will lead to inconsistency between runs."
-            )
-
         self.root = Path(root) if isinstance(root, str) else root
         self.category: str = category
         self.split = split
         self.task = task
 
         self.pre_process = pre_process
-
-        self.samples = make_mvtec_dataset(
-            path=self.root / category,
-            split=self.split,
-            seed=seed,
-            create_validation_set=create_validation_set,
-        )
+        self.samples = samples
 
     def __len__(self) -> int:
         """Get length of the dataset."""
@@ -368,6 +355,12 @@ def __init__(
             self.val_data: Dataset
         self.inference_data: Dataset
 
+        self.samples = make_mvtec_dataset(
+            path=self.root / category,
+            seed=seed,
+            create_validation_set=create_validation_set,
+        )
+
     def prepare_data(self) -> None:
         """Download the dataset if not available."""
         if (self.root / self.category).is_dir():
@@ -404,35 +397,38 @@ def setup(self, stage: Optional[str] = None) -> None:
         """
         logger.info("Setting up train, validation, test and prediction datasets.")
         if stage in (None, "fit"):
+            train_samples = self.samples[self.samples.split == "train"]
+            train_samples = train_samples.reset_index(drop=True)
             self.train_data = MVTecDataset(
+                samples=train_samples,
                 root=self.root,
                 category=self.category,
                 pre_process=self.pre_process_train,
                 split="train",
                 task=self.task,
-                seed=self.seed,
-                create_validation_set=self.create_validation_set,
             )
 
         if self.create_validation_set:
+            val_samples = self.samples[self.samples.split == "val"]
+            val_samples = val_samples.reset_index(drop=True)
             self.val_data = MVTecDataset(
+                samples=val_samples,
                 root=self.root,
                 category=self.category,
                 pre_process=self.pre_process_val,
                 split="val",
                 task=self.task,
-                seed=self.seed,
-                create_validation_set=self.create_validation_set,
             )
 
+        test_samples = self.samples[self.samples.split == "test"]
+        test_samples = test_samples.reset_index(drop=True)
         self.test_data = MVTecDataset(
+            samples=test_samples,
             root=self.root,
             category=self.category,
             pre_process=self.pre_process_val,
             split="test",
             task=self.task,
-            seed=self.seed,
-            create_validation_set=self.create_validation_set,
         )
 
         if stage == "predict":

From ec5199ec89765e975fd9b8c62b6b4badb09b9051 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 12 Sep 2022 14:15:47 +0200
Subject: [PATCH 02/96] move sample generation from init to setup

---
 anomalib/data/folder.py | 28 ++++++++++++++--------------
 anomalib/data/mvtec.py  | 18 +++++++++---------
 2 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index ed2357619d..3ed88b6be2 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -440,17 +440,6 @@ def __init__(
             self.val_data: Dataset
         self.inference_data: Dataset
 
-        self.samples = make_dataset(
-            normal_dir=self.normal_dir,
-            abnormal_dir=self.abnormal_dir,
-            normal_test_dir=self.normal_test,
-            mask_dir=mask_dir,
-            split_ratio=split_ratio,
-            seed=seed,
-            create_validation_set=create_validation_set,
-            extensions=extensions,
-        )
-
     def setup(self, stage: Optional[str] = None) -> None:
         """Setup train, validation and test data.
 
@@ -458,9 +447,20 @@ def setup(self, stage: Optional[str] = None) -> None:
           stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
 
         """
+        samples = make_dataset(
+            normal_dir=self.normal_dir,
+            abnormal_dir=self.abnormal_dir,
+            normal_test_dir=self.normal_test,
+            mask_dir=self.mask_dir,
+            split_ratio=self.split_ratio,
+            seed=self.seed,
+            create_validation_set=self.create_validation_set,
+            extensions=self.extensions,
+        )
+
         logger.info("Setting up train, validation, test and prediction datasets.")
         if stage in (None, "fit"):
-            train_samples = self.samples[self.samples.split == "train"]
+            train_samples = samples[samples.split == "train"]
             train_samples = train_samples.reset_index(drop=True)
             self.train_data = FolderDataset(
                 samples=train_samples,
@@ -471,7 +471,7 @@ def setup(self, stage: Optional[str] = None) -> None:
             )
 
         if self.create_validation_set:
-            val_samples = self.samples[self.samples.split == "val"]
+            val_samples = samples[samples.split == "val"]
             val_samples = val_samples.reset_index(drop=True)
             self.val_data = FolderDataset(
                 samples=val_samples,
@@ -481,7 +481,7 @@ def setup(self, stage: Optional[str] = None) -> None:
                 task=self.task,
             )
 
-        test_samples = self.samples[self.samples.split == "test"]
+        test_samples = samples[samples.split == "test"]
         test_samples = test_samples.reset_index(drop=True)
         self.test_data = FolderDataset(
             samples=test_samples,
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 8aa52af1d4..c8b3244d63 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -355,12 +355,6 @@ def __init__(
             self.val_data: Dataset
         self.inference_data: Dataset
 
-        self.samples = make_mvtec_dataset(
-            path=self.root / category,
-            seed=seed,
-            create_validation_set=create_validation_set,
-        )
-
     def prepare_data(self) -> None:
         """Download the dataset if not available."""
         if (self.root / self.category).is_dir():
@@ -395,9 +389,15 @@ def setup(self, stage: Optional[str] = None) -> None:
           stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
 
         """
+        samples = make_mvtec_dataset(
+            path=self.root / self.category,
+            seed=self.seed,
+            create_validation_set=self.create_validation_set,
+        )
+
         logger.info("Setting up train, validation, test and prediction datasets.")
         if stage in (None, "fit"):
-            train_samples = self.samples[self.samples.split == "train"]
+            train_samples = samples[samples.split == "train"]
             train_samples = train_samples.reset_index(drop=True)
             self.train_data = MVTecDataset(
                 samples=train_samples,
@@ -409,7 +409,7 @@ def setup(self, stage: Optional[str] = None) -> None:
             )
 
         if self.create_validation_set:
-            val_samples = self.samples[self.samples.split == "val"]
+            val_samples = samples[samples.split == "val"]
             val_samples = val_samples.reset_index(drop=True)
             self.val_data = MVTecDataset(
                 samples=val_samples,
@@ -420,7 +420,7 @@ def setup(self, stage: Optional[str] = None) -> None:
                 task=self.task,
             )
 
-        test_samples = self.samples[self.samples.split == "test"]
+        test_samples = samples[samples.split == "test"]
         test_samples = test_samples.reset_index(drop=True)
         self.test_data = MVTecDataset(
             samples=test_samples,

From 9f0a35ee4d05f24afefe33451198758df42a4d3a Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Tue, 13 Sep 2022 13:11:57 +0200
Subject: [PATCH 03/96] remove inference stage and add base classes

---
 anomalib/data/base.py   | 33 +++++++++++++++++++++++++++++++++
 anomalib/data/folder.py | 28 +++++-----------------------
 anomalib/data/mvtec.py  | 29 ++++-------------------------
 3 files changed, 42 insertions(+), 48 deletions(-)
 create mode 100644 anomalib/data/base.py

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
new file mode 100644
index 0000000000..722aedfe37
--- /dev/null
+++ b/anomalib/data/base.py
@@ -0,0 +1,33 @@
+"""Anomalib dataset and datamodule base classes."""
+
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import ABC
+from typing import Optional
+
+from pandas import DataFrame
+from pytorch_lightning import LightningDataModule
+from torch.utils.data import Dataset
+
+
+class AnomalibDataset(Dataset, ABC):
+    """Base Anomalib dataset."""
+
+    def __init__(self, samples: DataFrame):
+        super().__init__()
+        self.samples = samples
+
+    def contains_anomalous_images(self):
+        """Check if the dataset contains any anomalous images."""
+        return "anomalous" in list(self.samples.label)
+
+
+class AnomalibDataModule(LightningDataModule):
+    """Base Anomalib data module."""
+
+    def __init__(self):
+        super().__init__()
+        self.train_data: Optional[AnomalibDataset] = None
+        self.val_data: Optional[AnomalibDataset] = None
+        self.test_data: Optional[AnomalibDataset] = None
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index 3ed88b6be2..75e40ae981 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -15,14 +15,13 @@
 import cv2
 import numpy as np
 from pandas.core.frame import DataFrame
-from pytorch_lightning.core.datamodule import LightningDataModule
 from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch import Tensor
-from torch.utils.data import DataLoader, Dataset
+from torch.utils.data import DataLoader
 from torchvision.datasets.folder import IMG_EXTENSIONS
 
-from anomalib.data.inference import InferenceDataset
+from anomalib.data.base import AnomalibDataModule, AnomalibDataset
 from anomalib.data.utils import read_image
 from anomalib.data.utils.split import (
     create_validation_set_from_test_set,
@@ -161,7 +160,7 @@ def make_dataset(
     return samples
 
 
-class FolderDataset(Dataset):
+class FolderDataset(AnomalibDataset):
     """Folder Dataset."""
 
     def __init__(
@@ -199,6 +198,7 @@ def __init__(
                 provided, `task` should be set to `segmentation`.
 
         """
+        super().__init__(samples)
         self.split = split
 
         if task == "segmentation" and mask_dir is None:
@@ -221,7 +221,6 @@ def __init__(
             self.task = task
 
         self.pre_process = pre_process
-        self.samples = samples
 
     def __len__(self) -> int:
         """Get length of the dataset."""
@@ -271,7 +270,7 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
 
 
 @DATAMODULE_REGISTRY
-class Folder(LightningDataModule):
+class Folder(AnomalibDataModule):
     """Folder Lightning Data Module."""
 
     def __init__(
@@ -434,12 +433,6 @@ def __init__(
         self.create_validation_set = create_validation_set
         self.seed = seed
 
-        self.train_data: Dataset
-        self.test_data: Dataset
-        if create_validation_set:
-            self.val_data: Dataset
-        self.inference_data: Dataset
-
     def setup(self, stage: Optional[str] = None) -> None:
         """Setup train, validation and test data.
 
@@ -491,11 +484,6 @@ def setup(self, stage: Optional[str] = None) -> None:
             task=self.task,
         )
 
-        if stage == "predict":
-            self.inference_data = InferenceDataset(
-                path=self.root, image_size=self.image_size, transform_config=self.transform_config_val
-            )
-
     def train_dataloader(self) -> TRAIN_DATALOADERS:
         """Get train dataloader."""
         return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers)
@@ -508,9 +496,3 @@ def val_dataloader(self) -> EVAL_DATALOADERS:
     def test_dataloader(self) -> EVAL_DATALOADERS:
         """Get test dataloader."""
         return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
-
-    def predict_dataloader(self) -> EVAL_DATALOADERS:
-        """Get predict dataloader."""
-        return DataLoader(
-            self.inference_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers
-        )
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index c8b3244d63..c5df312486 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -39,15 +39,12 @@
 import numpy as np
 import pandas as pd
 from pandas.core.frame import DataFrame
-from pytorch_lightning.core.datamodule import LightningDataModule
 from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch import Tensor
 from torch.utils.data import DataLoader
-from torch.utils.data.dataset import Dataset
-from torchvision.datasets.folder import VisionDataset
 
-from anomalib.data.inference import InferenceDataset
+from anomalib.data.base import AnomalibDataModule, AnomalibDataset
 from anomalib.data.utils import DownloadProgressBar, hash_check, read_image
 from anomalib.data.utils.split import (
     create_validation_set_from_test_set,
@@ -156,7 +153,7 @@ def make_mvtec_dataset(
     return samples
 
 
-class MVTecDataset(VisionDataset):
+class MVTecDataset(AnomalibDataset):
     """MVTec AD PyTorch Dataset."""
 
     def __init__(
@@ -209,7 +206,7 @@ def __init__(
             >>> dataset[0]["image"].shape, dataset[0]["mask"].shape
             (torch.Size([3, 256, 256]), torch.Size([256, 256]))
         """
-        super().__init__(root)
+        super().__init__(samples)
 
         self.root = Path(root) if isinstance(root, str) else root
         self.category: str = category
@@ -267,14 +264,13 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
 
 
 @DATAMODULE_REGISTRY
-class MVTec(LightningDataModule):
+class MVTec(AnomalibDataModule):
     """MVTec AD Lightning Data Module."""
 
     def __init__(
         self,
         root: str,
         category: str,
-        # TODO: Remove default values. IAAALD-211
         image_size: Optional[Union[int, Tuple[int, int]]] = None,
         train_batch_size: int = 32,
         test_batch_size: int = 32,
@@ -349,12 +345,6 @@ def __init__(
         self.task = task
         self.seed = seed
 
-        self.train_data: Dataset
-        self.test_data: Dataset
-        if create_validation_set:
-            self.val_data: Dataset
-        self.inference_data: Dataset
-
     def prepare_data(self) -> None:
         """Download the dataset if not available."""
         if (self.root / self.category).is_dir():
@@ -431,11 +421,6 @@ def setup(self, stage: Optional[str] = None) -> None:
             task=self.task,
         )
 
-        if stage == "predict":
-            self.inference_data = InferenceDataset(
-                path=self.root, image_size=self.image_size, transform_config=self.transform_config_val
-            )
-
     def train_dataloader(self) -> TRAIN_DATALOADERS:
         """Get train dataloader."""
         return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers)
@@ -448,9 +433,3 @@ def val_dataloader(self) -> EVAL_DATALOADERS:
     def test_dataloader(self) -> EVAL_DATALOADERS:
         """Get test dataloader."""
         return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
-
-    def predict_dataloader(self) -> EVAL_DATALOADERS:
-        """Get predict dataloader."""
-        return DataLoader(
-            self.inference_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers
-        )

From dea176fcc186b3b4ee1a64da63b1274b7fe54bae Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Tue, 13 Sep 2022 14:47:47 +0200
Subject: [PATCH 04/96] replace dataset classes with AnomalibDataset

---
 anomalib/data/base.py   |  63 ++++++++++++++++--
 anomalib/data/folder.py | 138 ++++------------------------------------
 anomalib/data/mvtec.py  | 131 ++------------------------------------
 3 files changed, 76 insertions(+), 256 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 722aedfe37..ad5fd14bf8 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -4,26 +4,79 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from abc import ABC
-from typing import Optional
+from typing import Dict, Optional, Union
 
+import cv2
+import numpy as np
 from pandas import DataFrame
 from pytorch_lightning import LightningDataModule
+from torch import Tensor
 from torch.utils.data import Dataset
 
+from anomalib.data.utils import read_image
+from anomalib.pre_processing import PreProcessor
 
-class AnomalibDataset(Dataset, ABC):
-    """Base Anomalib dataset."""
 
-    def __init__(self, samples: DataFrame):
+class AnomalibDataset(Dataset):
+    """Anomalib dataset."""
+
+    def __init__(self, samples: DataFrame, task: str, split: str, pre_process: PreProcessor):
         super().__init__()
         self.samples = samples
+        self.task = task
+        self.split = split
+        self.pre_process = pre_process
 
     def contains_anomalous_images(self):
         """Check if the dataset contains any anomalous images."""
         return "anomalous" in list(self.samples.label)
 
+    def __len__(self) -> int:
+        """Get length of the dataset."""
+        return len(self.samples)
+
+    def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
+        """Get dataset item for the index ``index``.
+
+        Args:
+            index (int): Index to get the item.
+
+        Returns:
+            Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
+                Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
+        """
+        image_path = self.samples.image_path[index]
+        image = read_image(image_path)
+
+        pre_processed = self.pre_process(image=image)
+        item = {"image": pre_processed["image"]}
+
+        if self.split in ["val", "test"]:
+            label_index = self.samples.label_index[index]
+
+            item["image_path"] = image_path
+            item["label"] = label_index
+
+            if self.task == "segmentation":
+                mask_path = self.samples.mask_path[index]
+
+                # Only Anomalous (1) images has masks in MVTec AD dataset.
+                # Therefore, create empty mask for Normal (0) images.
+                if label_index == 0:
+                    mask = np.zeros(shape=image.shape[:2])
+                else:
+                    mask = cv2.imread(mask_path, flags=0) / 255.0
+
+                pre_processed = self.pre_process(image=image, mask=mask)
+
+                item["mask_path"] = mask_path
+                item["image"] = pre_processed["image"]
+                item["mask"] = pre_processed["mask"]
+
+        return item
+
 
-class AnomalibDataModule(LightningDataModule):
+class AnomalibDataModule(LightningDataModule, ABC):
     """Base Anomalib data module."""
 
     def __init__(self):
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index 75e40ae981..e883c0da37 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -9,20 +9,16 @@
 import logging
 import warnings
 from pathlib import Path
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 
 import albumentations as A
-import cv2
-import numpy as np
 from pandas.core.frame import DataFrame
 from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
-from torch import Tensor
 from torch.utils.data import DataLoader
 from torchvision.datasets.folder import IMG_EXTENSIONS
 
 from anomalib.data.base import AnomalibDataModule, AnomalibDataset
-from anomalib.data.utils import read_image
 from anomalib.data.utils.split import (
     create_validation_set_from_test_set,
     split_normal_images_in_train_set,
@@ -160,115 +156,6 @@ def make_dataset(
     return samples
 
 
-class FolderDataset(AnomalibDataset):
-    """Folder Dataset."""
-
-    def __init__(
-        self,
-        samples: DataFrame,
-        split: str,
-        pre_process: PreProcessor,
-        mask_dir: Optional[Union[Path, str]] = None,
-        task: Optional[str] = None,
-    ) -> None:
-        """Create Folder Dataset.
-
-        Args:
-            normal_dir (Union[str, Path]): Path to the directory containing normal images.
-            abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images.
-            split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None.
-            pre_process (Optional[PreProcessor], optional): Image Pro-processor to apply transform.
-                Defaults to None.
-            normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing
-                normal images for the test dataset. Defaults to None.
-            split_ratio (float, optional): Ratio to split normal training images and add to the
-                test set in case test set doesn't contain any normal images.
-                Defaults to 0.2.
-            mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
-                the mask annotations. Defaults to None.
-            extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
-                directory.
-            task (Optional[str], optional): Task type. (classification or segmentation) Defaults to None.
-            seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0.
-            create_validation_set (bool, optional):Boolean to create a validation set from the test set.
-                Those wanting to create a validation set could set this flag to ``True``.
-
-        Raises:
-            ValueError: When task is set to classification and `mask_dir` is provided. When `mask_dir` is
-                provided, `task` should be set to `segmentation`.
-
-        """
-        super().__init__(samples)
-        self.split = split
-
-        if task == "segmentation" and mask_dir is None:
-            warnings.warn(
-                "Segmentation task is requested, but mask directory is not provided. "
-                "Classification is to be chosen if mask directory is not provided."
-            )
-            self.task = "classification"
-
-        if task == "classification" and mask_dir:
-            warnings.warn(
-                "Classification task is requested, but mask directory is provided. "
-                "Segmentation task is to be chosen if mask directory is provided."
-            )
-            self.task = "segmentation"
-
-        if task is None or mask_dir is None:
-            self.task = "classification"
-        else:
-            self.task = task
-
-        self.pre_process = pre_process
-
-    def __len__(self) -> int:
-        """Get length of the dataset."""
-        return len(self.samples)
-
-    def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
-        """Get dataset item for the index ``index``.
-
-        Args:
-            index (int): Index to get the item.
-
-        Returns:
-            Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
-                Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
-        """
-        item: Dict[str, Union[str, Tensor]] = {}
-
-        image_path = self.samples.image_path[index]
-        image = read_image(image_path)
-
-        pre_processed = self.pre_process(image=image)
-        item = {"image": pre_processed["image"]}
-
-        if self.split in ["val", "test"]:
-            label_index = self.samples.label_index[index]
-
-            item["image_path"] = image_path
-            item["label"] = label_index
-
-            if self.task == "segmentation":
-                mask_path = self.samples.mask_path[index]
-
-                # Only Anomalous (1) images has masks in MVTec AD dataset.
-                # Therefore, create empty mask for Normal (0) images.
-                if label_index == 0:
-                    mask = np.zeros(shape=image.shape[:2])
-                else:
-                    mask = cv2.imread(mask_path, flags=0) / 255.0
-
-                pre_processed = self.pre_process(image=image, mask=mask)
-
-                item["mask_path"] = mask_path
-                item["image"] = pre_processed["image"]
-                item["mask"] = pre_processed["mask"]
-
-        return item
-
-
 @DATAMODULE_REGISTRY
 class Folder(AnomalibDataModule):
     """Folder Lightning Data Module."""
@@ -409,13 +296,15 @@ def __init__(
         self.extensions = extensions
         self.split_ratio = split_ratio
 
-        if task == "classification" and mask_dir is not None:
-            raise ValueError(
-                "Classification type is set but mask_dir provided. "
-                "If mask_dir is provided task type must be segmentation. "
-                "Check your configuration."
+        if task == "segmentation" and mask_dir is None:
+            warnings.warn(
+                "Segmentation task is requested, but mask directory is not provided. "
+                "Classification is to be chosen if mask directory is not provided."
             )
-        self.task = task
+            self.task = "classification"
+        else:
+            self.task = task
+
         self.transform_config_train = transform_config_train
         self.transform_config_val = transform_config_val
         self.image_size = image_size
@@ -455,10 +344,9 @@ def setup(self, stage: Optional[str] = None) -> None:
         if stage in (None, "fit"):
             train_samples = samples[samples.split == "train"]
             train_samples = train_samples.reset_index(drop=True)
-            self.train_data = FolderDataset(
+            self.train_data = AnomalibDataset(
                 samples=train_samples,
                 split="train",
-                mask_dir=self.mask_dir,
                 pre_process=self.pre_process_train,
                 task=self.task,
             )
@@ -466,20 +354,18 @@ def setup(self, stage: Optional[str] = None) -> None:
         if self.create_validation_set:
             val_samples = samples[samples.split == "val"]
             val_samples = val_samples.reset_index(drop=True)
-            self.val_data = FolderDataset(
+            self.val_data = AnomalibDataset(
                 samples=val_samples,
                 split="val",
-                mask_dir=self.mask_dir,
                 pre_process=self.pre_process_val,
                 task=self.task,
             )
 
         test_samples = samples[samples.split == "test"]
         test_samples = test_samples.reset_index(drop=True)
-        self.test_data = FolderDataset(
+        self.test_data = AnomalibDataset(
             samples=test_samples,
             split="test",
-            mask_dir=self.mask_dir,
             pre_process=self.pre_process_val,
             task=self.task,
         )
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index c5df312486..af7d186a59 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -31,21 +31,18 @@
 import tarfile
 import warnings
 from pathlib import Path
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 from urllib.request import urlretrieve
 
 import albumentations as A
-import cv2
-import numpy as np
 import pandas as pd
 from pandas.core.frame import DataFrame
 from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
-from torch import Tensor
 from torch.utils.data import DataLoader
 
 from anomalib.data.base import AnomalibDataModule, AnomalibDataset
-from anomalib.data.utils import DownloadProgressBar, hash_check, read_image
+from anomalib.data.utils import DownloadProgressBar, hash_check
 from anomalib.data.utils.split import (
     create_validation_set_from_test_set,
     split_normal_images_in_train_set,
@@ -153,116 +150,6 @@ def make_mvtec_dataset(
     return samples
 
 
-class MVTecDataset(AnomalibDataset):
-    """MVTec AD PyTorch Dataset."""
-
-    def __init__(
-        self,
-        samples: DataFrame,
-        root: Union[Path, str],
-        category: str,
-        pre_process: PreProcessor,
-        split: str,
-        task: str = "segmentation",
-    ) -> None:
-        """Mvtec AD Dataset class.
-
-        Args:
-            root: Path to the MVTec AD dataset
-            category: Name of the MVTec AD category.
-            pre_process: List of pre_processing object containing albumentation compose.
-            split: 'train', 'val' or 'test'
-            task: ``classification`` or ``segmentation``
-            seed: seed used for the random subset splitting
-            create_validation_set: Create a validation subset in addition to the train and test subsets
-
-        Examples:
-            >>> from anomalib.data.mvtec import MVTecDataset
-            >>> from anomalib.data.transforms import PreProcessor
-            >>> pre_process = PreProcessor(image_size=256)
-            >>> dataset = MVTecDataset(
-            ...     root='./datasets/MVTec',
-            ...     category='leather',
-            ...     pre_process=pre_process,
-            ...     task="classification",
-            ...     is_train=True,
-            ... )
-            >>> dataset[0].keys()
-            dict_keys(['image'])
-
-            >>> dataset.split = "test"
-            >>> dataset[0].keys()
-            dict_keys(['image', 'image_path', 'label'])
-
-            >>> dataset.task = "segmentation"
-            >>> dataset.split = "train"
-            >>> dataset[0].keys()
-            dict_keys(['image'])
-
-            >>> dataset.split = "test"
-            >>> dataset[0].keys()
-            dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask'])
-
-            >>> dataset[0]["image"].shape, dataset[0]["mask"].shape
-            (torch.Size([3, 256, 256]), torch.Size([256, 256]))
-        """
-        super().__init__(samples)
-
-        self.root = Path(root) if isinstance(root, str) else root
-        self.category: str = category
-        self.split = split
-        self.task = task
-
-        self.pre_process = pre_process
-        self.samples = samples
-
-    def __len__(self) -> int:
-        """Get length of the dataset."""
-        return len(self.samples)
-
-    def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
-        """Get dataset item for the index ``index``.
-
-        Args:
-            index (int): Index to get the item.
-
-        Returns:
-            Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
-                Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
-        """
-        item: Dict[str, Union[str, Tensor]] = {}
-
-        image_path = self.samples.image_path[index]
-        image = read_image(image_path)
-
-        pre_processed = self.pre_process(image=image)
-        item = {"image": pre_processed["image"]}
-
-        if self.split in ["val", "test"]:
-            label_index = self.samples.label_index[index]
-
-            item["image_path"] = image_path
-            item["label"] = label_index
-
-            if self.task == "segmentation":
-                mask_path = self.samples.mask_path[index]
-
-                # Only Anomalous (1) images has masks in MVTec AD dataset.
-                # Therefore, create empty mask for Normal (0) images.
-                if label_index == 0:
-                    mask = np.zeros(shape=image.shape[:2])
-                else:
-                    mask = cv2.imread(mask_path, flags=0) / 255.0
-
-                pre_processed = self.pre_process(image=image, mask=mask)
-
-                item["mask_path"] = mask_path
-                item["image"] = pre_processed["image"]
-                item["mask"] = pre_processed["mask"]
-
-        return item
-
-
 @DATAMODULE_REGISTRY
 class MVTec(AnomalibDataModule):
     """MVTec AD Lightning Data Module."""
@@ -370,7 +257,7 @@ def prepare_data(self) -> None:
                 tar_file.extractall(self.root)
 
             logger.info("Cleaning the tar file")
-            (zip_filename).unlink()
+            zip_filename.unlink()
 
     def setup(self, stage: Optional[str] = None) -> None:
         """Setup train, validation and test data.
@@ -389,10 +276,8 @@ def setup(self, stage: Optional[str] = None) -> None:
         if stage in (None, "fit"):
             train_samples = samples[samples.split == "train"]
             train_samples = train_samples.reset_index(drop=True)
-            self.train_data = MVTecDataset(
+            self.train_data = AnomalibDataset(
                 samples=train_samples,
-                root=self.root,
-                category=self.category,
                 pre_process=self.pre_process_train,
                 split="train",
                 task=self.task,
@@ -401,10 +286,8 @@ def setup(self, stage: Optional[str] = None) -> None:
         if self.create_validation_set:
             val_samples = samples[samples.split == "val"]
             val_samples = val_samples.reset_index(drop=True)
-            self.val_data = MVTecDataset(
+            self.val_data = AnomalibDataset(
                 samples=val_samples,
-                root=self.root,
-                category=self.category,
                 pre_process=self.pre_process_val,
                 split="val",
                 task=self.task,
@@ -412,10 +295,8 @@ def setup(self, stage: Optional[str] = None) -> None:
 
         test_samples = samples[samples.split == "test"]
         test_samples = test_samples.reset_index(drop=True)
-        self.test_data = MVTecDataset(
+        self.test_data = AnomalibDataset(
             samples=test_samples,
-            root=self.root,
-            category=self.category,
             pre_process=self.pre_process_val,
             split="test",
             task=self.task,

From 62a04f868854cabe306c8f87c96eb7e0aca99744 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Tue, 13 Sep 2022 18:18:27 +0200
Subject: [PATCH 05/96] move setup to base class, create samples as class
 method

---
 anomalib/data/base.py   |  68 ++++++++++++-
 anomalib/data/folder.py | 215 ++++++++++++++--------------------------
 anomalib/data/mvtec.py  | 206 ++++++++++++--------------------------
 3 files changed, 203 insertions(+), 286 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index ad5fd14bf8..20bf9f52a9 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -3,9 +3,11 @@
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-from abc import ABC
-from typing import Dict, Optional, Union
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, Optional, Tuple, Union
 
+import albumentations as A
 import cv2
 import numpy as np
 from pandas import DataFrame
@@ -16,6 +18,8 @@
 from anomalib.data.utils import read_image
 from anomalib.pre_processing import PreProcessor
 
+logger = logging.getLogger(__name__)
+
 
 class AnomalibDataset(Dataset):
     """Anomalib dataset."""
@@ -79,8 +83,66 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
 class AnomalibDataModule(LightningDataModule, ABC):
     """Base Anomalib data module."""
 
-    def __init__(self):
+    def __init__(
+        self,
+        task: str,
+        transform_config_train: Optional[Union[str, A.Compose]] = None,
+        transform_config_val: Optional[Union[str, A.Compose]] = None,
+        image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        create_validation_set: bool = False,
+    ):
         super().__init__()
+        self.task = task
+        self.create_validation_set = create_validation_set
+
+        if transform_config_train is not None and transform_config_val is None:
+            transform_config_val = transform_config_train
+        self.pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
+        self.pre_process_val = PreProcessor(config=transform_config_val, image_size=image_size)
+
         self.train_data: Optional[AnomalibDataset] = None
         self.val_data: Optional[AnomalibDataset] = None
         self.test_data: Optional[AnomalibDataset] = None
+
+    @abstractmethod
+    def _create_samples(self) -> DataFrame:
+        """To be implemented in subclass."""
+
+    def setup(self, stage: Optional[str] = None) -> None:
+        """Setup train, validation and test data.
+
+        Args:
+          stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
+
+        """
+        samples = self._create_samples()
+
+        logger.info("Setting up train, validation, test and prediction datasets.")
+        if stage in (None, "fit"):
+            train_samples = samples[samples.split == "train"]
+            train_samples = train_samples.reset_index(drop=True)
+            self.train_data = AnomalibDataset(
+                samples=train_samples,
+                split="train",
+                task=self.task,
+                pre_process=self.pre_process_train,
+            )
+
+        if self.create_validation_set:
+            val_samples = samples[samples.split == "val"]
+            val_samples = val_samples.reset_index(drop=True)
+            self.val_data = AnomalibDataset(
+                samples=val_samples,
+                split="val",
+                task=self.task,
+                pre_process=self.pre_process_val,
+            )
+
+        test_samples = samples[samples.split == "test"]
+        test_samples = test_samples.reset_index(drop=True)
+        self.test_data = AnomalibDataset(
+            samples=test_samples,
+            split="test",
+            task=self.task,
+            pre_process=self.pre_process_val,
+        )
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index e883c0da37..12770ce2aa 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -18,12 +18,11 @@
 from torch.utils.data import DataLoader
 from torchvision.datasets.folder import IMG_EXTENSIONS
 
-from anomalib.data.base import AnomalibDataModule, AnomalibDataset
+from anomalib.data.base import AnomalibDataModule
 from anomalib.data.utils.split import (
     create_validation_set_from_test_set,
     split_normal_images_in_train_set,
 )
-from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
 
@@ -72,90 +71,6 @@ def _prepare_files_labels(
     return filenames, labels
 
 
-def make_dataset(
-    normal_dir: Union[str, Path],
-    abnormal_dir: Union[str, Path],
-    normal_test_dir: Optional[Union[str, Path]] = None,
-    mask_dir: Optional[Union[str, Path]] = None,
-    split_ratio: float = 0.2,
-    seed: Optional[int] = None,
-    create_validation_set: bool = True,
-    extensions: Optional[Tuple[str, ...]] = None,
-):
-    """Make Folder Dataset.
-
-    Args:
-        normal_dir (Union[str, Path]): Path to the directory containing normal images.
-        abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images.
-        normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing
-            normal images for the test dataset. Normal test images will be a split of `normal_dir`
-            if `None`. Defaults to None.
-        mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
-            the mask annotations. Defaults to None.
-        split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None.
-        split_ratio (float, optional): Ratio to split normal training images and add to the
-            test set in case test set doesn't contain any normal images.
-            Defaults to 0.2.
-        seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0.
-        create_validation_set (bool, optional):Boolean to create a validation set from the test set.
-            Those wanting to create a validation set could set this flag to ``True``.
-        extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
-            directory.
-
-    Returns:
-        DataFrame: an output dataframe containing samples for the requested split (ie., train or test)
-    """
-
-    filenames = []
-    labels = []
-    dirs = {"normal": normal_dir, "abnormal": abnormal_dir}
-
-    if normal_test_dir:
-        dirs = {**dirs, **{"normal_test": normal_test_dir}}
-
-    for dir_type, path in dirs.items():
-        if path is not None:
-            filename, label = _prepare_files_labels(path, dir_type, extensions)
-            filenames += filename
-            labels += label
-
-    samples = DataFrame({"image_path": filenames, "label": labels})
-
-    # Create label index for normal (0) and abnormal (1) images.
-    samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0
-    samples.loc[(samples.label == "abnormal"), "label_index"] = 1
-    samples.label_index = samples.label_index.astype(int)
-
-    # If a path to mask is provided, add it to the sample dataframe.
-    if mask_dir is not None:
-        mask_dir = _check_and_convert_path(mask_dir)
-        samples["mask_path"] = ""
-        for index, row in samples.iterrows():
-            if row.label_index == 1:
-                samples.loc[index, "mask_path"] = str(mask_dir / row.image_path.name)
-
-    # Ensure the pathlib objects are converted to str.
-    # This is because torch dataloader doesn't like pathlib.
-    samples = samples.astype({"image_path": "str"})
-
-    # Create train/test split.
-    # By default, all the normal samples are assigned as train.
-    #   and all the abnormal samples are test.
-    samples.loc[(samples.label == "normal"), "split"] = "train"
-    samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test"
-
-    if not normal_test_dir:
-        samples = split_normal_images_in_train_set(
-            samples=samples, split_ratio=split_ratio, seed=seed, normal_label="normal"
-        )
-
-    # If `create_validation_set` is set to True, the test set is split into half.
-    if create_validation_set:
-        samples = create_validation_set_from_test_set(samples, seed=seed, normal_label="normal")
-
-    return samples
-
-
 @DATAMODULE_REGISTRY
 class Folder(AnomalibDataModule):
     """Folder Lightning Data Module."""
@@ -277,7 +192,13 @@ def __init__(
             torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256])
 
         """
-        super().__init__()
+        super().__init__(
+            task=task,
+            transform_config_train=transform_config_train,
+            transform_config_val=transform_config_val,
+            image_size=image_size,
+            create_validation_set=create_validation_set,
+        )
 
         if seed is None and normal_test_dir is None:
             raise ValueError(
@@ -286,16 +207,6 @@ def __init__(
                 " This will lead to inconsistency between runs."
             )
 
-        self.root = _check_and_convert_path(root)
-        self.normal_dir = self.root / normal_dir
-        self.abnormal_dir = self.root / abnormal_dir if abnormal_dir is not None else None
-        self.normal_test = normal_test_dir
-        if normal_test_dir:
-            self.normal_test = self.root / normal_test_dir
-        self.mask_dir = mask_dir
-        self.extensions = extensions
-        self.split_ratio = split_ratio
-
         if task == "segmentation" and mask_dir is None:
             warnings.warn(
                 "Segmentation task is requested, but mask directory is not provided. "
@@ -305,16 +216,20 @@ def __init__(
         else:
             self.task = task
 
+        self.root = _check_and_convert_path(root)
+        self.normal_dir = self.root / normal_dir
+        self.abnormal_dir = self.root / abnormal_dir if abnormal_dir is not None else None
+        self.normal_test_dir = normal_test_dir
+        if normal_test_dir:
+            self.normal_test_dir = self.root / normal_test_dir
+        self.mask_dir = mask_dir
+        self.extensions = extensions
+        self.split_ratio = split_ratio
+
         self.transform_config_train = transform_config_train
         self.transform_config_val = transform_config_val
         self.image_size = image_size
 
-        if self.transform_config_train is not None and self.transform_config_val is None:
-            self.transform_config_val = self.transform_config_train
-
-        self.pre_process_train = PreProcessor(config=self.transform_config_train, image_size=self.image_size)
-        self.pre_process_val = PreProcessor(config=self.transform_config_val, image_size=self.image_size)
-
         self.train_batch_size = train_batch_size
         self.test_batch_size = test_batch_size
         self.num_workers = num_workers
@@ -322,53 +237,69 @@ def __init__(
         self.create_validation_set = create_validation_set
         self.seed = seed
 
-    def setup(self, stage: Optional[str] = None) -> None:
-        """Setup train, validation and test data.
+    def _create_samples(self):
+        """Create the dataframe with samples for the Folder dataset.
 
-        Args:
-          stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
+        This function creates a dataframe to store the parsed information based on the following format:
+        |---|-------------------|--------|-------------|------------------|-------|
+        |   | image_path        | label  | label_index | mask_path        | split |
+        |---|-------------------|--------|-------------|------------------|-------|
+        | 0 | path/to/image.png | normal | 0           | path/to/mask.png | train |
+        |---|-------------------|--------|-------------|------------------|-------|
+
+        Returns:
+            DataFrame: an output dataframe containing the samples of the dataset.
 
         """
-        samples = make_dataset(
-            normal_dir=self.normal_dir,
-            abnormal_dir=self.abnormal_dir,
-            normal_test_dir=self.normal_test,
-            mask_dir=self.mask_dir,
-            split_ratio=self.split_ratio,
-            seed=self.seed,
-            create_validation_set=self.create_validation_set,
-            extensions=self.extensions,
-        )
 
-        logger.info("Setting up train, validation, test and prediction datasets.")
-        if stage in (None, "fit"):
-            train_samples = samples[samples.split == "train"]
-            train_samples = train_samples.reset_index(drop=True)
-            self.train_data = AnomalibDataset(
-                samples=train_samples,
-                split="train",
-                pre_process=self.pre_process_train,
-                task=self.task,
+        filenames = []
+        labels = []
+        dirs = {"normal": self.normal_dir, "abnormal": self.abnormal_dir}
+
+        if self.normal_test_dir:
+            dirs = {**dirs, **{"normal_test": self.normal_test_dir}}
+
+        for dir_type, path in dirs.items():
+            if path is not None:
+                filename, label = _prepare_files_labels(path, dir_type, self.extensions)
+                filenames += filename
+                labels += label
+
+        samples = DataFrame({"image_path": filenames, "label": labels})
+
+        # Create label index for normal (0) and abnormal (1) images.
+        samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0
+        samples.loc[(samples.label == "abnormal"), "label_index"] = 1
+        samples.label_index = samples.label_index.astype(int)
+
+        # If a path to mask is provided, add it to the sample dataframe.
+        if self.mask_dir is not None:
+            self.mask_dir = _check_and_convert_path(self.mask_dir)
+            samples["mask_path"] = ""
+            for index, row in samples.iterrows():
+                if row.label_index == 1:
+                    samples.loc[index, "mask_path"] = str(self.mask_dir / row.image_path.name)
+
+        # Ensure the pathlib objects are converted to str.
+        # This is because torch dataloader doesn't like pathlib.
+        samples = samples.astype({"image_path": "str"})
+
+        # Create train/test split.
+        # By default, all the normal samples are assigned as train.
+        #   and all the abnormal samples are test.
+        samples.loc[(samples.label == "normal"), "split"] = "train"
+        samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test"
+
+        if not self.normal_test_dir:
+            samples = split_normal_images_in_train_set(
+                samples=samples, split_ratio=self.split_ratio, seed=self.seed, normal_label="normal"
             )
 
+        # If `create_validation_set` is set to True, the test set is split into half.
         if self.create_validation_set:
-            val_samples = samples[samples.split == "val"]
-            val_samples = val_samples.reset_index(drop=True)
-            self.val_data = AnomalibDataset(
-                samples=val_samples,
-                split="val",
-                pre_process=self.pre_process_val,
-                task=self.task,
-            )
+            samples = create_validation_set_from_test_set(samples, seed=self.seed, normal_label="normal")
 
-        test_samples = samples[samples.split == "test"]
-        test_samples = test_samples.reset_index(drop=True)
-        self.test_data = AnomalibDataset(
-            samples=test_samples,
-            split="test",
-            pre_process=self.pre_process_val,
-            task=self.task,
-        )
+        return samples
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
         """Get train dataloader."""
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index af7d186a59..466420eebd 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -41,115 +41,16 @@
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch.utils.data import DataLoader
 
-from anomalib.data.base import AnomalibDataModule, AnomalibDataset
+from anomalib.data.base import AnomalibDataModule
 from anomalib.data.utils import DownloadProgressBar, hash_check
 from anomalib.data.utils.split import (
     create_validation_set_from_test_set,
     split_normal_images_in_train_set,
 )
-from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
 
 
-def make_mvtec_dataset(
-    path: Path,
-    split_ratio: float = 0.1,
-    seed: Optional[int] = None,
-    create_validation_set: bool = False,
-) -> DataFrame:
-    """Create MVTec AD samples by parsing the MVTec AD data file structure.
-
-    The files are expected to follow the structure:
-        path/to/dataset/split/category/image_filename.png
-        path/to/dataset/ground_truth/category/mask_filename.png
-
-    This function creates a dataframe to store the parsed information based on the following format:
-    |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
-    |   | path          | split | label   | image_path    | mask_path                             | label_index |
-    |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
-    | 0 | datasets/name |  test |  defect |  filename.png | ground_truth/defect/filename_mask.png | 1           |
-    |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
-
-    Args:
-        path (Path): Path to dataset
-        split (str, optional): Dataset split (ie., either train or test). Defaults to None.
-        split_ratio (float, optional): Ratio to split normal training images and add to the
-            test set in case test set doesn't contain any normal images.
-            Defaults to 0.1.
-        seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0.
-        create_validation_set (bool, optional): Boolean to create a validation set from the test set.
-            MVTec AD dataset does not contain a validation set. Those wanting to create a validation set
-            could set this flag to ``True``.
-
-    Example:
-        The following example shows how to get training samples from MVTec AD bottle category:
-
-        >>> root = Path('./MVTec')
-        >>> category = 'bottle'
-        >>> path = root / category
-        >>> path
-        PosixPath('MVTec/bottle')
-
-        >>> samples = make_mvtec_dataset(path, split='train', split_ratio=0.1, seed=0)
-        >>> samples.head()
-           path         split label image_path                           mask_path                   label_index
-        0  MVTec/bottle train good MVTec/bottle/train/good/105.png MVTec/bottle/ground_truth/good/105_mask.png 0
-        1  MVTec/bottle train good MVTec/bottle/train/good/017.png MVTec/bottle/ground_truth/good/017_mask.png 0
-        2  MVTec/bottle train good MVTec/bottle/train/good/137.png MVTec/bottle/ground_truth/good/137_mask.png 0
-        3  MVTec/bottle train good MVTec/bottle/train/good/152.png MVTec/bottle/ground_truth/good/152_mask.png 0
-        4  MVTec/bottle train good MVTec/bottle/train/good/109.png MVTec/bottle/ground_truth/good/109_mask.png 0
-
-    Returns:
-        DataFrame: an output dataframe containing samples for the requested split (ie., train or test)
-    """
-    if seed is None:
-        warnings.warn(
-            "seed is None."
-            " When seed is not set, images from the normal directory are split between training and test dir."
-            " This will lead to inconsistency between runs."
-        )
-
-    samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.png")]
-    if len(samples_list) == 0:
-        raise RuntimeError(f"Found 0 images in {path}")
-
-    samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"])
-    samples = samples[samples.split != "ground_truth"]
-
-    # Create mask_path column
-    samples["mask_path"] = (
-        samples.path
-        + "/ground_truth/"
-        + samples.label
-        + "/"
-        + samples.image_path.str.rstrip("png").str.rstrip(".")
-        + "_mask.png"
-    )
-
-    # Modify image_path column by converting to absolute path
-    samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path
-
-    # Split the normal images in training set if test set doesn't
-    # contain any normal images. This is needed because AUC score
-    # cannot be computed based on 1-class
-    if sum((samples.split == "test") & (samples.label == "good")) == 0:
-        samples = split_normal_images_in_train_set(samples, split_ratio, seed)
-
-    # Good images don't have mask
-    samples.loc[(samples.split == "test") & (samples.label == "good"), "mask_path"] = ""
-
-    # Create label index for normal (0) and anomalous (1) images.
-    samples.loc[(samples.label == "good"), "label_index"] = 0
-    samples.loc[(samples.label != "good"), "label_index"] = 1
-    samples.label_index = samples.label_index.astype(int)
-
-    if create_validation_set:
-        samples = create_validation_set_from_test_set(samples, seed=seed)
-
-    return samples
-
-
 @DATAMODULE_REGISTRY
 class MVTec(AnomalibDataModule):
     """MVTec AD Lightning Data Module."""
@@ -165,6 +66,7 @@ def __init__(
         task: str = "segmentation",
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_val: Optional[Union[str, A.Compose]] = None,
+        split_ratio: float = 0.2,
         seed: Optional[int] = None,
         create_validation_set: bool = False,
     ) -> None:
@@ -209,7 +111,13 @@ def __init__(
             >>> data["image"].shape, data["mask"].shape
             (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256]))
         """
-        super().__init__()
+        super().__init__(
+            task=task,
+            transform_config_train=transform_config_train,
+            transform_config_val=transform_config_val,
+            image_size=image_size,
+            create_validation_set=create_validation_set,
+        )
 
         self.root = root if isinstance(root, Path) else Path(root)
         self.category = category
@@ -218,12 +126,6 @@ def __init__(
         self.transform_config_val = transform_config_val
         self.image_size = image_size
 
-        if self.transform_config_train is not None and self.transform_config_val is None:
-            self.transform_config_val = self.transform_config_train
-
-        self.pre_process_train = PreProcessor(config=self.transform_config_train, image_size=self.image_size)
-        self.pre_process_val = PreProcessor(config=self.transform_config_val, image_size=self.image_size)
-
         self.train_batch_size = train_batch_size
         self.test_batch_size = test_batch_size
         self.num_workers = num_workers
@@ -231,6 +133,7 @@ def __init__(
         self.create_validation_set = create_validation_set
         self.task = task
         self.seed = seed
+        self.split_ratio = split_ratio
 
     def prepare_data(self) -> None:
         """Download the dataset if not available."""
@@ -259,48 +162,69 @@ def prepare_data(self) -> None:
             logger.info("Cleaning the tar file")
             zip_filename.unlink()
 
-    def setup(self, stage: Optional[str] = None) -> None:
-        """Setup train, validation and test data.
+    def _create_samples(self) -> DataFrame:
+        """Create MVTec AD samples by parsing the MVTec AD data file structure.
 
-        Args:
-          stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
+        The files are expected to follow the structure:
+            path/to/dataset/split/category/image_filename.png
+            path/to/dataset/ground_truth/category/mask_filename.png
 
+        This function creates a dataframe to store the parsed information based on the following format:
+        |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
+        |   | path          | split | label   | image_path    | mask_path                             | label_index |
+        |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
+        | 0 | datasets/name |  test |  defect |  filename.png | ground_truth/defect/filename_mask.png | 1           |
+        |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
+
+        Returns:
+            DataFrame: an output dataframe containing the samples of the dataset.
         """
-        samples = make_mvtec_dataset(
-            path=self.root / self.category,
-            seed=self.seed,
-            create_validation_set=self.create_validation_set,
+        if self.seed is None:
+            warnings.warn(
+                "seed is None."
+                " When seed is not set, images from the normal directory are split between training and test dir."
+                " This will lead to inconsistency between runs."
+            )
+
+        path = self.root / self.category
+        samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.png")]
+        if len(samples_list) == 0:
+            raise RuntimeError(f"Found 0 images in {path}")
+
+        samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"])
+        samples = samples[samples.split != "ground_truth"]
+
+        # Create mask_path column
+        samples["mask_path"] = (
+            samples.path
+            + "/ground_truth/"
+            + samples.label
+            + "/"
+            + samples.image_path.str.rstrip("png").str.rstrip(".")
+            + "_mask.png"
         )
 
-        logger.info("Setting up train, validation, test and prediction datasets.")
-        if stage in (None, "fit"):
-            train_samples = samples[samples.split == "train"]
-            train_samples = train_samples.reset_index(drop=True)
-            self.train_data = AnomalibDataset(
-                samples=train_samples,
-                pre_process=self.pre_process_train,
-                split="train",
-                task=self.task,
-            )
+        # Modify image_path column by converting to absolute path
+        samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path
+
+        # Split the normal images in training set if test set doesn't
+        # contain any normal images. This is needed because AUC score
+        # cannot be computed based on 1-class
+        if sum((samples.split == "test") & (samples.label == "good")) == 0:
+            samples = split_normal_images_in_train_set(samples, self.split_ratio, self.seed)
+
+        # Good images don't have mask
+        samples.loc[(samples.split == "test") & (samples.label == "good"), "mask_path"] = ""
+
+        # Create label index for normal (0) and anomalous (1) images.
+        samples.loc[(samples.label == "good"), "label_index"] = 0
+        samples.loc[(samples.label != "good"), "label_index"] = 1
+        samples.label_index = samples.label_index.astype(int)
 
         if self.create_validation_set:
-            val_samples = samples[samples.split == "val"]
-            val_samples = val_samples.reset_index(drop=True)
-            self.val_data = AnomalibDataset(
-                samples=val_samples,
-                pre_process=self.pre_process_val,
-                split="val",
-                task=self.task,
-            )
+            samples = create_validation_set_from_test_set(samples, seed=self.seed)
 
-        test_samples = samples[samples.split == "test"]
-        test_samples = test_samples.reset_index(drop=True)
-        self.test_data = AnomalibDataset(
-            samples=test_samples,
-            pre_process=self.pre_process_val,
-            split="test",
-            task=self.task,
-        )
+        return samples
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
         """Get train dataloader."""

From e91afad8e9e0187d0e2fa2a5fed1bb038e1bc7db Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Tue, 13 Sep 2022 19:32:23 +0200
Subject: [PATCH 06/96] update docstrings

---
 anomalib/data/base.py   | 19 ++++++++++++++++++-
 anomalib/data/folder.py |  6 +++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 20bf9f52a9..f053fa7532 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -106,7 +106,24 @@ def __init__(
 
     @abstractmethod
     def _create_samples(self) -> DataFrame:
-        """To be implemented in subclass."""
+        """This method should be implemented in the subclass.
+
+        This method should return a dataframe that contains the information needed by the dataloader to load each of
+        the dataset items into memory. The dataframe must at least contain the following columns:
+        split - The subset to which the dataset item is assigned.
+        image_path - Path to file system location where the image is stored.
+        label_index - Index of the anomaly label, typically 0 for "normal" and 1 for "anomalous".
+
+        Additionally, when the task type is segmentation, the dataframe must have the mask_path column, which contains
+        the path the ground truth masks (for the anomalous images only).
+
+        Example of a dataframe returned by calling this method from a concrete class:
+        |---|-------------------|-----------|-------------|------------------|-------|
+        |   | image_path        | label     | label_index | mask_path        | split |
+        |---|-------------------|-----------|-------------|------------------|-------|
+        | 0 | path/to/image.png | anomalous | 0           | path/to/mask.png | train |
+        |---|-------------------|-----------|-------------|------------------|-------|
+        """
 
     def setup(self, stage: Optional[str] = None) -> None:
         """Setup train, validation and test data.
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index 12770ce2aa..aea65a4d64 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -240,6 +240,11 @@ def __init__(
     def _create_samples(self):
         """Create the dataframe with samples for the Folder dataset.
 
+        The files are expected to follow the structure:
+            path/to/dataset/normal_folder_name/normal_image_name.png
+            path/to/dataset/abnormal_folder_name/abnormal_image_name.png
+
+
         This function creates a dataframe to store the parsed information based on the following format:
         |---|-------------------|--------|-------------|------------------|-------|
         |   | image_path        | label  | label_index | mask_path        | split |
@@ -249,7 +254,6 @@ def _create_samples(self):
 
         Returns:
             DataFrame: an output dataframe containing the samples of the dataset.
-
         """
 
         filenames = []

From df4a805d7f59814da9a1f0e3466c99c527d1e5e0 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Wed, 14 Sep 2022 12:37:11 +0200
Subject: [PATCH 07/96] refactor btech to new format

---
 anomalib/data/btech.py | 352 ++++++++---------------------------------
 1 file changed, 66 insertions(+), 286 deletions(-)

diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 8b6bac792b..9f746f0e5c 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -11,260 +11,38 @@
 
 import logging
 import shutil
-import warnings
 import zipfile
 from pathlib import Path
-from typing import Dict, Optional, Tuple, Union
+from typing import Optional, Tuple, Union
 from urllib.request import urlretrieve
 
 import albumentations as A
 import cv2
-import numpy as np
 import pandas as pd
 from pandas.core.frame import DataFrame
-from pytorch_lightning.core.datamodule import LightningDataModule
 from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
-from torch import Tensor
 from torch.utils.data import DataLoader
-from torch.utils.data.dataset import Dataset
-from torchvision.datasets.folder import VisionDataset
 from tqdm import tqdm
 
-from anomalib.data.inference import InferenceDataset
-from anomalib.data.utils import DownloadProgressBar, hash_check, read_image
+from anomalib.data.base import AnomalibDataModule
+from anomalib.data.utils import DownloadProgressBar, hash_check
 from anomalib.data.utils.split import (
     create_validation_set_from_test_set,
     split_normal_images_in_train_set,
 )
-from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
 
 
-def make_btech_dataset(
-    path: Path,
-    split: Optional[str] = None,
-    split_ratio: float = 0.1,
-    seed: Optional[int] = None,
-    create_validation_set: bool = False,
-) -> DataFrame:
-    """Create BTech samples by parsing the BTech data file structure.
-
-    The files are expected to follow the structure:
-        path/to/dataset/split/category/image_filename.png
-        path/to/dataset/ground_truth/category/mask_filename.png
-
-    Args:
-        path (Path): Path to dataset
-        split (str, optional): Dataset split (ie., either train or test). Defaults to None.
-        split_ratio (float, optional): Ratio to split normal training images and add to the
-            test set in case test set doesn't contain any normal images.
-            Defaults to 0.1.
-        seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0.
-        create_validation_set (bool, optional): Boolean to create a validation set from the test set.
-            BTech dataset does not contain a validation set. Those wanting to create a validation set
-            could set this flag to ``True``.
-
-    Example:
-        The following example shows how to get training samples from BTech 01 category:
-
-        >>> root = Path('./BTech')
-        >>> category = '01'
-        >>> path = root / category
-        >>> path
-        PosixPath('BTech/01')
-
-        >>> samples = make_btech_dataset(path, split='train', split_ratio=0.1, seed=0)
-        >>> samples.head()
-           path     split label image_path                  mask_path                     label_index
-        0  BTech/01 train 01    BTech/01/train/ok/105.bmp BTech/01/ground_truth/ok/105.png      0
-        1  BTech/01 train 01    BTech/01/train/ok/017.bmp BTech/01/ground_truth/ok/017.png      0
-        ...
-
-    Returns:
-        DataFrame: an output dataframe containing samples for the requested split (ie., train or test)
-    """
-    samples_list = [
-        (str(path),) + filename.parts[-3:] for filename in path.glob("**/*") if filename.suffix in (".bmp", ".png")
-    ]
-    if len(samples_list) == 0:
-        raise RuntimeError(f"Found 0 images in {path}")
-
-    samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"])
-    samples = samples[samples.split != "ground_truth"]
-
-    # Create mask_path column
-    samples["mask_path"] = (
-        samples.path
-        + "/ground_truth/"
-        + samples.label
-        + "/"
-        + samples.image_path.str.rstrip("png").str.rstrip(".")
-        + ".png"
-    )
-
-    # Modify image_path column by converting to absolute path
-    samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path
-
-    # Split the normal images in training set if test set doesn't
-    # contain any normal images. This is needed because AUC score
-    # cannot be computed based on 1-class
-    if sum((samples.split == "test") & (samples.label == "ok")) == 0:
-        samples = split_normal_images_in_train_set(samples, split_ratio, seed)
-
-    # Good images don't have mask
-    samples.loc[(samples.split == "test") & (samples.label == "ok"), "mask_path"] = ""
-
-    # Create label index for normal (0) and anomalous (1) images.
-    samples.loc[(samples.label == "ok"), "label_index"] = 0
-    samples.loc[(samples.label != "ok"), "label_index"] = 1
-    samples.label_index = samples.label_index.astype(int)
-
-    if create_validation_set:
-        samples = create_validation_set_from_test_set(samples, seed=seed)
-
-    # Get the data frame for the split.
-    if split is not None and split in ["train", "val", "test"]:
-        samples = samples[samples.split == split]
-        samples = samples.reset_index(drop=True)
-
-    return samples
-
-
-class BTechDataset(VisionDataset):
-    """BTech PyTorch Dataset."""
-
-    def __init__(
-        self,
-        root: Union[Path, str],
-        category: str,
-        pre_process: PreProcessor,
-        split: str,
-        task: str = "segmentation",
-        seed: Optional[int] = None,
-        create_validation_set: bool = False,
-    ) -> None:
-        """Btech Dataset class.
-
-        Args:
-            root: Path to the BTech dataset
-            category: Name of the BTech category.
-            pre_process: List of pre_processing object containing albumentation compose.
-            split: 'train', 'val' or 'test'
-            task: ``classification`` or ``segmentation``
-            seed: seed used for the random subset splitting
-            create_validation_set: Create a validation subset in addition to the train and test subsets
-
-        Examples:
-            >>> from anomalib.data.btech import BTechDataset
-            >>> from anomalib.data.transforms import PreProcessor
-            >>> pre_process = PreProcessor(image_size=256)
-            >>> dataset = BTechDataset(
-            ...     root='./datasets/BTech',
-            ...     category='leather',
-            ...     pre_process=pre_process,
-            ...     task="classification",
-            ...     is_train=True,
-            ... )
-            >>> dataset[0].keys()
-            dict_keys(['image'])
-
-            >>> dataset.split = "test"
-            >>> dataset[0].keys()
-            dict_keys(['image', 'image_path', 'label'])
-
-            >>> dataset.task = "segmentation"
-            >>> dataset.split = "train"
-            >>> dataset[0].keys()
-            dict_keys(['image'])
-
-            >>> dataset.split = "test"
-            >>> dataset[0].keys()
-            dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask'])
-
-            >>> dataset[0]["image"].shape, dataset[0]["mask"].shape
-            (torch.Size([3, 256, 256]), torch.Size([256, 256]))
-        """
-        super().__init__(root)
-
-        if seed is None:
-            warnings.warn(
-                "seed is None."
-                " When seed is not set, images from the normal directory are split between training and test dir."
-                " This will lead to inconsistency between runs."
-            )
-
-        self.root = Path(root) if isinstance(root, str) else root
-        self.category: str = category
-        self.split = split
-        self.task = task
-
-        self.pre_process = pre_process
-
-        self.samples = make_btech_dataset(
-            path=self.root / category,
-            split=self.split,
-            seed=seed,
-            create_validation_set=create_validation_set,
-        )
-
-    def __len__(self) -> int:
-        """Get length of the dataset."""
-        return len(self.samples)
-
-    def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
-        """Get dataset item for the index ``index``.
-
-        Args:
-            index (int): Index to get the item.
-
-        Returns:
-            Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
-                Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
-        """
-        item: Dict[str, Union[str, Tensor]] = {}
-
-        image_path = self.samples.image_path[index]
-        image = read_image(image_path)
-
-        pre_processed = self.pre_process(image=image)
-        item = {"image": pre_processed["image"]}
-
-        if self.split in ["val", "test"]:
-            label_index = self.samples.label_index[index]
-
-            item["image_path"] = image_path
-            item["label"] = label_index
-
-            if self.task == "segmentation":
-                mask_path = self.samples.mask_path[index]
-
-                # Only Anomalous (1) images has masks in BTech dataset.
-                # Therefore, create empty mask for Normal (0) images.
-                if label_index == 0:
-                    mask = np.zeros(shape=image.shape[:2])
-                else:
-                    mask = cv2.imread(mask_path, flags=0) / 255.0
-
-                pre_processed = self.pre_process(image=image, mask=mask)
-
-                item["mask_path"] = mask_path
-                item["image"] = pre_processed["image"]
-                item["mask"] = pre_processed["mask"]
-
-        return item
-
-
 @DATAMODULE_REGISTRY
-class BTech(LightningDataModule):
+class BTech(AnomalibDataModule):
     """BTechDataModule Lightning Data Module."""
 
     def __init__(
         self,
         root: str,
         category: str,
-        # TODO: Remove default values. IAAALD-211
         image_size: Optional[Union[int, Tuple[int, int]]] = None,
         train_batch_size: int = 32,
         test_batch_size: int = 32,
@@ -272,6 +50,7 @@ def __init__(
         task: str = "segmentation",
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_val: Optional[Union[str, A.Compose]] = None,
+        split_ratio: float = 0.2,
         seed: Optional[int] = None,
         create_validation_set: bool = False,
     ) -> None:
@@ -316,21 +95,21 @@ def __init__(
             >>> data["image"].shape, data["mask"].shape
             (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256]))
         """
-        super().__init__()
+        super().__init__(
+            task=task,
+            transform_config_train=transform_config_train,
+            transform_config_val=transform_config_val,
+            image_size=image_size,
+            create_validation_set=create_validation_set,
+        )
 
         self.root = root if isinstance(root, Path) else Path(root)
         self.category = category
-        self.dataset_path = self.root / self.category
+        self.path = self.root / self.category
         self.transform_config_train = transform_config_train
         self.transform_config_val = transform_config_val
         self.image_size = image_size
 
-        if self.transform_config_train is not None and self.transform_config_val is None:
-            self.transform_config_val = self.transform_config_train
-
-        self.pre_process_train = PreProcessor(config=self.transform_config_train, image_size=self.image_size)
-        self.pre_process_val = PreProcessor(config=self.transform_config_val, image_size=self.image_size)
-
         self.train_batch_size = train_batch_size
         self.test_batch_size = test_batch_size
         self.num_workers = num_workers
@@ -338,12 +117,7 @@ def __init__(
         self.create_validation_set = create_validation_set
         self.task = task
         self.seed = seed
-
-        self.train_data: Dataset
-        self.test_data: Dataset
-        if create_validation_set:
-            self.val_data: Dataset
-        self.inference_data: Dataset
+        self.split_ratio = split_ratio
 
     def prepare_data(self) -> None:
         """Download the dataset if not available."""
@@ -386,53 +160,65 @@ def prepare_data(self) -> None:
             logger.info("Cleaning the tar file")
             zip_filename.unlink()
 
-    def setup(self, stage: Optional[str] = None) -> None:
-        """Setup train, validation and test data.
+    def _create_samples(self) -> DataFrame:
+        """Create BTech samples by parsing the BTech data file structure.
 
-        BTech dataset uses BTech dataset structure, which is the reason for
-        using `anomalib.data.btech.BTech` class to get the dataset items.
+        The files are expected to follow the structure:
+            path/to/dataset/category/split/[ok|ko]/image_filename.bmp
+            path/to/dataset/category/ground_truth/ko/mask_filename.png
 
-        Args:
-          stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
+        This function creates a dataframe to store the parsed information based on the following format:
+        |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
+        |   | path          | split | label   | image_path    | mask_path                             | label_index |
+        |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
+        | 0 | datasets/name |  test |  ko     |  filename.png | ground_truth/ko/filename_mask.png     | 1           |
+        |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
 
+        Returns:
+            DataFrame: an output dataframe containing the samples of the dataset.
         """
-        logger.info("Setting up train, validation, test and prediction datasets.")
-        if stage in (None, "fit"):
-            self.train_data = BTechDataset(
-                root=self.root,
-                category=self.category,
-                pre_process=self.pre_process_train,
-                split="train",
-                task=self.task,
-                seed=self.seed,
-                create_validation_set=self.create_validation_set,
-            )
+        samples_list = [
+            (str(self.path),) + filename.parts[-3:]
+            for filename in self.path.glob("**/*")
+            if filename.suffix in (".bmp", ".png")
+        ]
+        if len(samples_list) == 0:
+            raise RuntimeError(f"Found 0 images in {self.path}")
+
+        samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"])
+        samples = samples[samples.split != "ground_truth"]
+
+        # Create mask_path column
+        samples["mask_path"] = (
+            samples.path
+            + "/ground_truth/"
+            + samples.label
+            + "/"
+            + samples.image_path.str.rstrip("bmp").str.rstrip(".")
+            + ".png"
+        )
+
+        # Modify image_path column by converting to absolute path
+        samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path
+
+        # Split the normal images in training set if test set doesn't
+        # contain any normal images. This is needed because AUC score
+        # cannot be computed based on 1-class
+        if sum((samples.split == "test") & (samples.label == "ok")) == 0:
+            samples = split_normal_images_in_train_set(samples, self.split_ratio, self.seed)
+
+        # Good images don't have mask
+        samples.loc[(samples.split == "test") & (samples.label == "ok"), "mask_path"] = ""
+
+        # Create label index for normal (0) and anomalous (1) images.
+        samples.loc[(samples.label == "ok"), "label_index"] = 0
+        samples.loc[(samples.label != "ok"), "label_index"] = 1
+        samples.label_index = samples.label_index.astype(int)
 
         if self.create_validation_set:
-            self.val_data = BTechDataset(
-                root=self.root,
-                category=self.category,
-                pre_process=self.pre_process_val,
-                split="val",
-                task=self.task,
-                seed=self.seed,
-                create_validation_set=self.create_validation_set,
-            )
-
-        self.test_data = BTechDataset(
-            root=self.root,
-            category=self.category,
-            pre_process=self.pre_process_val,
-            split="test",
-            task=self.task,
-            seed=self.seed,
-            create_validation_set=self.create_validation_set,
-        )
+            samples = create_validation_set_from_test_set(samples, seed=self.seed)
 
-        if stage == "predict":
-            self.inference_data = InferenceDataset(
-                path=self.root, image_size=self.image_size, transform_config=self.transform_config_val
-            )
+        return samples
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
         """Get train dataloader."""
@@ -446,9 +232,3 @@ def val_dataloader(self) -> EVAL_DATALOADERS:
     def test_dataloader(self) -> EVAL_DATALOADERS:
         """Get test dataloader."""
         return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
-
-    def predict_dataloader(self) -> EVAL_DATALOADERS:
-        """Get predict dataloader."""
-        return DataLoader(
-            self.inference_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers
-        )

From c225a835ac7295162d6f3a4c42b7f7158f98a4c2 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Wed, 14 Sep 2022 17:14:58 +0200
Subject: [PATCH 08/96] allow training with no anomalous data

---
 anomalib/data/__init__.py                    |  7 ++++---
 anomalib/utils/metrics/adaptive_threshold.py | 10 ++++++++++
 tools/train.py                               |  7 +++++--
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index 8c295a1061..f1691620f5 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -7,7 +7,8 @@
 from typing import Union
 
 from omegaconf import DictConfig, ListConfig
-from pytorch_lightning import LightningDataModule
+
+from anomalib.data.base import AnomalibDataModule
 
 from .btech import BTech
 from .folder import Folder
@@ -17,7 +18,7 @@
 logger = logging.getLogger(__name__)
 
 
-def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule:
+def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
     """Get Anomaly Datamodule.
 
     Args:
@@ -28,7 +29,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule
     """
     logger.info("Loading the datamodule")
 
-    datamodule: LightningDataModule
+    datamodule: AnomalibDataModule
 
     if config.dataset.format.lower() == "mvtec":
         datamodule = MVTec(
diff --git a/anomalib/utils/metrics/adaptive_threshold.py b/anomalib/utils/metrics/adaptive_threshold.py
index fd112433f1..868c6e2ad6 100644
--- a/anomalib/utils/metrics/adaptive_threshold.py
+++ b/anomalib/utils/metrics/adaptive_threshold.py
@@ -3,6 +3,8 @@
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import warnings
+
 import torch
 from torchmetrics import PrecisionRecallCurve
 
@@ -33,6 +35,14 @@ def compute(self) -> torch.Tensor:
         recall: torch.Tensor
         thresholds: torch.Tensor
 
+        if not any(1 in batch for batch in self.target):
+            warnings.warn(
+                "The validation set does not contain any anomalous images. As a result, the adaptive threshold will "
+                "take the value of the highest anomaly score observed in the normal validation images, which may lead "
+                "to poor predictions. For a more reliable adaptive threshold computation, please add some anomalous "
+                "images to the validation set."
+            )
+
         precision, recall, thresholds = super().compute()
         f1_score = (2 * precision * recall) / (precision + recall + 1e-10)
         if thresholds.dim() == 0:
diff --git a/tools/train.py b/tools/train.py
index 0e5daa3b10..b1f176a591 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -63,8 +63,11 @@ def train():
     load_model_callback = LoadModelCallback(weights_path=trainer.checkpoint_callback.best_model_path)
     trainer.callbacks.insert(0, load_model_callback)
 
-    logger.info("Testing the model.")
-    trainer.test(model=model, datamodule=datamodule)
+    if datamodule.test_data.contains_anomalous_images():
+        logger.info("Testing the model.")
+        trainer.test(model=model, datamodule=datamodule)
+    else:
+        logger.info("No anomalous images found in dataset. Skipping test stage.")
 
 
 if __name__ == "__main__":

From ac0dc8a939ec3c01d683218d895d12a3c7cf8d6c Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Thu, 15 Sep 2022 10:56:38 +0200
Subject: [PATCH 09/96] remove MVTec name from comment

---
 anomalib/data/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index f053fa7532..4561f30365 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -64,7 +64,7 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
             if self.task == "segmentation":
                 mask_path = self.samples.mask_path[index]
 
-                # Only Anomalous (1) images has masks in MVTec AD dataset.
+                # Only Anomalous (1) images have masks in anomaly datasets
                 # Therefore, create empty mask for Normal (0) images.
                 if label_index == 0:
                     mask = np.zeros(shape=image.shape[:2])

From 5d90209cb036e3d6464fece4c1deac021efe598e Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Thu, 15 Sep 2022 10:58:07 +0200
Subject: [PATCH 10/96] raise NotImplementedError in base class

---
 anomalib/data/base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 4561f30365..b02b1f7d3c 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -124,6 +124,7 @@ def _create_samples(self) -> DataFrame:
         | 0 | path/to/image.png | anomalous | 0           | path/to/mask.png | train |
         |---|-------------------|-----------|-------------|------------------|-------|
         """
+        raise NotImplementedError
 
     def setup(self, stage: Optional[str] = None) -> None:
         """Setup train, validation and test data.

From c1e6724f4c3cf516c086bfe2058c188a7d446f07 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Thu, 15 Sep 2022 14:12:15 +0200
Subject: [PATCH 11/96] allow both png and bmp images for btech

---
 anomalib/data/btech.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 9f746f0e5c..270dcc09bf 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -194,7 +194,7 @@ def _create_samples(self) -> DataFrame:
             + "/ground_truth/"
             + samples.label
             + "/"
-            + samples.image_path.str.rstrip("bmp").str.rstrip(".")
+            + samples.image_path.str.rstrip("bmp|png").str.rstrip(".")
             + ".png"
         )
 

From 2d70d89dfee713bdd235e095f1b00ab79a24399b Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 16 Sep 2022 10:24:43 +0200
Subject: [PATCH 12/96] use label_index to check if dataset contains anomalous
 images

---
 anomalib/data/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index b02b1f7d3c..1d853c783c 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -33,7 +33,7 @@ def __init__(self, samples: DataFrame, task: str, split: str, pre_process: PrePr
 
     def contains_anomalous_images(self):
         """Check if the dataset contains any anomalous images."""
-        return "anomalous" in list(self.samples.label)
+        return 1 in list(self.samples.label_index)
 
     def __len__(self) -> int:
         """Get length of the dataset."""

From f5f17db19a3f23c174cfda725b28fd1da7cea355 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 16 Sep 2022 15:46:42 +0200
Subject: [PATCH 13/96] refactor getitem in dataset class

---
 anomalib/data/base.py | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 1d853c783c..3503ed164e 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -51,31 +51,29 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
         """
         image_path = self.samples.image_path[index]
         image = read_image(image_path)
+        label_index = self.samples.label_index[index]
 
-        pre_processed = self.pre_process(image=image)
-        item = {"image": pre_processed["image"]}
+        item = dict(image_path=image_path, label=label_index)
 
-        if self.split in ["val", "test"]:
-            label_index = self.samples.label_index[index]
+        if self.task == "classification":
+            pre_processed = self.pre_process(image=image)
+        elif self.task == "segmentation":
+            mask_path = self.samples.mask_path[index]
 
-            item["image_path"] = image_path
-            item["label"] = label_index
+            # Only Anomalous (1) images have masks in anomaly datasets
+            # Therefore, create empty mask for Normal (0) images.
+            if label_index == 0:
+                mask = np.zeros(shape=image.shape[:2])
+            else:
+                mask = cv2.imread(mask_path, flags=0) / 255.0
 
-            if self.task == "segmentation":
-                mask_path = self.samples.mask_path[index]
+            pre_processed = self.pre_process(image=image, mask=mask)
 
-                # Only Anomalous (1) images have masks in anomaly datasets
-                # Therefore, create empty mask for Normal (0) images.
-                if label_index == 0:
-                    mask = np.zeros(shape=image.shape[:2])
-                else:
-                    mask = cv2.imread(mask_path, flags=0) / 255.0
-
-                pre_processed = self.pre_process(image=image, mask=mask)
-
-                item["mask_path"] = mask_path
-                item["image"] = pre_processed["image"]
-                item["mask"] = pre_processed["mask"]
+            item["mask_path"] = mask_path
+            item["mask"] = pre_processed["mask"]
+        else:
+            raise ValueError(f"Unknown task type: {self.task}")
+        item["image"] = pre_processed["image"]
 
         return item
 

From f02065f50ba3792fbbf1101d2fad249f8bebd456 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 16 Sep 2022 15:53:13 +0200
Subject: [PATCH 14/96] use iloc for indexing

---
 anomalib/data/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 3503ed164e..cb56b9359e 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -49,16 +49,16 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
             Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
                 Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
         """
-        image_path = self.samples.image_path[index]
+        image_path = self.samples.iloc[index].image_path
         image = read_image(image_path)
-        label_index = self.samples.label_index[index]
+        label_index = self.samples.iloc[index].label_index
 
         item = dict(image_path=image_path, label=label_index)
 
         if self.task == "classification":
             pre_processed = self.pre_process(image=image)
         elif self.task == "segmentation":
-            mask_path = self.samples.mask_path[index]
+            mask_path = self.samples.iloc[index].mask_path
 
             # Only Anomalous (1) images have masks in anomaly datasets
             # Therefore, create empty mask for Normal (0) images.

From 9cba9da174594d7b4388bc9b57130871dbbdf511 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 16 Sep 2022 16:43:30 +0200
Subject: [PATCH 15/96] move dataloader getters to base class

---
 anomalib/data/base.py   | 22 +++++++++++++++++++++-
 anomalib/data/btech.py  | 26 +++-----------------------
 anomalib/data/folder.py | 26 +++-----------------------
 anomalib/data/mvtec.py  | 33 ++++++---------------------------
 4 files changed, 33 insertions(+), 74 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index cb56b9359e..aa5b5083c0 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -12,8 +12,9 @@
 import numpy as np
 from pandas import DataFrame
 from pytorch_lightning import LightningDataModule
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
 from torch import Tensor
-from torch.utils.data import Dataset
+from torch.utils.data import DataLoader, Dataset
 
 from anomalib.data.utils import read_image
 from anomalib.pre_processing import PreProcessor
@@ -84,6 +85,9 @@ class AnomalibDataModule(LightningDataModule, ABC):
     def __init__(
         self,
         task: str,
+        train_batch_size: int,
+        test_batch_size: int,
+        num_workers: int,
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_val: Optional[Union[str, A.Compose]] = None,
         image_size: Optional[Union[int, Tuple[int, int]]] = None,
@@ -91,6 +95,9 @@ def __init__(
     ):
         super().__init__()
         self.task = task
+        self.train_batch_size = train_batch_size
+        self.test_batch_size = test_batch_size
+        self.num_workers = num_workers
         self.create_validation_set = create_validation_set
 
         if transform_config_train is not None and transform_config_val is None:
@@ -162,3 +169,16 @@ def setup(self, stage: Optional[str] = None) -> None:
             task=self.task,
             pre_process=self.pre_process_val,
         )
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        """Get train dataloader."""
+        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        """Get validation dataloader."""
+        dataset = self.val_data if self.create_validation_set else self.test_data
+        return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        """Get test dataloader."""
+        return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 270dcc09bf..f1125ce0ff 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -21,8 +21,6 @@
 import pandas as pd
 from pandas.core.frame import DataFrame
 from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
-from torch.utils.data import DataLoader
 from tqdm import tqdm
 
 from anomalib.data.base import AnomalibDataModule
@@ -97,6 +95,9 @@ def __init__(
         """
         super().__init__(
             task=task,
+            train_batch_size=train_batch_size,
+            test_batch_size=test_batch_size,
+            num_workers=num_workers,
             transform_config_train=transform_config_train,
             transform_config_val=transform_config_val,
             image_size=image_size,
@@ -106,16 +107,8 @@ def __init__(
         self.root = root if isinstance(root, Path) else Path(root)
         self.category = category
         self.path = self.root / self.category
-        self.transform_config_train = transform_config_train
-        self.transform_config_val = transform_config_val
-        self.image_size = image_size
-
-        self.train_batch_size = train_batch_size
-        self.test_batch_size = test_batch_size
-        self.num_workers = num_workers
 
         self.create_validation_set = create_validation_set
-        self.task = task
         self.seed = seed
         self.split_ratio = split_ratio
 
@@ -219,16 +212,3 @@ def _create_samples(self) -> DataFrame:
             samples = create_validation_set_from_test_set(samples, seed=self.seed)
 
         return samples
-
-    def train_dataloader(self) -> TRAIN_DATALOADERS:
-        """Get train dataloader."""
-        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers)
-
-    def val_dataloader(self) -> EVAL_DATALOADERS:
-        """Get validation dataloader."""
-        dataset = self.val_data if self.create_validation_set else self.test_data
-        return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
-
-    def test_dataloader(self) -> EVAL_DATALOADERS:
-        """Get test dataloader."""
-        return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index aea65a4d64..e485863771 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -14,8 +14,6 @@
 import albumentations as A
 from pandas.core.frame import DataFrame
 from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
-from torch.utils.data import DataLoader
 from torchvision.datasets.folder import IMG_EXTENSIONS
 
 from anomalib.data.base import AnomalibDataModule
@@ -194,6 +192,9 @@ def __init__(
         """
         super().__init__(
             task=task,
+            train_batch_size=train_batch_size,
+            test_batch_size=test_batch_size,
+            num_workers=num_workers,
             transform_config_train=transform_config_train,
             transform_config_val=transform_config_val,
             image_size=image_size,
@@ -226,14 +227,6 @@ def __init__(
         self.extensions = extensions
         self.split_ratio = split_ratio
 
-        self.transform_config_train = transform_config_train
-        self.transform_config_val = transform_config_val
-        self.image_size = image_size
-
-        self.train_batch_size = train_batch_size
-        self.test_batch_size = test_batch_size
-        self.num_workers = num_workers
-
         self.create_validation_set = create_validation_set
         self.seed = seed
 
@@ -304,16 +297,3 @@ def _create_samples(self):
             samples = create_validation_set_from_test_set(samples, seed=self.seed, normal_label="normal")
 
         return samples
-
-    def train_dataloader(self) -> TRAIN_DATALOADERS:
-        """Get train dataloader."""
-        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers)
-
-    def val_dataloader(self) -> EVAL_DATALOADERS:
-        """Get validation dataloader."""
-        dataset = self.val_data if self.create_validation_set else self.test_data
-        return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
-
-    def test_dataloader(self) -> EVAL_DATALOADERS:
-        """Get test dataloader."""
-        return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 466420eebd..0f59961182 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -38,8 +38,6 @@
 import pandas as pd
 from pandas.core.frame import DataFrame
 from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
-from torch.utils.data import DataLoader
 
 from anomalib.data.base import AnomalibDataModule
 from anomalib.data.utils import DownloadProgressBar, hash_check
@@ -113,6 +111,9 @@ def __init__(
         """
         super().__init__(
             task=task,
+            train_batch_size=train_batch_size,
+            test_batch_size=test_batch_size,
+            num_workers=num_workers,
             transform_config_train=transform_config_train,
             transform_config_val=transform_config_val,
             image_size=image_size,
@@ -121,17 +122,9 @@ def __init__(
 
         self.root = root if isinstance(root, Path) else Path(root)
         self.category = category
-        self.dataset_path = self.root / self.category
-        self.transform_config_train = transform_config_train
-        self.transform_config_val = transform_config_val
-        self.image_size = image_size
-
-        self.train_batch_size = train_batch_size
-        self.test_batch_size = test_batch_size
-        self.num_workers = num_workers
+        self.path = self.root / self.category
 
         self.create_validation_set = create_validation_set
-        self.task = task
         self.seed = seed
         self.split_ratio = split_ratio
 
@@ -186,10 +179,9 @@ def _create_samples(self) -> DataFrame:
                 " This will lead to inconsistency between runs."
             )
 
-        path = self.root / self.category
-        samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.png")]
+        samples_list = [(str(self.path),) + filename.parts[-3:] for filename in self.path.glob("**/*.png")]
         if len(samples_list) == 0:
-            raise RuntimeError(f"Found 0 images in {path}")
+            raise RuntimeError(f"Found 0 images in {self.path}")
 
         samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"])
         samples = samples[samples.split != "ground_truth"]
@@ -225,16 +217,3 @@ def _create_samples(self) -> DataFrame:
             samples = create_validation_set_from_test_set(samples, seed=self.seed)
 
         return samples
-
-    def train_dataloader(self) -> TRAIN_DATALOADERS:
-        """Get train dataloader."""
-        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers)
-
-    def val_dataloader(self) -> EVAL_DATALOADERS:
-        """Get validation dataloader."""
-        dataset = self.val_data if self.create_validation_set else self.test_data
-        return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
-
-    def test_dataloader(self) -> EVAL_DATALOADERS:
-        """Get test dataloader."""
-        return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)

From 5b3e8410f11a0472703429b5b9fce5fe5f20c94a Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 16 Sep 2022 18:31:19 +0200
Subject: [PATCH 16/96] refactor to add validate stage in setup

---
 anomalib/data/base.py   | 69 +++++++++++++++++++++++++++--------------
 anomalib/data/btech.py  | 16 +++++-----
 anomalib/data/folder.py | 22 ++++++-------
 anomalib/data/mvtec.py  | 16 +++++-----
 tools/train.py          |  2 +-
 5 files changed, 74 insertions(+), 51 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index aa5b5083c0..ed61a83c9c 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -32,10 +32,6 @@ def __init__(self, samples: DataFrame, task: str, split: str, pre_process: PrePr
         self.split = split
         self.pre_process = pre_process
 
-    def contains_anomalous_images(self):
-        """Check if the dataset contains any anomalous images."""
-        return 1 in list(self.samples.label_index)
-
     def __len__(self) -> int:
         """Get length of the dataset."""
         return len(self.samples)
@@ -109,6 +105,8 @@ def __init__(
         self.val_data: Optional[AnomalibDataset] = None
         self.test_data: Optional[AnomalibDataset] = None
 
+        self._samples: Optional[DataFrame] = None
+
     @abstractmethod
     def _create_samples(self) -> DataFrame:
         """This method should be implemented in the subclass.
@@ -131,44 +129,70 @@ def _create_samples(self) -> DataFrame:
         """
         raise NotImplementedError
 
+    def get_samples(self, split: Optional[str] = None) -> DataFrame:
+        """Retrieve the samples of the full dataset or one of the splits (train, val, test).
+
+        Args:
+            split: (str): The split for which we want to retrieve the samples ("train", "val" or "test"). When
+                left empty, all samples will be returned.
+
+        Returns:
+            DataFrame: A dataframe containing the samples of the split or full dataset.
+        """
+        assert self._samples is not None, "Samples have not been created yet."
+        if split is None:
+            return self._samples
+        samples = self._samples[self._samples.split == split]
+        return samples.reset_index(drop=True)
+
     def setup(self, stage: Optional[str] = None) -> None:
         """Setup train, validation and test data.
 
         Args:
           stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
-
         """
-        samples = self._create_samples()
+        self._samples = self._create_samples()
 
         logger.info("Setting up train, validation, test and prediction datasets.")
         if stage in (None, "fit"):
-            train_samples = samples[samples.split == "train"]
-            train_samples = train_samples.reset_index(drop=True)
+            samples = self.get_samples("train")
             self.train_data = AnomalibDataset(
-                samples=train_samples,
+                samples=samples,
                 split="train",
                 task=self.task,
                 pre_process=self.pre_process_train,
             )
 
-        if self.create_validation_set:
-            val_samples = samples[samples.split == "val"]
-            val_samples = val_samples.reset_index(drop=True)
+        if stage in (None, "fit", "validate"):
+            samples = self.get_samples("val") if self.create_validation_set else self.get_samples("test")
             self.val_data = AnomalibDataset(
-                samples=val_samples,
+                samples=samples,
                 split="val",
                 task=self.task,
                 pre_process=self.pre_process_val,
             )
 
-        test_samples = samples[samples.split == "test"]
-        test_samples = test_samples.reset_index(drop=True)
-        self.test_data = AnomalibDataset(
-            samples=test_samples,
-            split="test",
-            task=self.task,
-            pre_process=self.pre_process_val,
-        )
+        if stage in (None, "test"):
+            samples = self.get_samples("test")
+            self.test_data = AnomalibDataset(
+                samples=samples,
+                split="test",
+                task=self.task,
+                pre_process=self.pre_process_val,
+            )
+
+    def contains_anomalous_images(self, split: Optional[str] = None) -> bool:
+        """Check if the dataset or the specified subset contains any anomalous images.
+
+        Args:
+            split (str): the subset of interest ("train", "val" or "test"). When left empty, the full dataset will be
+                checked.
+
+        Returns:
+            bool: Boolean indicating if any anomalous images have been assigned to the dataset or subset.
+        """
+        samples = self.get_samples(split)
+        return 1 in list(samples.label_index)
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
         """Get train dataloader."""
@@ -176,8 +200,7 @@ def train_dataloader(self) -> TRAIN_DATALOADERS:
 
     def val_dataloader(self) -> EVAL_DATALOADERS:
         """Get validation dataloader."""
-        dataset = self.val_data if self.create_validation_set else self.test_data
-        return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
+        return DataLoader(self.val_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
 
     def test_dataloader(self) -> EVAL_DATALOADERS:
         """Get test dataloader."""
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index f1125ce0ff..489841ab94 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -93,6 +93,14 @@ def __init__(
             >>> data["image"].shape, data["mask"].shape
             (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256]))
         """
+        self.root = root if isinstance(root, Path) else Path(root)
+        self.category = category
+        self.path = self.root / self.category
+
+        self.create_validation_set = create_validation_set
+        self.seed = seed
+        self.split_ratio = split_ratio
+
         super().__init__(
             task=task,
             train_batch_size=train_batch_size,
@@ -104,14 +112,6 @@ def __init__(
             create_validation_set=create_validation_set,
         )
 
-        self.root = root if isinstance(root, Path) else Path(root)
-        self.category = category
-        self.path = self.root / self.category
-
-        self.create_validation_set = create_validation_set
-        self.seed = seed
-        self.split_ratio = split_ratio
-
     def prepare_data(self) -> None:
         """Download the dataset if not available."""
         if (self.root / self.category).is_dir():
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index e485863771..22f6257308 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -190,17 +190,6 @@ def __init__(
             torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256])
 
         """
-        super().__init__(
-            task=task,
-            train_batch_size=train_batch_size,
-            test_batch_size=test_batch_size,
-            num_workers=num_workers,
-            transform_config_train=transform_config_train,
-            transform_config_val=transform_config_val,
-            image_size=image_size,
-            create_validation_set=create_validation_set,
-        )
-
         if seed is None and normal_test_dir is None:
             raise ValueError(
                 "Both seed and normal_test_dir cannot be None."
@@ -230,6 +219,17 @@ def __init__(
         self.create_validation_set = create_validation_set
         self.seed = seed
 
+        super().__init__(
+            task=task,
+            train_batch_size=train_batch_size,
+            test_batch_size=test_batch_size,
+            num_workers=num_workers,
+            transform_config_train=transform_config_train,
+            transform_config_val=transform_config_val,
+            image_size=image_size,
+            create_validation_set=create_validation_set,
+        )
+
     def _create_samples(self):
         """Create the dataframe with samples for the Folder dataset.
 
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 0f59961182..1772baf4f1 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -109,6 +109,14 @@ def __init__(
             >>> data["image"].shape, data["mask"].shape
             (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256]))
         """
+        self.root = root if isinstance(root, Path) else Path(root)
+        self.category = category
+        self.path = self.root / self.category
+
+        self.create_validation_set = create_validation_set
+        self.seed = seed
+        self.split_ratio = split_ratio
+
         super().__init__(
             task=task,
             train_batch_size=train_batch_size,
@@ -120,14 +128,6 @@ def __init__(
             create_validation_set=create_validation_set,
         )
 
-        self.root = root if isinstance(root, Path) else Path(root)
-        self.category = category
-        self.path = self.root / self.category
-
-        self.create_validation_set = create_validation_set
-        self.seed = seed
-        self.split_ratio = split_ratio
-
     def prepare_data(self) -> None:
         """Download the dataset if not available."""
         if (self.root / self.category).is_dir():
diff --git a/tools/train.py b/tools/train.py
index b1f176a591..33952a7e20 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -63,7 +63,7 @@ def train():
     load_model_callback = LoadModelCallback(weights_path=trainer.checkpoint_callback.best_model_path)
     trainer.callbacks.insert(0, load_model_callback)
 
-    if datamodule.test_data.contains_anomalous_images():
+    if datamodule.contains_anomalous_images("test"):
         logger.info("Testing the model.")
         trainer.test(model=model, datamodule=datamodule)
     else:

From f652227e377c8389e3bf481d39a571cfd17db6a8 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Wed, 21 Sep 2022 14:08:20 +0200
Subject: [PATCH 17/96] implement alternative datamodules solution

---
 anomalib/data/base.py   | 161 +++++++++++++++++++-----------------
 anomalib/data/folder.py | 175 ++++++++++++++++++++++++----------------
 2 files changed, 193 insertions(+), 143 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index ed61a83c9c..e7ce591d00 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -5,6 +5,7 @@
 
 import logging
 from abc import ABC, abstractmethod
+from enum import Enum
 from typing import Dict, Optional, Tuple, Union
 
 import albumentations as A
@@ -22,20 +23,91 @@
 logger = logging.getLogger(__name__)
 
 
+class Subset(str, Enum):
+    FULL = "full"
+    TRAIN = "train"
+    VAL = "val"
+    TEST = "test"
+
+
 class AnomalibDataset(Dataset):
     """Anomalib dataset."""
 
-    def __init__(self, samples: DataFrame, task: str, split: str, pre_process: PreProcessor):
+    def __init__(
+        self,
+        task: str,
+        pre_process: PreProcessor,
+        split: Subset = Subset.FULL,
+        samples: Optional[DataFrame] = None,
+        seed: Optional[int] = None,
+    ):
         super().__init__()
-        self.samples = samples
         self.task = task
         self.split = split
         self.pre_process = pre_process
+        self.seed = seed
+        if samples is None:
+            self.samples = self._create_samples()
+        else:
+            self.samples = samples
+        self.samples = self.get_samples(self.split)
 
     def __len__(self) -> int:
         """Get length of the dataset."""
         return len(self.samples)
 
+    @abstractmethod
+    def _create_samples(self) -> DataFrame:
+        """This method should be implemented in the subclass.
+
+        This method should return a dataframe that contains the information needed by the dataloader to load each of
+        the dataset items into memory. The dataframe must at least contain the following columns:
+        split - The subset to which the dataset item is assigned.
+        image_path - Path to file system location where the image is stored.
+        label_index - Index of the anomaly label, typically 0 for "normal" and 1 for "anomalous".
+
+        Additionally, when the task type is segmentation, the dataframe must have the mask_path column, which contains
+        the path the ground truth masks (for the anomalous images only).
+
+        Example of a dataframe returned by calling this method from a concrete class:
+        |---|-------------------|-----------|-------------|------------------|-------|
+        |   | image_path        | label     | label_index | mask_path        | split |
+        |---|-------------------|-----------|-------------|------------------|-------|
+        | 0 | path/to/image.png | anomalous | 0           | path/to/mask.png | train |
+        |---|-------------------|-----------|-------------|------------------|-------|
+        """
+        raise NotImplementedError
+
+    def _get_subset(self, split: Subset):
+        samples = self.get_samples(split)
+        return AnomalibDataset(
+            task=self.task, pre_process=self.pre_process, split=split, samples=samples, seed=self.seed
+        )
+
+    def train_subset(self):
+        return self._get_subset(Subset.TRAIN)
+
+    def val_subset(self):
+        return self._get_subset(Subset.VAL)
+
+    def test_subset(self):
+        return self._get_subset(Subset.TEST)
+
+    def get_samples(self, split: Subset):
+        """Retrieve the samples of the full dataset or one of the splits (train, val, test).
+
+        Args:
+            split: (str): The split for which we want to retrieve the samples ("train", "val" or "test"). When
+                left empty, all samples will be returned.
+
+        Returns:
+            DataFrame: A dataframe containing the samples of the split or full dataset.
+        """
+        if split == Subset.FULL:
+            return self.samples
+        samples = self.samples[self.samples.split == split]
+        return samples.reset_index(drop=True)
+
     def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
         """Get dataset item for the index ``index``.
 
@@ -107,92 +179,31 @@ def __init__(
 
         self._samples: Optional[DataFrame] = None
 
-    @abstractmethod
-    def _create_samples(self) -> DataFrame:
-        """This method should be implemented in the subclass.
-
-        This method should return a dataframe that contains the information needed by the dataloader to load each of
-        the dataset items into memory. The dataframe must at least contain the following columns:
-        split - The subset to which the dataset item is assigned.
-        image_path - Path to file system location where the image is stored.
-        label_index - Index of the anomaly label, typically 0 for "normal" and 1 for "anomalous".
-
-        Additionally, when the task type is segmentation, the dataframe must have the mask_path column, which contains
-        the path the ground truth masks (for the anomalous images only).
+        self.data: Optional[AnomalibDataset] = None
 
-        Example of a dataframe returned by calling this method from a concrete class:
-        |---|-------------------|-----------|-------------|------------------|-------|
-        |   | image_path        | label     | label_index | mask_path        | split |
-        |---|-------------------|-----------|-------------|------------------|-------|
-        | 0 | path/to/image.png | anomalous | 0           | path/to/mask.png | train |
-        |---|-------------------|-----------|-------------|------------------|-------|
-        """
+    @abstractmethod
+    def create_dataset(self) -> AnomalibDataset:
         raise NotImplementedError
 
-    def get_samples(self, split: Optional[str] = None) -> DataFrame:
-        """Retrieve the samples of the full dataset or one of the splits (train, val, test).
-
-        Args:
-            split: (str): The split for which we want to retrieve the samples ("train", "val" or "test"). When
-                left empty, all samples will be returned.
+    def prepare_data(self) -> None:
+        self.data = self.create_dataset()
 
-        Returns:
-            DataFrame: A dataframe containing the samples of the split or full dataset.
-        """
-        assert self._samples is not None, "Samples have not been created yet."
-        if split is None:
-            return self._samples
-        samples = self._samples[self._samples.split == split]
-        return samples.reset_index(drop=True)
+    def contains_anomalous_images(self, split):
+        samples = self.data.get_samples(split)
+        return 1 in list(samples.label_index)
 
-    def setup(self, stage: Optional[str] = None) -> None:
+    def setup(self, stage: Optional[str] = None):
         """Setup train, validation and test data.
 
         Args:
           stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
         """
-        self._samples = self._create_samples()
-
-        logger.info("Setting up train, validation, test and prediction datasets.")
         if stage in (None, "fit"):
-            samples = self.get_samples("train")
-            self.train_data = AnomalibDataset(
-                samples=samples,
-                split="train",
-                task=self.task,
-                pre_process=self.pre_process_train,
-            )
-
+            self.train_data = self.data.train_subset()
         if stage in (None, "fit", "validate"):
-            samples = self.get_samples("val") if self.create_validation_set else self.get_samples("test")
-            self.val_data = AnomalibDataset(
-                samples=samples,
-                split="val",
-                task=self.task,
-                pre_process=self.pre_process_val,
-            )
-
+            self.val_data = self.data.val_subset() if self.create_validation_set else self.data.test_subset()
         if stage in (None, "test"):
-            samples = self.get_samples("test")
-            self.test_data = AnomalibDataset(
-                samples=samples,
-                split="test",
-                task=self.task,
-                pre_process=self.pre_process_val,
-            )
-
-    def contains_anomalous_images(self, split: Optional[str] = None) -> bool:
-        """Check if the dataset or the specified subset contains any anomalous images.
-
-        Args:
-            split (str): the subset of interest ("train", "val" or "test"). When left empty, the full dataset will be
-                checked.
-
-        Returns:
-            bool: Boolean indicating if any anomalous images have been assigned to the dataset or subset.
-        """
-        samples = self.get_samples(split)
-        return 1 in list(samples.label_index)
+            self.test_data = self.data.test_subset()
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
         """Get train dataloader."""
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index 22f6257308..8a95280b9b 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -16,7 +16,7 @@
 from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
 from torchvision.datasets.folder import IMG_EXTENSIONS
 
-from anomalib.data.base import AnomalibDataModule
+from anomalib.data.base import AnomalibDataModule, AnomalibDataset
 from anomalib.data.utils.split import (
     create_validation_set_from_test_set,
     split_normal_images_in_train_set,
@@ -69,6 +69,99 @@ def _prepare_files_labels(
     return filenames, labels
 
 
+class FolderDataset(AnomalibDataset):
+    def __init__(
+        self,
+        normal_dir,
+        abnormal_dir,
+        normal_test_dir,
+        mask_dir,
+        extensions,
+        split_ratio,
+        seed,
+        create_validation_set,
+        *args,
+        **kwargs,
+    ):
+        self.normal_dir = normal_dir
+        self.abnormal_dir = abnormal_dir
+        self.normal_test_dir = normal_test_dir
+        self.extensions = extensions
+        self.mask_dir = mask_dir
+        self.split_ratio = split_ratio
+        self.seed = seed
+        self.create_validation_set = create_validation_set
+        super().__init__(*args, **kwargs)
+
+    def _create_samples(self):
+        """Create the dataframe with samples for the Folder dataset.
+
+        The files are expected to follow the structure:
+            path/to/dataset/normal_folder_name/normal_image_name.png
+            path/to/dataset/abnormal_folder_name/abnormal_image_name.png
+
+
+        This function creates a dataframe to store the parsed information based on the following format:
+        |---|-------------------|--------|-------------|------------------|-------|
+        |   | image_path        | label  | label_index | mask_path        | split |
+        |---|-------------------|--------|-------------|------------------|-------|
+        | 0 | path/to/image.png | normal | 0           | path/to/mask.png | train |
+        |---|-------------------|--------|-------------|------------------|-------|
+
+        Returns:
+            DataFrame: an output dataframe containing the samples of the dataset.
+        """
+
+        filenames = []
+        labels = []
+        dirs = {"normal": self.normal_dir, "abnormal": self.abnormal_dir}
+
+        if self.normal_test_dir:
+            dirs = {**dirs, **{"normal_test": self.normal_test_dir}}
+
+        for dir_type, path in dirs.items():
+            if path is not None:
+                filename, label = _prepare_files_labels(path, dir_type, self.extensions)
+                filenames += filename
+                labels += label
+
+        samples = DataFrame({"image_path": filenames, "label": labels})
+
+        # Create label index for normal (0) and abnormal (1) images.
+        samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0
+        samples.loc[(samples.label == "abnormal"), "label_index"] = 1
+        samples.label_index = samples.label_index.astype(int)
+
+        # If a path to mask is provided, add it to the sample dataframe.
+        if self.mask_dir is not None:
+            self.mask_dir = _check_and_convert_path(self.mask_dir)
+            samples["mask_path"] = ""
+            for index, row in samples.iterrows():
+                if row.label_index == 1:
+                    samples.loc[index, "mask_path"] = str(self.mask_dir / row.image_path.name)
+
+        # Ensure the pathlib objects are converted to str.
+        # This is because torch dataloader doesn't like pathlib.
+        samples = samples.astype({"image_path": "str"})
+
+        # Create train/test split.
+        # By default, all the normal samples are assigned as train.
+        #   and all the abnormal samples are test.
+        samples.loc[(samples.label == "normal"), "split"] = "train"
+        samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test"
+
+        if not self.normal_test_dir:
+            samples = split_normal_images_in_train_set(
+                samples=samples, split_ratio=self.split_ratio, seed=self.seed, normal_label="normal"
+            )
+
+        # If `create_validation_set` is set to True, the test set is split into half.
+        if self.create_validation_set:
+            samples = create_validation_set_from_test_set(samples, seed=self.seed, normal_label="normal")
+
+        return samples
+
+
 @DATAMODULE_REGISTRY
 class Folder(AnomalibDataModule):
     """Folder Lightning Data Module."""
@@ -230,70 +323,16 @@ def __init__(
             create_validation_set=create_validation_set,
         )
 
-    def _create_samples(self):
-        """Create the dataframe with samples for the Folder dataset.
-
-        The files are expected to follow the structure:
-            path/to/dataset/normal_folder_name/normal_image_name.png
-            path/to/dataset/abnormal_folder_name/abnormal_image_name.png
-
-
-        This function creates a dataframe to store the parsed information based on the following format:
-        |---|-------------------|--------|-------------|------------------|-------|
-        |   | image_path        | label  | label_index | mask_path        | split |
-        |---|-------------------|--------|-------------|------------------|-------|
-        | 0 | path/to/image.png | normal | 0           | path/to/mask.png | train |
-        |---|-------------------|--------|-------------|------------------|-------|
-
-        Returns:
-            DataFrame: an output dataframe containing the samples of the dataset.
-        """
-
-        filenames = []
-        labels = []
-        dirs = {"normal": self.normal_dir, "abnormal": self.abnormal_dir}
-
-        if self.normal_test_dir:
-            dirs = {**dirs, **{"normal_test": self.normal_test_dir}}
-
-        for dir_type, path in dirs.items():
-            if path is not None:
-                filename, label = _prepare_files_labels(path, dir_type, self.extensions)
-                filenames += filename
-                labels += label
-
-        samples = DataFrame({"image_path": filenames, "label": labels})
-
-        # Create label index for normal (0) and abnormal (1) images.
-        samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0
-        samples.loc[(samples.label == "abnormal"), "label_index"] = 1
-        samples.label_index = samples.label_index.astype(int)
-
-        # If a path to mask is provided, add it to the sample dataframe.
-        if self.mask_dir is not None:
-            self.mask_dir = _check_and_convert_path(self.mask_dir)
-            samples["mask_path"] = ""
-            for index, row in samples.iterrows():
-                if row.label_index == 1:
-                    samples.loc[index, "mask_path"] = str(self.mask_dir / row.image_path.name)
-
-        # Ensure the pathlib objects are converted to str.
-        # This is because torch dataloader doesn't like pathlib.
-        samples = samples.astype({"image_path": "str"})
-
-        # Create train/test split.
-        # By default, all the normal samples are assigned as train.
-        #   and all the abnormal samples are test.
-        samples.loc[(samples.label == "normal"), "split"] = "train"
-        samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test"
-
-        if not self.normal_test_dir:
-            samples = split_normal_images_in_train_set(
-                samples=samples, split_ratio=self.split_ratio, seed=self.seed, normal_label="normal"
-            )
-
-        # If `create_validation_set` is set to True, the test set is split into half.
-        if self.create_validation_set:
-            samples = create_validation_set_from_test_set(samples, seed=self.seed, normal_label="normal")
-
-        return samples
+    def create_dataset(self):
+        return FolderDataset(
+            normal_dir=self.normal_dir,
+            abnormal_dir=self.abnormal_dir,
+            normal_test_dir=self.normal_test_dir,
+            mask_dir=self.mask_dir,
+            extensions=self.extensions,
+            split_ratio=self.split_ratio,
+            seed=self.seed,
+            create_validation_set=self.create_validation_set,
+            task=self.task,
+            pre_process=self.pre_process_train,
+        )

From 0e565a42260f55f649ceb8ffe6b61632161b1983 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Wed, 21 Sep 2022 17:07:15 +0200
Subject: [PATCH 18/96] small improvements

---
 anomalib/data/base.py   | 30 +++++++++++++++---------------
 anomalib/data/folder.py | 21 ++++++++++-----------
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index e7ce591d00..cb8c3a7f22 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -78,20 +78,19 @@ def _create_samples(self) -> DataFrame:
         """
         raise NotImplementedError
 
-    def _get_subset(self, split: Subset):
+    def _get_subset(self, split: Subset, pre_process: Optional[PreProcessor] = None):
         samples = self.get_samples(split)
-        return AnomalibDataset(
-            task=self.task, pre_process=self.pre_process, split=split, samples=samples, seed=self.seed
-        )
+        pre_process = self.pre_process if pre_process is None else pre_process
+        return AnomalibDataset(task=self.task, pre_process=pre_process, split=split, samples=samples, seed=self.seed)
 
-    def train_subset(self):
-        return self._get_subset(Subset.TRAIN)
+    def train_subset(self, pre_process: Optional[PreProcessor] = None):
+        return self._get_subset(Subset.TRAIN, pre_process=pre_process)
 
-    def val_subset(self):
-        return self._get_subset(Subset.VAL)
+    def val_subset(self, pre_process: Optional[PreProcessor] = None):
+        return self._get_subset(Subset.VAL, pre_process=pre_process)
 
-    def test_subset(self):
-        return self._get_subset(Subset.TEST)
+    def test_subset(self, pre_process: Optional[PreProcessor] = None):
+        return self._get_subset(Subset.TEST, pre_process=pre_process)
 
     def get_samples(self, split: Subset):
         """Retrieve the samples of the full dataset or one of the splits (train, val, test).
@@ -159,14 +158,12 @@ def __init__(
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_val: Optional[Union[str, A.Compose]] = None,
         image_size: Optional[Union[int, Tuple[int, int]]] = None,
-        create_validation_set: bool = False,
     ):
         super().__init__()
         self.task = task
         self.train_batch_size = train_batch_size
         self.test_batch_size = test_batch_size
         self.num_workers = num_workers
-        self.create_validation_set = create_validation_set
 
         if transform_config_train is not None and transform_config_val is None:
             transform_config_val = transform_config_train
@@ -199,11 +196,14 @@ def setup(self, stage: Optional[str] = None):
           stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
         """
         if stage in (None, "fit"):
-            self.train_data = self.data.train_subset()
+            self.train_data = self.data.train_subset(pre_process=self.pre_process_train)
         if stage in (None, "fit", "validate"):
-            self.val_data = self.data.val_subset() if self.create_validation_set else self.data.test_subset()
+            if self.contains_anomalous_images("val"):
+                self.val_data = self.data.val_subset(pre_process=self.pre_process_val)
+            else:
+                self.val_data = self.data.test_subset(pre_process=self.pre_process_val)
         if stage in (None, "test"):
-            self.test_data = self.data.test_subset()
+            self.test_data = self.data.test_subset(pre_process=self.pre_process_val)
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
         """Get train dataloader."""
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index 8a95280b9b..29ba0dedf1 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -283,6 +283,16 @@ def __init__(
             torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256])
 
         """
+        super().__init__(
+            task=task,
+            train_batch_size=train_batch_size,
+            test_batch_size=test_batch_size,
+            num_workers=num_workers,
+            transform_config_train=transform_config_train,
+            transform_config_val=transform_config_val,
+            image_size=image_size,
+        )
+
         if seed is None and normal_test_dir is None:
             raise ValueError(
                 "Both seed and normal_test_dir cannot be None."
@@ -312,17 +322,6 @@ def __init__(
         self.create_validation_set = create_validation_set
         self.seed = seed
 
-        super().__init__(
-            task=task,
-            train_batch_size=train_batch_size,
-            test_batch_size=test_batch_size,
-            num_workers=num_workers,
-            transform_config_train=transform_config_train,
-            transform_config_val=transform_config_val,
-            image_size=image_size,
-            create_validation_set=create_validation_set,
-        )
-
     def create_dataset(self):
         return FolderDataset(
             normal_dir=self.normal_dir,

From 297195a032bb182a69560b3a8c2bc0288f7aec5b Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 7 Oct 2022 12:20:03 +0200
Subject: [PATCH 19/96] improve design

---
 anomalib/data/__init__.py         |  14 +-
 anomalib/data/base.py             | 177 ++++++-------
 anomalib/data/folder.py           | 409 ++++++++++++------------------
 anomalib/data/mvtec.py            | 278 +++++++-------------
 anomalib/data/utils/split.py      |  47 +++-
 anomalib/models/padim/config.yaml |   2 +-
 tools/train.py                    |   2 +-
 7 files changed, 388 insertions(+), 541 deletions(-)

diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index f1691620f5..d1da8af375 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -11,9 +11,9 @@
 from anomalib.data.base import AnomalibDataModule
 
 from .btech import BTech
-from .folder import Folder
+from .folder import FolderDataModule
 from .inference import InferenceDataset
-from .mvtec import MVTec
+from .mvtec import MVTecDataModule
 
 logger = logging.getLogger(__name__)
 
@@ -32,7 +32,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
     datamodule: AnomalibDataModule
 
     if config.dataset.format.lower() == "mvtec":
-        datamodule = MVTec(
+        datamodule = MVTecDataModule(
             # TODO: Remove config values. IAAALD-211
             root=config.dataset.path,
             category=config.dataset.category,
@@ -40,11 +40,10 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             train_batch_size=config.dataset.train_batch_size,
             test_batch_size=config.dataset.test_batch_size,
             num_workers=config.dataset.num_workers,
-            seed=config.project.seed,
             task=config.dataset.task,
             transform_config_train=config.dataset.transform_config.train,
             transform_config_val=config.dataset.transform_config.val,
-            create_validation_set=config.dataset.create_validation_set,
+            val_split_mode=config.dataset.validation_split_mode,
         )
     elif config.dataset.format.lower() == "btech":
         datamodule = BTech(
@@ -62,7 +61,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             create_validation_set=config.dataset.create_validation_set,
         )
     elif config.dataset.format.lower() == "folder":
-        datamodule = Folder(
+        datamodule = FolderDataModule(
             root=config.dataset.path,
             normal_dir=config.dataset.normal_dir,
             abnormal_dir=config.dataset.abnormal_dir,
@@ -71,14 +70,13 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             mask_dir=config.dataset.mask,
             extensions=config.dataset.extensions,
             split_ratio=config.dataset.split_ratio,
-            seed=config.project.seed,
             image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
             train_batch_size=config.dataset.train_batch_size,
             test_batch_size=config.dataset.test_batch_size,
             num_workers=config.dataset.num_workers,
             transform_config_train=config.dataset.transform_config.train,
             transform_config_val=config.dataset.transform_config.val,
-            create_validation_set=config.dataset.create_validation_set,
+            val_split_mode=config.dataset.validation_split_mode,
         )
     else:
         raise ValueError(
diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index cb8c3a7f22..9c5ab99ed5 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -3,14 +3,16 @@
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import logging
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Dict, Optional, Tuple, Union
+from typing import Dict, Optional, Union
 
-import albumentations as A
 import cv2
 import numpy as np
+import pandas as pd
 from pandas import DataFrame
 from pytorch_lightning import LightningDataModule
 from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
@@ -23,89 +25,54 @@
 logger = logging.getLogger(__name__)
 
 
-class Subset(str, Enum):
+class Split(str, Enum):
     FULL = "full"
     TRAIN = "train"
     VAL = "val"
     TEST = "test"
 
 
+class ValSplitMode(str, Enum):
+    SAME_AS_TEST = "same_as_test"
+    FROM_TEST = "from_test"
+
+
 class AnomalibDataset(Dataset):
     """Anomalib dataset."""
 
-    def __init__(
-        self,
-        task: str,
-        pre_process: PreProcessor,
-        split: Subset = Subset.FULL,
-        samples: Optional[DataFrame] = None,
-        seed: Optional[int] = None,
-    ):
+    def __init__(self, task: str, pre_process: PreProcessor, samples: Optional[DataFrame] = None):
         super().__init__()
         self.task = task
-        self.split = split
         self.pre_process = pre_process
-        self.seed = seed
-        if samples is None:
-            self.samples = self._create_samples()
-        else:
-            self.samples = samples
-        self.samples = self.get_samples(self.split)
+        self._samples = samples
 
     def __len__(self) -> int:
         """Get length of the dataset."""
-        return len(self.samples)
-
-    @abstractmethod
-    def _create_samples(self) -> DataFrame:
-        """This method should be implemented in the subclass.
-
-        This method should return a dataframe that contains the information needed by the dataloader to load each of
-        the dataset items into memory. The dataframe must at least contain the following columns:
-        split - The subset to which the dataset item is assigned.
-        image_path - Path to file system location where the image is stored.
-        label_index - Index of the anomaly label, typically 0 for "normal" and 1 for "anomalous".
-
-        Additionally, when the task type is segmentation, the dataframe must have the mask_path column, which contains
-        the path the ground truth masks (for the anomalous images only).
-
-        Example of a dataframe returned by calling this method from a concrete class:
-        |---|-------------------|-----------|-------------|------------------|-------|
-        |   | image_path        | label     | label_index | mask_path        | split |
-        |---|-------------------|-----------|-------------|------------------|-------|
-        | 0 | path/to/image.png | anomalous | 0           | path/to/mask.png | train |
-        |---|-------------------|-----------|-------------|------------------|-------|
-        """
-        raise NotImplementedError
-
-    def _get_subset(self, split: Subset, pre_process: Optional[PreProcessor] = None):
-        samples = self.get_samples(split)
-        pre_process = self.pre_process if pre_process is None else pre_process
-        return AnomalibDataset(task=self.task, pre_process=pre_process, split=split, samples=samples, seed=self.seed)
-
-    def train_subset(self, pre_process: Optional[PreProcessor] = None):
-        return self._get_subset(Subset.TRAIN, pre_process=pre_process)
+        assert isinstance(self._samples, DataFrame)
+        return len(self._samples)
 
-    def val_subset(self, pre_process: Optional[PreProcessor] = None):
-        return self._get_subset(Subset.VAL, pre_process=pre_process)
+    def subsample(self, indices):
+        return AnomalibDataset(task=self.task, pre_process=self.pre_process, samples=self.samples.iloc[indices])
 
-    def test_subset(self, pre_process: Optional[PreProcessor] = None):
-        return self._get_subset(Subset.TEST, pre_process=pre_process)
+    @property
+    def is_setup(self) -> bool:
+        """Has setup() been called?"""
+        return isinstance(self._samples, DataFrame)
 
-    def get_samples(self, split: Subset):
-        """Retrieve the samples of the full dataset or one of the splits (train, val, test).
+    @property
+    def samples(self) -> DataFrame:
+        """TODO"""
+        if not self.is_setup:
+            raise RuntimeError("Dataset is not setup yet. Call setup() first.")
+        return self._samples
 
-        Args:
-            split: (str): The split for which we want to retrieve the samples ("train", "val" or "test"). When
-                left empty, all samples will be returned.
+    @property
+    def has_normal(self) -> bool:
+        return 0 in list(self.samples.label_index)
 
-        Returns:
-            DataFrame: A dataframe containing the samples of the split or full dataset.
-        """
-        if split == Subset.FULL:
-            return self.samples
-        samples = self.samples[self.samples.split == split]
-        return samples.reset_index(drop=True)
+    @property
+    def has_anomalous(self) -> bool:
+        return 1 in list(self.samples.label_index)
 
     def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
         """Get dataset item for the index ``index``.
@@ -117,16 +84,18 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
             Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
                 Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
         """
-        image_path = self.samples.iloc[index].image_path
+        assert isinstance(self._samples, DataFrame)
+
+        image_path = self._samples.iloc[index].image_path
         image = read_image(image_path)
-        label_index = self.samples.iloc[index].label_index
+        label_index = self._samples.iloc[index].label_index
 
         item = dict(image_path=image_path, label=label_index)
 
         if self.task == "classification":
             pre_processed = self.pre_process(image=image)
         elif self.task == "segmentation":
-            mask_path = self.samples.iloc[index].mask_path
+            mask_path = self._samples.iloc[index].mask_path
 
             # Only Anomalous (1) images have masks in anomaly datasets
             # Therefore, create empty mask for Normal (0) images.
@@ -145,6 +114,35 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
 
         return item
 
+    def __add__(self, other_dataset: AnomalibDataset):
+        assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first."
+        samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True)
+        return AnomalibDataset(self.task, self.pre_process, samples)
+
+    def setup(self) -> None:
+        """Load data/metadata into memory"""
+        if not self.is_setup:
+            self._setup()
+        assert self.is_setup, "setup() should set self._samples"
+
+    def _setup(self) -> DataFrame:
+        """previous _create_samples()
+        This method should return a dataframe that contains the information needed by the dataloader to load each of
+        the dataset items into memory.
+        The dataframe must at least contain the following columns:
+            split: the subset to which the dataset item is assigned.
+            image_path: path to file system location where the image is stored.
+            label_index: index of the anomaly label, typically 0 for "normal" and 1 for "anomalous".
+            mask_path (if task == "segmentation"): path to the ground truth masks (for the anomalous images only).
+        Example:
+        |---|-------------------|-----------|-------------|------------------|-------|
+        |   | image_path        | label     | label_index | mask_path        | split |
+        |---|-------------------|-----------|-------------|------------------|-------|
+        | 0 | path/to/image.png | anomalous | 0           | path/to/mask.png | train |
+        |---|-------------------|-----------|-------------|------------------|-------|
+        """
+        pass
+
 
 class AnomalibDataModule(LightningDataModule, ABC):
     """Base Anomalib data module."""
@@ -155,20 +153,14 @@ def __init__(
         train_batch_size: int,
         test_batch_size: int,
         num_workers: int,
-        transform_config_train: Optional[Union[str, A.Compose]] = None,
-        transform_config_val: Optional[Union[str, A.Compose]] = None,
-        image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
     ):
         super().__init__()
         self.task = task
         self.train_batch_size = train_batch_size
         self.test_batch_size = test_batch_size
         self.num_workers = num_workers
-
-        if transform_config_train is not None and transform_config_val is None:
-            transform_config_val = transform_config_train
-        self.pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
-        self.pre_process_val = PreProcessor(config=transform_config_val, image_size=image_size)
+        self.val_split_mode = val_split_mode
 
         self.train_data: Optional[AnomalibDataset] = None
         self.val_data: Optional[AnomalibDataset] = None
@@ -178,32 +170,25 @@ def __init__(
 
         self.data: Optional[AnomalibDataset] = None
 
-    @abstractmethod
-    def create_dataset(self) -> AnomalibDataset:
-        raise NotImplementedError
-
-    def prepare_data(self) -> None:
-        self.data = self.create_dataset()
-
-    def contains_anomalous_images(self, split):
-        samples = self.data.get_samples(split)
-        return 1 in list(samples.label_index)
-
     def setup(self, stage: Optional[str] = None):
         """Setup train, validation and test data.
 
         Args:
           stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
         """
-        if stage in (None, "fit"):
-            self.train_data = self.data.train_subset(pre_process=self.pre_process_train)
-        if stage in (None, "fit", "validate"):
-            if self.contains_anomalous_images("val"):
-                self.val_data = self.data.val_subset(pre_process=self.pre_process_val)
-            else:
-                self.val_data = self.data.test_subset(pre_process=self.pre_process_val)
-        if stage in (None, "test"):
-            self.test_data = self.data.test_subset(pre_process=self.pre_process_val)
+        if not self.is_setup:
+            self._setup(stage)
+        assert self.is_setup
+
+    @abstractmethod
+    def _setup(self, _stage: Optional[str] = None) -> None:
+        pass
+
+    @property
+    def is_setup(self):
+        if self.train_data is None or self.val_data is None or self.test_data is None:
+            return False
+        return self.train_data.is_setup and self.val_data.is_setup and self.test_data.is_setup
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
         """Get train dataloader."""
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index 29ba0dedf1..cebba3ea46 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -6,23 +6,15 @@
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import logging
-import warnings
 from pathlib import Path
 from typing import Optional, Tuple, Union
 
-import albumentations as A
-from pandas.core.frame import DataFrame
-from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
+from pandas import DataFrame
 from torchvision.datasets.folder import IMG_EXTENSIONS
 
-from anomalib.data.base import AnomalibDataModule, AnomalibDataset
-from anomalib.data.utils.split import (
-    create_validation_set_from_test_set,
-    split_normal_images_in_train_set,
-)
-
-logger = logging.getLogger(__name__)
+from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode
+from anomalib.data.utils.split import split_normals_and_anomalous
+from anomalib.pre_processing.pre_process import PreProcessor
 
 
 def _check_and_convert_path(path: Union[str, Path]) -> Path:
@@ -69,269 +61,194 @@ def _prepare_files_labels(
     return filenames, labels
 
 
-class FolderDataset(AnomalibDataset):
-    def __init__(
-        self,
-        normal_dir,
-        abnormal_dir,
-        normal_test_dir,
-        mask_dir,
-        extensions,
-        split_ratio,
-        seed,
-        create_validation_set,
-        *args,
-        **kwargs,
-    ):
-        self.normal_dir = normal_dir
-        self.abnormal_dir = abnormal_dir
-        self.normal_test_dir = normal_test_dir
-        self.extensions = extensions
-        self.mask_dir = mask_dir
-        self.split_ratio = split_ratio
-        self.seed = seed
-        self.create_validation_set = create_validation_set
-        super().__init__(*args, **kwargs)
+def make_folder_dataset(
+    normal_dir: Union[str, Path],
+    abnormal_dir: Union[str, Path],
+    normal_test_dir: Optional[Union[str, Path]] = None,
+    mask_dir: Optional[Union[str, Path]] = None,
+    split: Optional[str] = None,
+    extensions: Optional[Tuple[str, ...]] = None,
+):
+    """Make Folder Dataset.
 
-    def _create_samples(self):
-        """Create the dataframe with samples for the Folder dataset.
+    Args:
+        normal_dir (Union[str, Path]): Path to the directory containing normal images.
+        abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images.
+        normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing
+            normal images for the test dataset. Normal test images will be a split of `normal_dir`
+            if `None`. Defaults to None.
+        mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
+            the mask annotations. Defaults to None.
+        split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None.
+        split_ratio (float, optional): Ratio to split normal training images and add to the
+            test set in case test set doesn't contain any normal images.
+            Defaults to 0.2.
+        seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0.
+        create_validation_set (bool, optional):Boolean to create a validation set from the test set.
+            Those wanting to create a validation set could set this flag to ``True``.
+        extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
+            directory.
 
-        The files are expected to follow the structure:
-            path/to/dataset/normal_folder_name/normal_image_name.png
-            path/to/dataset/abnormal_folder_name/abnormal_image_name.png
+    Returns:
+        DataFrame: an output dataframe containing samples for the requested split (ie., train or test)
+    """
 
+    filenames = []
+    labels = []
+    dirs = {"normal": normal_dir, "abnormal": abnormal_dir}
 
-        This function creates a dataframe to store the parsed information based on the following format:
-        |---|-------------------|--------|-------------|------------------|-------|
-        |   | image_path        | label  | label_index | mask_path        | split |
-        |---|-------------------|--------|-------------|------------------|-------|
-        | 0 | path/to/image.png | normal | 0           | path/to/mask.png | train |
-        |---|-------------------|--------|-------------|------------------|-------|
+    if normal_test_dir:
+        dirs = {**dirs, **{"normal_test": normal_test_dir}}
 
-        Returns:
-            DataFrame: an output dataframe containing the samples of the dataset.
-        """
+    for dir_type, path in dirs.items():
+        filename, label = _prepare_files_labels(path, dir_type, extensions)
+        filenames += filename
+        labels += label
 
-        filenames = []
-        labels = []
-        dirs = {"normal": self.normal_dir, "abnormal": self.abnormal_dir}
+    samples = DataFrame({"image_path": filenames, "label": labels})
 
-        if self.normal_test_dir:
-            dirs = {**dirs, **{"normal_test": self.normal_test_dir}}
+    # Create label index for normal (0) and abnormal (1) images.
+    samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0
+    samples.loc[(samples.label == "abnormal"), "label_index"] = 1
+    samples.label_index = samples.label_index.astype(int)
 
-        for dir_type, path in dirs.items():
-            if path is not None:
-                filename, label = _prepare_files_labels(path, dir_type, self.extensions)
-                filenames += filename
-                labels += label
+    # If a path to mask is provided, add it to the sample dataframe.
+    if mask_dir is not None:
+        mask_dir = _check_and_convert_path(mask_dir)
+        samples["mask_path"] = ""
+        for index, row in samples.iterrows():
+            if row.label_index == 1:
+                samples.loc[index, "mask_path"] = str(mask_dir / row.image_path.name)
 
-        samples = DataFrame({"image_path": filenames, "label": labels})
+    # Ensure the pathlib objects are converted to str.
+    # This is because torch dataloader doesn't like pathlib.
+    samples = samples.astype({"image_path": "str"})
 
-        # Create label index for normal (0) and abnormal (1) images.
-        samples.loc[(samples.label == "normal") | (samples.label == "normal_test"), "label_index"] = 0
-        samples.loc[(samples.label == "abnormal"), "label_index"] = 1
-        samples.label_index = samples.label_index.astype(int)
+    # Create train/test split.
+    # By default, all the normal samples are assigned as train.
+    #   and all the abnormal samples are test.
+    samples.loc[(samples.label == "normal"), "split"] = "train"
+    samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test"
 
-        # If a path to mask is provided, add it to the sample dataframe.
-        if self.mask_dir is not None:
-            self.mask_dir = _check_and_convert_path(self.mask_dir)
-            samples["mask_path"] = ""
-            for index, row in samples.iterrows():
-                if row.label_index == 1:
-                    samples.loc[index, "mask_path"] = str(self.mask_dir / row.image_path.name)
+    # Get the data frame for the split.
+    if split != Split.FULL:
+        samples = samples[samples.split == split]
+        samples = samples.reset_index(drop=True)
 
-        # Ensure the pathlib objects are converted to str.
-        # This is because torch dataloader doesn't like pathlib.
-        samples = samples.astype({"image_path": "str"})
+    return samples
 
-        # Create train/test split.
-        # By default, all the normal samples are assigned as train.
-        #   and all the abnormal samples are test.
-        samples.loc[(samples.label == "normal"), "split"] = "train"
-        samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test"
 
-        if not self.normal_test_dir:
-            samples = split_normal_images_in_train_set(
-                samples=samples, split_ratio=self.split_ratio, seed=self.seed, normal_label="normal"
-            )
+class Folder(AnomalibDataset):
+    def __init__(
+        self,
+        task: str,
+        pre_process: PreProcessor,
+        split: Split,
+        #
+        normal_dir: Union[str, Path],
+        abnormal_dir: Union[str, Path],
+        normal_test_dir: Optional[Union[str, Path]] = None,
+        mask_dir: Optional[Union[str, Path]] = None,
+        val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
+        extensions=None,
+        samples=None,
+    ) -> None:
+        super().__init__(task, pre_process, samples=samples)
 
-        # If `create_validation_set` is set to True, the test set is split into half.
-        if self.create_validation_set:
-            samples = create_validation_set_from_test_set(samples, seed=self.seed, normal_label="normal")
+        self.split = split
 
-        return samples
+        self.normal_dir = normal_dir
+        self.abnormal_dir = abnormal_dir
+        self.normal_test_dir = normal_test_dir
+        self.mask_dir = mask_dir
+        self.extensions = extensions
 
+        self.val_split_mode = val_split_mode
 
-@DATAMODULE_REGISTRY
-class Folder(AnomalibDataModule):
-    """Folder Lightning Data Module."""
+    def _setup(self):
+        self._samples = make_folder_dataset(
+            normal_dir=self.normal_dir,
+            abnormal_dir=self.abnormal_dir,
+            normal_test_dir=self.normal_test_dir,
+            mask_dir=self.mask_dir,
+            split=self.split,
+            extensions=self.extensions,
+        )
 
+
+class FolderDataModule(AnomalibDataModule):
     def __init__(
         self,
-        root: Union[str, Path],
-        normal_dir: str = "normal",
-        abnormal_dir: str = "abnormal",
-        task: str = "classification",
-        normal_test_dir: Optional[Union[Path, str]] = None,
-        mask_dir: Optional[Union[Path, str]] = None,
-        extensions: Optional[Tuple[str, ...]] = None,
-        split_ratio: float = 0.2,
-        seed: Optional[int] = None,
-        image_size: Optional[Union[int, Tuple[int, int]]] = None,
-        train_batch_size: int = 32,
-        test_batch_size: int = 32,
-        num_workers: int = 8,
-        transform_config_train: Optional[Union[str, A.Compose]] = None,
-        transform_config_val: Optional[Union[str, A.Compose]] = None,
-        create_validation_set: bool = False,
-    ) -> None:
-        """Folder Dataset PL Datamodule.
-
-        Args:
-            root (Union[str, Path]): Path to the root folder containing normal and abnormal dirs.
-            normal_dir (str, optional): Name of the directory containing normal images.
-                Defaults to "normal".
-            abnormal_dir (str, optional): Name of the directory containing abnormal images.
-                Defaults to "abnormal".
-            task (str, optional): Task type. Could be either classification or segmentation.
-                Defaults to "classification".
-            normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing
-                normal images for the test dataset. Defaults to None.
-            mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
-                the mask annotations. Defaults to None.
-            extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
-                directory. Defaults to None.
-            split_ratio (float, optional): Ratio to split normal training images and add to the
-                test set in case test set doesn't contain any normal images.
-                Defaults to 0.2.
-            seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0.
-            image_size (Optional[Union[int, Tuple[int, int]]], optional): Size of the input image.
-                Defaults to None.
-            train_batch_size (int, optional): Training batch size. Defaults to 32.
-            test_batch_size (int, optional): Test batch size. Defaults to 32.
-            num_workers (int, optional): Number of workers. Defaults to 8.
-            transform_config_train (Optional[Union[str, A.Compose]], optional): Config for pre-processing
-                during training.
-                Defaults to None.
-            transform_config_val (Optional[Union[str, A.Compose]], optional): Config for pre-processing
-                during validation.
-                Defaults to None.
-            create_validation_set (bool, optional):Boolean to create a validation set from the test set.
-                Those wanting to create a validation set could set this flag to ``True``.
-
-        Examples:
-            Assume that we use Folder Dataset for the MVTec/bottle/broken_large category. We would do:
-            >>> from anomalib.data import Folder
-            >>> datamodule = Folder(
-            ...     root="./datasets/MVTec/bottle/test",
-            ...     normal="good",
-            ...     abnormal="broken_large",
-            ...     image_size=256
-            ... )
-            >>> datamodule.setup()
-            >>> i, data = next(enumerate(datamodule.train_dataloader()))
-            >>> data["image"].shape
-            torch.Size([16, 3, 256, 256])
-
-            >>> i, test_data = next(enumerate(datamodule.test_dataloader()))
-            >>> test_data.keys()
-            dict_keys(['image'])
-
-            We could also create a Folder DataModule for datasets containing mask annotations.
-            The dataset expects that mask annotation filenames must be same as the original filename.
-            To this end, we modified mask filenames in MVTec AD bottle category.
-            Now we could try folder data module using the mvtec bottle broken large category
-            >>> datamodule = Folder(
-            ...     root="./datasets/bottle/test",
-            ...     normal="good",
-            ...     abnormal="broken_large",
-            ...     mask_dir="./datasets/bottle/ground_truth/broken_large",
-            ...     image_size=256
-            ... )
-
-            >>> i , train_data = next(enumerate(datamodule.train_dataloader()))
-            >>> train_data.keys()
-            dict_keys(['image'])
-            >>> train_data["image"].shape
-            torch.Size([16, 3, 256, 256])
-
-            >>> i, test_data = next(enumerate(datamodule.test_dataloader()))
-            dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask'])
-            >>> print(test_data["image"].shape, test_data["mask"].shape)
-            torch.Size([24, 3, 256, 256]) torch.Size([24, 256, 256])
-
-            By default, Folder Data Module does not create a validation set. If a validation set
-            is needed it could be set as follows:
-
-            >>> datamodule = Folder(
-            ...     root="./datasets/bottle/test",
-            ...     normal="good",
-            ...     abnormal="broken_large",
-            ...     mask_dir="./datasets/bottle/ground_truth/broken_large",
-            ...     image_size=256,
-            ...     create_validation_set=True,
-            ... )
-
-            >>> i, val_data = next(enumerate(datamodule.val_dataloader()))
-            >>> val_data.keys()
-            dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask'])
-            >>> print(val_data["image"].shape, val_data["mask"].shape)
-            torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256])
-
-            >>> i, test_data = next(enumerate(datamodule.test_dataloader()))
-            >>> print(test_data["image"].shape, test_data["mask"].shape)
-            torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256])
-
-        """
+        root,
+        task,
+        train_batch_size,
+        test_batch_size,
+        image_size,
+        num_workers,
+        val_split_mode,
+        #
+        normal_dir,
+        abnormal_dir,
+        normal_test_dir,
+        mask_dir,
+        split_ratio,
+        transform_config_train=None,
+        transform_config_val=None,
+        extensions=None,
+    ):
         super().__init__(
             task=task,
             train_batch_size=train_batch_size,
             test_batch_size=test_batch_size,
             num_workers=num_workers,
-            transform_config_train=transform_config_train,
-            transform_config_val=transform_config_val,
-            image_size=image_size,
+            val_split_mode=val_split_mode,
         )
 
-        if seed is None and normal_test_dir is None:
-            raise ValueError(
-                "Both seed and normal_test_dir cannot be None."
-                " When seed is not set, images from the normal directory are split between training and test dir."
-                " This will lead to inconsistency between runs."
-            )
-
-        if task == "segmentation" and mask_dir is None:
-            warnings.warn(
-                "Segmentation task is requested, but mask directory is not provided. "
-                "Classification is to be chosen if mask directory is not provided."
-            )
-            self.task = "classification"
-        else:
-            self.task = task
-
-        self.root = _check_and_convert_path(root)
-        self.normal_dir = self.root / normal_dir
-        self.abnormal_dir = self.root / abnormal_dir if abnormal_dir is not None else None
-        self.normal_test_dir = normal_test_dir
-        if normal_test_dir:
-            self.normal_test_dir = self.root / normal_test_dir
-        self.mask_dir = mask_dir
-        self.extensions = extensions
         self.split_ratio = split_ratio
 
-        self.create_validation_set = create_validation_set
-        self.seed = seed
+        pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
+        pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size)
 
-    def create_dataset(self):
-        return FolderDataset(
-            normal_dir=self.normal_dir,
-            abnormal_dir=self.abnormal_dir,
-            normal_test_dir=self.normal_test_dir,
-            mask_dir=self.mask_dir,
-            extensions=self.extensions,
-            split_ratio=self.split_ratio,
-            seed=self.seed,
-            create_validation_set=self.create_validation_set,
-            task=self.task,
-            pre_process=self.pre_process_train,
+        normal_dir = Path(root) / Path(normal_dir)
+        abnormal_dir = Path(root) / Path(abnormal_dir)
+
+        self.train_data = Folder(
+            task=task,
+            pre_process=pre_process_train,
+            split=Split.TRAIN,
+            normal_dir=normal_dir,
+            abnormal_dir=abnormal_dir,
+            normal_test_dir=normal_test_dir,
+            mask_dir=mask_dir,
+            extensions=extensions,
         )
+
+        self.test_data = Folder(
+            task=task,
+            pre_process=pre_process_infer,
+            split=Split.TEST,
+            normal_dir=normal_dir,
+            abnormal_dir=abnormal_dir,
+            normal_test_dir=normal_test_dir,
+            mask_dir=mask_dir,
+            extensions=extensions,
+        )
+
+    def _setup(self, _stage: Optional[str] = None):
+
+        assert self.train_data is not None
+        assert self.test_data is not None
+
+        self.train_data.setup()
+        self.test_data.setup()
+
+        if not self.test_data.has_normal:
+            self.train_data, normal_test_data = split_normals_and_anomalous(self.train_data, self.split_ratio)
+            self.test_data += normal_test_data
+
+        if self.val_split_mode == ValSplitMode.FROM_TEST:
+            self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5)
+        elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
+            self.val_data = self.test_data
+        else:
+            raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 1772baf4f1..5c26a87f59 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -1,58 +1,77 @@
-"""MVTec AD Dataset (CC BY-NC-SA 4.0).
-
-Description:
-    This script contains PyTorch Dataset, Dataloader and PyTorch
-        Lightning DataModule for the MVTec AD dataset.
-
-    If the dataset is not on the file system, the script downloads and
-        extracts the dataset and create PyTorch data objects.
-
-License:
-    MVTec AD dataset is released under the Creative Commons
-    Attribution-NonCommercial-ShareAlike 4.0 International License
-    (CC BY-NC-SA 4.0)(https://creativecommons.org/licenses/by-nc-sa/4.0/).
-
-Reference:
-    - Paul Bergmann, Kilian Batzner, Michael Fauser, David Sattlegger, Carsten Steger:
-      The MVTec Anomaly Detection Dataset: A Comprehensive Real-World Dataset for
-      Unsupervised Anomaly Detection; in: International Journal of Computer Vision
-      129(4):1038-1059, 2021, DOI: 10.1007/s11263-020-01400-4.
-
-    - Paul Bergmann, Michael Fauser, David Sattlegger, Carsten Steger: MVTec AD —
-      A Comprehensive Real-World Dataset for Unsupervised Anomaly Detection;
-      in: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
-      9584-9592, 2019, DOI: 10.1109/CVPR.2019.00982.
-"""
-
-# Copyright (C) 2022 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-import logging
-import tarfile
-import warnings
 from pathlib import Path
 from typing import Optional, Tuple, Union
-from urllib.request import urlretrieve
 
 import albumentations as A
-import pandas as pd
-from pandas.core.frame import DataFrame
-from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
+from pandas import DataFrame
 
-from anomalib.data.base import AnomalibDataModule
-from anomalib.data.utils import DownloadProgressBar, hash_check
-from anomalib.data.utils.split import (
-    create_validation_set_from_test_set,
-    split_normal_images_in_train_set,
-)
+from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode
+from anomalib.data.utils.split import split_normals_and_anomalous
+from anomalib.pre_processing import PreProcessor
 
-logger = logging.getLogger(__name__)
 
+def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> DataFrame:
+    """Create MVTec AD samples by parsing the MVTec AD data file structure.
 
-@DATAMODULE_REGISTRY
-class MVTec(AnomalibDataModule):
-    """MVTec AD Lightning Data Module."""
+    The files are expected to follow the structure:
+        path/to/dataset/split/category/image_filename.png
+        path/to/dataset/ground_truth/category/mask_filename.png
 
+    This function creates a dataframe to store the parsed information based on the following format:
+    |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
+    |   | path          | split | label   | image_path    | mask_path                             | label_index |
+    |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
+    | 0 | datasets/name |  test |  defect |  filename.png | ground_truth/defect/filename_mask.png | 1           |
+    |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
+
+    Returns:
+        DataFrame: an output dataframe containing the samples of the dataset.
+    """
+    samples_list = [(str(root),) + filename.parts[-3:] for filename in Path(root).glob("**/*.png")]
+    if len(samples_list) == 0:
+        raise RuntimeError(f"Found 0 images in {root}")
+
+    samples = DataFrame(samples_list, columns=["path", "split", "label", "image_path"])
+    samples = samples[samples.split != "ground_truth"]
+
+    # Create mask_path column
+    samples["mask_path"] = (
+        samples.path
+        + "/ground_truth/"
+        + samples.label
+        + "/"
+        + samples.image_path.str.rstrip("png").str.rstrip(".")
+        + "_mask.png"
+    )
+
+    # Modify image_path column by converting to absolute path
+    samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path
+
+    # Good images don't have mask
+    samples.loc[(samples.split == "test") & (samples.label == "good"), "mask_path"] = ""
+
+    # Create label index for normal (0) and anomalous (1) images.
+    samples.loc[(samples.label == "good"), "label_index"] = 0
+    samples.loc[(samples.label != "good"), "label_index"] = 1
+    samples.label_index = samples.label_index.astype(int)
+
+    if split != Split.FULL:
+        samples = samples[samples.split == split].reset_index(drop=True)
+
+    return samples
+
+
+class MVTec(AnomalibDataset):
+    def __init__(self, task: str, pre_process: PreProcessor, split: Split, root, category, samples=None) -> None:
+        super().__init__(task=task, pre_process=pre_process, samples=samples)
+
+        self.root_category = Path(root) / Path(category)
+        self.split = split
+
+    def _setup(self):
+        self._samples = make_mvtec_dataset(self.root_category, split=self.split)
+
+
+class MVTecDataModule(AnomalibDataModule):
     def __init__(
         self,
         root: str,
@@ -64,156 +83,39 @@ def __init__(
         task: str = "segmentation",
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_val: Optional[Union[str, A.Compose]] = None,
-        split_ratio: float = 0.2,
-        seed: Optional[int] = None,
-        create_validation_set: bool = False,
-    ) -> None:
-        """Mvtec AD Lightning Data Module.
-
-        Args:
-            root: Path to the MVTec AD dataset
-            category: Name of the MVTec AD category.
-            image_size: Variable to which image is resized.
-            train_batch_size: Training batch size.
-            test_batch_size: Testing batch size.
-            num_workers: Number of workers.
-            task: ``classification`` or ``segmentation``
-            transform_config_train: Config for pre-processing during training.
-            transform_config_val: Config for pre-processing during validation.
-            seed: seed used for the random subset splitting
-            create_validation_set: Create a validation subset in addition to the train and test subsets
-
-        Examples
-            >>> from anomalib.data import MVTec
-            >>> datamodule = MVTec(
-            ...     root="./datasets/MVTec",
-            ...     category="leather",
-            ...     image_size=256,
-            ...     train_batch_size=32,
-            ...     test_batch_size=32,
-            ...     num_workers=8,
-            ...     transform_config_train=None,
-            ...     transform_config_val=None,
-            ... )
-            >>> datamodule.setup()
-
-            >>> i, data = next(enumerate(datamodule.train_dataloader()))
-            >>> data.keys()
-            dict_keys(['image'])
-            >>> data["image"].shape
-            torch.Size([32, 3, 256, 256])
-
-            >>> i, data = next(enumerate(datamodule.val_dataloader()))
-            >>> data.keys()
-            dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask'])
-            >>> data["image"].shape, data["mask"].shape
-            (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256]))
-        """
-        self.root = root if isinstance(root, Path) else Path(root)
-        self.category = category
-        self.path = self.root / self.category
-
-        self.create_validation_set = create_validation_set
-        self.seed = seed
-        self.split_ratio = split_ratio
-
+        val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
+    ):
         super().__init__(
             task=task,
             train_batch_size=train_batch_size,
             test_batch_size=test_batch_size,
             num_workers=num_workers,
-            transform_config_train=transform_config_train,
-            transform_config_val=transform_config_val,
-            image_size=image_size,
-            create_validation_set=create_validation_set,
-        )
-
-    def prepare_data(self) -> None:
-        """Download the dataset if not available."""
-        if (self.root / self.category).is_dir():
-            logger.info("Found the dataset.")
-        else:
-            self.root.mkdir(parents=True, exist_ok=True)
-
-            logger.info("Downloading the Mvtec AD dataset.")
-            url = "https://www.mydrive.ch/shares/38536/3830184030e49fe74747669442f0f282/download/420938113-1629952094"
-            dataset_name = "mvtec_anomaly_detection.tar.xz"
-            zip_filename = self.root / dataset_name
-            with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc="MVTec AD") as progress_bar:
-                urlretrieve(
-                    url=f"{url}/{dataset_name}",
-                    filename=zip_filename,
-                    reporthook=progress_bar.update_to,
-                )
-            logger.info("Checking hash")
-            hash_check(zip_filename, "eefca59f2cede9c3fc5b6befbfec275e")
-
-            logger.info("Extracting the dataset.")
-            with tarfile.open(zip_filename) as tar_file:
-                tar_file.extractall(self.root)
-
-            logger.info("Cleaning the tar file")
-            zip_filename.unlink()
-
-    def _create_samples(self) -> DataFrame:
-        """Create MVTec AD samples by parsing the MVTec AD data file structure.
-
-        The files are expected to follow the structure:
-            path/to/dataset/split/category/image_filename.png
-            path/to/dataset/ground_truth/category/mask_filename.png
-
-        This function creates a dataframe to store the parsed information based on the following format:
-        |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
-        |   | path          | split | label   | image_path    | mask_path                             | label_index |
-        |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
-        | 0 | datasets/name |  test |  defect |  filename.png | ground_truth/defect/filename_mask.png | 1           |
-        |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
-
-        Returns:
-            DataFrame: an output dataframe containing the samples of the dataset.
-        """
-        if self.seed is None:
-            warnings.warn(
-                "seed is None."
-                " When seed is not set, images from the normal directory are split between training and test dir."
-                " This will lead to inconsistency between runs."
-            )
-
-        samples_list = [(str(self.path),) + filename.parts[-3:] for filename in self.path.glob("**/*.png")]
-        if len(samples_list) == 0:
-            raise RuntimeError(f"Found 0 images in {self.path}")
-
-        samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"])
-        samples = samples[samples.split != "ground_truth"]
-
-        # Create mask_path column
-        samples["mask_path"] = (
-            samples.path
-            + "/ground_truth/"
-            + samples.label
-            + "/"
-            + samples.image_path.str.rstrip("png").str.rstrip(".")
-            + "_mask.png"
+            val_split_mode=val_split_mode,
         )
 
-        # Modify image_path column by converting to absolute path
-        samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path
-
-        # Split the normal images in training set if test set doesn't
-        # contain any normal images. This is needed because AUC score
-        # cannot be computed based on 1-class
-        if sum((samples.split == "test") & (samples.label == "good")) == 0:
-            samples = split_normal_images_in_train_set(samples, self.split_ratio, self.seed)
+        self.val_split_mode = val_split_mode
 
-        # Good images don't have mask
-        samples.loc[(samples.split == "test") & (samples.label == "good"), "mask_path"] = ""
+        pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
+        pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size)
 
-        # Create label index for normal (0) and anomalous (1) images.
-        samples.loc[(samples.label == "good"), "label_index"] = 0
-        samples.loc[(samples.label != "good"), "label_index"] = 1
-        samples.label_index = samples.label_index.astype(int)
+        self.train_data = MVTec(
+            task=task, pre_process=pre_process_train, split=Split.TRAIN, root=root, category=category
+        )
+        self.test_data = MVTec(task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category)
 
-        if self.create_validation_set:
-            samples = create_validation_set_from_test_set(samples, seed=self.seed)
+    def _setup(self, _stage: Optional[str] = None) -> None:
+        """Set up the datasets and perform dynamic subset splitting if necessary.
 
-        return samples
+        This method may be overridden in subclasses for custom splitting behaviour.
+        """
+        assert self.train_data is not None
+        assert self.test_data is not None
+
+        self.train_data.setup()
+        self.test_data.setup()
+        if self.val_split_mode == ValSplitMode.FROM_TEST:
+            self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5)
+        elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
+            self.val_data = self.test_data
+        else:
+            raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index 311928bb6b..ba47dbdaa4 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -12,9 +12,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
-from typing import Optional
+from typing import Optional, Tuple
 
 from pandas.core.frame import DataFrame
+from torch.utils.data import Subset
+
+from anomalib.data.base import AnomalibDataset
 
 
 def split_normal_images_in_train_set(
@@ -84,3 +87,45 @@ def create_validation_set_from_test_set(
     samples.loc[indices_to_sample, "split"] = "val"
 
     return samples
+
+
+def split_normals_and_anomalous(
+    dataset: "AnomalibDataset", split_ratio: float, seed: Optional[int] = None
+) -> Tuple[Subset, Subset]:
+    """Wrap dataset wit torch.utils.data.Subset twice to create two (non-overlaping) subsets.
+    Args:
+        dataset (AnomalibDataset): AnomalibDataset object.
+        split_ratio (float): Split ratio (0 to 100%) that goes to the NEW split.
+        seed (int): Random seed to ensure reproducibility.
+    Returns:
+        Tuple[AnomalibDataset, AnomalibDataset]: (new split, old split).
+    """
+
+    assert 0 < split_ratio < 1, "Split ratio must be between 0 and 1."
+    if seed is not None:
+        assert seed >= 0, "Seed must be non-negative."
+        random.seed(seed)
+
+    # get the indices of the normal/anomalous images in the dataset
+    normals_indices = dataset.samples.index[dataset.samples.label_index == 0].to_list()
+    anomalous_indices = dataset.samples.index[dataset.samples.label_index == 1].to_list()
+
+    # get the number of normal/anomalous images will got to the new split
+    new_split_n_normals = int(len(normals_indices) * split_ratio)
+    new_split_n_anomalous = int(len(anomalous_indices) * split_ratio)
+
+    # ranmdomly sample the indices of the normal/anomalous images that will go to the new split
+    new_split_normals_indices = random.sample(population=normals_indices, k=new_split_n_normals)
+    new_split_anomalous_indices = random.sample(population=anomalous_indices, k=new_split_n_anomalous)
+
+    # indices that remain in the original split
+    old_split_normals_indices = list(set(normals_indices) - set(new_split_normals_indices))
+    old_split_anomalous_indices = list(set(anomalous_indices) - set(new_split_anomalous_indices))
+
+    # create the new split and the (reduced) original split
+    # new_split = Subset(dataset, new_split_normals_indices + new_split_anomalous_indices)
+    # old_split = Subset(dataset, old_split_normals_indices + old_split_anomalous_indices)
+    new_split = dataset.subsample(new_split_normals_indices + new_split_anomalous_indices)
+    old_split = dataset.subsample(old_split_normals_indices + old_split_anomalous_indices)
+
+    return new_split, old_split
diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml
index 92e66618dc..a12d1d7a25 100644
--- a/anomalib/models/padim/config.yaml
+++ b/anomalib/models/padim/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  create_validation_set: false
+  validation_split_mode: from_test
   tiling:
     apply: false
     tile_size: null
diff --git a/tools/train.py b/tools/train.py
index 33952a7e20..37b894af79 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -63,7 +63,7 @@ def train():
     load_model_callback = LoadModelCallback(weights_path=trainer.checkpoint_callback.best_model_path)
     trainer.callbacks.insert(0, load_model_callback)
 
-    if datamodule.contains_anomalous_images("test"):
+    if datamodule.test_data.has_anomalous:
         logger.info("Testing the model.")
         trainer.test(model=model, datamodule=datamodule)
     else:

From 94cabb7ba637478cf6f2c97fe5c1a42ce248ad39 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 7 Oct 2022 18:00:28 +0200
Subject: [PATCH 20/96] remove unused constructor arguments

---
 anomalib/data/base.py        | 6 ------
 anomalib/data/folder.py      | 6 ++++--
 anomalib/data/mvtec.py       | 2 --
 anomalib/data/utils/split.py | 2 +-
 4 files changed, 5 insertions(+), 11 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 9c5ab99ed5..0b25f5aa66 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -149,18 +149,14 @@ class AnomalibDataModule(LightningDataModule, ABC):
 
     def __init__(
         self,
-        task: str,
         train_batch_size: int,
         test_batch_size: int,
         num_workers: int,
-        val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
     ):
         super().__init__()
-        self.task = task
         self.train_batch_size = train_batch_size
         self.test_batch_size = test_batch_size
         self.num_workers = num_workers
-        self.val_split_mode = val_split_mode
 
         self.train_data: Optional[AnomalibDataset] = None
         self.val_data: Optional[AnomalibDataset] = None
@@ -168,8 +164,6 @@ def __init__(
 
         self._samples: Optional[DataFrame] = None
 
-        self.data: Optional[AnomalibDataset] = None
-
     def setup(self, stage: Optional[str] = None):
         """Setup train, validation and test data.
 
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index cebba3ea46..be20425269 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -197,13 +197,12 @@ def __init__(
         extensions=None,
     ):
         super().__init__(
-            task=task,
             train_batch_size=train_batch_size,
             test_batch_size=test_batch_size,
             num_workers=num_workers,
-            val_split_mode=val_split_mode,
         )
 
+        self.val_split_mode = val_split_mode
         self.split_ratio = split_ratio
 
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
@@ -242,11 +241,14 @@ def _setup(self, _stage: Optional[str] = None):
         self.train_data.setup()
         self.test_data.setup()
 
+        # add some normal images to the test set
         if not self.test_data.has_normal:
             self.train_data, normal_test_data = split_normals_and_anomalous(self.train_data, self.split_ratio)
             self.test_data += normal_test_data
 
+        # split validation set from test set
         if self.val_split_mode == ValSplitMode.FROM_TEST:
+            assert self.test_data is not None
             self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5)
         elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
             self.val_data = self.test_data
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 5c26a87f59..72408a2eee 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -86,11 +86,9 @@ def __init__(
         val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
     ):
         super().__init__(
-            task=task,
             train_batch_size=train_batch_size,
             test_batch_size=test_batch_size,
             num_workers=num_workers,
-            val_split_mode=val_split_mode,
         )
 
         self.val_split_mode = val_split_mode
diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index ba47dbdaa4..c8ed8a5e90 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -91,7 +91,7 @@ def create_validation_set_from_test_set(
 
 def split_normals_and_anomalous(
     dataset: "AnomalibDataset", split_ratio: float, seed: Optional[int] = None
-) -> Tuple[Subset, Subset]:
+) -> Tuple[AnomalibDataset, AnomalibDataset]:
     """Wrap dataset wit torch.utils.data.Subset twice to create two (non-overlaping) subsets.
     Args:
         dataset (AnomalibDataset): AnomalibDataset object.

From 1ee8a962fa0686f0eb3aa5fb5d8bfc3b06889cbb Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 7 Oct 2022 18:11:30 +0200
Subject: [PATCH 21/96] adapt btech to new design

---
 anomalib/data/__init__.py |  15 +--
 anomalib/data/btech.py    | 254 +++++++++++++++++++++++++-------------
 anomalib/data/mvtec.py    |  52 +++++++-
 3 files changed, 221 insertions(+), 100 deletions(-)

diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index d1da8af375..7ba61dc5c2 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -10,7 +10,7 @@
 
 from anomalib.data.base import AnomalibDataModule
 
-from .btech import BTech
+from .btech import BTechDataModule
 from .folder import FolderDataModule
 from .inference import InferenceDataset
 from .mvtec import MVTecDataModule
@@ -33,7 +33,6 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
 
     if config.dataset.format.lower() == "mvtec":
         datamodule = MVTecDataModule(
-            # TODO: Remove config values. IAAALD-211
             root=config.dataset.path,
             category=config.dataset.category,
             image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
@@ -46,19 +45,17 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             val_split_mode=config.dataset.validation_split_mode,
         )
     elif config.dataset.format.lower() == "btech":
-        datamodule = BTech(
-            # TODO: Remove config values. IAAALD-211
+        datamodule = BTechDataModule(
             root=config.dataset.path,
             category=config.dataset.category,
             image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
             train_batch_size=config.dataset.train_batch_size,
             test_batch_size=config.dataset.test_batch_size,
             num_workers=config.dataset.num_workers,
-            seed=config.project.seed,
             task=config.dataset.task,
             transform_config_train=config.dataset.transform_config.train,
             transform_config_val=config.dataset.transform_config.val,
-            create_validation_set=config.dataset.create_validation_set,
+            val_split_mode=config.dataset.validation_split_mode,
         )
     elif config.dataset.format.lower() == "folder":
         datamodule = FolderDataModule(
@@ -90,8 +87,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
 
 __all__ = [
     "get_datamodule",
-    "BTech",
-    "Folder",
+    "BTechDataModule",
+    "FolderDataModule",
     "InferenceDataset",
-    "MVTec",
+    "MVTecDataModule",
 ]
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 489841ab94..fa799cc6e7 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -23,18 +23,153 @@
 from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
 from tqdm import tqdm
 
-from anomalib.data.base import AnomalibDataModule
+from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode
 from anomalib.data.utils import DownloadProgressBar, hash_check
-from anomalib.data.utils.split import (
-    create_validation_set_from_test_set,
-    split_normal_images_in_train_set,
-)
+from anomalib.data.utils.split import split_normals_and_anomalous
+from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
 
 
+def make_btech_dataset(path: Path, split: Optional[str] = None) -> DataFrame:
+    """Create BTech samples by parsing the BTech data file structure.
+
+    The files are expected to follow the structure:
+        path/to/dataset/split/category/image_filename.png
+        path/to/dataset/ground_truth/category/mask_filename.png
+
+    Args:
+        path (Path): Path to dataset
+        split (str, optional): Dataset split (ie., either train or test). Defaults to None.
+        split_ratio (float, optional): Ratio to split normal training images and add to the
+            test set in case test set doesn't contain any normal images.
+            Defaults to 0.1.
+        seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0.
+        create_validation_set (bool, optional): Boolean to create a validation set from the test set.
+            BTech dataset does not contain a validation set. Those wanting to create a validation set
+            could set this flag to ``True``.
+
+    Example:
+        The following example shows how to get training samples from BTech 01 category:
+
+        >>> root = Path('./BTech')
+        >>> category = '01'
+        >>> path = root / category
+        >>> path
+        PosixPath('BTech/01')
+
+        >>> samples = make_btech_dataset(path, split='train', split_ratio=0.1, seed=0)
+        >>> samples.head()
+           path     split label image_path                  mask_path                     label_index
+        0  BTech/01 train 01    BTech/01/train/ok/105.bmp BTech/01/ground_truth/ok/105.png      0
+        1  BTech/01 train 01    BTech/01/train/ok/017.bmp BTech/01/ground_truth/ok/017.png      0
+        ...
+
+    Returns:
+        DataFrame: an output dataframe containing samples for the requested split (ie., train or test)
+    """
+    samples_list = [
+        (str(path),) + filename.parts[-3:] for filename in path.glob("**/*") if filename.suffix in (".bmp", ".png")
+    ]
+    if len(samples_list) == 0:
+        raise RuntimeError(f"Found 0 images in {path}")
+
+    samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"])
+    samples = samples[samples.split != "ground_truth"]
+
+    # Create mask_path column
+    samples["mask_path"] = (
+        samples.path
+        + "/ground_truth/"
+        + samples.label
+        + "/"
+        + samples.image_path.str.rstrip("png").str.rstrip(".")
+        + ".png"
+    )
+
+    # Modify image_path column by converting to absolute path
+    samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path
+
+    # Good images don't have mask
+    samples.loc[(samples.split == "test") & (samples.label == "ok"), "mask_path"] = ""
+
+    # Create label index for normal (0) and anomalous (1) images.
+    samples.loc[(samples.label == "ok"), "label_index"] = 0
+    samples.loc[(samples.label != "ok"), "label_index"] = 1
+    samples.label_index = samples.label_index.astype(int)
+
+    # Get the data frame for the split.
+    if split != Split.FULL:
+        samples = samples[samples.split == split]
+        samples = samples.reset_index(drop=True)
+
+    return samples
+
+
+class BTech(AnomalibDataset):
+    """BTech PyTorch Dataset."""
+
+    def __init__(
+        self,
+        root: Union[Path, str],
+        category: str,
+        pre_process: PreProcessor,
+        split: Split,
+        task: str = "segmentation",
+        samples: Optional[DataFrame] = None,
+    ) -> None:
+        """Btech Dataset class.
+
+        Args:
+            root: Path to the BTech dataset
+            category: Name of the BTech category.
+            pre_process: List of pre_processing object containing albumentation compose.
+            split: 'train', 'val' or 'test'
+            task: ``classification`` or ``segmentation``
+            seed: seed used for the random subset splitting
+            create_validation_set: Create a validation subset in addition to the train and test subsets
+
+        Examples:
+            >>> from anomalib.data.btech import BTechDataset
+            >>> from anomalib.data.transforms import PreProcessor
+            >>> pre_process = PreProcessor(image_size=256)
+            >>> dataset = BTechDataset(
+            ...     root='./datasets/BTech',
+            ...     category='leather',
+            ...     pre_process=pre_process,
+            ...     task="classification",
+            ...     is_train=True,
+            ... )
+            >>> dataset[0].keys()
+            dict_keys(['image'])
+
+            >>> dataset.split = "test"
+            >>> dataset[0].keys()
+            dict_keys(['image', 'image_path', 'label'])
+
+            >>> dataset.task = "segmentation"
+            >>> dataset.split = "train"
+            >>> dataset[0].keys()
+            dict_keys(['image'])
+
+            >>> dataset.split = "test"
+            >>> dataset[0].keys()
+            dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask'])
+
+            >>> dataset[0]["image"].shape, dataset[0]["mask"].shape
+            (torch.Size([3, 256, 256]), torch.Size([256, 256]))
+        """
+        super().__init__(task, pre_process, samples)
+
+        self.root_category = Path(root) / Path(category)
+        self.split = split
+
+    def _setup(self):
+        self._samples = make_btech_dataset(path=self.root_category, split=self.split)
+
+
 @DATAMODULE_REGISTRY
-class BTech(AnomalibDataModule):
+class BTechDataModule(AnomalibDataModule):
     """BTechDataModule Lightning Data Module."""
 
     def __init__(
@@ -48,9 +183,7 @@ def __init__(
         task: str = "segmentation",
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_val: Optional[Union[str, A.Compose]] = None,
-        split_ratio: float = 0.2,
-        seed: Optional[int] = None,
-        create_validation_set: bool = False,
+        val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
     ) -> None:
         """Instantiate BTech Lightning Data Module.
 
@@ -67,7 +200,7 @@ def __init__(
             seed: seed used for the random subset splitting
             create_validation_set: Create a validation subset in addition to the train and test subsets
 
-        Examples
+        Examples:
             >>> from anomalib.data import BTech
             >>> datamodule = BTech(
             ...     root="./datasets/BTech",
@@ -93,24 +226,19 @@ def __init__(
             >>> data["image"].shape, data["mask"].shape
             (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256]))
         """
-        self.root = root if isinstance(root, Path) else Path(root)
-        self.category = category
-        self.path = self.root / self.category
-
-        self.create_validation_set = create_validation_set
-        self.seed = seed
-        self.split_ratio = split_ratio
-
-        super().__init__(
-            task=task,
-            train_batch_size=train_batch_size,
-            test_batch_size=test_batch_size,
-            num_workers=num_workers,
-            transform_config_train=transform_config_train,
-            transform_config_val=transform_config_val,
-            image_size=image_size,
-            create_validation_set=create_validation_set,
+        super().__init__(train_batch_size, test_batch_size, num_workers)
+
+        self.root = Path(root)
+        self.category = Path(category)
+        self.val_split_mode = val_split_mode
+
+        pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
+        pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size)
+
+        self.train_data = BTech(
+            task=task, pre_process=pre_process_train, split=Split.TRAIN, root=root, category=category
         )
+        self.test_data = BTech(task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category)
 
     def prepare_data(self) -> None:
         """Download the dataset if not available."""
@@ -153,62 +281,16 @@ def prepare_data(self) -> None:
             logger.info("Cleaning the tar file")
             zip_filename.unlink()
 
-    def _create_samples(self) -> DataFrame:
-        """Create BTech samples by parsing the BTech data file structure.
-
-        The files are expected to follow the structure:
-            path/to/dataset/category/split/[ok|ko]/image_filename.bmp
-            path/to/dataset/category/ground_truth/ko/mask_filename.png
-
-        This function creates a dataframe to store the parsed information based on the following format:
-        |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
-        |   | path          | split | label   | image_path    | mask_path                             | label_index |
-        |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
-        | 0 | datasets/name |  test |  ko     |  filename.png | ground_truth/ko/filename_mask.png     | 1           |
-        |---|---------------|-------|---------|---------------|---------------------------------------|-------------|
-
-        Returns:
-            DataFrame: an output dataframe containing the samples of the dataset.
-        """
-        samples_list = [
-            (str(self.path),) + filename.parts[-3:]
-            for filename in self.path.glob("**/*")
-            if filename.suffix in (".bmp", ".png")
-        ]
-        if len(samples_list) == 0:
-            raise RuntimeError(f"Found 0 images in {self.path}")
-
-        samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"])
-        samples = samples[samples.split != "ground_truth"]
-
-        # Create mask_path column
-        samples["mask_path"] = (
-            samples.path
-            + "/ground_truth/"
-            + samples.label
-            + "/"
-            + samples.image_path.str.rstrip("bmp|png").str.rstrip(".")
-            + ".png"
-        )
-
-        # Modify image_path column by converting to absolute path
-        samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path
-
-        # Split the normal images in training set if test set doesn't
-        # contain any normal images. This is needed because AUC score
-        # cannot be computed based on 1-class
-        if sum((samples.split == "test") & (samples.label == "ok")) == 0:
-            samples = split_normal_images_in_train_set(samples, self.split_ratio, self.seed)
-
-        # Good images don't have mask
-        samples.loc[(samples.split == "test") & (samples.label == "ok"), "mask_path"] = ""
-
-        # Create label index for normal (0) and anomalous (1) images.
-        samples.loc[(samples.label == "ok"), "label_index"] = 0
-        samples.loc[(samples.label != "ok"), "label_index"] = 1
-        samples.label_index = samples.label_index.astype(int)
-
-        if self.create_validation_set:
-            samples = create_validation_set_from_test_set(samples, seed=self.seed)
-
-        return samples
+    def _setup(self, _stage: Optional[str] = None):
+        """Set up the datasets and perform dynamic subset splitting."""
+        assert self.train_data is not None
+        assert self.test_data is not None
+
+        self.train_data.setup()
+        self.test_data.setup()
+        if self.val_split_mode == ValSplitMode.FROM_TEST:
+            self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5)
+        elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
+            self.val_data = self.test_data
+        else:
+            raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 72408a2eee..6e2ba99e5d 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -1,3 +1,28 @@
+"""MVTec AD Dataset (CC BY-NC-SA 4.0).
+
+Description:
+    This script contains PyTorch Dataset, Dataloader and PyTorch
+        Lightning DataModule for the MVTec AD dataset.
+    If the dataset is not on the file system, the script downloads and
+        extracts the dataset and create PyTorch data objects.
+License:
+    MVTec AD dataset is released under the Creative Commons
+    Attribution-NonCommercial-ShareAlike 4.0 International License
+    (CC BY-NC-SA 4.0)(https://creativecommons.org/licenses/by-nc-sa/4.0/).
+Reference:
+    - Paul Bergmann, Kilian Batzner, Michael Fauser, David Sattlegger, Carsten Steger:
+      The MVTec Anomaly Detection Dataset: A Comprehensive Real-World Dataset for
+      Unsupervised Anomaly Detection; in: International Journal of Computer Vision
+      129(4):1038-1059, 2021, DOI: 10.1007/s11263-020-01400-4.
+    - Paul Bergmann, Michael Fauser, David Sattlegger, Carsten Steger: MVTec AD —
+      A Comprehensive Real-World Dataset for Unsupervised Anomaly Detection;
+      in: IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR),
+      9584-9592, 2019, DOI: 10.1109/CVPR.2019.00982.
+"""
+
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 from pathlib import Path
 from typing import Optional, Tuple, Union
 
@@ -61,7 +86,25 @@ def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> Dat
 
 
 class MVTec(AnomalibDataset):
-    def __init__(self, task: str, pre_process: PreProcessor, split: Split, root, category, samples=None) -> None:
+    """MVTec dataset class.
+
+    Args:
+        task (str): Task type, either 'classification' or 'segmentation'
+        pre_process (PreProcessor): Pre-processor object
+        split (Split): Split of the dataset, usually Split.TRAIN or Split. TEST
+        root (str): Path to the root of the dataset
+        category (str): Sub-category of the dataset, e.g. 'bottle'
+    """
+
+    def __init__(
+        self,
+        task: str,
+        pre_process: PreProcessor,
+        split: Split,
+        root: str,
+        category: str,
+        samples: Optional[DataFrame] = None,
+    ) -> None:
         super().__init__(task=task, pre_process=pre_process, samples=samples)
 
         self.root_category = Path(root) / Path(category)
@@ -72,6 +115,8 @@ def _setup(self):
 
 
 class MVTecDataModule(AnomalibDataModule):
+    """MVTec Datamodule."""
+
     def __init__(
         self,
         root: str,
@@ -102,10 +147,7 @@ def __init__(
         self.test_data = MVTec(task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category)
 
     def _setup(self, _stage: Optional[str] = None) -> None:
-        """Set up the datasets and perform dynamic subset splitting if necessary.
-
-        This method may be overridden in subclasses for custom splitting behaviour.
-        """
+        """Set up the datasets and perform dynamic subset splitting."""
         assert self.train_data is not None
         assert self.test_data is not None
 

From 7fc5483ef57cd1d1eaf85dcacf719fa03bbded94 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 7 Oct 2022 18:17:20 +0200
Subject: [PATCH 22/96] add prepare_data method for mvtec

---
 anomalib/data/mvtec.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 6e2ba99e5d..8e14587162 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -23,16 +23,22 @@
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+import logging
+import tarfile
 from pathlib import Path
 from typing import Optional, Tuple, Union
+from urllib.request import urlretrieve
 
 import albumentations as A
 from pandas import DataFrame
 
 from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode
+from anomalib.data.utils import DownloadProgressBar, hash_check
 from anomalib.data.utils.split import split_normals_and_anomalous
 from anomalib.pre_processing import PreProcessor
 
+logger = logging.getLogger(__name__)
+
 
 def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> DataFrame:
     """Create MVTec AD samples by parsing the MVTec AD data file structure.
@@ -136,6 +142,8 @@ def __init__(
             num_workers=num_workers,
         )
 
+        self.root = Path(root)
+        self.category = Path(category)
         self.val_split_mode = val_split_mode
 
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
@@ -146,6 +154,33 @@ def __init__(
         )
         self.test_data = MVTec(task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category)
 
+    def prepare_data(self) -> None:
+        """Download the dataset if not available."""
+        if (self.root / self.category).is_dir():
+            logger.info("Found the dataset.")
+        else:
+            self.root.mkdir(parents=True, exist_ok=True)
+
+            logger.info("Downloading the Mvtec AD dataset.")
+            url = "https://www.mydrive.ch/shares/38536/3830184030e49fe74747669442f0f282/download/420938113-1629952094"
+            dataset_name = "mvtec_anomaly_detection.tar.xz"
+            zip_filename = self.root / dataset_name
+            with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc="MVTec AD") as progress_bar:
+                urlretrieve(
+                    url=f"{url}/{dataset_name}",
+                    filename=zip_filename,
+                    reporthook=progress_bar.update_to,
+                )
+            logger.info("Checking hash")
+            hash_check(zip_filename, "eefca59f2cede9c3fc5b6befbfec275e")
+
+            logger.info("Extracting the dataset.")
+            with tarfile.open(zip_filename) as tar_file:
+                tar_file.extractall(self.root)
+
+            logger.info("Cleaning the tar file")
+            (zip_filename).unlink()
+
     def _setup(self, _stage: Optional[str] = None) -> None:
         """Set up the datasets and perform dynamic subset splitting."""
         assert self.train_data is not None

From 1ac7c652d9b66e7e49429138d170dc3a47165eae Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 10 Oct 2022 16:08:04 +0200
Subject: [PATCH 23/96] implement more generic random splitting function

---
 anomalib/data/base.py        |   9 +-
 anomalib/data/btech.py       |   4 +-
 anomalib/data/folder.py      |   6 +-
 anomalib/data/mvtec.py       |   5 +-
 anomalib/data/utils/split.py | 156 +++++++++++------------------------
 5 files changed, 63 insertions(+), 117 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 0b25f5aa66..392fa32438 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -52,7 +52,8 @@ def __len__(self) -> int:
         return len(self._samples)
 
     def subsample(self, indices):
-        return AnomalibDataset(task=self.task, pre_process=self.pre_process, samples=self.samples.iloc[indices])
+        samples = self.samples.iloc[indices].reset_index(drop=True)
+        return AnomalibDataset(task=self.task, pre_process=self.pre_process, samples=samples)
 
     @property
     def is_setup(self) -> bool:
@@ -119,6 +120,12 @@ def __add__(self, other_dataset: AnomalibDataset):
         samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True)
         return AnomalibDataset(self.task, self.pre_process, samples)
 
+    def __radd__(self, other):
+        if other == 0:
+            return self
+        else:
+            return self.__add__(other)
+
     def setup(self) -> None:
         """Load data/metadata into memory"""
         if not self.is_setup:
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index fa799cc6e7..f7d03ac19b 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -25,7 +25,7 @@
 
 from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode
 from anomalib.data.utils import DownloadProgressBar, hash_check
-from anomalib.data.utils.split import split_normals_and_anomalous
+from anomalib.data.utils.split import random_split
 from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
@@ -289,7 +289,7 @@ def _setup(self, _stage: Optional[str] = None):
         self.train_data.setup()
         self.test_data.setup()
         if self.val_split_mode == ValSplitMode.FROM_TEST:
-            self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5)
+            self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True)
         elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
             self.val_data = self.test_data
         else:
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index be20425269..eef498735d 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -13,7 +13,7 @@
 from torchvision.datasets.folder import IMG_EXTENSIONS
 
 from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode
-from anomalib.data.utils.split import split_normals_and_anomalous
+from anomalib.data.utils.split import random_split
 from anomalib.pre_processing.pre_process import PreProcessor
 
 
@@ -243,13 +243,13 @@ def _setup(self, _stage: Optional[str] = None):
 
         # add some normal images to the test set
         if not self.test_data.has_normal:
-            self.train_data, normal_test_data = split_normals_and_anomalous(self.train_data, self.split_ratio)
+            self.train_data, normal_test_data = random_split(self.train_data, self.split_ratio)
             self.test_data += normal_test_data
 
         # split validation set from test set
         if self.val_split_mode == ValSplitMode.FROM_TEST:
             assert self.test_data is not None
-            self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5)
+            self.val_data, self.test_data = random_split(self.train_data, [0.5, 0.5], label_aware=True)
         elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
             self.val_data = self.test_data
         else:
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 70fca5c1fe..f5655090e9 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -34,7 +34,7 @@
 
 from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode
 from anomalib.data.utils import DownloadProgressBar, hash_check
-from anomalib.data.utils.split import split_normals_and_anomalous
+from anomalib.data.utils.split import random_split
 from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
@@ -175,6 +175,7 @@ def __init__(
         self.category = Path(category)
         self.val_split_mode = val_split_mode
 
+        # TODO: Get rid of PreProcessor by passing transform directly
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
         pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size)
 
@@ -218,7 +219,7 @@ def _setup(self, _stage: Optional[str] = None) -> None:
         self.train_data.setup()
         self.test_data.setup()
         if self.val_split_mode == ValSplitMode.FROM_TEST:
-            self.val_data, self.test_data = split_normals_and_anomalous(self.test_data, 0.5)
+            self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True)
         elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
             self.val_data = self.test_data
         else:
diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index c8ed8a5e90..b45fc23ea8 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -11,121 +11,59 @@
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
-import random
-from typing import Optional, Tuple
+import math
+import warnings
+from typing import Sequence, Union
 
-from pandas.core.frame import DataFrame
-from torch.utils.data import Subset
+from torch import randperm, split
 
 from anomalib.data.base import AnomalibDataset
 
 
-def split_normal_images_in_train_set(
-    samples: DataFrame, split_ratio: float = 0.1, seed: Optional[int] = None, normal_label: str = "good"
-) -> DataFrame:
-    """Split normal images in train set.
-
-        This function splits the normal images in training set and assigns the
-        values to the test set. This is particularly useful especially when the
-        test set does not contain any normal images.
-
-        This is important because when the test set doesn't have any normal images,
-        AUC computation fails due to having single class.
+def random_split(
+    dataset: AnomalibDataset, split_ratio: Union[float, Sequence[float]], label_aware: bool = False
+) -> Sequence[AnomalibDataset]:
+    """Perform a random split of a dataset.
 
     Args:
-        samples (DataFrame): Dataframe containing dataset info such as filenames, splits etc.
-        split_ratio (float, optional): Train-Test normal image split ratio. Defaults to 0.1.
-        seed (int, optional): Random seed to ensure reproducibility. Defaults to 0.
-        normal_label (str): Name of the normal label. For MVTec AD, for instance, this is normal_label.
-
-    Returns:
-        DataFrame: Output dataframe where the part of the training set is assigned to test set.
+        dataset (AnomalibDataset): Source dataset
+        split_ratio (Union[float, Sequence[float]]): Fractions of the splits that will be produced. The values in the
+            sequence must sum to 1. If a single value is passed, the ratio will be converted to
+            [1-split_ratio, split_ratio].
+        label_aware (bool): When True, the relative occurrence of the different class labels of the source dataset will
+            be maintained in each of the subsets.
     """
 
-    if seed is not None:
-        random.seed(seed)
-
-    normal_train_image_indices = samples.index[(samples.split == "train") & (samples.label == normal_label)].to_list()
-    num_normal_train_images = len(normal_train_image_indices)
-    num_normal_valid_images = int(num_normal_train_images * split_ratio)
-
-    indices_to_split_from_train_set = random.sample(population=normal_train_image_indices, k=num_normal_valid_images)
-    samples.loc[indices_to_split_from_train_set, "split"] = "test"
-
-    return samples
-
-
-def create_validation_set_from_test_set(
-    samples: DataFrame, seed: Optional[int] = None, normal_label: str = "good"
-) -> DataFrame:
-    """Craete Validation Set from Test Set.
-
-    This function creates a validation set from test set by splitting both
-    normal and abnormal samples to two.
-
-    Args:
-        samples (DataFrame): Dataframe containing dataset info such as filenames, splits etc.
-        seed (int, optional): Random seed to ensure reproducibility. Defaults to 0.
-        normal_label (str): Name of the normal label. For MVTec AD, for instance, this is normal_label.
-    """
-
-    if seed is not None:
-        random.seed(seed)
-
-    # Split normal images.
-    normal_test_image_indices = samples.index[(samples.split == "test") & (samples.label == normal_label)].to_list()
-    num_normal_valid_images = len(normal_test_image_indices) // 2
-
-    indices_to_sample = random.sample(population=normal_test_image_indices, k=num_normal_valid_images)
-    samples.loc[indices_to_sample, "split"] = "val"
-
-    # Split abnormal images.
-    abnormal_test_image_indices = samples.index[(samples.split == "test") & (samples.label != normal_label)].to_list()
-    num_abnormal_valid_images = len(abnormal_test_image_indices) // 2
-
-    indices_to_sample = random.sample(population=abnormal_test_image_indices, k=num_abnormal_valid_images)
-    samples.loc[indices_to_sample, "split"] = "val"
-
-    return samples
-
-
-def split_normals_and_anomalous(
-    dataset: "AnomalibDataset", split_ratio: float, seed: Optional[int] = None
-) -> Tuple[AnomalibDataset, AnomalibDataset]:
-    """Wrap dataset wit torch.utils.data.Subset twice to create two (non-overlaping) subsets.
-    Args:
-        dataset (AnomalibDataset): AnomalibDataset object.
-        split_ratio (float): Split ratio (0 to 100%) that goes to the NEW split.
-        seed (int): Random seed to ensure reproducibility.
-    Returns:
-        Tuple[AnomalibDataset, AnomalibDataset]: (new split, old split).
-    """
-
-    assert 0 < split_ratio < 1, "Split ratio must be between 0 and 1."
-    if seed is not None:
-        assert seed >= 0, "Seed must be non-negative."
-        random.seed(seed)
-
-    # get the indices of the normal/anomalous images in the dataset
-    normals_indices = dataset.samples.index[dataset.samples.label_index == 0].to_list()
-    anomalous_indices = dataset.samples.index[dataset.samples.label_index == 1].to_list()
-
-    # get the number of normal/anomalous images will got to the new split
-    new_split_n_normals = int(len(normals_indices) * split_ratio)
-    new_split_n_anomalous = int(len(anomalous_indices) * split_ratio)
-
-    # ranmdomly sample the indices of the normal/anomalous images that will go to the new split
-    new_split_normals_indices = random.sample(population=normals_indices, k=new_split_n_normals)
-    new_split_anomalous_indices = random.sample(population=anomalous_indices, k=new_split_n_anomalous)
-
-    # indices that remain in the original split
-    old_split_normals_indices = list(set(normals_indices) - set(new_split_normals_indices))
-    old_split_anomalous_indices = list(set(anomalous_indices) - set(new_split_anomalous_indices))
-
-    # create the new split and the (reduced) original split
-    # new_split = Subset(dataset, new_split_normals_indices + new_split_anomalous_indices)
-    # old_split = Subset(dataset, old_split_normals_indices + old_split_anomalous_indices)
-    new_split = dataset.subsample(new_split_normals_indices + new_split_anomalous_indices)
-    old_split = dataset.subsample(old_split_normals_indices + old_split_anomalous_indices)
-
-    return new_split, old_split
+    if isinstance(split_ratio, float):
+        split_ratio = [1 - split_ratio, split_ratio]
+
+    assert math.isclose(sum(split_ratio), 1) and sum(split_ratio) <= 1, "split ratios must sum to 1."
+    assert all(0 < ratio < 1 for ratio in split_ratio), "all split ratios must be between 0 and 1."
+
+    # create list of source data
+    if label_aware:
+        indices_per_label = [group.index for _, group in dataset.samples.groupby("label_index")]
+        datasets = [dataset.subsample(indices) for indices in indices_per_label]
+    else:
+        datasets = [dataset]
+
+    # split each (label-aware) subset of source data
+    subsets = []
+    for dataset in datasets:
+        # get subset lengths
+        subset_lengths = []
+        for ratio in split_ratio:
+            subset_lengths.append(int(math.floor(len(dataset) * ratio)))
+        for i in range(len(dataset) - sum(subset_lengths)):
+            subset_idx = i % sum(subset_lengths)
+            subset_lengths[subset_idx] += 1
+        for index, length in enumerate(subset_lengths):
+            if length == 0:
+                warnings.warn(f"Length of subset at index {index} is 0.")
+        # perform random subsampling
+        indices = randperm(len(dataset))
+        subsets.append([dataset.subsample(subset_indices) for subset_indices in split(indices, subset_lengths)])
+
+    # concatenate and return
+    subsets = list(map(list, zip(*subsets)))
+    return tuple(sum(subset) for subset in subsets)

From 965ea949b8c22816583128ab1ca6396f94f8bcfc Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 10 Oct 2022 17:15:47 +0200
Subject: [PATCH 24/96] update docstrings for folder module

---
 anomalib/data/folder.py | 118 +++++++++++++++++++++++++++++-----------
 1 file changed, 86 insertions(+), 32 deletions(-)

diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index eef498735d..4e88b22db4 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 from typing import Optional, Tuple, Union
 
+import albumentations as A
 from pandas import DataFrame
 from torchvision.datasets.folder import IMG_EXTENSIONS
 
@@ -79,13 +80,7 @@ def make_folder_dataset(
             if `None`. Defaults to None.
         mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
             the mask annotations. Defaults to None.
-        split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None.
-        split_ratio (float, optional): Ratio to split normal training images and add to the
-            test set in case test set doesn't contain any normal images.
-            Defaults to 0.2.
-        seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0.
-        create_validation_set (bool, optional):Boolean to create a validation set from the test set.
-            Those wanting to create a validation set could set this flag to ``True``.
+        split (Optional[Split], optional): Dataset split (ie., Split.FULL, Split.TRAIN or Split.TEST). Defaults to None.
         extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
             directory.
 
@@ -139,26 +134,52 @@ def make_folder_dataset(
 
 
 class Folder(AnomalibDataset):
+    """Folder dataset.
+
+    Args:
+        task (str): Task type. (classification or segmentation).
+        pre_process (PreProcessor): Image Pre-processor to apply transform.
+        split (Split): Fixed subset split that follows from folder structure on file system. Choose from
+            [Split.FULL, Split.TRAIN, Split.TEST]
+
+        root (Union[str, Path]): Root folder of the dataset.
+        normal_dir (Union[str, Path]): Path to the directory containing normal images.
+        abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images.
+        split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None.
+        normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing
+            normal images for the test dataset. Defaults to None.
+        mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
+            the mask annotations. Defaults to None.
+
+        extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
+            directory.
+        val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained.
+
+    Raises:
+        ValueError: When task is set to classification and `mask_dir` is provided. When `mask_dir` is
+            provided, `task` should be set to `segmentation`.
+    """
+
     def __init__(
         self,
         task: str,
         pre_process: PreProcessor,
         split: Split,
         #
+        root: Union[str, Path],
         normal_dir: Union[str, Path],
         abnormal_dir: Union[str, Path],
         normal_test_dir: Optional[Union[str, Path]] = None,
         mask_dir: Optional[Union[str, Path]] = None,
         val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
-        extensions=None,
-        samples=None,
+        extensions: Optional[Tuple[str]] = None,
+        samples: DataFrame = None,
     ) -> None:
         super().__init__(task, pre_process, samples=samples)
 
         self.split = split
-
-        self.normal_dir = normal_dir
-        self.abnormal_dir = abnormal_dir
+        self.normal_dir = Path(root) / Path(normal_dir)
+        self.abnormal_dir = Path(root) / Path(abnormal_dir)
         self.normal_test_dir = normal_test_dir
         self.mask_dir = mask_dir
         self.extensions = extensions
@@ -166,6 +187,7 @@ def __init__(
         self.val_split_mode = val_split_mode
 
     def _setup(self):
+        """Assign samples."""
         self._samples = make_folder_dataset(
             normal_dir=self.normal_dir,
             abnormal_dir=self.abnormal_dir,
@@ -177,24 +199,57 @@ def _setup(self):
 
 
 class FolderDataModule(AnomalibDataModule):
+    """Folder DataModule.
+
+    Args:
+        root (Union[str, Path]): Path to the root folder containing normal and abnormal dirs.
+        normal_dir (Union[str, Path]): Name of the directory containing normal images.
+            Defaults to "normal".
+        abnormal_dir (str, optional): Name of the directory containing abnormal images.
+            Defaults to "abnormal".
+        normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing
+            normal images for the test dataset. Defaults to None.
+        mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
+            the mask annotations. Defaults to None.
+        split_ratio (float, optional): Ratio to split normal training images and add to the
+            test set in case test set doesn't contain any normal images.
+            Defaults to 0.2.
+        extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
+            directory. Defaults to None.
+        image_size (Optional[Union[int, Tuple[int, int]]], optional): Size of the input image.
+            Defaults to None.
+        train_batch_size (int, optional): Training batch size. Defaults to 32.
+        test_batch_size (int, optional): Test batch size. Defaults to 32.
+        num_workers (int, optional): Number of workers. Defaults to 8.
+        task (str, optional): Task type. Could be either classification or segmentation.
+            Defaults to "classification".
+        transform_config_train (Optional[Union[str, A.Compose]], optional): Config for pre-processing
+            during training.
+            Defaults to None.
+        transform_config_val (Optional[Union[str, A.Compose]], optional): Config for pre-processing
+            during validation.
+            Defaults to None.
+        val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained.
+    """
+
     def __init__(
         self,
-        root,
-        task,
-        train_batch_size,
-        test_batch_size,
-        image_size,
-        num_workers,
-        val_split_mode,
+        root: Union[str, Path],
+        normal_dir: Union[str, Path],
+        abnormal_dir: Union[str, Path],
+        normal_test_dir: Union[str, Path],
+        mask_dir: Union[str, Path],
+        split_ratio: float,
+        extensions: Optional[Tuple[str]] = None,
         #
-        normal_dir,
-        abnormal_dir,
-        normal_test_dir,
-        mask_dir,
-        split_ratio,
-        transform_config_train=None,
-        transform_config_val=None,
-        extensions=None,
+        image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        train_batch_size: int = 32,
+        test_batch_size: int = 32,
+        num_workers: int = 8,
+        task: str = "segmentation",
+        transform_config_train: Optional[Union[str, A.Compose]] = None,
+        transform_config_val: Optional[Union[str, A.Compose]] = None,
+        val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
     ):
         super().__init__(
             train_batch_size=train_batch_size,
@@ -208,13 +263,11 @@ def __init__(
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
         pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size)
 
-        normal_dir = Path(root) / Path(normal_dir)
-        abnormal_dir = Path(root) / Path(abnormal_dir)
-
         self.train_data = Folder(
             task=task,
             pre_process=pre_process_train,
             split=Split.TRAIN,
+            root=root,
             normal_dir=normal_dir,
             abnormal_dir=abnormal_dir,
             normal_test_dir=normal_test_dir,
@@ -226,6 +279,7 @@ def __init__(
             task=task,
             pre_process=pre_process_infer,
             split=Split.TEST,
+            root=root,
             normal_dir=normal_dir,
             abnormal_dir=abnormal_dir,
             normal_test_dir=normal_test_dir,
@@ -234,7 +288,7 @@ def __init__(
         )
 
     def _setup(self, _stage: Optional[str] = None):
-
+        """Set up the datasets for the Folder Data Module."""
         assert self.train_data is not None
         assert self.test_data is not None
 
@@ -249,7 +303,7 @@ def _setup(self, _stage: Optional[str] = None):
         # split validation set from test set
         if self.val_split_mode == ValSplitMode.FROM_TEST:
             assert self.test_data is not None
-            self.val_data, self.test_data = random_split(self.train_data, [0.5, 0.5], label_aware=True)
+            self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True)
         elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
             self.val_data = self.test_data
         else:

From 2a9f6f8a18158350c1a1e214f1249a85f58d261e Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 10 Oct 2022 19:06:45 +0200
Subject: [PATCH 25/96] ensure type consistency when performing operations on
 dataset

---
 anomalib/data/base.py        | 68 ++++++++++++++++++++++++++----------
 anomalib/data/utils/split.py | 21 +++++++++--
 2 files changed, 67 insertions(+), 22 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 392fa32438..268f82782f 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -5,10 +5,11 @@
 
 from __future__ import annotations
 
+import copy
 import logging
 from abc import ABC, abstractmethod
 from enum import Enum
-from typing import Dict, Optional, Union
+from typing import Dict, Optional, Sequence, Union
 
 import cv2
 import numpy as np
@@ -26,6 +27,8 @@
 
 
 class Split(str, Enum):
+    """Split of a subset."""
+
     FULL = "full"
     TRAIN = "train"
     VAL = "val"
@@ -33,11 +36,13 @@ class Split(str, Enum):
 
 
 class ValSplitMode(str, Enum):
+    """Splitting mode used to obtain validation subset."""
+
     SAME_AS_TEST = "same_as_test"
     FROM_TEST = "from_test"
 
 
-class AnomalibDataset(Dataset):
+class AnomalibDataset(Dataset, ABC):
     """Anomalib dataset."""
 
     def __init__(self, task: str, pre_process: PreProcessor, samples: Optional[DataFrame] = None):
@@ -51,28 +56,45 @@ def __len__(self) -> int:
         assert isinstance(self._samples, DataFrame)
         return len(self._samples)
 
-    def subsample(self, indices):
-        samples = self.samples.iloc[indices].reset_index(drop=True)
-        return AnomalibDataset(task=self.task, pre_process=self.pre_process, samples=samples)
+    def subsample(self, indices: Sequence[int], inplace=False) -> AnomalibDataset:
+        """Subsamples the dataset at the provided indices.
+
+        Args:
+            indices (Sequence[int]): Indices at which the dataset is to be subsampled.
+            inplace (bool): When true, the subsampling will be performed on the instance itself.
+        """
+        dataset = self if inplace else copy.deepcopy(self)
+        dataset.assign_samples(self.samples.iloc[indices].reset_index(drop=True))
+        return dataset
 
     @property
     def is_setup(self) -> bool:
-        """Has setup() been called?"""
+        """Checks if setup() been called."""
         return isinstance(self._samples, DataFrame)
 
     @property
     def samples(self) -> DataFrame:
-        """TODO"""
+        """Get the samples dataframe."""
         if not self.is_setup:
             raise RuntimeError("Dataset is not setup yet. Call setup() first.")
         return self._samples
 
+    def assign_samples(self, samples: DataFrame):
+        """Overwrite the samples with a new dataframe.
+
+        Args:
+            samples (DataFrame): DataFrame with new samples.
+        """
+        self._samples = samples
+
     @property
     def has_normal(self) -> bool:
+        """Check if the dataset contains any normal samples."""
         return 0 in list(self.samples.label_index)
 
     @property
     def has_anomalous(self) -> bool:
+        """Check if the dataset contains any anomalous samples."""
         return 1 in list(self.samples.label_index)
 
     def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
@@ -115,25 +137,24 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
 
         return item
 
-    def __add__(self, other_dataset: AnomalibDataset):
+    def __add__(self, other_dataset: AnomalibDataset) -> AnomalibDataset:
+        """Concatenate this dataset with another dataset."""
+        assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of same type."
         assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first."
-        samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True)
-        return AnomalibDataset(self.task, self.pre_process, samples)
-
-    def __radd__(self, other):
-        if other == 0:
-            return self
-        else:
-            return self.__add__(other)
+        dataset = copy.deepcopy(self)
+        dataset.assign_samples(pd.concat([self.samples, other_dataset.samples], ignore_index=True))
+        return dataset
 
     def setup(self) -> None:
-        """Load data/metadata into memory"""
+        """Load data/metadata into memory."""
         if not self.is_setup:
             self._setup()
         assert self.is_setup, "setup() should set self._samples"
 
+    @abstractmethod
     def _setup(self) -> DataFrame:
-        """previous _create_samples()
+        """Set up the data module.
+
         This method should return a dataframe that contains the information needed by the dataloader to load each of
         the dataset items into memory.
         The dataframe must at least contain the following columns:
@@ -141,6 +162,7 @@ def _setup(self) -> DataFrame:
             image_path: path to file system location where the image is stored.
             label_index: index of the anomaly label, typically 0 for "normal" and 1 for "anomalous".
             mask_path (if task == "segmentation"): path to the ground truth masks (for the anomalous images only).
+
         Example:
         |---|-------------------|-----------|-------------|------------------|-------|
         |   | image_path        | label     | label_index | mask_path        | split |
@@ -152,7 +174,13 @@ def _setup(self) -> DataFrame:
 
 
 class AnomalibDataModule(LightningDataModule, ABC):
-    """Base Anomalib data module."""
+    """Base Anomalib data module.
+
+    Args:
+        train_batch_size (int): Batch size used by the train dataloader.
+        test_batch_size (int): Batch size used by the val and test dataloaders.
+        num_workers (int): Number of workers used by the train, val and test dataloaders.
+    """
 
     def __init__(
         self,
@@ -183,10 +211,12 @@ def setup(self, stage: Optional[str] = None):
 
     @abstractmethod
     def _setup(self, _stage: Optional[str] = None) -> None:
+        """To be implemented in conrete subclass."""
         pass
 
     @property
     def is_setup(self):
+        """Checks if setup() has been called."""
         if self.train_data is None or self.val_data is None or self.test_data is None:
             return False
         return self.train_data.is_setup and self.val_data.is_setup and self.test_data.is_setup
diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index b45fc23ea8..367359bdf2 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -13,16 +13,31 @@
 
 import math
 import warnings
-from typing import Sequence, Union
+from typing import List, Sequence, Union
 
 from torch import randperm, split
 
 from anomalib.data.base import AnomalibDataset
 
 
+def concatenate_datasets(datasets: Sequence[AnomalibDataset]) -> AnomalibDataset:
+    """Concatenate multiple datasets into a single dataset object.
+
+    Args:
+        datasets (Sequence[AnomalibDataset]): Sequence of at least two datasets.
+
+    Returns:
+        AnomalibDataset: Dataset that contains the combined samples of all input datasets.
+    """
+    concat_dataset = datasets[0]
+    for dataset in datasets[1:]:
+        concat_dataset += dataset
+    return concat_dataset
+
+
 def random_split(
     dataset: AnomalibDataset, split_ratio: Union[float, Sequence[float]], label_aware: bool = False
-) -> Sequence[AnomalibDataset]:
+) -> List[AnomalibDataset]:
     """Perform a random split of a dataset.
 
     Args:
@@ -66,4 +81,4 @@ def random_split(
 
     # concatenate and return
     subsets = list(map(list, zip(*subsets)))
-    return tuple(sum(subset) for subset in subsets)
+    return [concatenate_datasets(subset) for subset in subsets]

From 84997b9d9d6cc637ab8e70a4d91ffb104c792beb Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 10 Oct 2022 19:28:00 +0200
Subject: [PATCH 26/96] change imports

---
 anomalib/data/utils/split.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index 367359bdf2..2b11e32475 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -15,7 +15,7 @@
 import warnings
 from typing import List, Sequence, Union
 
-from torch import randperm, split
+import torch
 
 from anomalib.data.base import AnomalibDataset
 
@@ -76,8 +76,8 @@ def random_split(
             if length == 0:
                 warnings.warn(f"Length of subset at index {index} is 0.")
         # perform random subsampling
-        indices = randperm(len(dataset))
-        subsets.append([dataset.subsample(subset_indices) for subset_indices in split(indices, subset_lengths)])
+        indices = torch.randperm(len(dataset))
+        subsets.append([dataset.subsample(subset_indices) for subset_indices in torch.split(indices, subset_lengths)])
 
     # concatenate and return
     subsets = list(map(list, zip(*subsets)))

From f21c652c933c636b6ead107a44d36c768a925ebd Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 10 Oct 2022 19:34:10 +0200
Subject: [PATCH 27/96] change variable names

---
 anomalib/data/utils/split.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index 2b11e32475..266583d907 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -58,26 +58,28 @@ def random_split(
     # create list of source data
     if label_aware:
         indices_per_label = [group.index for _, group in dataset.samples.groupby("label_index")]
-        datasets = [dataset.subsample(indices) for indices in indices_per_label]
+        per_label_datasets = [dataset.subsample(indices) for indices in indices_per_label]
     else:
-        datasets = [dataset]
+        per_label_datasets = [dataset]
 
     # split each (label-aware) subset of source data
     subsets = []
-    for dataset in datasets:
+    for label_dataset in per_label_datasets:
         # get subset lengths
         subset_lengths = []
         for ratio in split_ratio:
-            subset_lengths.append(int(math.floor(len(dataset) * ratio)))
-        for i in range(len(dataset) - sum(subset_lengths)):
+            subset_lengths.append(int(math.floor(len(label_dataset) * ratio)))
+        for i in range(len(label_dataset) - sum(subset_lengths)):
             subset_idx = i % sum(subset_lengths)
             subset_lengths[subset_idx] += 1
         for index, length in enumerate(subset_lengths):
             if length == 0:
                 warnings.warn(f"Length of subset at index {index} is 0.")
         # perform random subsampling
-        indices = torch.randperm(len(dataset))
-        subsets.append([dataset.subsample(subset_indices) for subset_indices in torch.split(indices, subset_lengths)])
+        indices = torch.randperm(len(label_dataset))
+        subsets.append(
+            [label_dataset.subsample(subset_indices) for subset_indices in torch.split(indices, subset_lengths)]
+        )
 
     # concatenate and return
     subsets = list(map(list, zip(*subsets)))

From ab7d0ff61693767eee17aca49ba5f1834e17f3e1 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 10 Oct 2022 19:34:48 +0200
Subject: [PATCH 28/96] replace pass with NotImplementedError

---
 anomalib/data/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 268f82782f..d6dcea8b17 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -170,7 +170,7 @@ def _setup(self) -> DataFrame:
         | 0 | path/to/image.png | anomalous | 0           | path/to/mask.png | train |
         |---|-------------------|-----------|-------------|------------------|-------|
         """
-        pass
+        raise NotImplementedError
 
 
 class AnomalibDataModule(LightningDataModule, ABC):
@@ -212,7 +212,7 @@ def setup(self, stage: Optional[str] = None):
     @abstractmethod
     def _setup(self, _stage: Optional[str] = None) -> None:
         """To be implemented in conrete subclass."""
-        pass
+        raise NotImplementedError
 
     @property
     def is_setup(self):

From d7e47a942195c72122849696ecde74e33fd2f770 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Tue, 11 Oct 2022 14:05:39 +0200
Subject: [PATCH 29/96] allow training on folder without test images

---
 anomalib/data/folder.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index 4e88b22db4..c059129191 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -64,7 +64,7 @@ def _prepare_files_labels(
 
 def make_folder_dataset(
     normal_dir: Union[str, Path],
-    abnormal_dir: Union[str, Path],
+    abnormal_dir: Optional[Union[str, Path]] = None,
     normal_test_dir: Optional[Union[str, Path]] = None,
     mask_dir: Optional[Union[str, Path]] = None,
     split: Optional[str] = None,
@@ -90,7 +90,10 @@ def make_folder_dataset(
 
     filenames = []
     labels = []
-    dirs = {"normal": normal_dir, "abnormal": abnormal_dir}
+    dirs = {"normal": normal_dir}
+
+    if abnormal_dir:
+        dirs = {**dirs, **{"abnormal": abnormal_dir}}
 
     if normal_test_dir:
         dirs = {**dirs, **{"normal_test": normal_test_dir}}
@@ -168,7 +171,7 @@ def __init__(
         #
         root: Union[str, Path],
         normal_dir: Union[str, Path],
-        abnormal_dir: Union[str, Path],
+        abnormal_dir: Optional[Union[str, Path]] = None,
         normal_test_dir: Optional[Union[str, Path]] = None,
         mask_dir: Optional[Union[str, Path]] = None,
         val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
@@ -179,7 +182,7 @@ def __init__(
 
         self.split = split
         self.normal_dir = Path(root) / Path(normal_dir)
-        self.abnormal_dir = Path(root) / Path(abnormal_dir)
+        self.abnormal_dir = Path(root) / Path(abnormal_dir) if abnormal_dir else None
         self.normal_test_dir = normal_test_dir
         self.mask_dir = mask_dir
         self.extensions = extensions

From da851c6869dd31712359828749be2b205592d4b5 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Tue, 11 Oct 2022 14:23:37 +0200
Subject: [PATCH 30/96] use relative path for normal_test_dir

---
 anomalib/data/folder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index c059129191..01f6f1f031 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -183,7 +183,7 @@ def __init__(
         self.split = split
         self.normal_dir = Path(root) / Path(normal_dir)
         self.abnormal_dir = Path(root) / Path(abnormal_dir) if abnormal_dir else None
-        self.normal_test_dir = normal_test_dir
+        self.normal_test_dir = Path(root) / Path(normal_test_dir) if normal_test_dir else None
         self.mask_dir = mask_dir
         self.extensions = extensions
 

From f3e38ba384ad4ec716201798f097ee74ada734bb Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Tue, 11 Oct 2022 14:43:00 +0200
Subject: [PATCH 31/96] fix dataset tests

---
 anomalib/data/folder.py                  |  8 ++++----
 tests/pre_merge/datasets/test_dataset.py | 15 +++++++++------
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index 01f6f1f031..e0fe0c16dc 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -240,9 +240,9 @@ def __init__(
         root: Union[str, Path],
         normal_dir: Union[str, Path],
         abnormal_dir: Union[str, Path],
-        normal_test_dir: Union[str, Path],
-        mask_dir: Union[str, Path],
-        split_ratio: float,
+        normal_test_dir: Optional[Union[str, Path]] = None,
+        mask_dir: Optional[Union[str, Path]] = None,
+        split_ratio: float = 0.2,
         extensions: Optional[Tuple[str]] = None,
         #
         image_size: Optional[Union[int, Tuple[int, int]]] = None,
@@ -252,7 +252,7 @@ def __init__(
         task: str = "segmentation",
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_val: Optional[Union[str, A.Compose]] = None,
-        val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
+        val_split_mode: ValSplitMode = ValSplitMode.FROM_TEST,
     ):
         super().__init__(
             train_batch_size=train_batch_size,
diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
index 06d9629b45..789b833ff8 100644
--- a/tests/pre_merge/datasets/test_dataset.py
+++ b/tests/pre_merge/datasets/test_dataset.py
@@ -6,7 +6,12 @@
 import pytest
 
 from anomalib.config import update_input_size_config
-from anomalib.data import BTech, Folder, MVTec, get_datamodule
+from anomalib.data import (
+    BTechDataModule,
+    FolderDataModule,
+    MVTecDataModule,
+    get_datamodule,
+)
 from anomalib.pre_processing.transforms import Denormalize, ToNumpy
 from tests.helpers.config import get_test_configurable_parameters
 from tests.helpers.dataset import TestDataset, get_dataset_path
@@ -14,7 +19,7 @@
 
 @pytest.fixture(autouse=True)
 def mvtec_data_module():
-    datamodule = MVTec(
+    datamodule = MVTecDataModule(
         root=get_dataset_path(dataset="MVTec"),
         category="leather",
         image_size=(256, 256),
@@ -31,7 +36,7 @@ def mvtec_data_module():
 @pytest.fixture(autouse=True)
 def btech_data_module():
     """Create BTech Data Module."""
-    datamodule = BTech(
+    datamodule = BTechDataModule(
         root=get_dataset_path(dataset="BTech"),
         category="01",
         image_size=(256, 256),
@@ -49,19 +54,17 @@ def btech_data_module():
 def folder_data_module():
     """Create Folder Data Module."""
     root = get_dataset_path(dataset="bottle")
-    datamodule = Folder(
+    datamodule = FolderDataModule(
         root=root,
         normal_dir="good",
         abnormal_dir="broken_large",
         mask_dir=os.path.join(root, "ground_truth/broken_large"),
         task="segmentation",
         split_ratio=0.2,
-        seed=0,
         image_size=(256, 256),
         train_batch_size=32,
         test_batch_size=32,
         num_workers=8,
-        create_validation_set=True,
     )
     datamodule.setup()
 

From f4719f2f7dad840beb7f1926fa4084a4baa4aca8 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Tue, 11 Oct 2022 15:00:05 +0200
Subject: [PATCH 32/96] update validation set parameter in configs

---
 anomalib/config/config.py                        | 7 +++++++
 anomalib/models/cflow/config.yaml                | 2 +-
 anomalib/models/dfkde/config.yaml                | 2 +-
 anomalib/models/dfm/config.yaml                  | 2 +-
 anomalib/models/draem/config.yaml                | 2 +-
 anomalib/models/fastflow/config.yaml             | 2 +-
 anomalib/models/ganomaly/config.yaml             | 2 +-
 anomalib/models/patchcore/config.yaml            | 2 +-
 anomalib/models/reverse_distillation/config.yaml | 2 +-
 anomalib/models/stfpm/config.yaml                | 2 +-
 10 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/anomalib/config/config.py b/anomalib/config/config.py
index 6312c1012f..9b174bc162 100644
--- a/anomalib/config/config.py
+++ b/anomalib/config/config.py
@@ -136,6 +136,13 @@ def get_configurable_parameters(
     if "format" not in config.dataset.keys():
         config.dataset.format = "mvtec"
 
+    if "create_validation_set" in config.dataset.keys():
+        warn(
+            "The 'create_validation_set' parameter is deprecated and will be removed in v0.4.0. Please use "
+            "validation_split_mode instead."
+        )
+        config.dataset.validation_split_mode = "from_test" if config.dataset.create_validation_set else "same_as_test"
+
     config = update_input_size_config(config)
 
     # Project Configs
diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml
index be166c2417..0a8eec5a65 100644
--- a/anomalib/models/cflow/config.yaml
+++ b/anomalib/models/cflow/config.yaml
@@ -13,7 +13,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  create_validation_set: false
+  validation_split_mode: same_as_test
 
 model:
   name: cflow
diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml
index 070f0c2456..538a806bc6 100644
--- a/anomalib/models/dfkde/config.yaml
+++ b/anomalib/models/dfkde/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  create_validation_set: false
+  validation_split_mode: same_as_test
 
 model:
   name: dfkde
diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml
index 47db50fb4e..104f7a2a07 100755
--- a/anomalib/models/dfm/config.yaml
+++ b/anomalib/models/dfm/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  create_validation_set: false
+  validation_split_mode: same_as_test
 
 model:
   name: dfm
diff --git a/anomalib/models/draem/config.yaml b/anomalib/models/draem/config.yaml
index 9f3326daa1..05ab67360c 100644
--- a/anomalib/models/draem/config.yaml
+++ b/anomalib/models/draem/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: ./anomalib/models/draem/transform_config.yaml
     val: ./anomalib/models/draem/transform_config.yaml
-  create_validation_set: false
+  validation_split_mode: same_as_test
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/fastflow/config.yaml b/anomalib/models/fastflow/config.yaml
index b02f430fd9..cef97d58fb 100644
--- a/anomalib/models/fastflow/config.yaml
+++ b/anomalib/models/fastflow/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  create_validation_set: false
+  validation_split_mode: same_as_test
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml
index 2e5dfb6bba..3e0a0cc677 100644
--- a/anomalib/models/ganomaly/config.yaml
+++ b/anomalib/models/ganomaly/config.yaml
@@ -12,7 +12,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  create_validation_set: false
+  validation_split_mode: same_as_test
   tiling:
     apply: true
     tile_size: 64
diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml
index 31567ad530..9f98f7604a 100644
--- a/anomalib/models/patchcore/config.yaml
+++ b/anomalib/models/patchcore/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  create_validation_set: false
+  validation_split_mode: same_as_test
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/reverse_distillation/config.yaml b/anomalib/models/reverse_distillation/config.yaml
index d30f3baedf..cc474091e4 100644
--- a/anomalib/models/reverse_distillation/config.yaml
+++ b/anomalib/models/reverse_distillation/config.yaml
@@ -12,7 +12,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  create_validation_set: false
+  validation_split_mode: same_as_test
   tiling:
     apply: false
     tile_size: 64
diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml
index fe3637bf27..524e58e42b 100644
--- a/anomalib/models/stfpm/config.yaml
+++ b/anomalib/models/stfpm/config.yaml
@@ -12,7 +12,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  create_validation_set: false
+  validation_split_mode: same_as_test
   tiling:
     apply: false
     tile_size: null

From e25a587f034981cf93aaf7ebc4920e76ea9bb3cd Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Tue, 11 Oct 2022 15:26:57 +0200
Subject: [PATCH 33/96] change default argument

---
 anomalib/models/padim/config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml
index a12d1d7a25..058e78cd25 100644
--- a/anomalib/models/padim/config.yaml
+++ b/anomalib/models/padim/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  validation_split_mode: from_test
+  validation_split_mode: same_as_test
   tiling:
     apply: false
     tile_size: null
@@ -58,7 +58,7 @@ logging:
   log_graph: false # Logs the model graph to respective logger.
 
 optimization:
-  export_mode: null #options: onnx, openvino
+  export_mode: openvino #options: onnx, openvino
 
 # PL Trainer Args. Don't add extra parameter here.
 trainer:

From fb84cd1d032140170ec776cf306df1c13ca6fbe7 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Wed, 12 Oct 2022 10:53:10 +0200
Subject: [PATCH 34/96] use setter for samples

---
 anomalib/data/base.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index d6dcea8b17..2f9076d7de 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -64,7 +64,7 @@ def subsample(self, indices: Sequence[int], inplace=False) -> AnomalibDataset:
             inplace (bool): When true, the subsampling will be performed on the instance itself.
         """
         dataset = self if inplace else copy.deepcopy(self)
-        dataset.assign_samples(self.samples.iloc[indices].reset_index(drop=True))
+        dataset.samples = self.samples.iloc[indices].reset_index(drop=True)
         return dataset
 
     @property
@@ -79,7 +79,8 @@ def samples(self) -> DataFrame:
             raise RuntimeError("Dataset is not setup yet. Call setup() first.")
         return self._samples
 
-    def assign_samples(self, samples: DataFrame):
+    @samples.setter
+    def samples(self, samples: DataFrame):
         """Overwrite the samples with a new dataframe.
 
         Args:
@@ -142,7 +143,7 @@ def __add__(self, other_dataset: AnomalibDataset) -> AnomalibDataset:
         assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of same type."
         assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first."
         dataset = copy.deepcopy(self)
-        dataset.assign_samples(pd.concat([self.samples, other_dataset.samples], ignore_index=True))
+        dataset.samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True)
         return dataset
 
     def setup(self) -> None:

From cfa4f52aca25d32a82b28eaa47e5acc0688a921b Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Wed, 12 Oct 2022 10:56:26 +0200
Subject: [PATCH 35/96] hint options for val_split_mode

---
 anomalib/models/cflow/config.yaml                | 2 +-
 anomalib/models/dfkde/config.yaml                | 2 +-
 anomalib/models/dfm/config.yaml                  | 2 +-
 anomalib/models/draem/config.yaml                | 2 +-
 anomalib/models/fastflow/config.yaml             | 2 +-
 anomalib/models/ganomaly/config.yaml             | 2 +-
 anomalib/models/padim/config.yaml                | 4 ++--
 anomalib/models/patchcore/config.yaml            | 2 +-
 anomalib/models/reverse_distillation/config.yaml | 2 +-
 anomalib/models/stfpm/config.yaml                | 2 +-
 10 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml
index 0a8eec5a65..725589b8e1 100644
--- a/anomalib/models/cflow/config.yaml
+++ b/anomalib/models/cflow/config.yaml
@@ -13,7 +13,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  validation_split_mode: same_as_test
+  validation_split_mode: same_as_test # options: [same_as_test, from_test]
 
 model:
   name: cflow
diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml
index 538a806bc6..5fc7b53861 100644
--- a/anomalib/models/dfkde/config.yaml
+++ b/anomalib/models/dfkde/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  validation_split_mode: same_as_test
+  validation_split_mode: same_as_test # options: [same_as_test, from_test]
 
 model:
   name: dfkde
diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml
index 104f7a2a07..34256daa32 100755
--- a/anomalib/models/dfm/config.yaml
+++ b/anomalib/models/dfm/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  validation_split_mode: same_as_test
+  validation_split_mode: same_as_test # options: [same_as_test, from_test]
 
 model:
   name: dfm
diff --git a/anomalib/models/draem/config.yaml b/anomalib/models/draem/config.yaml
index 05ab67360c..510e661d15 100644
--- a/anomalib/models/draem/config.yaml
+++ b/anomalib/models/draem/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: ./anomalib/models/draem/transform_config.yaml
     val: ./anomalib/models/draem/transform_config.yaml
-  validation_split_mode: same_as_test
+  validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/fastflow/config.yaml b/anomalib/models/fastflow/config.yaml
index cef97d58fb..59d2a12aa5 100644
--- a/anomalib/models/fastflow/config.yaml
+++ b/anomalib/models/fastflow/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  validation_split_mode: same_as_test
+  validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml
index 3e0a0cc677..c8d09276f2 100644
--- a/anomalib/models/ganomaly/config.yaml
+++ b/anomalib/models/ganomaly/config.yaml
@@ -12,7 +12,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  validation_split_mode: same_as_test
+  validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: true
     tile_size: 64
diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml
index 058e78cd25..b857d6d692 100644
--- a/anomalib/models/padim/config.yaml
+++ b/anomalib/models/padim/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  validation_split_mode: same_as_test
+  validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
     tile_size: null
@@ -58,7 +58,7 @@ logging:
   log_graph: false # Logs the model graph to respective logger.
 
 optimization:
-  export_mode: openvino #options: onnx, openvino
+  export_mode: null #options: onnx, openvino
 
 # PL Trainer Args. Don't add extra parameter here.
 trainer:
diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml
index 9f98f7604a..c97603ea2e 100644
--- a/anomalib/models/patchcore/config.yaml
+++ b/anomalib/models/patchcore/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  validation_split_mode: same_as_test
+  validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/reverse_distillation/config.yaml b/anomalib/models/reverse_distillation/config.yaml
index cc474091e4..8dcdd0fe9c 100644
--- a/anomalib/models/reverse_distillation/config.yaml
+++ b/anomalib/models/reverse_distillation/config.yaml
@@ -12,7 +12,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  validation_split_mode: same_as_test
+  validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
     tile_size: 64
diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml
index 524e58e42b..4a2b251173 100644
--- a/anomalib/models/stfpm/config.yaml
+++ b/anomalib/models/stfpm/config.yaml
@@ -12,7 +12,7 @@ dataset:
   transform_config:
     train: null
     val: null
-  validation_split_mode: same_as_test
+  validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
     tile_size: null

From 624e5229ed4b387547a34c5826001878954fbcea Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Wed, 12 Oct 2022 10:58:25 +0200
Subject: [PATCH 36/96] update assert message and docstring

---
 anomalib/data/base.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 2f9076d7de..dd3b0ec2b7 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -140,7 +140,7 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
 
     def __add__(self, other_dataset: AnomalibDataset) -> AnomalibDataset:
         """Concatenate this dataset with another dataset."""
-        assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of same type."
+        assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of the same type."
         assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first."
         dataset = copy.deepcopy(self)
         dataset.samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True)
@@ -168,7 +168,7 @@ def _setup(self) -> DataFrame:
         |---|-------------------|-----------|-------------|------------------|-------|
         |   | image_path        | label     | label_index | mask_path        | split |
         |---|-------------------|-----------|-------------|------------------|-------|
-        | 0 | path/to/image.png | anomalous | 0           | path/to/mask.png | train |
+        | 0 | path/to/image.png | anomalous | 1           | path/to/mask.png | train |
         |---|-------------------|-----------|-------------|------------------|-------|
         """
         raise NotImplementedError

From 0bd77f9325f1df8c10780106e2b8a7ca338b9c61 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Wed, 12 Oct 2022 15:51:20 +0200
Subject: [PATCH 37/96] revert name change dataset vs datamodule

---
 anomalib/data/__init__.py                | 18 +++++++++---------
 anomalib/data/btech.py                   | 10 ++++++----
 anomalib/data/folder.py                  |  8 ++++----
 anomalib/data/mvtec.py                   | 10 ++++++----
 tests/pre_merge/datasets/test_dataset.py | 13 ++++---------
 5 files changed, 29 insertions(+), 30 deletions(-)

diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index 7ba61dc5c2..6e77606aba 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -10,10 +10,10 @@
 
 from anomalib.data.base import AnomalibDataModule
 
-from .btech import BTechDataModule
-from .folder import FolderDataModule
+from .btech import BTech
+from .folder import Folder
 from .inference import InferenceDataset
-from .mvtec import MVTecDataModule
+from .mvtec import MVTec
 
 logger = logging.getLogger(__name__)
 
@@ -32,7 +32,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
     datamodule: AnomalibDataModule
 
     if config.dataset.format.lower() == "mvtec":
-        datamodule = MVTecDataModule(
+        datamodule = MVTec(
             root=config.dataset.path,
             category=config.dataset.category,
             image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
@@ -45,7 +45,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             val_split_mode=config.dataset.validation_split_mode,
         )
     elif config.dataset.format.lower() == "btech":
-        datamodule = BTechDataModule(
+        datamodule = BTech(
             root=config.dataset.path,
             category=config.dataset.category,
             image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
@@ -58,7 +58,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             val_split_mode=config.dataset.validation_split_mode,
         )
     elif config.dataset.format.lower() == "folder":
-        datamodule = FolderDataModule(
+        datamodule = Folder(
             root=config.dataset.path,
             normal_dir=config.dataset.normal_dir,
             abnormal_dir=config.dataset.abnormal_dir,
@@ -87,8 +87,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
 
 __all__ = [
     "get_datamodule",
-    "BTechDataModule",
-    "FolderDataModule",
+    "BTech",
+    "Folder",
     "InferenceDataset",
-    "MVTecDataModule",
+    "MVTec",
 ]
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index f7d03ac19b..e0a45d5d62 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -106,7 +106,7 @@ def make_btech_dataset(path: Path, split: Optional[str] = None) -> DataFrame:
     return samples
 
 
-class BTech(AnomalibDataset):
+class BTechDataset(AnomalibDataset):
     """BTech PyTorch Dataset."""
 
     def __init__(
@@ -169,7 +169,7 @@ def _setup(self):
 
 
 @DATAMODULE_REGISTRY
-class BTechDataModule(AnomalibDataModule):
+class BTech(AnomalibDataModule):
     """BTechDataModule Lightning Data Module."""
 
     def __init__(
@@ -235,10 +235,12 @@ def __init__(
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
         pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size)
 
-        self.train_data = BTech(
+        self.train_data = BTechDataset(
             task=task, pre_process=pre_process_train, split=Split.TRAIN, root=root, category=category
         )
-        self.test_data = BTech(task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category)
+        self.test_data = BTechDataset(
+            task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category
+        )
 
     def prepare_data(self) -> None:
         """Download the dataset if not available."""
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index e0fe0c16dc..a34a96f0d4 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -136,7 +136,7 @@ def make_folder_dataset(
     return samples
 
 
-class Folder(AnomalibDataset):
+class FolderDataset(AnomalibDataset):
     """Folder dataset.
 
     Args:
@@ -201,7 +201,7 @@ def _setup(self):
         )
 
 
-class FolderDataModule(AnomalibDataModule):
+class Folder(AnomalibDataModule):
     """Folder DataModule.
 
     Args:
@@ -266,7 +266,7 @@ def __init__(
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
         pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size)
 
-        self.train_data = Folder(
+        self.train_data = FolderDataset(
             task=task,
             pre_process=pre_process_train,
             split=Split.TRAIN,
@@ -278,7 +278,7 @@ def __init__(
             extensions=extensions,
         )
 
-        self.test_data = Folder(
+        self.test_data = FolderDataset(
             task=task,
             pre_process=pre_process_infer,
             split=Split.TEST,
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index f5655090e9..21ce00e622 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -120,7 +120,7 @@ def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> Dat
     return samples
 
 
-class MVTec(AnomalibDataset):
+class MVTecDataset(AnomalibDataset):
     """MVTec dataset class.
 
     Args:
@@ -149,7 +149,7 @@ def _setup(self):
         self._samples = make_mvtec_dataset(self.root_category, split=self.split)
 
 
-class MVTecDataModule(AnomalibDataModule):
+class MVTec(AnomalibDataModule):
     """MVTec Datamodule."""
 
     def __init__(
@@ -179,10 +179,12 @@ def __init__(
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
         pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size)
 
-        self.train_data = MVTec(
+        self.train_data = MVTecDataset(
             task=task, pre_process=pre_process_train, split=Split.TRAIN, root=root, category=category
         )
-        self.test_data = MVTec(task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category)
+        self.test_data = MVTecDataset(
+            task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category
+        )
 
     def prepare_data(self) -> None:
         """Download the dataset if not available."""
diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
index 789b833ff8..39707bb69a 100644
--- a/tests/pre_merge/datasets/test_dataset.py
+++ b/tests/pre_merge/datasets/test_dataset.py
@@ -6,12 +6,7 @@
 import pytest
 
 from anomalib.config import update_input_size_config
-from anomalib.data import (
-    BTechDataModule,
-    FolderDataModule,
-    MVTecDataModule,
-    get_datamodule,
-)
+from anomalib.data import BTech, Folder, MVTec, get_datamodule
 from anomalib.pre_processing.transforms import Denormalize, ToNumpy
 from tests.helpers.config import get_test_configurable_parameters
 from tests.helpers.dataset import TestDataset, get_dataset_path
@@ -19,7 +14,7 @@
 
 @pytest.fixture(autouse=True)
 def mvtec_data_module():
-    datamodule = MVTecDataModule(
+    datamodule = MVTec(
         root=get_dataset_path(dataset="MVTec"),
         category="leather",
         image_size=(256, 256),
@@ -36,7 +31,7 @@ def mvtec_data_module():
 @pytest.fixture(autouse=True)
 def btech_data_module():
     """Create BTech Data Module."""
-    datamodule = BTechDataModule(
+    datamodule = BTech(
         root=get_dataset_path(dataset="BTech"),
         category="01",
         image_size=(256, 256),
@@ -54,7 +49,7 @@ def btech_data_module():
 def folder_data_module():
     """Create Folder Data Module."""
     root = get_dataset_path(dataset="bottle")
-    datamodule = FolderDataModule(
+    datamodule = Folder(
         root=root,
         normal_dir="good",
         abnormal_dir="broken_large",

From 6bed98f39d4c616d34a3d90223d306cd026b9673 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Wed, 12 Oct 2022 15:57:16 +0200
Subject: [PATCH 38/96] typing and docstrings

---
 anomalib/data/folder.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index a34a96f0d4..c935f48550 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -67,20 +67,21 @@ def make_folder_dataset(
     abnormal_dir: Optional[Union[str, Path]] = None,
     normal_test_dir: Optional[Union[str, Path]] = None,
     mask_dir: Optional[Union[str, Path]] = None,
-    split: Optional[str] = None,
+    split: Optional[Union[Split, str]] = None,
     extensions: Optional[Tuple[str, ...]] = None,
 ):
     """Make Folder Dataset.
 
     Args:
         normal_dir (Union[str, Path]): Path to the directory containing normal images.
-        abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images.
+        abnormal_dir (Optional[Union[str, Path]], optional): Path to the directory containing abnormal images.
         normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing
             normal images for the test dataset. Normal test images will be a split of `normal_dir`
             if `None`. Defaults to None.
         mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
             the mask annotations. Defaults to None.
-        split (Optional[Split], optional): Dataset split (ie., Split.FULL, Split.TRAIN or Split.TEST). Defaults to None.
+        split (Optional[Union[Split, str]], optional): Dataset split (ie., Split.FULL, Split.TRAIN or Split.TEST).
+            Defaults to None.
         extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
             directory.
 
@@ -147,8 +148,7 @@ class FolderDataset(AnomalibDataset):
 
         root (Union[str, Path]): Root folder of the dataset.
         normal_dir (Union[str, Path]): Path to the directory containing normal images.
-        abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images.
-        split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None.
+        abnormal_dir (Optional[Union[str, Path]], optional): Path to the directory containing abnormal images.
         normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing
             normal images for the test dataset. Defaults to None.
         mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
@@ -175,7 +175,7 @@ def __init__(
         normal_test_dir: Optional[Union[str, Path]] = None,
         mask_dir: Optional[Union[str, Path]] = None,
         val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
-        extensions: Optional[Tuple[str]] = None,
+        extensions: Optional[Tuple[str, ...]] = None,
         samples: DataFrame = None,
     ) -> None:
         super().__init__(task, pre_process, samples=samples)
@@ -208,7 +208,7 @@ class Folder(AnomalibDataModule):
         root (Union[str, Path]): Path to the root folder containing normal and abnormal dirs.
         normal_dir (Union[str, Path]): Name of the directory containing normal images.
             Defaults to "normal".
-        abnormal_dir (str, optional): Name of the directory containing abnormal images.
+        abnormal_dir (Union[str, Path]): Name of the directory containing abnormal images.
             Defaults to "abnormal".
         normal_test_dir (Optional[Union[str, Path]], optional): Path to the directory containing
             normal images for the test dataset. Defaults to None.

From fc34f8eb9763755667a49521bfcce181be225481 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Wed, 12 Oct 2022 16:00:24 +0200
Subject: [PATCH 39/96] remove samples argument from dataset constructor

---
 anomalib/data/base.py   | 4 ++--
 anomalib/data/btech.py  | 3 +--
 anomalib/data/folder.py | 3 +--
 anomalib/data/mvtec.py  | 3 +--
 4 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index dd3b0ec2b7..f79847292d 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -45,11 +45,11 @@ class ValSplitMode(str, Enum):
 class AnomalibDataset(Dataset, ABC):
     """Anomalib dataset."""
 
-    def __init__(self, task: str, pre_process: PreProcessor, samples: Optional[DataFrame] = None):
+    def __init__(self, task: str, pre_process: PreProcessor):
         super().__init__()
         self.task = task
         self.pre_process = pre_process
-        self._samples = samples
+        self._samples = None
 
     def __len__(self) -> int:
         """Get length of the dataset."""
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index e0a45d5d62..0bdab70d6a 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -116,7 +116,6 @@ def __init__(
         pre_process: PreProcessor,
         split: Split,
         task: str = "segmentation",
-        samples: Optional[DataFrame] = None,
     ) -> None:
         """Btech Dataset class.
 
@@ -159,7 +158,7 @@ def __init__(
             >>> dataset[0]["image"].shape, dataset[0]["mask"].shape
             (torch.Size([3, 256, 256]), torch.Size([256, 256]))
         """
-        super().__init__(task, pre_process, samples)
+        super().__init__(task, pre_process)
 
         self.root_category = Path(root) / Path(category)
         self.split = split
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index c935f48550..b2fa9ab3d7 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -176,9 +176,8 @@ def __init__(
         mask_dir: Optional[Union[str, Path]] = None,
         val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
         extensions: Optional[Tuple[str, ...]] = None,
-        samples: DataFrame = None,
     ) -> None:
-        super().__init__(task, pre_process, samples=samples)
+        super().__init__(task, pre_process)
 
         self.split = split
         self.normal_dir = Path(root) / Path(normal_dir)
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 21ce00e622..6b67c0bb33 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -138,9 +138,8 @@ def __init__(
         split: Split,
         root: str,
         category: str,
-        samples: Optional[DataFrame] = None,
     ) -> None:
-        super().__init__(task=task, pre_process=pre_process, samples=samples)
+        super().__init__(task=task, pre_process=pre_process)
 
         self.root_category = Path(root) / Path(category)
         self.split = split

From 1482c138f5735cc40ea50d74f01c64ab35d75a3b Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Wed, 12 Oct 2022 16:40:51 +0200
Subject: [PATCH 40/96] val/test -> eval

---
 anomalib/config/config.py                        | 16 +++++++++++++++-
 anomalib/data/__init__.py                        | 12 ++++++------
 anomalib/data/base.py                            |  8 ++++----
 anomalib/data/btech.py                           | 10 +++++-----
 anomalib/data/folder.py                          | 10 +++++-----
 anomalib/data/mvtec.py                           | 10 +++++-----
 anomalib/models/cflow/config.yaml                |  4 ++--
 anomalib/models/dfkde/config.yaml                |  4 ++--
 anomalib/models/dfm/config.yaml                  |  4 ++--
 anomalib/models/draem/config.yaml                |  4 ++--
 anomalib/models/fastflow/config.yaml             |  4 ++--
 anomalib/models/ganomaly/config.yaml             |  4 ++--
 anomalib/models/padim/config.yaml                |  4 ++--
 anomalib/models/patchcore/config.yaml            |  4 ++--
 anomalib/models/reverse_distillation/config.yaml |  4 ++--
 anomalib/models/stfpm/config.yaml                |  4 ++--
 tests/pre_merge/datasets/test_dataset.py         |  6 +++---
 17 files changed, 63 insertions(+), 49 deletions(-)

diff --git a/anomalib/config/config.py b/anomalib/config/config.py
index 9b174bc162..38d9e5d531 100644
--- a/anomalib/config/config.py
+++ b/anomalib/config/config.py
@@ -139,10 +139,24 @@ def get_configurable_parameters(
     if "create_validation_set" in config.dataset.keys():
         warn(
             "The 'create_validation_set' parameter is deprecated and will be removed in v0.4.0. Please use "
-            "validation_split_mode instead."
+            "'validation_split_mode' instead."
         )
         config.dataset.validation_split_mode = "from_test" if config.dataset.create_validation_set else "same_as_test"
 
+    if "test_batch_size" in config.dataset.keys():
+        warn(
+            "The 'test_batch_size' parameter is deprecated and will be removed in v0.4.0. Please use "
+            "'eval_batch_size' instead."
+        )
+        config.dataset.eval_batch_size = config.dataset.test_batch_size
+
+    if "transform_config" in config.dataset.keys() and "val" in config.dataset.transform_config.keys():
+        warn(
+            "The 'transform_config.val' parameter is deprecated and will be removed in v0.4.0. Please use "
+            "'transform_config.eval' instead."
+        )
+        config.dataset.transform_config.eval = config.dataset.transform_config.val
+
     config = update_input_size_config(config)
 
     # Project Configs
diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index 6e77606aba..98ee0394e2 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -37,11 +37,11 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             category=config.dataset.category,
             image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
             train_batch_size=config.dataset.train_batch_size,
-            test_batch_size=config.dataset.test_batch_size,
+            eval_batch_size=config.dataset.eval_batch_size,
             num_workers=config.dataset.num_workers,
             task=config.dataset.task,
             transform_config_train=config.dataset.transform_config.train,
-            transform_config_val=config.dataset.transform_config.val,
+            transform_config_eval=config.dataset.transform_config.eval,
             val_split_mode=config.dataset.validation_split_mode,
         )
     elif config.dataset.format.lower() == "btech":
@@ -50,11 +50,11 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             category=config.dataset.category,
             image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
             train_batch_size=config.dataset.train_batch_size,
-            test_batch_size=config.dataset.test_batch_size,
+            eval_batch_size=config.dataset.eval_batch_size,
             num_workers=config.dataset.num_workers,
             task=config.dataset.task,
             transform_config_train=config.dataset.transform_config.train,
-            transform_config_val=config.dataset.transform_config.val,
+            transform_config_eval=config.dataset.transform_config.eval,
             val_split_mode=config.dataset.validation_split_mode,
         )
     elif config.dataset.format.lower() == "folder":
@@ -69,10 +69,10 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             split_ratio=config.dataset.split_ratio,
             image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
             train_batch_size=config.dataset.train_batch_size,
-            test_batch_size=config.dataset.test_batch_size,
+            eval_batch_size=config.dataset.eval_batch_size,
             num_workers=config.dataset.num_workers,
             transform_config_train=config.dataset.transform_config.train,
-            transform_config_val=config.dataset.transform_config.val,
+            transform_config_eval=config.dataset.transform_config.eval,
             val_split_mode=config.dataset.validation_split_mode,
         )
     else:
diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index f79847292d..55ed39b5a9 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -186,12 +186,12 @@ class AnomalibDataModule(LightningDataModule, ABC):
     def __init__(
         self,
         train_batch_size: int,
-        test_batch_size: int,
+        eval_batch_size: int,
         num_workers: int,
     ):
         super().__init__()
         self.train_batch_size = train_batch_size
-        self.test_batch_size = test_batch_size
+        self.eval_batch_size = eval_batch_size
         self.num_workers = num_workers
 
         self.train_data: Optional[AnomalibDataset] = None
@@ -228,8 +228,8 @@ def train_dataloader(self) -> TRAIN_DATALOADERS:
 
     def val_dataloader(self) -> EVAL_DATALOADERS:
         """Get validation dataloader."""
-        return DataLoader(self.val_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
+        return DataLoader(self.val_data, shuffle=False, batch_size=self.eval_batch_size, num_workers=self.num_workers)
 
     def test_dataloader(self) -> EVAL_DATALOADERS:
         """Get test dataloader."""
-        return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
+        return DataLoader(self.test_data, shuffle=False, batch_size=self.eval_batch_size, num_workers=self.num_workers)
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 0bdab70d6a..e501275390 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -177,11 +177,11 @@ def __init__(
         category: str,
         image_size: Optional[Union[int, Tuple[int, int]]] = None,
         train_batch_size: int = 32,
-        test_batch_size: int = 32,
+        eval_batch_size: int = 32,
         num_workers: int = 8,
         task: str = "segmentation",
         transform_config_train: Optional[Union[str, A.Compose]] = None,
-        transform_config_val: Optional[Union[str, A.Compose]] = None,
+        transform_config_eval: Optional[Union[str, A.Compose]] = None,
         val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
     ) -> None:
         """Instantiate BTech Lightning Data Module.
@@ -225,20 +225,20 @@ def __init__(
             >>> data["image"].shape, data["mask"].shape
             (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256]))
         """
-        super().__init__(train_batch_size, test_batch_size, num_workers)
+        super().__init__(train_batch_size, eval_batch_size, num_workers)
 
         self.root = Path(root)
         self.category = Path(category)
         self.val_split_mode = val_split_mode
 
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
-        pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size)
+        pre_process_eval = PreProcessor(config=transform_config_eval, image_size=image_size)
 
         self.train_data = BTechDataset(
             task=task, pre_process=pre_process_train, split=Split.TRAIN, root=root, category=category
         )
         self.test_data = BTechDataset(
-            task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category
+            task=task, pre_process=pre_process_eval, split=Split.TEST, root=root, category=category
         )
 
     def prepare_data(self) -> None:
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index b2fa9ab3d7..2f31c5cfb8 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -246,16 +246,16 @@ def __init__(
         #
         image_size: Optional[Union[int, Tuple[int, int]]] = None,
         train_batch_size: int = 32,
-        test_batch_size: int = 32,
+        eval_batch_size: int = 32,
         num_workers: int = 8,
         task: str = "segmentation",
         transform_config_train: Optional[Union[str, A.Compose]] = None,
-        transform_config_val: Optional[Union[str, A.Compose]] = None,
+        transform_config_eval: Optional[Union[str, A.Compose]] = None,
         val_split_mode: ValSplitMode = ValSplitMode.FROM_TEST,
     ):
         super().__init__(
             train_batch_size=train_batch_size,
-            test_batch_size=test_batch_size,
+            eval_batch_size=eval_batch_size,
             num_workers=num_workers,
         )
 
@@ -263,7 +263,7 @@ def __init__(
         self.split_ratio = split_ratio
 
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
-        pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size)
+        pre_process_eval = PreProcessor(config=transform_config_eval, image_size=image_size)
 
         self.train_data = FolderDataset(
             task=task,
@@ -279,7 +279,7 @@ def __init__(
 
         self.test_data = FolderDataset(
             task=task,
-            pre_process=pre_process_infer,
+            pre_process=pre_process_eval,
             split=Split.TEST,
             root=root,
             normal_dir=normal_dir,
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 6b67c0bb33..252c556b80 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -157,16 +157,16 @@ def __init__(
         category: str,
         image_size: Optional[Union[int, Tuple[int, int]]] = None,
         train_batch_size: int = 32,
-        test_batch_size: int = 32,
+        eval_batch_size: int = 32,
         num_workers: int = 8,
         task: str = "segmentation",
         transform_config_train: Optional[Union[str, A.Compose]] = None,
-        transform_config_val: Optional[Union[str, A.Compose]] = None,
+        transform_config_eval: Optional[Union[str, A.Compose]] = None,
         val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
     ):
         super().__init__(
             train_batch_size=train_batch_size,
-            test_batch_size=test_batch_size,
+            eval_batch_size=eval_batch_size,
             num_workers=num_workers,
         )
 
@@ -176,13 +176,13 @@ def __init__(
 
         # TODO: Get rid of PreProcessor by passing transform directly
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
-        pre_process_infer = PreProcessor(config=transform_config_val, image_size=image_size)
+        pre_process_eval = PreProcessor(config=transform_config_eval, image_size=image_size)
 
         self.train_data = MVTecDataset(
             task=task, pre_process=pre_process_train, split=Split.TRAIN, root=root, category=category
         )
         self.test_data = MVTecDataset(
-            task=task, pre_process=pre_process_infer, split=Split.TEST, root=root, category=category
+            task=task, pre_process=pre_process_eval, split=Split.TEST, root=root, category=category
         )
 
     def prepare_data(self) -> None:
diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml
index 725589b8e1..2a823620ba 100644
--- a/anomalib/models/cflow/config.yaml
+++ b/anomalib/models/cflow/config.yaml
@@ -6,13 +6,13 @@ dataset:
   task: segmentation
   image_size: 256
   train_batch_size: 16
-  test_batch_size: 16
+  eval_batch_size: 16
   inference_batch_size: 16
   fiber_batch_size: 64
   num_workers: 8
   transform_config:
     train: null
-    val: null
+    eval: null
   validation_split_mode: same_as_test # options: [same_as_test, from_test]
 
 model:
diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml
index 5fc7b53861..7e9961f660 100644
--- a/anomalib/models/dfkde/config.yaml
+++ b/anomalib/models/dfkde/config.yaml
@@ -6,11 +6,11 @@ dataset:
   task: classification
   image_size: 256
   train_batch_size: 32
-  test_batch_size: 32
+  eval_batch_size: 32
   num_workers: 8
   transform_config:
     train: null
-    val: null
+    eval: null
   validation_split_mode: same_as_test # options: [same_as_test, from_test]
 
 model:
diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml
index 34256daa32..807f39e5db 100755
--- a/anomalib/models/dfm/config.yaml
+++ b/anomalib/models/dfm/config.yaml
@@ -6,11 +6,11 @@ dataset:
   task: classification
   image_size: 256
   train_batch_size: 32
-  test_batch_size: 32
+  eval_batch_size: 32
   num_workers: 8
   transform_config:
     train: null
-    val: null
+    eval: null
   validation_split_mode: same_as_test # options: [same_as_test, from_test]
 
 model:
diff --git a/anomalib/models/draem/config.yaml b/anomalib/models/draem/config.yaml
index 510e661d15..5f225e4cff 100644
--- a/anomalib/models/draem/config.yaml
+++ b/anomalib/models/draem/config.yaml
@@ -6,11 +6,11 @@ dataset:
   task: segmentation
   image_size: 256
   train_batch_size: 8
-  test_batch_size: 32
+  eval_batch_size: 32
   num_workers: 8
   transform_config:
     train: ./anomalib/models/draem/transform_config.yaml
-    val: ./anomalib/models/draem/transform_config.yaml
+    eval: ./anomalib/models/draem/transform_config.yaml
   validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
diff --git a/anomalib/models/fastflow/config.yaml b/anomalib/models/fastflow/config.yaml
index 59d2a12aa5..d1ad0d6eae 100644
--- a/anomalib/models/fastflow/config.yaml
+++ b/anomalib/models/fastflow/config.yaml
@@ -6,11 +6,11 @@ dataset:
   category: bottle
   image_size: 256 # options: [256, 256, 448, 384] - for each supported backbone
   train_batch_size: 32
-  test_batch_size: 32
+  eval_batch_size: 32
   num_workers: 8
   transform_config:
     train: null
-    val: null
+    eval: null
   validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml
index c8d09276f2..542f117df1 100644
--- a/anomalib/models/ganomaly/config.yaml
+++ b/anomalib/models/ganomaly/config.yaml
@@ -6,12 +6,12 @@ dataset:
   task: classification
   image_size: 256
   train_batch_size: 32
-  test_batch_size: 32
+  eval_batch_size: 32
   inference_batch_size: 32
   num_workers: 8
   transform_config:
     train: null
-    val: null
+    eval: null
   validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: true
diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml
index b857d6d692..bb08d58ab5 100644
--- a/anomalib/models/padim/config.yaml
+++ b/anomalib/models/padim/config.yaml
@@ -6,11 +6,11 @@ dataset:
   task: segmentation
   image_size: 256
   train_batch_size: 32
-  test_batch_size: 32
+  eval_batch_size: 32
   num_workers: 8
   transform_config:
     train: null
-    val: null
+    eval: null
   validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml
index c97603ea2e..38fc14bb38 100644
--- a/anomalib/models/patchcore/config.yaml
+++ b/anomalib/models/patchcore/config.yaml
@@ -6,11 +6,11 @@ dataset:
   category: bottle
   image_size: 224
   train_batch_size: 32
-  test_batch_size: 1
+  eval_batch_size: 1
   num_workers: 8
   transform_config:
     train: null
-    val: null
+    eval: null
   validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
diff --git a/anomalib/models/reverse_distillation/config.yaml b/anomalib/models/reverse_distillation/config.yaml
index 8dcdd0fe9c..1a6a697f36 100644
--- a/anomalib/models/reverse_distillation/config.yaml
+++ b/anomalib/models/reverse_distillation/config.yaml
@@ -6,12 +6,12 @@ dataset:
   task: segmentation
   image_size: 256
   train_batch_size: 32
-  test_batch_size: 32
+  eval_batch_size: 32
   inference_batch_size: 32
   num_workers: 8
   transform_config:
     train: null
-    val: null
+    eval: null
   validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml
index 4a2b251173..a25a558f41 100644
--- a/anomalib/models/stfpm/config.yaml
+++ b/anomalib/models/stfpm/config.yaml
@@ -6,12 +6,12 @@ dataset:
   task: segmentation
   image_size: 256
   train_batch_size: 32
-  test_batch_size: 32
+  eval_batch_size: 32
   inference_batch_size: 32
   num_workers: 36
   transform_config:
     train: null
-    val: null
+    eval: null
   validation_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
index 39707bb69a..b625621b35 100644
--- a/tests/pre_merge/datasets/test_dataset.py
+++ b/tests/pre_merge/datasets/test_dataset.py
@@ -19,7 +19,7 @@ def mvtec_data_module():
         category="leather",
         image_size=(256, 256),
         train_batch_size=1,
-        test_batch_size=1,
+        eval_batch_size=1,
         num_workers=0,
     )
     datamodule.prepare_data()
@@ -36,7 +36,7 @@ def btech_data_module():
         category="01",
         image_size=(256, 256),
         train_batch_size=1,
-        test_batch_size=1,
+        eval_batch_size=1,
         num_workers=0,
     )
     datamodule.prepare_data()
@@ -58,7 +58,7 @@ def folder_data_module():
         split_ratio=0.2,
         image_size=(256, 256),
         train_batch_size=32,
-        test_batch_size=32,
+        eval_batch_size=32,
         num_workers=8,
     )
     datamodule.setup()

From e16816333ac4b272187cfdbb18d8124315ea3876 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Thu, 13 Oct 2022 13:21:43 +0200
Subject: [PATCH 41/96] remove Split.Full from enum

---
 anomalib/data/base.py                    |  1 -
 anomalib/data/btech.py                   |  8 ++++----
 anomalib/data/folder.py                  |  9 ++++-----
 anomalib/data/mvtec.py                   | 10 +++++-----
 tests/pre_merge/datasets/test_dataset.py |  3 +++
 5 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 55ed39b5a9..b5ce95716d 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -29,7 +29,6 @@
 class Split(str, Enum):
     """Split of a subset."""
 
-    FULL = "full"
     TRAIN = "train"
     VAL = "val"
     TEST = "test"
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index e501275390..934b5d57eb 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -31,7 +31,7 @@
 logger = logging.getLogger(__name__)
 
 
-def make_btech_dataset(path: Path, split: Optional[str] = None) -> DataFrame:
+def make_btech_dataset(path: Path, split: Optional[Union[Split, str]] = None) -> DataFrame:
     """Create BTech samples by parsing the BTech data file structure.
 
     The files are expected to follow the structure:
@@ -40,7 +40,7 @@ def make_btech_dataset(path: Path, split: Optional[str] = None) -> DataFrame:
 
     Args:
         path (Path): Path to dataset
-        split (str, optional): Dataset split (ie., either train or test). Defaults to None.
+        split (Optional[Union[Split, str]], optional): Dataset split (ie., either train or test). Defaults to None.
         split_ratio (float, optional): Ratio to split normal training images and add to the
             test set in case test set doesn't contain any normal images.
             Defaults to 0.1.
@@ -99,7 +99,7 @@ def make_btech_dataset(path: Path, split: Optional[str] = None) -> DataFrame:
     samples.label_index = samples.label_index.astype(int)
 
     # Get the data frame for the split.
-    if split != Split.FULL:
+    if split:
         samples = samples[samples.split == split]
         samples = samples.reset_index(drop=True)
 
@@ -114,7 +114,7 @@ def __init__(
         root: Union[Path, str],
         category: str,
         pre_process: PreProcessor,
-        split: Split,
+        split: Optional[Union[Split, str]] = None,
         task: str = "segmentation",
     ) -> None:
         """Btech Dataset class.
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index 2f31c5cfb8..db42d6ef6b 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -130,7 +130,7 @@ def make_folder_dataset(
     samples.loc[(samples.label == "abnormal") | (samples.label == "normal_test"), "split"] = "test"
 
     # Get the data frame for the split.
-    if split != Split.FULL:
+    if split:
         samples = samples[samples.split == split]
         samples = samples.reset_index(drop=True)
 
@@ -143,8 +143,8 @@ class FolderDataset(AnomalibDataset):
     Args:
         task (str): Task type. (classification or segmentation).
         pre_process (PreProcessor): Image Pre-processor to apply transform.
-        split (Split): Fixed subset split that follows from folder structure on file system. Choose from
-            [Split.FULL, Split.TRAIN, Split.TEST]
+        split (Optional[Union[Split, str]]): Fixed subset split that follows from folder structure on file system.
+            Choose from [Split.FULL, Split.TRAIN, Split.TEST]
 
         root (Union[str, Path]): Root folder of the dataset.
         normal_dir (Union[str, Path]): Path to the directory containing normal images.
@@ -167,13 +167,12 @@ def __init__(
         self,
         task: str,
         pre_process: PreProcessor,
-        split: Split,
-        #
         root: Union[str, Path],
         normal_dir: Union[str, Path],
         abnormal_dir: Optional[Union[str, Path]] = None,
         normal_test_dir: Optional[Union[str, Path]] = None,
         mask_dir: Optional[Union[str, Path]] = None,
+        split: Optional[Union[Split, str]] = None,
         val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
         extensions: Optional[Tuple[str, ...]] = None,
     ) -> None:
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 252c556b80..704c8f6626 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -40,7 +40,7 @@
 logger = logging.getLogger(__name__)
 
 
-def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> DataFrame:
+def make_mvtec_dataset(root: Union[str, Path], split: Optional[Union[Split, str]] = None) -> DataFrame:
     """Create MVTec AD samples by parsing the MVTec AD data file structure.
 
     The files are expected to follow the structure:
@@ -56,7 +56,7 @@ def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> Dat
 
     Args:
         path (Path): Path to dataset
-        split (str, optional): Dataset split (ie., either train or test). Defaults to None.
+        split (Optional[Union[Split, str]], optional): Dataset split (ie., either train or test). Defaults to None.
         split_ratio (float, optional): Ratio to split normal training images and add to the
             test set in case test set doesn't contain any normal images.
             Defaults to 0.1.
@@ -114,7 +114,7 @@ def make_mvtec_dataset(root: Union[str, Path], split: Split = Split.FULL) -> Dat
     samples.loc[(samples.label != "good"), "label_index"] = 1
     samples.label_index = samples.label_index.astype(int)
 
-    if split != Split.FULL:
+    if split:
         samples = samples[samples.split == split].reset_index(drop=True)
 
     return samples
@@ -126,7 +126,7 @@ class MVTecDataset(AnomalibDataset):
     Args:
         task (str): Task type, either 'classification' or 'segmentation'
         pre_process (PreProcessor): Pre-processor object
-        split (Split): Split of the dataset, usually Split.TRAIN or Split. TEST
+        split (Optional[Union[Split, str]]): Split of the dataset, usually Split.TRAIN or Split.TEST
         root (str): Path to the root of the dataset
         category (str): Sub-category of the dataset, e.g. 'bottle'
     """
@@ -135,9 +135,9 @@ def __init__(
         self,
         task: str,
         pre_process: PreProcessor,
-        split: Split,
         root: str,
         category: str,
+        split: Optional[Union[Split, str]] = None,
     ) -> None:
         super().__init__(task=task, pre_process=pre_process)
 
diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
index b625621b35..a893c01478 100644
--- a/tests/pre_merge/datasets/test_dataset.py
+++ b/tests/pre_merge/datasets/test_dataset.py
@@ -21,6 +21,7 @@ def mvtec_data_module():
         train_batch_size=1,
         eval_batch_size=1,
         num_workers=0,
+        val_split_mode="from_test",
     )
     datamodule.prepare_data()
     datamodule.setup()
@@ -38,6 +39,7 @@ def btech_data_module():
         train_batch_size=1,
         eval_batch_size=1,
         num_workers=0,
+        val_split_mode="from_test",
     )
     datamodule.prepare_data()
     datamodule.setup()
@@ -60,6 +62,7 @@ def folder_data_module():
         train_batch_size=32,
         eval_batch_size=32,
         num_workers=8,
+        val_split_mode="from_test",
     )
     datamodule.setup()
 

From 5071dcf96abeed9f44bebdf95d50ed358fcaca8a Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Thu, 13 Oct 2022 14:35:37 +0200
Subject: [PATCH 42/96] sort samples when setting

---
 anomalib/data/base.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index b5ce95716d..53d0c571c4 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -85,7 +85,7 @@ def samples(self, samples: DataFrame):
         Args:
             samples (DataFrame): DataFrame with new samples.
         """
-        self._samples = samples
+        self._samples = samples.sort_values(by="image_path", ignore_index=True)
 
     @property
     def has_normal(self) -> bool:

From e175d7d0309975ca13e374874341c7916aa9d23a Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Thu, 13 Oct 2022 14:36:50 +0200
Subject: [PATCH 43/96] update warn message

---
 anomalib/data/utils/split.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index 266583d907..e4c356af92 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -72,9 +72,9 @@ def random_split(
         for i in range(len(label_dataset) - sum(subset_lengths)):
             subset_idx = i % sum(subset_lengths)
             subset_lengths[subset_idx] += 1
-        for index, length in enumerate(subset_lengths):
-            if length == 0:
-                warnings.warn(f"Length of subset at index {index} is 0.")
+        if 0 in subset_lengths:
+            warnings.warn("Zero subset length encountered during splitting. This means one of your subsets might be"
+                          " empty or devoid of either normal or anomalous images.")
         # perform random subsampling
         indices = torch.randperm(len(label_dataset))
         subsets.append(

From 03773b0211cd68e0f27d2f1f7a20e77feed232ff Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Thu, 13 Oct 2022 14:37:49 +0200
Subject: [PATCH 44/96] formatting

---
 anomalib/data/utils/split.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index e4c356af92..824a27f594 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -73,8 +73,10 @@ def random_split(
             subset_idx = i % sum(subset_lengths)
             subset_lengths[subset_idx] += 1
         if 0 in subset_lengths:
-            warnings.warn("Zero subset length encountered during splitting. This means one of your subsets might be"
-                          " empty or devoid of either normal or anomalous images.")
+            warnings.warn(
+                "Zero subset length encountered during splitting. This means one of your subsets might be"
+                " empty or devoid of either normal or anomalous images."
+            )
         # perform random subsampling
         indices = torch.randperm(len(label_dataset))
         subsets.append(

From 3910c32766441a6e59b677bf5c37ad3db05b2b35 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Thu, 13 Oct 2022 14:40:40 +0200
Subject: [PATCH 45/96] use setter when creating samples in dataset classes

---
 anomalib/data/btech.py  | 2 +-
 anomalib/data/folder.py | 2 +-
 anomalib/data/mvtec.py  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 934b5d57eb..b7f913750e 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -164,7 +164,7 @@ def __init__(
         self.split = split
 
     def _setup(self):
-        self._samples = make_btech_dataset(path=self.root_category, split=self.split)
+        self.samples = make_btech_dataset(path=self.root_category, split=self.split)
 
 
 @DATAMODULE_REGISTRY
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index db42d6ef6b..b67d7abe4d 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -189,7 +189,7 @@ def __init__(
 
     def _setup(self):
         """Assign samples."""
-        self._samples = make_folder_dataset(
+        self.samples = make_folder_dataset(
             normal_dir=self.normal_dir,
             abnormal_dir=self.abnormal_dir,
             normal_test_dir=self.normal_test_dir,
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 704c8f6626..445ba48440 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -145,7 +145,7 @@ def __init__(
         self.split = split
 
     def _setup(self):
-        self._samples = make_mvtec_dataset(self.root_category, split=self.split)
+        self.samples = make_mvtec_dataset(self.root_category, split=self.split)
 
 
 class MVTec(AnomalibDataModule):

From 894ef123e7bc885498d5d5e4955057d79c57097a Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Thu, 13 Oct 2022 14:41:47 +0200
Subject: [PATCH 46/96] add tests for new dataset class

---
 tests/pre_merge/datasets/test_datamodule.py | 244 ++++++++++++++++++
 tests/pre_merge/datasets/test_dataset.py    | 272 ++++----------------
 2 files changed, 289 insertions(+), 227 deletions(-)
 create mode 100644 tests/pre_merge/datasets/test_datamodule.py

diff --git a/tests/pre_merge/datasets/test_datamodule.py b/tests/pre_merge/datasets/test_datamodule.py
new file mode 100644
index 0000000000..a893c01478
--- /dev/null
+++ b/tests/pre_merge/datasets/test_datamodule.py
@@ -0,0 +1,244 @@
+"""Test Dataset."""
+
+import os
+
+import numpy as np
+import pytest
+
+from anomalib.config import update_input_size_config
+from anomalib.data import BTech, Folder, MVTec, get_datamodule
+from anomalib.pre_processing.transforms import Denormalize, ToNumpy
+from tests.helpers.config import get_test_configurable_parameters
+from tests.helpers.dataset import TestDataset, get_dataset_path
+
+
+@pytest.fixture(autouse=True)
+def mvtec_data_module():
+    datamodule = MVTec(
+        root=get_dataset_path(dataset="MVTec"),
+        category="leather",
+        image_size=(256, 256),
+        train_batch_size=1,
+        eval_batch_size=1,
+        num_workers=0,
+        val_split_mode="from_test",
+    )
+    datamodule.prepare_data()
+    datamodule.setup()
+
+    return datamodule
+
+
+@pytest.fixture(autouse=True)
+def btech_data_module():
+    """Create BTech Data Module."""
+    datamodule = BTech(
+        root=get_dataset_path(dataset="BTech"),
+        category="01",
+        image_size=(256, 256),
+        train_batch_size=1,
+        eval_batch_size=1,
+        num_workers=0,
+        val_split_mode="from_test",
+    )
+    datamodule.prepare_data()
+    datamodule.setup()
+
+    return datamodule
+
+
+@pytest.fixture(autouse=True)
+def folder_data_module():
+    """Create Folder Data Module."""
+    root = get_dataset_path(dataset="bottle")
+    datamodule = Folder(
+        root=root,
+        normal_dir="good",
+        abnormal_dir="broken_large",
+        mask_dir=os.path.join(root, "ground_truth/broken_large"),
+        task="segmentation",
+        split_ratio=0.2,
+        image_size=(256, 256),
+        train_batch_size=32,
+        eval_batch_size=32,
+        num_workers=8,
+        val_split_mode="from_test",
+    )
+    datamodule.setup()
+
+    return datamodule
+
+
+@pytest.fixture(autouse=True)
+def data_sample(mvtec_data_module):
+    _, data = next(enumerate(mvtec_data_module.train_dataloader()))
+    return data
+
+
+class TestMVTecDataModule:
+    """Test MVTec AD Data Module."""
+
+    def test_batch_size(self, mvtec_data_module):
+        """test_mvtec_datamodule [summary]"""
+        _, train_data_sample = next(enumerate(mvtec_data_module.train_dataloader()))
+        _, val_data_sample = next(enumerate(mvtec_data_module.val_dataloader()))
+        assert train_data_sample["image"].shape[0] == 1
+        assert val_data_sample["image"].shape[0] == 1
+
+    def test_val_and_test_dataloaders_has_mask_and_gt(self, mvtec_data_module):
+        """Test Validation and Test dataloaders should return filenames, image, mask and label."""
+        _, val_data = next(enumerate(mvtec_data_module.val_dataloader()))
+        _, test_data = next(enumerate(mvtec_data_module.test_dataloader()))
+
+        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys())
+        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys())
+
+    def test_non_overlapping_splits(self, mvtec_data_module):
+        """This test ensures that the train and test splits generated are non-overlapping."""
+        assert (
+            len(
+                set(mvtec_data_module.test_data.samples["image_path"].values).intersection(
+                    set(mvtec_data_module.train_data.samples["image_path"].values)
+                )
+            )
+            == 0
+        ), "Found train and test split contamination"
+
+
+class TestBTechDataModule:
+    """Test BTech Data Module."""
+
+    def test_batch_size(self, btech_data_module):
+        """Test batch size."""
+        _, train_data_sample = next(enumerate(btech_data_module.train_dataloader()))
+        _, val_data_sample = next(enumerate(btech_data_module.val_dataloader()))
+        assert train_data_sample["image"].shape[0] == 1
+        assert val_data_sample["image"].shape[0] == 1
+
+    def test_val_and_test_dataloaders_has_mask_and_gt(self, btech_data_module):
+        """Test Validation and Test dataloaders should return filenames, image, mask and label."""
+        _, val_data = next(enumerate(btech_data_module.val_dataloader()))
+        _, test_data = next(enumerate(btech_data_module.test_dataloader()))
+
+        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys())
+        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys())
+
+    def test_non_overlapping_splits(self, btech_data_module):
+        """This test ensures that the train and test splits generated are non-overlapping."""
+        assert (
+            len(
+                set(btech_data_module.test_data.samples["image_path"].values).intersection(
+                    set(btech_data_module.train_data.samples["image_path"].values)
+                )
+            )
+            == 0
+        ), "Found train and test split contamination"
+
+
+class TestFolderDataModule:
+    """Test Folder Data Module."""
+
+    def test_batch_size(self, folder_data_module):
+        """Test batch size."""
+        _, train_data_sample = next(enumerate(folder_data_module.train_dataloader()))
+        _, val_data_sample = next(enumerate(folder_data_module.val_dataloader()))
+        assert train_data_sample["image"].shape[0] == 16
+        assert val_data_sample["image"].shape[0] == 12
+
+    def test_val_and_test_dataloaders_has_mask_and_gt(self, folder_data_module):
+        """Test Validation and Test dataloaders should return filenames, image, mask and label."""
+        _, val_data = next(enumerate(folder_data_module.val_dataloader()))
+        _, test_data = next(enumerate(folder_data_module.test_dataloader()))
+
+        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys())
+        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys())
+
+    def test_non_overlapping_splits(self, folder_data_module):
+        """This test ensures that the train and test splits generated are non-overlapping."""
+        assert (
+            len(
+                set(folder_data_module.test_data.samples["image_path"].values).intersection(
+                    set(folder_data_module.train_data.samples["image_path"].values)
+                )
+            )
+            == 0
+        ), "Found train and test split contamination"
+
+
+class TestDenormalize:
+    """Test Denormalize Util."""
+
+    def test_denormalize_image_pixel_values(self, data_sample):
+        """Test Denormalize denormalizes tensor into [0, 256] range."""
+        denormalized_sample = Denormalize().__call__(data_sample["image"].squeeze())
+        assert denormalized_sample.min() >= 0 and denormalized_sample.max() <= 256
+
+    def test_denormalize_return_numpy(self, data_sample):
+        """Denormalize should return a numpy array."""
+        denormalized_sample = Denormalize()(data_sample["image"].squeeze())
+        assert isinstance(denormalized_sample, np.ndarray)
+
+    def test_denormalize_channel_order(self, data_sample):
+        """Denormalize should return a numpy array of order [HxWxC]"""
+        denormalized_sample = Denormalize().__call__(data_sample["image"].squeeze())
+        assert len(denormalized_sample.shape) == 3 and denormalized_sample.shape[-1] == 3
+
+    def test_representation(self):
+        """Test Denormalize representation should return string
+        Denormalize()"""
+        assert str(Denormalize()) == "Denormalize()"
+
+
+class TestToNumpy:
+    """Test ToNumpy whether it properly converts tensor into numpy array."""
+
+    def test_to_numpy_image_pixel_values(self, data_sample):
+        """Test ToNumpy should return an array whose pixels in the range of [0,
+        256]"""
+        array = ToNumpy()(data_sample["image"])
+        assert array.min() >= 0 and array.max() <= 256
+
+    def test_to_numpy_converts_tensor_to_np_array(self, data_sample):
+        """ToNumpy returns a numpy array."""
+        array = ToNumpy()(data_sample["image"])
+        assert isinstance(array, np.ndarray)
+
+    def test_to_numpy_channel_order(self, data_sample):
+        """ToNumpy() should return a numpy array of order [HxWxC]"""
+        array = ToNumpy()(data_sample["image"])
+        assert len(array.shape) == 3 and array.shape[-1] == 3
+
+    def test_one_channel_images(self, data_sample):
+        """One channel tensor should be converted to HxW np array."""
+        data = data_sample["image"][:, 0, :, :].unsqueeze(0)
+        array = ToNumpy()(data)
+        assert len(array.shape) == 2
+
+    def test_representation(self):
+        """Test ToNumpy() representation should return string `ToNumpy()`"""
+        assert str(ToNumpy()) == "ToNumpy()"
+
+
+class TestConfigToDataModule:
+    """Tests that check if the dataset parameters in the config achieve the desired effect."""
+
+    @pytest.mark.parametrize(
+        ["input_size", "effective_image_size"],
+        [
+            (512, (512, 512)),
+            ((245, 276), (245, 276)),
+            ((263, 134), (263, 134)),
+            ((267, 267), (267, 267)),
+        ],
+    )
+    @TestDataset(num_train=20, num_test=10)
+    def test_image_size(self, input_size, effective_image_size, category="shapes", path=None):
+        """Test if the image size parameter works as expected."""
+        configurable_parameters = get_test_configurable_parameters(dataset_path=path, model_name="stfpm")
+        configurable_parameters.dataset.category = category
+        configurable_parameters.dataset.image_size = input_size
+        configurable_parameters = update_input_size_config(configurable_parameters)
+
+        data_module = get_datamodule(configurable_parameters)
+        data_module.setup()
+        assert iter(data_module.train_dataloader()).__next__()["image"].shape[-2:] == effective_image_size
diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
index a893c01478..bc06286ec8 100644
--- a/tests/pre_merge/datasets/test_dataset.py
+++ b/tests/pre_merge/datasets/test_dataset.py
@@ -1,244 +1,62 @@
-"""Test Dataset."""
+"""Test the AnomalibDataset class."""
 
-import os
+import random
 
-import numpy as np
 import pytest
 
-from anomalib.config import update_input_size_config
-from anomalib.data import BTech, Folder, MVTec, get_datamodule
-from anomalib.pre_processing.transforms import Denormalize, ToNumpy
-from tests.helpers.config import get_test_configurable_parameters
-from tests.helpers.dataset import TestDataset, get_dataset_path
+from anomalib.data.folder import FolderDataset
+from anomalib.data.utils.split import concatenate_datasets, random_split
+from anomalib.pre_processing import PreProcessor
+from tests.helpers.dataset import get_dataset_path
 
 
 @pytest.fixture(autouse=True)
-def mvtec_data_module():
-    datamodule = MVTec(
-        root=get_dataset_path(dataset="MVTec"),
-        category="leather",
-        image_size=(256, 256),
-        train_batch_size=1,
-        eval_batch_size=1,
-        num_workers=0,
-        val_split_mode="from_test",
+def folder_dataset():
+    """Create Folder Dataset."""
+    root = get_dataset_path(dataset="bottle")
+    pre_process = PreProcessor(image_size=(256, 256))
+    dataset = FolderDataset(
+        task="classification",
+        pre_process=pre_process,
+        root=root,
+        normal_dir="good",
+        abnormal_dir="broken_large",
     )
-    datamodule.prepare_data()
-    datamodule.setup()
+    dataset.setup()
 
-    return datamodule
+    return dataset
 
 
-@pytest.fixture(autouse=True)
-def btech_data_module():
-    """Create BTech Data Module."""
-    datamodule = BTech(
-        root=get_dataset_path(dataset="BTech"),
-        category="01",
-        image_size=(256, 256),
-        train_batch_size=1,
-        eval_batch_size=1,
-        num_workers=0,
-        val_split_mode="from_test",
-    )
-    datamodule.prepare_data()
-    datamodule.setup()
+class TestAnomalibDataset:
+    def test_subsample(self, folder_dataset):
+        """Test the subsample functionality."""
 
-    return datamodule
+        sample_size = int(0.5 * len(folder_dataset))
+        indices = random.sample(range(len(folder_dataset)), sample_size)
+        subset = folder_dataset.subsample(indices)
 
+        # check if the dataset has been subsampled to correct size
+        assert len(subset) == sample_size
+        # check if index has been reset
+        assert subset.samples.index.start == 0
+        assert subset.samples.index.stop == sample_size
 
-@pytest.fixture(autouse=True)
-def folder_data_module():
-    """Create Folder Data Module."""
-    root = get_dataset_path(dataset="bottle")
-    datamodule = Folder(
-        root=root,
-        normal_dir="good",
-        abnormal_dir="broken_large",
-        mask_dir=os.path.join(root, "ground_truth/broken_large"),
-        task="segmentation",
-        split_ratio=0.2,
-        image_size=(256, 256),
-        train_batch_size=32,
-        eval_batch_size=32,
-        num_workers=8,
-        val_split_mode="from_test",
-    )
-    datamodule.setup()
+    def test_random_split(self, folder_dataset):
+        """Test the random subset splitting."""
 
-    return datamodule
+        # split the dataset
+        subsets = random_split(folder_dataset, [0.4, 0.35, 0.25], label_aware=True)
 
+        # check if subset splitting has been performed correctly
+        assert len(subsets) == 3
 
-@pytest.fixture(autouse=True)
-def data_sample(mvtec_data_module):
-    _, data = next(enumerate(mvtec_data_module.train_dataloader()))
-    return data
-
-
-class TestMVTecDataModule:
-    """Test MVTec AD Data Module."""
-
-    def test_batch_size(self, mvtec_data_module):
-        """test_mvtec_datamodule [summary]"""
-        _, train_data_sample = next(enumerate(mvtec_data_module.train_dataloader()))
-        _, val_data_sample = next(enumerate(mvtec_data_module.val_dataloader()))
-        assert train_data_sample["image"].shape[0] == 1
-        assert val_data_sample["image"].shape[0] == 1
-
-    def test_val_and_test_dataloaders_has_mask_and_gt(self, mvtec_data_module):
-        """Test Validation and Test dataloaders should return filenames, image, mask and label."""
-        _, val_data = next(enumerate(mvtec_data_module.val_dataloader()))
-        _, test_data = next(enumerate(mvtec_data_module.test_dataloader()))
-
-        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys())
-        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys())
-
-    def test_non_overlapping_splits(self, mvtec_data_module):
-        """This test ensures that the train and test splits generated are non-overlapping."""
-        assert (
-            len(
-                set(mvtec_data_module.test_data.samples["image_path"].values).intersection(
-                    set(mvtec_data_module.train_data.samples["image_path"].values)
-                )
-            )
-            == 0
-        ), "Found train and test split contamination"
-
-
-class TestBTechDataModule:
-    """Test BTech Data Module."""
-
-    def test_batch_size(self, btech_data_module):
-        """Test batch size."""
-        _, train_data_sample = next(enumerate(btech_data_module.train_dataloader()))
-        _, val_data_sample = next(enumerate(btech_data_module.val_dataloader()))
-        assert train_data_sample["image"].shape[0] == 1
-        assert val_data_sample["image"].shape[0] == 1
-
-    def test_val_and_test_dataloaders_has_mask_and_gt(self, btech_data_module):
-        """Test Validation and Test dataloaders should return filenames, image, mask and label."""
-        _, val_data = next(enumerate(btech_data_module.val_dataloader()))
-        _, test_data = next(enumerate(btech_data_module.test_dataloader()))
-
-        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys())
-        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys())
-
-    def test_non_overlapping_splits(self, btech_data_module):
-        """This test ensures that the train and test splits generated are non-overlapping."""
-        assert (
-            len(
-                set(btech_data_module.test_data.samples["image_path"].values).intersection(
-                    set(btech_data_module.train_data.samples["image_path"].values)
-                )
-            )
-            == 0
-        ), "Found train and test split contamination"
-
-
-class TestFolderDataModule:
-    """Test Folder Data Module."""
-
-    def test_batch_size(self, folder_data_module):
-        """Test batch size."""
-        _, train_data_sample = next(enumerate(folder_data_module.train_dataloader()))
-        _, val_data_sample = next(enumerate(folder_data_module.val_dataloader()))
-        assert train_data_sample["image"].shape[0] == 16
-        assert val_data_sample["image"].shape[0] == 12
-
-    def test_val_and_test_dataloaders_has_mask_and_gt(self, folder_data_module):
-        """Test Validation and Test dataloaders should return filenames, image, mask and label."""
-        _, val_data = next(enumerate(folder_data_module.val_dataloader()))
-        _, test_data = next(enumerate(folder_data_module.test_dataloader()))
-
-        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys())
-        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys())
-
-    def test_non_overlapping_splits(self, folder_data_module):
-        """This test ensures that the train and test splits generated are non-overlapping."""
-        assert (
-            len(
-                set(folder_data_module.test_data.samples["image_path"].values).intersection(
-                    set(folder_data_module.train_data.samples["image_path"].values)
-                )
-            )
-            == 0
-        ), "Found train and test split contamination"
-
-
-class TestDenormalize:
-    """Test Denormalize Util."""
-
-    def test_denormalize_image_pixel_values(self, data_sample):
-        """Test Denormalize denormalizes tensor into [0, 256] range."""
-        denormalized_sample = Denormalize().__call__(data_sample["image"].squeeze())
-        assert denormalized_sample.min() >= 0 and denormalized_sample.max() <= 256
-
-    def test_denormalize_return_numpy(self, data_sample):
-        """Denormalize should return a numpy array."""
-        denormalized_sample = Denormalize()(data_sample["image"].squeeze())
-        assert isinstance(denormalized_sample, np.ndarray)
-
-    def test_denormalize_channel_order(self, data_sample):
-        """Denormalize should return a numpy array of order [HxWxC]"""
-        denormalized_sample = Denormalize().__call__(data_sample["image"].squeeze())
-        assert len(denormalized_sample.shape) == 3 and denormalized_sample.shape[-1] == 3
-
-    def test_representation(self):
-        """Test Denormalize representation should return string
-        Denormalize()"""
-        assert str(Denormalize()) == "Denormalize()"
-
-
-class TestToNumpy:
-    """Test ToNumpy whether it properly converts tensor into numpy array."""
-
-    def test_to_numpy_image_pixel_values(self, data_sample):
-        """Test ToNumpy should return an array whose pixels in the range of [0,
-        256]"""
-        array = ToNumpy()(data_sample["image"])
-        assert array.min() >= 0 and array.max() <= 256
-
-    def test_to_numpy_converts_tensor_to_np_array(self, data_sample):
-        """ToNumpy returns a numpy array."""
-        array = ToNumpy()(data_sample["image"])
-        assert isinstance(array, np.ndarray)
-
-    def test_to_numpy_channel_order(self, data_sample):
-        """ToNumpy() should return a numpy array of order [HxWxC]"""
-        array = ToNumpy()(data_sample["image"])
-        assert len(array.shape) == 3 and array.shape[-1] == 3
-
-    def test_one_channel_images(self, data_sample):
-        """One channel tensor should be converted to HxW np array."""
-        data = data_sample["image"][:, 0, :, :].unsqueeze(0)
-        array = ToNumpy()(data)
-        assert len(array.shape) == 2
-
-    def test_representation(self):
-        """Test ToNumpy() representation should return string `ToNumpy()`"""
-        assert str(ToNumpy()) == "ToNumpy()"
-
-
-class TestConfigToDataModule:
-    """Tests that check if the dataset parameters in the config achieve the desired effect."""
-
-    @pytest.mark.parametrize(
-        ["input_size", "effective_image_size"],
-        [
-            (512, (512, 512)),
-            ((245, 276), (245, 276)),
-            ((263, 134), (263, 134)),
-            ((267, 267), (267, 267)),
-        ],
-    )
-    @TestDataset(num_train=20, num_test=10)
-    def test_image_size(self, input_size, effective_image_size, category="shapes", path=None):
-        """Test if the image size parameter works as expected."""
-        configurable_parameters = get_test_configurable_parameters(dataset_path=path, model_name="stfpm")
-        configurable_parameters.dataset.category = category
-        configurable_parameters.dataset.image_size = input_size
-        configurable_parameters = update_input_size_config(configurable_parameters)
-
-        data_module = get_datamodule(configurable_parameters)
-        data_module.setup()
-        assert iter(data_module.train_dataloader()).__next__()["image"].shape[-2:] == effective_image_size
+        # reconstruct the original dataset by concatenating the subsets
+        reconstructed_dataset = concatenate_datasets(subsets)
+
+        # check if reconstructed dataset is equal to original dataset
+        assert folder_dataset.samples.equals(reconstructed_dataset.samples)
+
+        # check if warning raised when one of the subsets is empty
+        split_ratios = [1 - (1 / (len(folder_dataset) + 1)), 1 / (len(folder_dataset) + 1)]
+        with pytest.warns():
+            subsets = random_split(folder_dataset, split_ratios)

From 44009e27e193d5a20132d92addeacfd35a42bb02 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Thu, 13 Oct 2022 15:17:13 +0200
Subject: [PATCH 47/96] add test case for label aware random split

---
 tests/pre_merge/datasets/test_dataset.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
index bc06286ec8..81daad062c 100644
--- a/tests/pre_merge/datasets/test_dataset.py
+++ b/tests/pre_merge/datasets/test_dataset.py
@@ -2,6 +2,7 @@
 
 import random
 
+import pandas as pd
 import pytest
 
 from anomalib.data.folder import FolderDataset
@@ -44,15 +45,12 @@ def test_subsample(self, folder_dataset):
     def test_random_split(self, folder_dataset):
         """Test the random subset splitting."""
 
-        # split the dataset
-        subsets = random_split(folder_dataset, [0.4, 0.35, 0.25], label_aware=True)
-
+        # subset splitting
+        subsets = random_split(folder_dataset, [0.4, 0.35, 0.25])
         # check if subset splitting has been performed correctly
         assert len(subsets) == 3
-
         # reconstruct the original dataset by concatenating the subsets
         reconstructed_dataset = concatenate_datasets(subsets)
-
         # check if reconstructed dataset is equal to original dataset
         assert folder_dataset.samples.equals(reconstructed_dataset.samples)
 
@@ -60,3 +58,17 @@ def test_random_split(self, folder_dataset):
         split_ratios = [1 - (1 / (len(folder_dataset) + 1)), 1 / (len(folder_dataset) + 1)]
         with pytest.warns():
             subsets = random_split(folder_dataset, split_ratios)
+
+        # label-aware subset splitting
+        samples = folder_dataset.samples
+        normal_samples = samples[samples["label_index"] == 0]
+        anomalous_samples = samples[samples["label_index"] == 1]
+        samples = pd.concat([normal_samples, anomalous_samples[0:5]])
+        folder_dataset.samples = samples
+
+        subsets = random_split(folder_dataset, [0.4, 0.4, 0.2], label_aware=True)
+
+        # 5 anomalous images in total, so the first two subsets should each have 2, and the last subset 1
+        assert len(subsets[0].samples[subsets[0].samples["label_index"] == 1]) == 2
+        assert len(subsets[1].samples[subsets[1].samples["label_index"] == 1]) == 2
+        assert len(subsets[2].samples[subsets[2].samples["label_index"] == 1]) == 1

From 012ed479b2fd79889a39442ab176782f27248d6b Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 11:00:55 +0200
Subject: [PATCH 48/96] update parameter name in inferencers

---
 anomalib/deploy/inferencers/openvino_inferencer.py | 2 +-
 anomalib/deploy/inferencers/torch_inferencer.py    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/anomalib/deploy/inferencers/openvino_inferencer.py b/anomalib/deploy/inferencers/openvino_inferencer.py
index 804abb52d0..e68ef0f294 100644
--- a/anomalib/deploy/inferencers/openvino_inferencer.py
+++ b/anomalib/deploy/inferencers/openvino_inferencer.py
@@ -94,7 +94,7 @@ def pre_process(self, image: np.ndarray) -> np.ndarray:
             np.ndarray: pre-processed image.
         """
         transform_config = (
-            self.config.dataset.transform_config.val if "transform_config" in self.config.dataset.keys() else None
+            self.config.dataset.transform_config.eval if "transform_config" in self.config.dataset.keys() else None
         )
         image_size = tuple(self.config.dataset.image_size)
         pre_processor = PreProcessor(transform_config, image_size)
diff --git a/anomalib/deploy/inferencers/torch_inferencer.py b/anomalib/deploy/inferencers/torch_inferencer.py
index 795149e6c6..ff2d8813a6 100644
--- a/anomalib/deploy/inferencers/torch_inferencer.py
+++ b/anomalib/deploy/inferencers/torch_inferencer.py
@@ -96,7 +96,7 @@ def pre_process(self, image: np.ndarray) -> Tensor:
             Tensor: pre-processed image.
         """
         transform_config = (
-            self.config.dataset.transform_config.val if "transform_config" in self.config.dataset.keys() else None
+            self.config.dataset.transform_config.eval if "transform_config" in self.config.dataset.keys() else None
         )
         image_size = tuple(self.config.dataset.image_size)
         pre_processor = PreProcessor(transform_config, image_size)

From 62b176e53c4c72afbbec349c347b5b756c85a4ec Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 11:44:19 +0200
Subject: [PATCH 49/96] move _setup implementation to base class

---
 anomalib/data/base.py           | 20 ++++++++++++++++----
 anomalib/data/btech.py          | 15 ---------------
 anomalib/data/folder.py         |  9 +--------
 anomalib/data/mvtec.py          | 15 ---------------
 anomalib/data/utils/__init__.py |  3 +++
 anomalib/data/utils/split.py    |  7 +++++--
 6 files changed, 25 insertions(+), 44 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 53d0c571c4..10b7c2ecf4 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -20,7 +20,7 @@
 from torch import Tensor
 from torch.utils.data import DataLoader, Dataset
 
-from anomalib.data.utils import read_image
+from anomalib.data.utils import random_split, read_image
 from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
@@ -209,10 +209,22 @@ def setup(self, stage: Optional[str] = None):
             self._setup(stage)
         assert self.is_setup
 
-    @abstractmethod
     def _setup(self, _stage: Optional[str] = None) -> None:
-        """To be implemented in conrete subclass."""
-        raise NotImplementedError
+        """Set up the datasets and perform dynamic subset splitting.
+
+        May be overridden in subclass for custom splitting behaviour.
+        """
+        assert self.train_data is not None
+        assert self.test_data is not None
+
+        self.train_data.setup()
+        self.test_data.setup()
+        if self.val_split_mode == ValSplitMode.FROM_TEST:
+            self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True)
+        elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
+            self.val_data = self.test_data
+        else:
+            raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
 
     @property
     def is_setup(self):
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index b7f913750e..76be5ca087 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -25,7 +25,6 @@
 
 from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode
 from anomalib.data.utils import DownloadProgressBar, hash_check
-from anomalib.data.utils.split import random_split
 from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
@@ -281,17 +280,3 @@ def prepare_data(self) -> None:
 
             logger.info("Cleaning the tar file")
             zip_filename.unlink()
-
-    def _setup(self, _stage: Optional[str] = None):
-        """Set up the datasets and perform dynamic subset splitting."""
-        assert self.train_data is not None
-        assert self.test_data is not None
-
-        self.train_data.setup()
-        self.test_data.setup()
-        if self.val_split_mode == ValSplitMode.FROM_TEST:
-            self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True)
-        elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
-            self.val_data = self.test_data
-        else:
-            raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index b67d7abe4d..2a8f63b79b 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -301,11 +301,4 @@ def _setup(self, _stage: Optional[str] = None):
             self.train_data, normal_test_data = random_split(self.train_data, self.split_ratio)
             self.test_data += normal_test_data
 
-        # split validation set from test set
-        if self.val_split_mode == ValSplitMode.FROM_TEST:
-            assert self.test_data is not None
-            self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True)
-        elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
-            self.val_data = self.test_data
-        else:
-            raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
+        super()._setup()
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 445ba48440..e52d361c91 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -34,7 +34,6 @@
 
 from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode
 from anomalib.data.utils import DownloadProgressBar, hash_check
-from anomalib.data.utils.split import random_split
 from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
@@ -211,17 +210,3 @@ def prepare_data(self) -> None:
 
             logger.info("Cleaning the tar file")
             (zip_filename).unlink()
-
-    def _setup(self, _stage: Optional[str] = None) -> None:
-        """Set up the datasets and perform dynamic subset splitting."""
-        assert self.train_data is not None
-        assert self.test_data is not None
-
-        self.train_data.setup()
-        self.test_data.setup()
-        if self.val_split_mode == ValSplitMode.FROM_TEST:
-            self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True)
-        elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
-            self.val_data = self.test_data
-        else:
-            raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py
index 5059b51c06..52b21b8fcf 100644
--- a/anomalib/data/utils/__init__.py
+++ b/anomalib/data/utils/__init__.py
@@ -11,6 +11,7 @@
     get_image_height_and_width,
     read_image,
 )
+from .split import concatenate_datasets, random_split
 
 __all__ = [
     "generate_output_image_filename",
@@ -20,4 +21,6 @@
     "random_2d_perlin",
     "read_image",
     "DownloadProgressBar",
+    "random_split",
+    "concatenate_datasets",
 ]
diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index 824a27f594..5ab5f6074e 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -11,13 +11,16 @@
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+from __future__ import annotations
+
 import math
 import warnings
-from typing import List, Sequence, Union
+from typing import TYPE_CHECKING, List, Sequence, Union
 
 import torch
 
-from anomalib.data.base import AnomalibDataset
+if TYPE_CHECKING:
+    from anomalib.data.base import AnomalibDataset
 
 
 def concatenate_datasets(datasets: Sequence[AnomalibDataset]) -> AnomalibDataset:

From 7e957b690d751c1ca302de9d6b2fd9525cbd8810 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 11:44:36 +0200
Subject: [PATCH 50/96] address codacy issues

---
 tests/pre_merge/datasets/test_datamodule.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/pre_merge/datasets/test_datamodule.py b/tests/pre_merge/datasets/test_datamodule.py
index a893c01478..7d9000fc17 100644
--- a/tests/pre_merge/datasets/test_datamodule.py
+++ b/tests/pre_merge/datasets/test_datamodule.py
@@ -180,7 +180,7 @@ def test_denormalize_return_numpy(self, data_sample):
 
     def test_denormalize_channel_order(self, data_sample):
         """Denormalize should return a numpy array of order [HxWxC]"""
-        denormalized_sample = Denormalize().__call__(data_sample["image"].squeeze())
+        denormalized_sample = Denormalize()(data_sample["image"].squeeze())
         assert len(denormalized_sample.shape) == 3 and denormalized_sample.shape[-1] == 3
 
     def test_representation(self):
@@ -241,4 +241,4 @@ def test_image_size(self, input_size, effective_image_size, category="shapes", p
 
         data_module = get_datamodule(configurable_parameters)
         data_module.setup()
-        assert iter(data_module.train_dataloader()).__next__()["image"].shape[-2:] == effective_image_size
+        assert next(iter(data_module.train_dataloader()))["image"].shape[-2:] == effective_image_size

From 25f503d26fe6af5796c1e34843d4ae9719913786 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 11:53:17 +0200
Subject: [PATCH 51/96] fix pylint issues

---
 anomalib/data/base.py   | 8 ++------
 anomalib/data/btech.py  | 3 +--
 anomalib/data/folder.py | 2 +-
 anomalib/data/mvtec.py  | 2 +-
 4 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/anomalib/data/base.py b/anomalib/data/base.py
index 10b7c2ecf4..0ac402e047 100644
--- a/anomalib/data/base.py
+++ b/anomalib/data/base.py
@@ -182,16 +182,12 @@ class AnomalibDataModule(LightningDataModule, ABC):
         num_workers (int): Number of workers used by the train, val and test dataloaders.
     """
 
-    def __init__(
-        self,
-        train_batch_size: int,
-        eval_batch_size: int,
-        num_workers: int,
-    ):
+    def __init__(self, train_batch_size: int, eval_batch_size: int, num_workers: int, val_split_mode: ValSplitMode):
         super().__init__()
         self.train_batch_size = train_batch_size
         self.eval_batch_size = eval_batch_size
         self.num_workers = num_workers
+        self.val_split_mode = val_split_mode
 
         self.train_data: Optional[AnomalibDataset] = None
         self.val_data: Optional[AnomalibDataset] = None
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 76be5ca087..f6a4879245 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -224,11 +224,10 @@ def __init__(
             >>> data["image"].shape, data["mask"].shape
             (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256]))
         """
-        super().__init__(train_batch_size, eval_batch_size, num_workers)
+        super().__init__(train_batch_size, eval_batch_size, num_workers, val_split_mode)
 
         self.root = Path(root)
         self.category = Path(category)
-        self.val_split_mode = val_split_mode
 
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
         pre_process_eval = PreProcessor(config=transform_config_eval, image_size=image_size)
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index 2a8f63b79b..f5b3c415a3 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -256,9 +256,9 @@ def __init__(
             train_batch_size=train_batch_size,
             eval_batch_size=eval_batch_size,
             num_workers=num_workers,
+            val_split_mode=val_split_mode,
         )
 
-        self.val_split_mode = val_split_mode
         self.split_ratio = split_ratio
 
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index e52d361c91..33353f5eed 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -167,11 +167,11 @@ def __init__(
             train_batch_size=train_batch_size,
             eval_batch_size=eval_batch_size,
             num_workers=num_workers,
+            val_split_mode=val_split_mode,
         )
 
         self.root = Path(root)
         self.category = Path(category)
-        self.val_split_mode = val_split_mode
 
         # TODO: Get rid of PreProcessor by passing transform directly
         pre_process_train = PreProcessor(config=transform_config_train, image_size=image_size)

From 12459281440f2b996912d86194d4dbbac9943627 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 12:02:21 +0200
Subject: [PATCH 52/96] codacy

---
 tests/pre_merge/datasets/test_datamodule.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pre_merge/datasets/test_datamodule.py b/tests/pre_merge/datasets/test_datamodule.py
index 7d9000fc17..6b41137f69 100644
--- a/tests/pre_merge/datasets/test_datamodule.py
+++ b/tests/pre_merge/datasets/test_datamodule.py
@@ -170,7 +170,7 @@ class TestDenormalize:
 
     def test_denormalize_image_pixel_values(self, data_sample):
         """Test Denormalize denormalizes tensor into [0, 256] range."""
-        denormalized_sample = Denormalize().__call__(data_sample["image"].squeeze())
+        denormalized_sample = Denormalize()(data_sample["image"].squeeze())
         assert denormalized_sample.min() >= 0 and denormalized_sample.max() <= 256
 
     def test_denormalize_return_numpy(self, data_sample):

From 0459a0d5f250311cb083443d08d6fc0e78bd8cbd Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 12:30:49 +0200
Subject: [PATCH 53/96] update example dataset config in docs

---
 docs/source/how_to_guides/train_custom_data.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/source/how_to_guides/train_custom_data.rst b/docs/source/how_to_guides/train_custom_data.rst
index 5974ccffed..9a70fdc88e 100644
--- a/docs/source/how_to_guides/train_custom_data.rst
+++ b/docs/source/how_to_guides/train_custom_data.rst
@@ -82,12 +82,12 @@ Let's choose `Padim algorithm <https://arxiv.org/pdf/2011.08785.pdf>`_, copy the
     seed: 0
     image_size: 256
     train_batch_size: 32
-    test_batch_size: 32
+    eval_batch_size: 32
     num_workers: 8
     transform_config:
         train: null
-        val: null
-    create_validation_set: true
+        eval: null
+    validation_split_mode: from_test # determines how the validation set is created, options [same_as_test, from_test]
     tiling:
         apply: false
         tile_size: null

From 30dc45aafb8495bfbdcf74e59fc98f09a13ff303 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 12:47:32 +0200
Subject: [PATCH 54/96] fix test

---
 tests/pre_merge/utils/metrics/test_adaptive_threshold.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/pre_merge/utils/metrics/test_adaptive_threshold.py b/tests/pre_merge/utils/metrics/test_adaptive_threshold.py
index 1a7eef5b61..607a544c2e 100644
--- a/tests/pre_merge/utils/metrics/test_adaptive_threshold.py
+++ b/tests/pre_merge/utils/metrics/test_adaptive_threshold.py
@@ -39,6 +39,7 @@ def test_non_adaptive_threshold():
     """
     config = get_test_configurable_parameters(config_path="anomalib/models/padim/config.yaml")
 
+    config.dataset.num_workers = 0
     config.model.normalization_method = "none"
     config.metrics.threshold.adaptive = False
     config.trainer.fast_dev_run = True

From 85c475a70657144d51faa432a2f5e56b600a4004 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 15:32:39 +0200
Subject: [PATCH 55/96] move base classes to separate files (avoid circular
 import)

---
 anomalib/data/__init__.py       |   5 +-
 anomalib/data/base.py           | 242 --------------------------------
 anomalib/data/btech.py          |   4 +-
 anomalib/data/folder.py         |   4 +-
 anomalib/data/mvtec.py          |   4 +-
 anomalib/data/utils/__init__.py |   4 +-
 anomalib/data/utils/split.py    |  18 ++-
 7 files changed, 29 insertions(+), 252 deletions(-)
 delete mode 100644 anomalib/data/base.py

diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index 98ee0394e2..0cc71d9d0b 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -8,8 +8,7 @@
 
 from omegaconf import DictConfig, ListConfig
 
-from anomalib.data.base import AnomalibDataModule
-
+from .base import AnomalibDataModule, AnomalibDataset
 from .btech import BTech
 from .folder import Folder
 from .inference import InferenceDataset
@@ -86,6 +85,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
 
 
 __all__ = [
+    "AnomalibDataset",
+    "AnomalibDataModule",
     "get_datamodule",
     "BTech",
     "Folder",
diff --git a/anomalib/data/base.py b/anomalib/data/base.py
deleted file mode 100644
index 0ac402e047..0000000000
--- a/anomalib/data/base.py
+++ /dev/null
@@ -1,242 +0,0 @@
-"""Anomalib dataset and datamodule base classes."""
-
-# Copyright (C) 2022 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-from __future__ import annotations
-
-import copy
-import logging
-from abc import ABC, abstractmethod
-from enum import Enum
-from typing import Dict, Optional, Sequence, Union
-
-import cv2
-import numpy as np
-import pandas as pd
-from pandas import DataFrame
-from pytorch_lightning import LightningDataModule
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
-from torch import Tensor
-from torch.utils.data import DataLoader, Dataset
-
-from anomalib.data.utils import random_split, read_image
-from anomalib.pre_processing import PreProcessor
-
-logger = logging.getLogger(__name__)
-
-
-class Split(str, Enum):
-    """Split of a subset."""
-
-    TRAIN = "train"
-    VAL = "val"
-    TEST = "test"
-
-
-class ValSplitMode(str, Enum):
-    """Splitting mode used to obtain validation subset."""
-
-    SAME_AS_TEST = "same_as_test"
-    FROM_TEST = "from_test"
-
-
-class AnomalibDataset(Dataset, ABC):
-    """Anomalib dataset."""
-
-    def __init__(self, task: str, pre_process: PreProcessor):
-        super().__init__()
-        self.task = task
-        self.pre_process = pre_process
-        self._samples = None
-
-    def __len__(self) -> int:
-        """Get length of the dataset."""
-        assert isinstance(self._samples, DataFrame)
-        return len(self._samples)
-
-    def subsample(self, indices: Sequence[int], inplace=False) -> AnomalibDataset:
-        """Subsamples the dataset at the provided indices.
-
-        Args:
-            indices (Sequence[int]): Indices at which the dataset is to be subsampled.
-            inplace (bool): When true, the subsampling will be performed on the instance itself.
-        """
-        dataset = self if inplace else copy.deepcopy(self)
-        dataset.samples = self.samples.iloc[indices].reset_index(drop=True)
-        return dataset
-
-    @property
-    def is_setup(self) -> bool:
-        """Checks if setup() been called."""
-        return isinstance(self._samples, DataFrame)
-
-    @property
-    def samples(self) -> DataFrame:
-        """Get the samples dataframe."""
-        if not self.is_setup:
-            raise RuntimeError("Dataset is not setup yet. Call setup() first.")
-        return self._samples
-
-    @samples.setter
-    def samples(self, samples: DataFrame):
-        """Overwrite the samples with a new dataframe.
-
-        Args:
-            samples (DataFrame): DataFrame with new samples.
-        """
-        self._samples = samples.sort_values(by="image_path", ignore_index=True)
-
-    @property
-    def has_normal(self) -> bool:
-        """Check if the dataset contains any normal samples."""
-        return 0 in list(self.samples.label_index)
-
-    @property
-    def has_anomalous(self) -> bool:
-        """Check if the dataset contains any anomalous samples."""
-        return 1 in list(self.samples.label_index)
-
-    def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
-        """Get dataset item for the index ``index``.
-
-        Args:
-            index (int): Index to get the item.
-
-        Returns:
-            Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
-                Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
-        """
-        assert isinstance(self._samples, DataFrame)
-
-        image_path = self._samples.iloc[index].image_path
-        image = read_image(image_path)
-        label_index = self._samples.iloc[index].label_index
-
-        item = dict(image_path=image_path, label=label_index)
-
-        if self.task == "classification":
-            pre_processed = self.pre_process(image=image)
-        elif self.task == "segmentation":
-            mask_path = self._samples.iloc[index].mask_path
-
-            # Only Anomalous (1) images have masks in anomaly datasets
-            # Therefore, create empty mask for Normal (0) images.
-            if label_index == 0:
-                mask = np.zeros(shape=image.shape[:2])
-            else:
-                mask = cv2.imread(mask_path, flags=0) / 255.0
-
-            pre_processed = self.pre_process(image=image, mask=mask)
-
-            item["mask_path"] = mask_path
-            item["mask"] = pre_processed["mask"]
-        else:
-            raise ValueError(f"Unknown task type: {self.task}")
-        item["image"] = pre_processed["image"]
-
-        return item
-
-    def __add__(self, other_dataset: AnomalibDataset) -> AnomalibDataset:
-        """Concatenate this dataset with another dataset."""
-        assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of the same type."
-        assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first."
-        dataset = copy.deepcopy(self)
-        dataset.samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True)
-        return dataset
-
-    def setup(self) -> None:
-        """Load data/metadata into memory."""
-        if not self.is_setup:
-            self._setup()
-        assert self.is_setup, "setup() should set self._samples"
-
-    @abstractmethod
-    def _setup(self) -> DataFrame:
-        """Set up the data module.
-
-        This method should return a dataframe that contains the information needed by the dataloader to load each of
-        the dataset items into memory.
-        The dataframe must at least contain the following columns:
-            split: the subset to which the dataset item is assigned.
-            image_path: path to file system location where the image is stored.
-            label_index: index of the anomaly label, typically 0 for "normal" and 1 for "anomalous".
-            mask_path (if task == "segmentation"): path to the ground truth masks (for the anomalous images only).
-
-        Example:
-        |---|-------------------|-----------|-------------|------------------|-------|
-        |   | image_path        | label     | label_index | mask_path        | split |
-        |---|-------------------|-----------|-------------|------------------|-------|
-        | 0 | path/to/image.png | anomalous | 1           | path/to/mask.png | train |
-        |---|-------------------|-----------|-------------|------------------|-------|
-        """
-        raise NotImplementedError
-
-
-class AnomalibDataModule(LightningDataModule, ABC):
-    """Base Anomalib data module.
-
-    Args:
-        train_batch_size (int): Batch size used by the train dataloader.
-        test_batch_size (int): Batch size used by the val and test dataloaders.
-        num_workers (int): Number of workers used by the train, val and test dataloaders.
-    """
-
-    def __init__(self, train_batch_size: int, eval_batch_size: int, num_workers: int, val_split_mode: ValSplitMode):
-        super().__init__()
-        self.train_batch_size = train_batch_size
-        self.eval_batch_size = eval_batch_size
-        self.num_workers = num_workers
-        self.val_split_mode = val_split_mode
-
-        self.train_data: Optional[AnomalibDataset] = None
-        self.val_data: Optional[AnomalibDataset] = None
-        self.test_data: Optional[AnomalibDataset] = None
-
-        self._samples: Optional[DataFrame] = None
-
-    def setup(self, stage: Optional[str] = None):
-        """Setup train, validation and test data.
-
-        Args:
-          stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
-        """
-        if not self.is_setup:
-            self._setup(stage)
-        assert self.is_setup
-
-    def _setup(self, _stage: Optional[str] = None) -> None:
-        """Set up the datasets and perform dynamic subset splitting.
-
-        May be overridden in subclass for custom splitting behaviour.
-        """
-        assert self.train_data is not None
-        assert self.test_data is not None
-
-        self.train_data.setup()
-        self.test_data.setup()
-        if self.val_split_mode == ValSplitMode.FROM_TEST:
-            self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True)
-        elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
-            self.val_data = self.test_data
-        else:
-            raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
-
-    @property
-    def is_setup(self):
-        """Checks if setup() has been called."""
-        if self.train_data is None or self.val_data is None or self.test_data is None:
-            return False
-        return self.train_data.is_setup and self.val_data.is_setup and self.test_data.is_setup
-
-    def train_dataloader(self) -> TRAIN_DATALOADERS:
-        """Get train dataloader."""
-        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers)
-
-    def val_dataloader(self) -> EVAL_DATALOADERS:
-        """Get validation dataloader."""
-        return DataLoader(self.val_data, shuffle=False, batch_size=self.eval_batch_size, num_workers=self.num_workers)
-
-    def test_dataloader(self) -> EVAL_DATALOADERS:
-        """Get test dataloader."""
-        return DataLoader(self.test_data, shuffle=False, batch_size=self.eval_batch_size, num_workers=self.num_workers)
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index f6a4879245..97bdeabef5 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -23,8 +23,8 @@
 from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
 from tqdm import tqdm
 
-from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode
-from anomalib.data.utils import DownloadProgressBar, hash_check
+from anomalib.data import AnomalibDataModule, AnomalibDataset
+from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check
 from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index f5b3c415a3..b0907788fb 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -13,8 +13,8 @@
 from pandas import DataFrame
 from torchvision.datasets.folder import IMG_EXTENSIONS
 
-from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode
-from anomalib.data.utils.split import random_split
+from anomalib.data import AnomalibDataModule, AnomalibDataset
+from anomalib.data.utils import Split, ValSplitMode, random_split
 from anomalib.pre_processing.pre_process import PreProcessor
 
 
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 33353f5eed..7c8bbee9bd 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -32,8 +32,8 @@
 import albumentations as A
 from pandas import DataFrame
 
-from anomalib.data.base import AnomalibDataModule, AnomalibDataset, Split, ValSplitMode
-from anomalib.data.utils import DownloadProgressBar, hash_check
+from anomalib.data import AnomalibDataModule, AnomalibDataset
+from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check
 from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py
index 52b21b8fcf..53eb3f8ef2 100644
--- a/anomalib/data/utils/__init__.py
+++ b/anomalib/data/utils/__init__.py
@@ -11,7 +11,7 @@
     get_image_height_and_width,
     read_image,
 )
-from .split import concatenate_datasets, random_split
+from .split import Split, ValSplitMode, concatenate_datasets, random_split
 
 __all__ = [
     "generate_output_image_filename",
@@ -23,4 +23,6 @@
     "DownloadProgressBar",
     "random_split",
     "concatenate_datasets",
+    "Split",
+    "ValSplitMode",
 ]
diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index 5ab5f6074e..bec80e3bfd 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -15,12 +15,28 @@
 
 import math
 import warnings
+from enum import Enum
 from typing import TYPE_CHECKING, List, Sequence, Union
 
 import torch
 
 if TYPE_CHECKING:
-    from anomalib.data.base import AnomalibDataset
+    from anomalib.data import AnomalibDataset
+
+
+class Split(str, Enum):
+    """Split of a subset."""
+
+    TRAIN = "train"
+    VAL = "val"
+    TEST = "test"
+
+
+class ValSplitMode(str, Enum):
+    """Splitting mode used to obtain validation subset."""
+
+    SAME_AS_TEST = "same_as_test"
+    FROM_TEST = "from_test"
 
 
 def concatenate_datasets(datasets: Sequence[AnomalibDataset]) -> AnomalibDataset:

From 0552c1ad8b96529e1db640a1f0a0000ab10e723a Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 16:30:20 +0200
Subject: [PATCH 56/96] add synthetic dataset class

---
 anomalib/data/synthetic.py   | 104 +++++++++++++++++++++++++++++++++++
 anomalib/data/utils/split.py |   1 +
 2 files changed, 105 insertions(+)
 create mode 100644 anomalib/data/synthetic.py

diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py
new file mode 100644
index 0000000000..53ab7e3a05
--- /dev/null
+++ b/anomalib/data/synthetic.py
@@ -0,0 +1,104 @@
+"""Dataset that generates synthetic anomalies.
+
+This dataset can be used when there is a lack of real anomalous data.
+"""
+
+import os
+import tempfile
+from pathlib import Path
+from typing import Union
+
+import albumentations as A
+import cv2
+import pandas as pd
+from albumentations.pytorch import ToTensorV2
+from pandas import DataFrame
+
+from anomalib.data.base.dataset import AnomalibDataset
+from anomalib.data.utils import read_image
+from anomalib.models.draem.utils import Augmenter
+from anomalib.pre_processing import PreProcessor
+
+
+def make_synthetic_dataset(normal_samples: DataFrame, root: Union[Path, str]) -> DataFrame:
+    """Convert a set of normal samples into a mixed set of normal and synthetic anomalous samples.
+
+    The synthetic images will be saved to the file system in the specified root directory under <root>/images.
+    For the synthetic anomalous images, the masks will be saved under <root>/ground_truth.
+
+    Args:
+        normal_samples (DataFrame): DataFrame describing a set of normal images.
+        root (Union[Path, str]): Root directory to which the image files will be written.
+    """
+    im_dir = Path(root) / "images"
+    mask_dir = Path(root) / "ground_truth"
+    os.makedirs(im_dir)
+    os.makedirs(mask_dir)
+
+    # make fakes
+    augmenter = Augmenter("./datasets/dtd")
+
+    transform = A.Compose([A.ToFloat(), ToTensorV2()])
+
+    new_samples_list = []
+    for index, sample in normal_samples.iterrows():
+        # load image
+        im = read_image(sample.image_path)
+        # to tensor
+        im = transform(image=im)["image"].unsqueeze(0)
+        # apply rand aug
+        aug_im, mask = augmenter.augment_batch(im)
+        #
+        is_anomalous = mask.max() == 1
+        # write image
+        aug_im = (aug_im.squeeze().permute((1, 2, 0)) * 255).numpy()
+        aug_im = cv2.cvtColor(aug_im, cv2.COLOR_RGB2BGR)
+        im_path = im_dir / (str(index).zfill(3) + ".png")
+        cv2.imwrite(str(im_path), aug_im)
+        # write mask
+        if is_anomalous:
+            mask = (mask.squeeze() * 255).numpy()
+            mask_path = mask_dir / (str(index).zfill(3) + ".png")
+            cv2.imwrite(str(mask_path), mask)
+        # update path in samples
+        new_samples_list.append(
+            dict(
+                image_path=str(im_path),
+                label="abnormal" if is_anomalous else "normal",
+                label_index=1 if is_anomalous else 0,
+                mask_path=str(mask_path) if is_anomalous else "",
+                split=None,
+            )
+        )
+
+    return pd.DataFrame(new_samples_list)
+
+
+class SyntheticValidationSet(AnomalibDataset):
+    """Dataset which reads synthetically generated anomalous images from a temporary folder.
+
+    Args:
+        task (str): Task type, either "classification" or "segmentation".
+        pre_process (PreProcessor): Preprocessor object used to transform the input images.
+        normal_samples (DataFrame): Normal samples to which the anomalous augmentations will be applied.
+    """
+
+    def __init__(self, task: str, pre_process: PreProcessor, normal_samples: DataFrame):
+        super().__init__(task, pre_process)
+
+        self.normal_samples = normal_samples
+        self.tempfolder = tempfile.TemporaryDirectory(dir="./datasets")
+        self.setup()
+
+    @classmethod
+    def from_dataset(cls, dataset):
+        """Create a synthetic anomaly dataset from an existing dataset of normal images."""
+        return cls(task=dataset.task, pre_process=dataset.pre_process, normal_samples=dataset.samples)
+
+    def _setup(self) -> None:
+        """Create samples dataframe."""
+        self.samples = make_synthetic_dataset(self.normal_samples, self.tempfolder.name)
+
+    def __del__(self):
+        """Make sure the temporary directory is cleaned up when the dataset object is deleted."""
+        self.tempfolder.cleanup()
diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index bec80e3bfd..1c93d86095 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -37,6 +37,7 @@ class ValSplitMode(str, Enum):
 
     SAME_AS_TEST = "same_as_test"
     FROM_TEST = "from_test"
+    SYNTHETIC = "synthetic"
 
 
 def concatenate_datasets(datasets: Sequence[AnomalibDataset]) -> AnomalibDataset:

From bf4f5372375ff4e9ceadd13dc9db8fffdc504294 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 17:07:09 +0200
Subject: [PATCH 57/96] move augmenter to data directory

---
 anomalib/data/synthetic.py                    |  5 +-
 anomalib/data/utils/__init__.py               |  2 +
 .../{models/draem => data}/utils/augmenter.py | 29 ++++++---
 anomalib/models/draem/lightning_model.py      |  2 +-
 anomalib/models/draem/perlin_new.py           | 59 +++++++++++++++++++
 anomalib/models/draem/utils/__init__.py       |  8 ---
 6 files changed, 85 insertions(+), 20 deletions(-)
 rename anomalib/{models/draem => data}/utils/augmenter.py (86%)
 create mode 100644 anomalib/models/draem/perlin_new.py
 delete mode 100644 anomalib/models/draem/utils/__init__.py

diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py
index 53ab7e3a05..6b9e7b7bf8 100644
--- a/anomalib/data/synthetic.py
+++ b/anomalib/data/synthetic.py
@@ -15,8 +15,7 @@
 from pandas import DataFrame
 
 from anomalib.data.base.dataset import AnomalibDataset
-from anomalib.data.utils import read_image
-from anomalib.models.draem.utils import Augmenter
+from anomalib.data.utils import Augmenter, read_image
 from anomalib.pre_processing import PreProcessor
 
 
@@ -36,7 +35,7 @@ def make_synthetic_dataset(normal_samples: DataFrame, root: Union[Path, str]) ->
     os.makedirs(mask_dir)
 
     # make fakes
-    augmenter = Augmenter("./datasets/dtd")
+    augmenter = Augmenter("./datasets/dtd", beta=(0.01, 0.2))
 
     transform = A.Compose([A.ToFloat(), ToTensorV2()])
 
diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py
index 53eb3f8ef2..ecb869568f 100644
--- a/anomalib/data/utils/__init__.py
+++ b/anomalib/data/utils/__init__.py
@@ -3,6 +3,7 @@
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+from .augmenter import Augmenter
 from .download import DownloadProgressBar, hash_check
 from .generators import random_2d_perlin
 from .image import (
@@ -25,4 +26,5 @@
     "concatenate_datasets",
     "Split",
     "ValSplitMode",
+    "Augmenter",
 ]
diff --git a/anomalib/models/draem/utils/augmenter.py b/anomalib/data/utils/augmenter.py
similarity index 86%
rename from anomalib/models/draem/utils/augmenter.py
rename to anomalib/data/utils/augmenter.py
index 6433c4338c..e9d8e3f4ce 100644
--- a/anomalib/models/draem/utils/augmenter.py
+++ b/anomalib/data/utils/augmenter.py
@@ -13,7 +13,7 @@
 import glob
 import math
 import random
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import cv2
 import imgaug.augmenters as iaa
@@ -22,7 +22,7 @@
 from torch import Tensor
 from torchvision.datasets.folder import IMG_EXTENSIONS
 
-from anomalib.data.utils import random_2d_perlin
+from anomalib.data.utils.generators.perlin import random_2d_perlin
 
 
 def nextpow2(value):
@@ -38,7 +38,15 @@ class Augmenter:
         noise. If not specified, random noise will be used instead.
     """
 
-    def __init__(self, anomaly_source_path: Optional[str] = None):
+    def __init__(
+        self,
+        anomaly_source_path: Optional[str] = None,
+        p_anomalous: float = 0.5,
+        beta: Union[float, Tuple[float, float]] = (0.2, 1.0),
+    ):
+
+        self.p_anomalous = p_anomalous
+        self.beta = beta
 
         self.anomaly_source_paths = []
         if anomaly_source_path is not None:
@@ -132,7 +140,7 @@ def augment_batch(self, batch: Tensor) -> Tuple[Tensor, Tensor]:
         perturbations_list = []
         masks_list = []
         for _ in range(batch_size):
-            if random.random() > 0.5:  # include 50% normal samples
+            if random.random() < self.p_anomalous:  # include 50% normal samples
                 perturbations_list.append(torch.zeros((channels, height, width)))
                 masks_list.append(torch.zeros((1, height, width)))
             else:
@@ -147,9 +155,14 @@ def augment_batch(self, batch: Tensor) -> Tuple[Tensor, Tensor]:
         masks = torch.stack(masks_list).to(batch.device)
 
         # Apply perturbations batch wise
-        beta = torch.rand(batch_size) * 0.8
-        beta = beta.view(batch_size, 1, 1, 1).expand_as(batch).to(batch.device)
-
-        augmented_batch = batch * (1 - masks) + (1 - beta) * perturbations + beta * batch * (masks)
+        if isinstance(self.beta, float):
+            beta = self.beta
+        elif isinstance(self.beta, tuple):
+            beta = torch.rand(batch_size) * (self.beta[1] - self.beta[0]) + self.beta[0]
+            beta = beta.view(batch_size, 1, 1, 1).expand_as(batch).to(batch.device)
+        else:
+            raise ValueError("Beta must be either float or tuple of floats")
+
+        augmented_batch = batch * (1 - masks) + (beta) * perturbations + (1 - beta) * batch * (masks)
 
         return augmented_batch, masks
diff --git a/anomalib/models/draem/lightning_model.py b/anomalib/models/draem/lightning_model.py
index 18e3d2d41b..e54c8e9bd4 100644
--- a/anomalib/models/draem/lightning_model.py
+++ b/anomalib/models/draem/lightning_model.py
@@ -14,10 +14,10 @@
 from pytorch_lightning.utilities.cli import MODEL_REGISTRY
 from torch import Tensor, nn
 
+from anomalib.data.utils import Augmenter
 from anomalib.models.components import AnomalyModule
 from anomalib.models.draem.loss import DraemLoss
 from anomalib.models.draem.torch_model import DraemModel
-from anomalib.models.draem.utils import Augmenter
 
 __all__ = ["Draem", "DraemLightning"]
 
diff --git a/anomalib/models/draem/perlin_new.py b/anomalib/models/draem/perlin_new.py
new file mode 100644
index 0000000000..d20e2e6957
--- /dev/null
+++ b/anomalib/models/draem/perlin_new.py
@@ -0,0 +1,59 @@
+import torch as th
+from matplotlib import pyplot as plt
+
+
+def interp(t):
+    # return 3 * t**2 - 2 * t ** 3
+    return 6 * t**5 - 15 * t**4 + 10 * t**3
+
+
+def fade(t):
+    return 6 * t**5 - 15 * t**4 + 10 * t**3
+
+
+def perlin(width, height, scale=10, device=None):
+    gx, gy = th.randn(2, width + 1, height + 1, 1, 1, device=device)
+    xs = th.linspace(0, 1, scale + 1)[:-1, None].to(device)
+    ys = th.linspace(0, 1, scale + 1)[None, :-1].to(device)
+
+    wx = 1 - interp(xs)
+    wy = 1 - interp(ys)
+
+    dots = 0
+    dots += wx * wy * (gx[:-1, :-1] * xs + gy[:-1, :-1] * ys)
+    dots += (1 - wx) * wy * (-gx[1:, :-1] * (1 - xs) + gy[1:, :-1] * ys)
+    dots += wx * (1 - wy) * (gx[:-1, 1:] * xs - gy[:-1, 1:] * (1 - ys))
+    dots += (1 - wx) * (1 - wy) * (-gx[1:, 1:] * (1 - xs) - gy[1:, 1:] * (1 - ys))
+
+    return dots.permute(0, 2, 1, 3).contiguous().view(width * scale, height * scale)
+
+
+# def my_perlin(width, height, scale=10):
+
+
+def perlin_ms(octaves=[1, 1, 1, 1], width=2, height=2, device=None):
+    scale = 2 ** len(octaves)
+    out = 0
+    for oct in octaves:
+        p = perlin(width, height, scale, device)
+        out += p * oct
+        scale //= 2
+        width *= 2
+        height *= 2
+    return out
+
+
+if __name__ == "__main__":
+    perlin = perlin(224, 224, 2)
+    plt.figure(figsize=(12, 12))
+    plt.imshow(perlin)
+    plt.show()
+
+    plt.figure(figsize=(12, 12))
+    for idx, rho in enumerate([1, 2, 4, 8]):
+        plt.subplot(2, 2, idx + 1)
+        out = perlin_ms([rho**-i for i in range(4)], 6, 6).cpu().numpy()
+        # out = perlin(6, 6, 2**rho).cpu().numpy()
+        plt.imshow(out)
+        plt.title(f"Decay for finer grids as {rho} ** -scale")
+    plt.show()
diff --git a/anomalib/models/draem/utils/__init__.py b/anomalib/models/draem/utils/__init__.py
deleted file mode 100644
index dde7003813..0000000000
--- a/anomalib/models/draem/utils/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""Helpers for the DRAEM model implementation."""
-
-# Copyright (C) 2022 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-from .augmenter import Augmenter
-
-__all__ = ["Augmenter"]

From cc328967952a17d99630665edc262f22e4e05668 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 17:10:38 +0200
Subject: [PATCH 58/96] add base classes

---
 anomalib/data/base/__init__.py   |  10 ++
 anomalib/data/base/datamodule.py |  89 ++++++++++++++++++
 anomalib/data/base/dataset.py    | 155 +++++++++++++++++++++++++++++++
 3 files changed, 254 insertions(+)
 create mode 100644 anomalib/data/base/__init__.py
 create mode 100644 anomalib/data/base/datamodule.py
 create mode 100644 anomalib/data/base/dataset.py

diff --git a/anomalib/data/base/__init__.py b/anomalib/data/base/__init__.py
new file mode 100644
index 0000000000..afb5a62463
--- /dev/null
+++ b/anomalib/data/base/__init__.py
@@ -0,0 +1,10 @@
+"""Base classes for custom dataset and datamodules."""
+
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+
+from .datamodule import AnomalibDataModule
+from .dataset import AnomalibDataset
+
+__all__ = ["AnomalibDataset", "AnomalibDataModule"]
diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py
new file mode 100644
index 0000000000..b8d8603bdd
--- /dev/null
+++ b/anomalib/data/base/datamodule.py
@@ -0,0 +1,89 @@
+"""Anomalib datamodule base class."""
+
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import logging
+from abc import ABC
+from typing import Optional
+
+from pandas import DataFrame
+from pytorch_lightning import LightningDataModule
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch.utils.data import DataLoader
+
+from anomalib.data.base.dataset import AnomalibDataset
+from anomalib.data.utils import ValSplitMode, random_split
+
+logger = logging.getLogger(__name__)
+
+
+class AnomalibDataModule(LightningDataModule, ABC):
+    """Base Anomalib data module.
+
+    Args:
+        train_batch_size (int): Batch size used by the train dataloader.
+        test_batch_size (int): Batch size used by the val and test dataloaders.
+        num_workers (int): Number of workers used by the train, val and test dataloaders.
+    """
+
+    def __init__(self, train_batch_size: int, eval_batch_size: int, num_workers: int, val_split_mode: ValSplitMode):
+        super().__init__()
+        self.train_batch_size = train_batch_size
+        self.eval_batch_size = eval_batch_size
+        self.num_workers = num_workers
+        self.val_split_mode = val_split_mode
+
+        self.train_data: Optional[AnomalibDataset] = None
+        self.val_data: Optional[AnomalibDataset] = None
+        self.test_data: Optional[AnomalibDataset] = None
+
+        self._samples: Optional[DataFrame] = None
+
+    def setup(self, stage: Optional[str] = None):
+        """Setup train, validation and test data.
+
+        Args:
+          stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
+        """
+        if not self.is_setup:
+            self._setup(stage)
+        assert self.is_setup
+
+    def _setup(self, _stage: Optional[str] = None) -> None:
+        """Set up the datasets and perform dynamic subset splitting.
+
+        May be overridden in subclass for custom splitting behaviour.
+        """
+        assert self.train_data is not None
+        assert self.test_data is not None
+
+        self.train_data.setup()
+        self.test_data.setup()
+        if self.val_split_mode == ValSplitMode.FROM_TEST:
+            self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True)
+        elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
+            self.val_data = self.test_data
+        else:
+            raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
+
+    @property
+    def is_setup(self):
+        """Checks if setup() has been called."""
+        if self.train_data is None or self.val_data is None or self.test_data is None:
+            return False
+        return self.train_data.is_setup and self.val_data.is_setup and self.test_data.is_setup
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        """Get train dataloader."""
+        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        """Get validation dataloader."""
+        return DataLoader(self.val_data, shuffle=False, batch_size=self.eval_batch_size, num_workers=self.num_workers)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        """Get test dataloader."""
+        return DataLoader(self.test_data, shuffle=False, batch_size=self.eval_batch_size, num_workers=self.num_workers)
diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py
new file mode 100644
index 0000000000..e5ae8fceaf
--- /dev/null
+++ b/anomalib/data/base/dataset.py
@@ -0,0 +1,155 @@
+"""Anomalib dataset and datamodule base classes."""
+
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import copy
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, Sequence, Union
+
+import cv2
+import numpy as np
+import pandas as pd
+from pandas import DataFrame
+from torch import Tensor
+from torch.utils.data import Dataset
+
+from anomalib.data.utils import read_image
+from anomalib.pre_processing import PreProcessor
+
+logger = logging.getLogger(__name__)
+
+
+class AnomalibDataset(Dataset, ABC):
+    """Anomalib dataset."""
+
+    def __init__(self, task: str, pre_process: PreProcessor):
+        super().__init__()
+        self.task = task
+        self.pre_process = pre_process
+        self._samples = None
+
+    def __len__(self) -> int:
+        """Get length of the dataset."""
+        assert isinstance(self._samples, DataFrame)
+        return len(self._samples)
+
+    def subsample(self, indices: Sequence[int], inplace=False) -> AnomalibDataset:
+        """Subsamples the dataset at the provided indices.
+
+        Args:
+            indices (Sequence[int]): Indices at which the dataset is to be subsampled.
+            inplace (bool): When true, the subsampling will be performed on the instance itself.
+        """
+        dataset = self if inplace else copy.deepcopy(self)
+        dataset.samples = self.samples.iloc[indices].reset_index(drop=True)
+        return dataset
+
+    @property
+    def is_setup(self) -> bool:
+        """Checks if setup() been called."""
+        return isinstance(self._samples, DataFrame)
+
+    @property
+    def samples(self) -> DataFrame:
+        """Get the samples dataframe."""
+        if not self.is_setup:
+            raise RuntimeError("Dataset is not setup yet. Call setup() first.")
+        return self._samples
+
+    @samples.setter
+    def samples(self, samples: DataFrame):
+        """Overwrite the samples with a new dataframe.
+
+        Args:
+            samples (DataFrame): DataFrame with new samples.
+        """
+        self._samples = samples.sort_values(by="image_path", ignore_index=True)
+
+    @property
+    def has_normal(self) -> bool:
+        """Check if the dataset contains any normal samples."""
+        return 0 in list(self.samples.label_index)
+
+    @property
+    def has_anomalous(self) -> bool:
+        """Check if the dataset contains any anomalous samples."""
+        return 1 in list(self.samples.label_index)
+
+    def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
+        """Get dataset item for the index ``index``.
+
+        Args:
+            index (int): Index to get the item.
+
+        Returns:
+            Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
+                Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
+        """
+        assert isinstance(self._samples, DataFrame)
+
+        image_path = self._samples.iloc[index].image_path
+        image = read_image(image_path)
+        label_index = self._samples.iloc[index].label_index
+
+        item = dict(image_path=image_path, label=label_index)
+
+        if self.task == "classification":
+            pre_processed = self.pre_process(image=image)
+        elif self.task == "segmentation":
+            mask_path = self._samples.iloc[index].mask_path
+
+            # Only Anomalous (1) images have masks in anomaly datasets
+            # Therefore, create empty mask for Normal (0) images.
+            if label_index == 0:
+                mask = np.zeros(shape=image.shape[:2])
+            else:
+                mask = cv2.imread(mask_path, flags=0) / 255.0
+
+            pre_processed = self.pre_process(image=image, mask=mask)
+
+            item["mask_path"] = mask_path
+            item["mask"] = pre_processed["mask"]
+        else:
+            raise ValueError(f"Unknown task type: {self.task}")
+        item["image"] = pre_processed["image"]
+
+        return item
+
+    def __add__(self, other_dataset: AnomalibDataset) -> AnomalibDataset:
+        """Concatenate this dataset with another dataset."""
+        assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of the same type."
+        assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first."
+        dataset = copy.deepcopy(self)
+        dataset.samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True)
+        return dataset
+
+    def setup(self) -> None:
+        """Load data/metadata into memory."""
+        if not self.is_setup:
+            self._setup()
+        assert self.is_setup, "setup() should set self._samples"
+
+    @abstractmethod
+    def _setup(self) -> DataFrame:
+        """Set up the data module.
+
+        This method should return a dataframe that contains the information needed by the dataloader to load each of
+        the dataset items into memory.
+        The dataframe must at least contain the following columns:
+            split: the subset to which the dataset item is assigned.
+            image_path: path to file system location where the image is stored.
+            label_index: index of the anomaly label, typically 0 for "normal" and 1 for "anomalous".
+            mask_path (if task == "segmentation"): path to the ground truth masks (for the anomalous images only).
+
+        Example:
+        |---|-------------------|-----------|-------------|------------------|-------|
+        |   | image_path        | label     | label_index | mask_path        | split |
+        |---|-------------------|-----------|-------------|------------------|-------|
+        | 0 | path/to/image.png | anomalous | 1           | path/to/mask.png | train |
+        |---|-------------------|-----------|-------------|------------------|-------|
+        """
+        raise NotImplementedError

From 23d47666bd15e317c163eef02c26782e9f5142e8 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 17:12:01 +0200
Subject: [PATCH 59/96] update docstring

---
 anomalib/data/base/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py
index e5ae8fceaf..c73c06b185 100644
--- a/anomalib/data/base/dataset.py
+++ b/anomalib/data/base/dataset.py
@@ -1,4 +1,4 @@
-"""Anomalib dataset and datamodule base classes."""
+"""Anomalib dataset base class."""
 
 # Copyright (C) 2022 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0

From 05ba31df773fd4b0c1c59f546f5bbdc716d8ac51 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 17:18:43 +0200
Subject: [PATCH 60/96] use synthetic dataset in base datamodule

---
 anomalib/data/base/datamodule.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py
index b8d8603bdd..576d68bc43 100644
--- a/anomalib/data/base/datamodule.py
+++ b/anomalib/data/base/datamodule.py
@@ -15,6 +15,7 @@
 from torch.utils.data import DataLoader
 
 from anomalib.data.base.dataset import AnomalibDataset
+from anomalib.data.synthetic import SyntheticValidationSet
 from anomalib.data.utils import ValSplitMode, random_split
 
 logger = logging.getLogger(__name__)
@@ -66,6 +67,9 @@ def _setup(self, _stage: Optional[str] = None) -> None:
             self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True)
         elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
             self.val_data = self.test_data
+        elif self.val_split_mode == ValSplitMode.SYNTHETIC:
+            self.train_data, normal_val_data = random_split(self.train_data, 0.5)
+            self.val_data = SyntheticValidationSet.from_dataset(normal_val_data)
         else:
             raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
 

From e8d7998c669baf9f8f1f0827c008cbafc8473c7e Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 14 Oct 2022 17:21:33 +0200
Subject: [PATCH 61/96] fix imports

---
 anomalib/data/btech.py  | 2 +-
 anomalib/data/folder.py | 2 +-
 anomalib/data/mvtec.py  | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 97bdeabef5..1636d5ec9f 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -23,7 +23,7 @@
 from pytorch_lightning.utilities.cli import DATAMODULE_REGISTRY
 from tqdm import tqdm
 
-from anomalib.data import AnomalibDataModule, AnomalibDataset
+from anomalib.data.base import AnomalibDataModule, AnomalibDataset
 from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check
 from anomalib.pre_processing import PreProcessor
 
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index b0907788fb..bcfd30adf6 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -13,7 +13,7 @@
 from pandas import DataFrame
 from torchvision.datasets.folder import IMG_EXTENSIONS
 
-from anomalib.data import AnomalibDataModule, AnomalibDataset
+from anomalib.data.base import AnomalibDataModule, AnomalibDataset
 from anomalib.data.utils import Split, ValSplitMode, random_split
 from anomalib.pre_processing.pre_process import PreProcessor
 
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 7c8bbee9bd..8dd70af4a1 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -32,7 +32,7 @@
 import albumentations as A
 from pandas import DataFrame
 
-from anomalib.data import AnomalibDataModule, AnomalibDataset
+from anomalib.data.base import AnomalibDataModule, AnomalibDataset
 from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check
 from anomalib.pre_processing import PreProcessor
 

From 26b6b83375483333eda530f1755e7812d371161b Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 17 Oct 2022 15:11:09 +0200
Subject: [PATCH 62/96] clean up synthetic anomaly dataset implementation

---
 anomalib/data/synthetic.py | 128 +++++++++++++++++++++++--------------
 1 file changed, 79 insertions(+), 49 deletions(-)

diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py
index 6b9e7b7bf8..3f8956f21b 100644
--- a/anomalib/data/synthetic.py
+++ b/anomalib/data/synthetic.py
@@ -3,8 +3,9 @@
 This dataset can be used when there is a lack of real anomalous data.
 """
 
+import math
 import os
-import tempfile
+import shutil
 from pathlib import Path
 from typing import Union
 
@@ -12,65 +13,81 @@
 import cv2
 import pandas as pd
 from albumentations.pytorch import ToTensorV2
-from pandas import DataFrame
+from pandas import DataFrame, Series
 
 from anomalib.data.base.dataset import AnomalibDataset
-from anomalib.data.utils import Augmenter, read_image
+from anomalib.data.utils import Augmenter, Split, read_image
 from anomalib.pre_processing import PreProcessor
 
 
-def make_synthetic_dataset(normal_samples: DataFrame, root: Union[Path, str]) -> DataFrame:
+def make_synthetic_dataset(
+    source_samples: DataFrame, im_dir: Union[Path, str], mask_dir: Union[Path, str], anomalous_ratio: float = 0.5
+) -> DataFrame:
     """Convert a set of normal samples into a mixed set of normal and synthetic anomalous samples.
 
     The synthetic images will be saved to the file system in the specified root directory under <root>/images.
     For the synthetic anomalous images, the masks will be saved under <root>/ground_truth.
 
     Args:
-        normal_samples (DataFrame): DataFrame describing a set of normal images.
-        root (Union[Path, str]): Root directory to which the image files will be written.
+        source_samples (DataFrame): Normal images that will be used as source for the synthetic anomalous images.
+        im_dir (Union[Path, str]): Directory to which the synthetic anomalous image files will be written.
+        mask_dir (Union[Path, str]): Directory to which the ground truth anomaly masks will be written.
+        anomalous_ratio (float): Fraction of source samples that will be converted into anomalous samples.
     """
-    im_dir = Path(root) / "images"
-    mask_dir = Path(root) / "ground_truth"
-    os.makedirs(im_dir)
-    os.makedirs(mask_dir)
-
-    # make fakes
-    augmenter = Augmenter("./datasets/dtd", beta=(0.01, 0.2))
-
+    assert 1 not in source_samples.label_index.values, "All source images must be normal."
+    assert os.path.isdir(im_dir), f"{im_dir} is not a folder."
+    assert os.path.isdir(mask_dir), f"{mask_dir} is not a folder"
+
+    # filter relevant columns
+    source_samples = source_samples.filter(["image_path", "label", "label_index", "mask_path", "split"])
+    # randomly select samples for augmentation
+    n_anomalous = int(anomalous_ratio * len(source_samples))
+    anomalous_samples = source_samples.sample(n_anomalous)
+    normal_samples = source_samples.drop(anomalous_samples.index)
+    anomalous_samples = anomalous_samples.reset_index(drop=True)
+
+    # initialize augmenter
+    augmenter = Augmenter("./datasets/dtd", p_anomalous=1.0, beta=(0.01, 0.2))
+
+    # initialize transform for source images
     transform = A.Compose([A.ToFloat(), ToTensorV2()])
 
-    new_samples_list = []
-    for index, sample in normal_samples.iterrows():
-        # load image
-        im = read_image(sample.image_path)
-        # to tensor
-        im = transform(image=im)["image"].unsqueeze(0)
-        # apply rand aug
-        aug_im, mask = augmenter.augment_batch(im)
-        #
-        is_anomalous = mask.max() == 1
+    def augment(sample: Series) -> Series:
+        """Helper function to apply synthetic anomalous augmentation to a sample from a dataframe.
+
+        Reads an image, applies the augmentations, writes the augmented image and corresponding mask to the file system,
+        and returns a new Series object with the updates labels and file locations.
+
+        Args:
+            sample (Series): DataFrame row containing info about the image that will be augmented.
+
+        Returns:
+            Series: DataFrame row with updated information about the augmented image.
+        """
+        # read and transform image
+        image = read_image(sample.image_path)
+        image = transform(image=image)["image"].unsqueeze(0)
+        # apply anomalous perturbation
+        aug_im, mask = augmenter.augment_batch(image)
+        # target file name with leading zeros
+        file_name = f"{str(sample.name).zfill(int(math.log10(n_anomalous)) + 1)}.png"
         # write image
         aug_im = (aug_im.squeeze().permute((1, 2, 0)) * 255).numpy()
         aug_im = cv2.cvtColor(aug_im, cv2.COLOR_RGB2BGR)
-        im_path = im_dir / (str(index).zfill(3) + ".png")
-        cv2.imwrite(str(im_path), aug_im)
+        im_path = str(Path(im_dir) / file_name)
+        cv2.imwrite(im_path, aug_im)
         # write mask
-        if is_anomalous:
-            mask = (mask.squeeze() * 255).numpy()
-            mask_path = mask_dir / (str(index).zfill(3) + ".png")
-            cv2.imwrite(str(mask_path), mask)
-        # update path in samples
-        new_samples_list.append(
-            dict(
-                image_path=str(im_path),
-                label="abnormal" if is_anomalous else "normal",
-                label_index=1 if is_anomalous else 0,
-                mask_path=str(mask_path) if is_anomalous else "",
-                split=None,
-            )
-        )
-
-    return pd.DataFrame(new_samples_list)
+        mask = (mask.squeeze() * 255).numpy()
+        mask_path = str(Path(mask_dir) / file_name)
+        cv2.imwrite(mask_path, mask)
+        out = dict(image_path=im_path, label="abnormal", label_index=1, mask_path=mask_path, split=Split.VAL)
+        return Series(out)
+
+    anomalous_samples = anomalous_samples.apply(augment, axis=1)
+
+    samples = pd.concat([normal_samples, anomalous_samples], ignore_index=True)
+
+    return samples
 
 
 class SyntheticValidationSet(AnomalibDataset):
@@ -79,25 +96,38 @@ class SyntheticValidationSet(AnomalibDataset):
     Args:
         task (str): Task type, either "classification" or "segmentation".
         pre_process (PreProcessor): Preprocessor object used to transform the input images.
-        normal_samples (DataFrame): Normal samples to which the anomalous augmentations will be applied.
+        source_samples (DataFrame): Normal samples to which the anomalous augmentations will be applied.
     """
 
-    def __init__(self, task: str, pre_process: PreProcessor, normal_samples: DataFrame):
+    def __init__(self, task: str, pre_process: PreProcessor, source_samples: DataFrame):
         super().__init__(task, pre_process)
 
-        self.normal_samples = normal_samples
-        self.tempfolder = tempfile.TemporaryDirectory(dir="./datasets")
+        self.source_samples = source_samples
+
+        # Files will be written to a temporary directory in the workdir, which is cleaned up after code execution
+        self.root = Path("./.tmp/synthetic_anomaly")
+        self.im_dir = self.root / "images"
+        self.mask_dir = self.root / "ground_truth"
+
+        # clean up any existing data that may be left over from previous run
+        if os.path.exists(self.root):
+            shutil.rmtree(self.root)
+
+        # create directories
+        os.makedirs(self.im_dir)
+        os.makedirs(self.mask_dir)
+
         self.setup()
 
     @classmethod
     def from_dataset(cls, dataset):
         """Create a synthetic anomaly dataset from an existing dataset of normal images."""
-        return cls(task=dataset.task, pre_process=dataset.pre_process, normal_samples=dataset.samples)
+        return cls(task=dataset.task, pre_process=dataset.pre_process, source_samples=dataset.samples)
 
     def _setup(self) -> None:
         """Create samples dataframe."""
-        self.samples = make_synthetic_dataset(self.normal_samples, self.tempfolder.name)
+        self.samples = make_synthetic_dataset(self.source_samples, self.im_dir, self.mask_dir, 0.5)
 
     def __del__(self):
         """Make sure the temporary directory is cleaned up when the dataset object is deleted."""
-        self.tempfolder.cleanup()
+        shutil.rmtree(self.root)

From c32fee94bb656b3e607201852530b89afd8d3446 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 17 Oct 2022 15:35:16 +0200
Subject: [PATCH 63/96] fix mistake in augmenter

---
 anomalib/data/utils/augmenter.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/anomalib/data/utils/augmenter.py b/anomalib/data/utils/augmenter.py
index e9d8e3f4ce..b08bb11898 100644
--- a/anomalib/data/utils/augmenter.py
+++ b/anomalib/data/utils/augmenter.py
@@ -140,7 +140,7 @@ def augment_batch(self, batch: Tensor) -> Tuple[Tensor, Tensor]:
         perturbations_list = []
         masks_list = []
         for _ in range(batch_size):
-            if random.random() < self.p_anomalous:  # include 50% normal samples
+            if random.random() > self.p_anomalous:  # include normal samples
                 perturbations_list.append(torch.zeros((channels, height, width)))
                 masks_list.append(torch.zeros((1, height, width)))
             else:
@@ -159,7 +159,7 @@ def augment_batch(self, batch: Tensor) -> Tuple[Tensor, Tensor]:
             beta = self.beta
         elif isinstance(self.beta, tuple):
             beta = torch.rand(batch_size) * (self.beta[1] - self.beta[0]) + self.beta[0]
-            beta = beta.view(batch_size, 1, 1, 1).expand_as(batch).to(batch.device)
+            beta = beta.view(batch_size, 1, 1, 1).expand_as(batch).to(batch.device)  # type: ignore
         else:
             raise ValueError("Beta must be either float or tuple of floats")
 

From e1204349c87a630a4b9c8cd23af57a5a98069a7d Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 17 Oct 2022 15:51:57 +0200
Subject: [PATCH 64/96] change default split ratio

---
 anomalib/data/base/datamodule.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py
index 576d68bc43..361676a60a 100644
--- a/anomalib/data/base/datamodule.py
+++ b/anomalib/data/base/datamodule.py
@@ -68,7 +68,7 @@ def _setup(self, _stage: Optional[str] = None) -> None:
         elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
             self.val_data = self.test_data
         elif self.val_split_mode == ValSplitMode.SYNTHETIC:
-            self.train_data, normal_val_data = random_split(self.train_data, 0.5)
+            self.train_data, normal_val_data = random_split(self.train_data, 0.3)
             self.val_data = SyntheticValidationSet.from_dataset(normal_val_data)
         else:
             raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")

From 14ee645399651eeeee740d99ef268dd8c7bc76d8 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 17 Oct 2022 17:22:43 +0200
Subject: [PATCH 65/96] remove accidentally added file

---
 anomalib/models/draem/perlin_new.py | 59 -----------------------------
 1 file changed, 59 deletions(-)
 delete mode 100644 anomalib/models/draem/perlin_new.py

diff --git a/anomalib/models/draem/perlin_new.py b/anomalib/models/draem/perlin_new.py
deleted file mode 100644
index d20e2e6957..0000000000
--- a/anomalib/models/draem/perlin_new.py
+++ /dev/null
@@ -1,59 +0,0 @@
-import torch as th
-from matplotlib import pyplot as plt
-
-
-def interp(t):
-    # return 3 * t**2 - 2 * t ** 3
-    return 6 * t**5 - 15 * t**4 + 10 * t**3
-
-
-def fade(t):
-    return 6 * t**5 - 15 * t**4 + 10 * t**3
-
-
-def perlin(width, height, scale=10, device=None):
-    gx, gy = th.randn(2, width + 1, height + 1, 1, 1, device=device)
-    xs = th.linspace(0, 1, scale + 1)[:-1, None].to(device)
-    ys = th.linspace(0, 1, scale + 1)[None, :-1].to(device)
-
-    wx = 1 - interp(xs)
-    wy = 1 - interp(ys)
-
-    dots = 0
-    dots += wx * wy * (gx[:-1, :-1] * xs + gy[:-1, :-1] * ys)
-    dots += (1 - wx) * wy * (-gx[1:, :-1] * (1 - xs) + gy[1:, :-1] * ys)
-    dots += wx * (1 - wy) * (gx[:-1, 1:] * xs - gy[:-1, 1:] * (1 - ys))
-    dots += (1 - wx) * (1 - wy) * (-gx[1:, 1:] * (1 - xs) - gy[1:, 1:] * (1 - ys))
-
-    return dots.permute(0, 2, 1, 3).contiguous().view(width * scale, height * scale)
-
-
-# def my_perlin(width, height, scale=10):
-
-
-def perlin_ms(octaves=[1, 1, 1, 1], width=2, height=2, device=None):
-    scale = 2 ** len(octaves)
-    out = 0
-    for oct in octaves:
-        p = perlin(width, height, scale, device)
-        out += p * oct
-        scale //= 2
-        width *= 2
-        height *= 2
-    return out
-
-
-if __name__ == "__main__":
-    perlin = perlin(224, 224, 2)
-    plt.figure(figsize=(12, 12))
-    plt.imshow(perlin)
-    plt.show()
-
-    plt.figure(figsize=(12, 12))
-    for idx, rho in enumerate([1, 2, 4, 8]):
-        plt.subplot(2, 2, idx + 1)
-        out = perlin_ms([rho**-i for i in range(4)], 6, 6).cpu().numpy()
-        # out = perlin(6, 6, 2**rho).cpu().numpy()
-        plt.imshow(out)
-        plt.title(f"Decay for finer grids as {rho} ** -scale")
-    plt.show()

From 9c4e7bf6e7f05a8b036f3bde5b4579193df861e2 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Tue, 18 Oct 2022 15:35:18 +0200
Subject: [PATCH 66/96] validation_split_mode -> val_split_mode

---
 anomalib/data/__init__.py                        | 6 +++---
 anomalib/models/cflow/config.yaml                | 2 +-
 anomalib/models/dfkde/config.yaml                | 2 +-
 anomalib/models/dfm/config.yaml                  | 2 +-
 anomalib/models/draem/config.yaml                | 2 +-
 anomalib/models/fastflow/config.yaml             | 2 +-
 anomalib/models/ganomaly/config.yaml             | 2 +-
 anomalib/models/padim/config.yaml                | 2 +-
 anomalib/models/patchcore/config.yaml            | 2 +-
 anomalib/models/reverse_distillation/config.yaml | 2 +-
 anomalib/models/stfpm/config.yaml                | 2 +-
 11 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index 0cc71d9d0b..55cdd7aa11 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -41,7 +41,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             task=config.dataset.task,
             transform_config_train=config.dataset.transform_config.train,
             transform_config_eval=config.dataset.transform_config.eval,
-            val_split_mode=config.dataset.validation_split_mode,
+            val_split_mode=config.dataset.val_split_mode,
         )
     elif config.dataset.format.lower() == "btech":
         datamodule = BTech(
@@ -54,7 +54,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             task=config.dataset.task,
             transform_config_train=config.dataset.transform_config.train,
             transform_config_eval=config.dataset.transform_config.eval,
-            val_split_mode=config.dataset.validation_split_mode,
+            val_split_mode=config.dataset.val_split_mode,
         )
     elif config.dataset.format.lower() == "folder":
         datamodule = Folder(
@@ -72,7 +72,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             num_workers=config.dataset.num_workers,
             transform_config_train=config.dataset.transform_config.train,
             transform_config_eval=config.dataset.transform_config.eval,
-            val_split_mode=config.dataset.validation_split_mode,
+            val_split_mode=config.dataset.val_split_mode,
         )
     else:
         raise ValueError(
diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml
index 2a823620ba..239bfddfa5 100644
--- a/anomalib/models/cflow/config.yaml
+++ b/anomalib/models/cflow/config.yaml
@@ -13,7 +13,7 @@ dataset:
   transform_config:
     train: null
     eval: null
-  validation_split_mode: same_as_test # options: [same_as_test, from_test]
+  val_split_mode: same_as_test # options: [same_as_test, from_test]
 
 model:
   name: cflow
diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml
index 7e9961f660..1c0dd3491a 100644
--- a/anomalib/models/dfkde/config.yaml
+++ b/anomalib/models/dfkde/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     eval: null
-  validation_split_mode: same_as_test # options: [same_as_test, from_test]
+  val_split_mode: same_as_test # options: [same_as_test, from_test]
 
 model:
   name: dfkde
diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml
index 807f39e5db..e9ebee1501 100755
--- a/anomalib/models/dfm/config.yaml
+++ b/anomalib/models/dfm/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     eval: null
-  validation_split_mode: same_as_test # options: [same_as_test, from_test]
+  val_split_mode: same_as_test # options: [same_as_test, from_test]
 
 model:
   name: dfm
diff --git a/anomalib/models/draem/config.yaml b/anomalib/models/draem/config.yaml
index 5f225e4cff..2435654923 100644
--- a/anomalib/models/draem/config.yaml
+++ b/anomalib/models/draem/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: ./anomalib/models/draem/transform_config.yaml
     eval: ./anomalib/models/draem/transform_config.yaml
-  validation_split_mode: same_as_test # options: [same_as_test, from_test]
+  val_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/fastflow/config.yaml b/anomalib/models/fastflow/config.yaml
index d1ad0d6eae..e7fb76da45 100644
--- a/anomalib/models/fastflow/config.yaml
+++ b/anomalib/models/fastflow/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     eval: null
-  validation_split_mode: same_as_test # options: [same_as_test, from_test]
+  val_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml
index 542f117df1..f8ddab8fba 100644
--- a/anomalib/models/ganomaly/config.yaml
+++ b/anomalib/models/ganomaly/config.yaml
@@ -12,7 +12,7 @@ dataset:
   transform_config:
     train: null
     eval: null
-  validation_split_mode: same_as_test # options: [same_as_test, from_test]
+  val_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: true
     tile_size: 64
diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml
index bb08d58ab5..91e3cccaf8 100644
--- a/anomalib/models/padim/config.yaml
+++ b/anomalib/models/padim/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     eval: null
-  validation_split_mode: same_as_test # options: [same_as_test, from_test]
+  val_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml
index 38fc14bb38..5392e2740e 100644
--- a/anomalib/models/patchcore/config.yaml
+++ b/anomalib/models/patchcore/config.yaml
@@ -11,7 +11,7 @@ dataset:
   transform_config:
     train: null
     eval: null
-  validation_split_mode: same_as_test # options: [same_as_test, from_test]
+  val_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/reverse_distillation/config.yaml b/anomalib/models/reverse_distillation/config.yaml
index 1a6a697f36..e8d2289ff6 100644
--- a/anomalib/models/reverse_distillation/config.yaml
+++ b/anomalib/models/reverse_distillation/config.yaml
@@ -12,7 +12,7 @@ dataset:
   transform_config:
     train: null
     eval: null
-  validation_split_mode: same_as_test # options: [same_as_test, from_test]
+  val_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
     tile_size: 64
diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml
index a25a558f41..b08bc97387 100644
--- a/anomalib/models/stfpm/config.yaml
+++ b/anomalib/models/stfpm/config.yaml
@@ -12,7 +12,7 @@ dataset:
   transform_config:
     train: null
     eval: null
-  validation_split_mode: same_as_test # options: [same_as_test, from_test]
+  val_split_mode: same_as_test # options: [same_as_test, from_test]
   tiling:
     apply: false
     tile_size: null

From 067d601de090419cf1b3d4010bee5b781ec1e3f3 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Wed, 19 Oct 2022 09:01:24 +0200
Subject: [PATCH 67/96] update docs

---
 docs/source/how_to_guides/train_custom_data.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/how_to_guides/train_custom_data.rst b/docs/source/how_to_guides/train_custom_data.rst
index 9a70fdc88e..4d1652462f 100644
--- a/docs/source/how_to_guides/train_custom_data.rst
+++ b/docs/source/how_to_guides/train_custom_data.rst
@@ -87,7 +87,7 @@ Let's choose `Padim algorithm <https://arxiv.org/pdf/2011.08785.pdf>`_, copy the
     transform_config:
         train: null
         eval: null
-    validation_split_mode: from_test # determines how the validation set is created, options [same_as_test, from_test]
+    val_split_mode: from_test # determines how the validation set is created, options [same_as_test, from_test]
     tiling:
         apply: false
         tile_size: null

From c84c99c9917cd733d30ea5eb191794805d05bd7b Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 21 Oct 2022 11:33:56 +0200
Subject: [PATCH 68/96] Update anomalib/data/base/dataset.py

Co-authored-by: Joao P C Bertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 anomalib/data/base/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py
index c73c06b185..2baae26eaf 100644
--- a/anomalib/data/base/dataset.py
+++ b/anomalib/data/base/dataset.py
@@ -30,7 +30,7 @@ def __init__(self, task: str, pre_process: PreProcessor):
         super().__init__()
         self.task = task
         self.pre_process = pre_process
-        self._samples = None
+        self._samples: DataFrame = None
 
     def __len__(self) -> int:
         """Get length of the dataset."""

From b680d44ade9409a6627e420968fa57b75f85ab3a Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 21 Oct 2022 11:41:15 +0200
Subject: [PATCH 69/96] get length from self.samples

---
 anomalib/data/base/dataset.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py
index 2baae26eaf..daec19c2ee 100644
--- a/anomalib/data/base/dataset.py
+++ b/anomalib/data/base/dataset.py
@@ -34,8 +34,7 @@ def __init__(self, task: str, pre_process: PreProcessor):
 
     def __len__(self) -> int:
         """Get length of the dataset."""
-        assert isinstance(self._samples, DataFrame)
-        return len(self._samples)
+        return len(self.samples)
 
     def subsample(self, indices: Sequence[int], inplace=False) -> AnomalibDataset:
         """Subsamples the dataset at the provided indices.

From 95c37b004bbd2e0ff4ea9655510a7f1fd43e5171 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 21 Oct 2022 11:48:40 +0200
Subject: [PATCH 70/96] assert unique indices

---
 anomalib/data/base/dataset.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py
index daec19c2ee..af732ef26c 100644
--- a/anomalib/data/base/dataset.py
+++ b/anomalib/data/base/dataset.py
@@ -43,6 +43,7 @@ def subsample(self, indices: Sequence[int], inplace=False) -> AnomalibDataset:
             indices (Sequence[int]): Indices at which the dataset is to be subsampled.
             inplace (bool): When true, the subsampling will be performed on the instance itself.
         """
+        assert len(set(indices)) == len(indices), "No duplicates allowed in indices."
         dataset = self if inplace else copy.deepcopy(self)
         dataset.samples = self.samples.iloc[indices].reset_index(drop=True)
         return dataset

From 3e77014b496068ded588139d4cacf64cc1c0c37b Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 21 Oct 2022 11:59:01 +0200
Subject: [PATCH 71/96] check is_setup for individual datasets

Co-authored-by: Joao P C Bertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 anomalib/data/base/dataset.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py
index af732ef26c..d6bfb33af6 100644
--- a/anomalib/data/base/dataset.py
+++ b/anomalib/data/base/dataset.py
@@ -122,7 +122,8 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
     def __add__(self, other_dataset: AnomalibDataset) -> AnomalibDataset:
         """Concatenate this dataset with another dataset."""
         assert isinstance(other_dataset, self.__class__), "Cannot concatenate datasets that are not of the same type."
-        assert self.is_setup and other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first."
+        assert self.is_setup, "Cannot concatenate uninitialized datasets. Call setup first."
+        assert other_dataset.is_setup, "Cannot concatenate uninitialized datasets. Call setup first."
         dataset = copy.deepcopy(self)
         dataset.samples = pd.concat([self.samples, other_dataset.samples], ignore_index=True)
         return dataset

From ede213a215bf123cb15e986ed6c4b5af7ea71965 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 21 Oct 2022 12:04:39 +0200
Subject: [PATCH 72/96] remove assert in __getitem_\

Co-authored-by: Joao P C Bertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 anomalib/data/base/dataset.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py
index d6bfb33af6..48e0fa3489 100644
--- a/anomalib/data/base/dataset.py
+++ b/anomalib/data/base/dataset.py
@@ -89,7 +89,6 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
             Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
                 Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
         """
-        assert isinstance(self._samples, DataFrame)
 
         image_path = self._samples.iloc[index].image_path
         image = read_image(image_path)

From f5e2d240dbce0df4610bb3eec265aed7eb1dc6b3 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 21 Oct 2022 12:05:49 +0200
Subject: [PATCH 73/96] Update anomalib/data/btech.py

Co-authored-by: Joao P C Bertoldo <24547377+jpcbertoldo@users.noreply.github.com>
---
 anomalib/data/btech.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 1636d5ec9f..271ad066ec 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -159,7 +159,7 @@ def __init__(
         """
         super().__init__(task, pre_process)
 
-        self.root_category = Path(root) / Path(category)
+        self.root_category = Path(root) / category
         self.split = split
 
     def _setup(self):

From d9e136905d24b2b9e46b77330c5cf0a434427f91 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 21 Oct 2022 15:44:20 +0200
Subject: [PATCH 74/96] clearer assert message

---
 anomalib/data/utils/split.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index bec80e3bfd..501a271c73 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -71,8 +71,10 @@ def random_split(
     if isinstance(split_ratio, float):
         split_ratio = [1 - split_ratio, split_ratio]
 
-    assert math.isclose(sum(split_ratio), 1) and sum(split_ratio) <= 1, "split ratios must sum to 1."
-    assert all(0 < ratio < 1 for ratio in split_ratio), "all split ratios must be between 0 and 1."
+    assert (
+        math.isclose(sum(split_ratio), 1) and sum(split_ratio) <= 1
+    ), f"split ratios must sum to 1, found {sum(split_ratio)}"
+    assert all(0 < ratio < 1 for ratio in split_ratio), f"all split ratios must be between 0 and 1, found {split_ratio}"
 
     # create list of source data
     if label_aware:
@@ -102,6 +104,5 @@ def random_split(
             [label_dataset.subsample(subset_indices) for subset_indices in torch.split(indices, subset_lengths)]
         )
 
-    # concatenate and return
     subsets = list(map(list, zip(*subsets)))
     return [concatenate_datasets(subset) for subset in subsets]

From 2e6bc608c17e8df3e4672706bfd71eb90733ac50 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 21 Oct 2022 15:49:29 +0200
Subject: [PATCH 75/96] clarify list inversion in comment

---
 anomalib/data/utils/split.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index 501a271c73..60087da96a 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -104,5 +104,7 @@ def random_split(
             [label_dataset.subsample(subset_indices) for subset_indices in torch.split(indices, subset_lengths)]
         )
 
+    # invert outer/inner lists
+    # outer list: subsets with the given ratio, inner list: per-label unique
     subsets = list(map(list, zip(*subsets)))
     return [concatenate_datasets(subset) for subset in subsets]

From af0cd99f9df427335b4f99068254a5ddbac56c35 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 21 Oct 2022 15:51:26 +0200
Subject: [PATCH 76/96] comments and typing

---
 anomalib/data/utils/split.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index 60087da96a..1ce0ab2362 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -83,8 +83,9 @@ def random_split(
     else:
         per_label_datasets = [dataset]
 
+    # outer list: per-label unique, inner list: random subsets with the given ratio
+    subsets: List[List[AnomalibDataset]] = []
     # split each (label-aware) subset of source data
-    subsets = []
     for label_dataset in per_label_datasets:
         # get subset lengths
         subset_lengths = []

From 5ee8480ddf7f7f64f0945d2a056c19a2558d38d0 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 21 Oct 2022 17:50:08 +0200
Subject: [PATCH 77/96] validate contents of samples dataframe before setting

---
 anomalib/data/base/dataset.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py
index 48e0fa3489..274955a49e 100644
--- a/anomalib/data/base/dataset.py
+++ b/anomalib/data/base/dataset.py
@@ -20,6 +20,13 @@
 from anomalib.data.utils import read_image
 from anomalib.pre_processing import PreProcessor
 
+_EXPECTED_COlS_CLASSIFICATION = ["image_path", "label", "label_index", "mask_path", "split"]
+_EXPECTED_COLS_SEGMENTATION = _EXPECTED_COlS_CLASSIFICATION + ["mask_path"]
+_EXPECTED_COLS_PERTASK = {
+    "classification": _EXPECTED_COlS_CLASSIFICATION,
+    "segmentation": _EXPECTED_COLS_SEGMENTATION,
+}
+
 logger = logging.getLogger(__name__)
 
 
@@ -67,6 +74,13 @@ def samples(self, samples: DataFrame):
         Args:
             samples (DataFrame): DataFrame with new samples.
         """
+        # validate the passed samples by checking the
+        assert isinstance(samples, DataFrame), f"samples must be a pandas.DataFrame, found {type(samples)}"
+        expected_columns = _EXPECTED_COLS_PERTASK[self.task]
+        assert all(
+            col in samples.columns for col in expected_columns
+        ), f"samples must have (at least) columns {expected_columns}, found {samples.columns}"
+
         self._samples = samples.sort_values(by="image_path", ignore_index=True)
 
     @property

From a5e876a4139ed845e56ccc2854dd451d0bc0a67f Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 21 Oct 2022 17:55:22 +0200
Subject: [PATCH 78/96] add file paths check

---
 anomalib/data/base/dataset.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py
index 274955a49e..d86194a7de 100644
--- a/anomalib/data/base/dataset.py
+++ b/anomalib/data/base/dataset.py
@@ -8,6 +8,7 @@
 import copy
 import logging
 from abc import ABC, abstractmethod
+from pathlib import Path
 from typing import Dict, Sequence, Union
 
 import cv2
@@ -80,6 +81,7 @@ def samples(self, samples: DataFrame):
         assert all(
             col in samples.columns for col in expected_columns
         ), f"samples must have (at least) columns {expected_columns}, found {samples.columns}"
+        assert samples["image_path"].apply(lambda p: Path(p).exists()).all(), "missing file path(s) in samples"
 
         self._samples = samples.sort_values(by="image_path", ignore_index=True)
 

From c490e30ddb525786593077908df25008d8a8c9fc Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 21 Oct 2022 18:47:02 +0200
Subject: [PATCH 79/96] add seed to random_split function

---
 anomalib/data/utils/split.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index 1ce0ab2362..72f97ff79d 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -16,7 +16,7 @@
 import math
 import warnings
 from enum import Enum
-from typing import TYPE_CHECKING, List, Sequence, Union
+from typing import TYPE_CHECKING, List, Optional, Sequence, Union
 
 import torch
 
@@ -55,7 +55,10 @@ def concatenate_datasets(datasets: Sequence[AnomalibDataset]) -> AnomalibDataset
 
 
 def random_split(
-    dataset: AnomalibDataset, split_ratio: Union[float, Sequence[float]], label_aware: bool = False
+    dataset: AnomalibDataset,
+    split_ratio: Union[float, Sequence[float]],
+    label_aware: bool = False,
+    seed: Optional[int] = None,
 ) -> List[AnomalibDataset]:
     """Perform a random split of a dataset.
 
@@ -66,6 +69,7 @@ def random_split(
             [1-split_ratio, split_ratio].
         label_aware (bool): When True, the relative occurrence of the different class labels of the source dataset will
             be maintained in each of the subsets.
+        seed (Optional[int], optional): Seed that can be passed if results need to be reproducible
     """
 
     if isinstance(split_ratio, float):
@@ -99,8 +103,10 @@ def random_split(
                 "Zero subset length encountered during splitting. This means one of your subsets might be"
                 " empty or devoid of either normal or anomalous images."
             )
+
         # perform random subsampling
-        indices = torch.randperm(len(label_dataset))
+        random_state = torch.Generator().manual_seed(seed) if seed else None
+        indices = torch.randperm(len(label_dataset), generator=random_state)
         subsets.append(
             [label_dataset.subsample(subset_indices) for subset_indices in torch.split(indices, subset_lengths)]
         )

From 48082877145b23894e73f3efece34fc5a2334a06 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 24 Oct 2022 13:11:58 +0200
Subject: [PATCH 80/96] fix expected columns

---
 anomalib/data/base/dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py
index d86194a7de..86ce72c96b 100644
--- a/anomalib/data/base/dataset.py
+++ b/anomalib/data/base/dataset.py
@@ -21,7 +21,7 @@
 from anomalib.data.utils import read_image
 from anomalib.pre_processing import PreProcessor
 
-_EXPECTED_COlS_CLASSIFICATION = ["image_path", "label", "label_index", "mask_path", "split"]
+_EXPECTED_COlS_CLASSIFICATION = ["image_path", "label", "label_index", "split"]
 _EXPECTED_COLS_SEGMENTATION = _EXPECTED_COlS_CLASSIFICATION + ["mask_path"]
 _EXPECTED_COLS_PERTASK = {
     "classification": _EXPECTED_COlS_CLASSIFICATION,

From 10bbf9c0c28b6c63ddc6f3be5f4b2d282ee7c7e1 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 24 Oct 2022 13:45:43 +0200
Subject: [PATCH 81/96] fix typo

---
 anomalib/data/base/dataset.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/anomalib/data/base/dataset.py b/anomalib/data/base/dataset.py
index 86ce72c96b..6b2c9aefd4 100644
--- a/anomalib/data/base/dataset.py
+++ b/anomalib/data/base/dataset.py
@@ -21,10 +21,10 @@
 from anomalib.data.utils import read_image
 from anomalib.pre_processing import PreProcessor
 
-_EXPECTED_COlS_CLASSIFICATION = ["image_path", "label", "label_index", "split"]
-_EXPECTED_COLS_SEGMENTATION = _EXPECTED_COlS_CLASSIFICATION + ["mask_path"]
+_EXPECTED_COLS_CLASSIFICATION = ["image_path", "label", "label_index", "split"]
+_EXPECTED_COLS_SEGMENTATION = _EXPECTED_COLS_CLASSIFICATION + ["mask_path"]
 _EXPECTED_COLS_PERTASK = {
-    "classification": _EXPECTED_COlS_CLASSIFICATION,
+    "classification": _EXPECTED_COLS_CLASSIFICATION,
     "segmentation": _EXPECTED_COLS_SEGMENTATION,
 }
 

From 81d3ca310c973c941a1de78f2cd80380915daaa6 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 28 Oct 2022 14:10:35 +0200
Subject: [PATCH 82/96] add seed parameter to datamodules

---
 anomalib/data/base/datamodule.py | 13 +++++++++++--
 anomalib/data/btech.py           |  6 +++---
 anomalib/data/folder.py          |  5 ++++-
 anomalib/data/mvtec.py           |  2 ++
 4 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py
index b8d8603bdd..38cb6eac61 100644
--- a/anomalib/data/base/datamodule.py
+++ b/anomalib/data/base/datamodule.py
@@ -27,14 +27,23 @@ class AnomalibDataModule(LightningDataModule, ABC):
         train_batch_size (int): Batch size used by the train dataloader.
         test_batch_size (int): Batch size used by the val and test dataloaders.
         num_workers (int): Number of workers used by the train, val and test dataloaders.
+        seed (Optional[int], optional): Seed used during random subset splitting.
     """
 
-    def __init__(self, train_batch_size: int, eval_batch_size: int, num_workers: int, val_split_mode: ValSplitMode):
+    def __init__(
+        self,
+        train_batch_size: int,
+        eval_batch_size: int,
+        num_workers: int,
+        val_split_mode: ValSplitMode,
+        seed: Optional[int] = None,
+    ):
         super().__init__()
         self.train_batch_size = train_batch_size
         self.eval_batch_size = eval_batch_size
         self.num_workers = num_workers
         self.val_split_mode = val_split_mode
+        self.seed = seed
 
         self.train_data: Optional[AnomalibDataset] = None
         self.val_data: Optional[AnomalibDataset] = None
@@ -63,7 +72,7 @@ def _setup(self, _stage: Optional[str] = None) -> None:
         self.train_data.setup()
         self.test_data.setup()
         if self.val_split_mode == ValSplitMode.FROM_TEST:
-            self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True)
+            self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True, seed=self.seed)
         elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
             self.val_data = self.test_data
         else:
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 271ad066ec..74b341c9a4 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -124,7 +124,6 @@ def __init__(
             pre_process: List of pre_processing object containing albumentation compose.
             split: 'train', 'val' or 'test'
             task: ``classification`` or ``segmentation``
-            seed: seed used for the random subset splitting
             create_validation_set: Create a validation subset in addition to the train and test subsets
 
         Examples:
@@ -182,6 +181,7 @@ def __init__(
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_eval: Optional[Union[str, A.Compose]] = None,
         val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
+        seed: Optional[int] = None,
     ) -> None:
         """Instantiate BTech Lightning Data Module.
 
@@ -195,8 +195,8 @@ def __init__(
             task: ``classification`` or ``segmentation``
             transform_config_train: Config for pre-processing during training.
             transform_config_val: Config for pre-processing during validation.
-            seed: seed used for the random subset splitting
             create_validation_set: Create a validation subset in addition to the train and test subsets
+            seed (Optional[int], optional): Seed used during random subset splitting.
 
         Examples:
             >>> from anomalib.data import BTech
@@ -224,7 +224,7 @@ def __init__(
             >>> data["image"].shape, data["mask"].shape
             (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256]))
         """
-        super().__init__(train_batch_size, eval_batch_size, num_workers, val_split_mode)
+        super().__init__(train_batch_size, eval_batch_size, num_workers, val_split_mode, seed)
 
         self.root = Path(root)
         self.category = Path(category)
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index bcfd30adf6..53f5d922d7 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -231,6 +231,7 @@ class Folder(AnomalibDataModule):
             during validation.
             Defaults to None.
         val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained.
+        seed (Optional[int], optional): Seed used during random subset splitting.
     """
 
     def __init__(
@@ -251,12 +252,14 @@ def __init__(
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_eval: Optional[Union[str, A.Compose]] = None,
         val_split_mode: ValSplitMode = ValSplitMode.FROM_TEST,
+        seed: Optional[int] = None,
     ):
         super().__init__(
             train_batch_size=train_batch_size,
             eval_batch_size=eval_batch_size,
             num_workers=num_workers,
             val_split_mode=val_split_mode,
+            seed=seed,
         )
 
         self.split_ratio = split_ratio
@@ -298,7 +301,7 @@ def _setup(self, _stage: Optional[str] = None):
 
         # add some normal images to the test set
         if not self.test_data.has_normal:
-            self.train_data, normal_test_data = random_split(self.train_data, self.split_ratio)
+            self.train_data, normal_test_data = random_split(self.train_data, self.split_ratio, seed=self.seed)
             self.test_data += normal_test_data
 
         super()._setup()
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 8dd70af4a1..2b21edf180 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -162,12 +162,14 @@ def __init__(
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_eval: Optional[Union[str, A.Compose]] = None,
         val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
+        seed: Optional[int] = None,
     ):
         super().__init__(
             train_batch_size=train_batch_size,
             eval_batch_size=eval_batch_size,
             num_workers=num_workers,
             val_split_mode=val_split_mode,
+            seed=seed,
         )
 
         self.root = Path(root)

From b372dd1283864105db6606915d5e1501cde26eea Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 28 Oct 2022 14:30:22 +0200
Subject: [PATCH 83/96] set global seed in test entrypoint

---
 tools/test.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/test.py b/tools/test.py
index 5427cf9f06..b2772aaf5d 100644
--- a/tools/test.py
+++ b/tools/test.py
@@ -5,7 +5,7 @@
 
 from argparse import ArgumentParser, Namespace
 
-from pytorch_lightning import Trainer
+from pytorch_lightning import Trainer, seed_everything
 
 from anomalib.config import get_configurable_parameters
 from anomalib.data import get_datamodule
@@ -40,6 +40,9 @@ def test():
         weight_file=args.weight_file,
     )
 
+    if config.project.seed:
+        seed_everything(config.project.seed)
+
     datamodule = get_datamodule(config)
     model = get_model(config)
 

From e07a12c1cbd336ab1f3c1cf67e8557f34cef5f5a Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 28 Oct 2022 14:34:16 +0200
Subject: [PATCH 84/96] add NONE option to valsplitmode

---
 anomalib/data/base/datamodule.py | 13 +++++++++----
 anomalib/data/utils/split.py     |  1 +
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py
index 38cb6eac61..e843d3450f 100644
--- a/anomalib/data/base/datamodule.py
+++ b/anomalib/data/base/datamodule.py
@@ -75,15 +75,20 @@ def _setup(self, _stage: Optional[str] = None) -> None:
             self.val_data, self.test_data = random_split(self.test_data, [0.5, 0.5], label_aware=True, seed=self.seed)
         elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
             self.val_data = self.test_data
-        else:
+        elif self.val_split_mode != ValSplitMode.NONE:
             raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
 
     @property
     def is_setup(self):
         """Checks if setup() has been called."""
-        if self.train_data is None or self.val_data is None or self.test_data is None:
-            return False
-        return self.train_data.is_setup and self.val_data.is_setup and self.test_data.is_setup
+        # at least one of [train_data, val_data, test_data] should be setup
+        if self.train_data is not None and self.train_data.is_setup:
+            return True
+        if self.val_data is not None and self.val_data.is_setup:
+            return True
+        if self.test_data is not None and self.test_data.is_setup:
+            return True
+        return False
 
     def train_dataloader(self) -> TRAIN_DATALOADERS:
         """Get train dataloader."""
diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index 72f97ff79d..86249086c2 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -35,6 +35,7 @@ class Split(str, Enum):
 class ValSplitMode(str, Enum):
     """Splitting mode used to obtain validation subset."""
 
+    NONE = "none"
     SAME_AS_TEST = "same_as_test"
     FROM_TEST = "from_test"
 

From ffdb47c0b9a652ae0d235af4664e9eb462c009dc Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 28 Oct 2022 15:22:05 +0200
Subject: [PATCH 85/96] clarify setup behaviour in docstring

---
 anomalib/data/base/datamodule.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py
index e843d3450f..bfdbbfb82a 100644
--- a/anomalib/data/base/datamodule.py
+++ b/anomalib/data/base/datamodule.py
@@ -64,7 +64,12 @@ def setup(self, stage: Optional[str] = None):
     def _setup(self, _stage: Optional[str] = None) -> None:
         """Set up the datasets and perform dynamic subset splitting.
 
-        May be overridden in subclass for custom splitting behaviour.
+        This method yay be overridden in subclass for custom splitting behaviour.
+
+        Note: The stage argument is not used here. This is because, for a given instance of an AnomalibDataModule
+        subclass, all three subsets are created at the first call of setup(). This is to accommodate the subset
+        splitting behaviour of anomaly tasks, where the validation set is usually extracted from the test set, and
+        the test set must therefore be created as early as the `fit` stage.
         """
         assert self.train_data is not None
         assert self.test_data is not None

From 63801a28bdca342effdc2d59798e28f9cc83ee24 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 5 Dec 2022 16:36:51 +0100
Subject: [PATCH 86/96] add logging message

---
 anomalib/data/synthetic.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py
index 3f8956f21b..e3c88d8537 100644
--- a/anomalib/data/synthetic.py
+++ b/anomalib/data/synthetic.py
@@ -3,6 +3,7 @@
 This dataset can be used when there is a lack of real anomalous data.
 """
 
+import logging
 import math
 import os
 import shutil
@@ -19,6 +20,8 @@
 from anomalib.data.utils import Augmenter, Split, read_image
 from anomalib.pre_processing import PreProcessor
 
+logger = logging.getLogger(__name__)
+
 
 def make_synthetic_dataset(
     source_samples: DataFrame, im_dir: Union[Path, str], mask_dir: Union[Path, str], anomalous_ratio: float = 0.5
@@ -126,6 +129,7 @@ def from_dataset(cls, dataset):
 
     def _setup(self) -> None:
         """Create samples dataframe."""
+        logger.info("Generating synthetic anomalous images for validation set")
         self.samples = make_synthetic_dataset(self.source_samples, self.im_dir, self.mask_dir, 0.5)
 
     def __del__(self):

From 74cbc0ae56559cc6b4f83edc7b1e14c1993efbeb Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Tue, 6 Dec 2022 09:33:09 +0100
Subject: [PATCH 87/96] use val_split_ratio for synthetic validation set

---
 anomalib/data/base/datamodule.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py
index ddff4d5177..5200e214d7 100644
--- a/anomalib/data/base/datamodule.py
+++ b/anomalib/data/base/datamodule.py
@@ -86,7 +86,7 @@ def _setup(self, _stage: Optional[str] = None) -> None:
         elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
             self.val_data = self.test_data
         elif self.val_split_mode == ValSplitMode.SYNTHETIC:
-            self.train_data, normal_val_data = random_split(self.train_data, 0.3)
+            self.train_data, normal_val_data = random_split(self.train_data, self.val_split_ratio)
             self.val_data = SyntheticValidationSet.from_dataset(normal_val_data)
         elif self.val_split_mode != ValSplitMode.NONE:
             raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")

From 090cec265c073394df164e1876763aea334540d0 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Tue, 6 Dec 2022 13:26:45 +0100
Subject: [PATCH 88/96] pathlib

---
 anomalib/data/synthetic.py | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py
index e3c88d8537..a4ce480ff6 100644
--- a/anomalib/data/synthetic.py
+++ b/anomalib/data/synthetic.py
@@ -5,10 +5,8 @@
 
 import logging
 import math
-import os
 import shutil
 from pathlib import Path
-from typing import Union
 
 import albumentations as A
 import cv2
@@ -24,7 +22,7 @@
 
 
 def make_synthetic_dataset(
-    source_samples: DataFrame, im_dir: Union[Path, str], mask_dir: Union[Path, str], anomalous_ratio: float = 0.5
+    source_samples: DataFrame, im_dir: Path, mask_dir: Path, anomalous_ratio: float = 0.5
 ) -> DataFrame:
     """Convert a set of normal samples into a mixed set of normal and synthetic anomalous samples.
 
@@ -33,13 +31,13 @@ def make_synthetic_dataset(
 
     Args:
         source_samples (DataFrame): Normal images that will be used as source for the synthetic anomalous images.
-        im_dir (Union[Path, str]): Directory to which the synthetic anomalous image files will be written.
-        mask_dir (Union[Path, str]): Directory to which the ground truth anomaly masks will be written.
+        im_dir (Path): Directory to which the synthetic anomalous image files will be written.
+        mask_dir (Path): Directory to which the ground truth anomaly masks will be written.
         anomalous_ratio (float): Fraction of source samples that will be converted into anomalous samples.
     """
     assert 1 not in source_samples.label_index.values, "All source images must be normal."
-    assert os.path.isdir(im_dir), f"{im_dir} is not a folder."
-    assert os.path.isdir(mask_dir), f"{mask_dir} is not a folder"
+    assert im_dir.is_dir(), f"{im_dir} is not a folder."
+    assert mask_dir.is_dir(), f"{mask_dir} is not a folder"
 
     # filter relevant columns
     source_samples = source_samples.filter(["image_path", "label", "label_index", "mask_path", "split"])
@@ -77,13 +75,13 @@ def augment(sample: Series) -> Series:
         # write image
         aug_im = (aug_im.squeeze().permute((1, 2, 0)) * 255).numpy()
         aug_im = cv2.cvtColor(aug_im, cv2.COLOR_RGB2BGR)
-        im_path = str(Path(im_dir) / file_name)
-        cv2.imwrite(im_path, aug_im)
+        im_path = im_dir / file_name
+        cv2.imwrite(str(im_path), aug_im)
         # write mask
         mask = (mask.squeeze() * 255).numpy()
-        mask_path = str(Path(mask_dir) / file_name)
-        cv2.imwrite(mask_path, mask)
-        out = dict(image_path=im_path, label="abnormal", label_index=1, mask_path=mask_path, split=Split.VAL)
+        mask_path = mask_dir / file_name
+        cv2.imwrite(str(mask_path), mask)
+        out = dict(image_path=str(im_path), label="abnormal", label_index=1, mask_path=str(mask_path), split=Split.VAL)
         return Series(out)
 
     anomalous_samples = anomalous_samples.apply(augment, axis=1)
@@ -113,12 +111,12 @@ def __init__(self, task: str, pre_process: PreProcessor, source_samples: DataFra
         self.mask_dir = self.root / "ground_truth"
 
         # clean up any existing data that may be left over from previous run
-        if os.path.exists(self.root):
+        if self.root.exists():
             shutil.rmtree(self.root)
 
         # create directories
-        os.makedirs(self.im_dir)
-        os.makedirs(self.mask_dir)
+        self.im_dir.mkdir(parents=True)
+        self.mask_dir.mkdir()
 
         self.setup()
 

From 2a8df7b3923f008a8b4143f22c1707897717e16b Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 9 Dec 2022 17:08:18 +0100
Subject: [PATCH 89/96] make synthetic anomaly available for test set

---
 anomalib/data/__init__.py        |  7 ++++-
 anomalib/data/avenue.py          |  6 ++--
 anomalib/data/base/__init__.py   |  4 +--
 anomalib/data/base/datamodule.py | 51 ++++++++++++++++++++++++++++++--
 anomalib/data/base/video.py      | 25 +++++++++++++++-
 anomalib/data/btech.py           | 17 ++++++++++-
 anomalib/data/folder.py          | 24 +++++----------
 anomalib/data/mvtec.py           | 36 ++++++++++++++++++++--
 anomalib/data/synthetic.py       | 44 ++++++++++++++++++++-------
 anomalib/data/ucsd_ped.py        |  7 +++--
 anomalib/data/utils/__init__.py  | 11 ++++++-
 anomalib/data/utils/split.py     | 21 ++++++++++++-
 12 files changed, 210 insertions(+), 43 deletions(-)

diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index 2ec51182e9..0d1c4f1d75 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -44,6 +44,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             task=config.dataset.task,
             transform_config_train=config.dataset.transform_config.train,
             transform_config_eval=config.dataset.transform_config.eval,
+            test_split_mode=config.dataset.test_split_mode,
+            test_split_ratio=config.dataset.test_split_ratio,
             val_split_mode=config.dataset.val_split_mode,
             val_split_ratio=config.dataset.val_split_ratio,
         )
@@ -58,6 +60,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             task=config.dataset.task,
             transform_config_train=config.dataset.transform_config.train,
             transform_config_eval=config.dataset.transform_config.eval,
+            test_split_mode=config.dataset.test_split_mode,
+            test_split_ratio=config.dataset.test_split_ratio,
             val_split_mode=config.dataset.val_split_mode,
             val_split_ratio=config.dataset.val_split_ratio,
         )
@@ -70,13 +74,14 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> AnomalibDataModule:
             normal_test_dir=config.dataset.normal_test_dir,
             mask_dir=config.dataset.mask,
             extensions=config.dataset.extensions,
-            normal_split_ratio=config.dataset.normal_split_ratio,
             image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
             train_batch_size=config.dataset.train_batch_size,
             eval_batch_size=config.dataset.eval_batch_size,
             num_workers=config.dataset.num_workers,
             transform_config_train=config.dataset.transform_config.train,
             transform_config_eval=config.dataset.transform_config.eval,
+            test_split_mode=config.dataset.test_split_mode,
+            test_split_ratio=config.dataset.test_split_ratio,
             val_split_mode=config.dataset.val_split_mode,
             val_split_ratio=config.dataset.val_split_ratio,
         )
diff --git a/anomalib/data/avenue.py b/anomalib/data/avenue.py
index 792d2f663b..459f42677c 100644
--- a/anomalib/data/avenue.py
+++ b/anomalib/data/avenue.py
@@ -26,7 +26,7 @@
 from pandas import DataFrame
 from torch import Tensor
 
-from anomalib.data.base import AnomalibDataModule, VideoAnomalibDataset
+from anomalib.data.base import VideoAnomalibDataModule, VideoAnomalibDataset
 from anomalib.data.task_type import TaskType
 from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check
 from anomalib.data.utils.video import ClipsIndexer
@@ -156,7 +156,7 @@ def _setup(self):
         self.samples = make_avenue_dataset(self.root, self.gt_dir, self.split)
 
 
-class Avenue(AnomalibDataModule):
+class Avenue(VideoAnomalibDataModule):
     """Avenue DataModule class.
 
     Args:
@@ -177,6 +177,8 @@ class Avenue(AnomalibDataModule):
             during validation.
             Defaults to None.
         val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained.
+        val_split_ratio (float): Fraction of train or test images that will be reserved for validation.
+        seed (Optional[int], optional): Seed which may be set to a fixed value for reproducibility.
     """
 
     def __init__(
diff --git a/anomalib/data/base/__init__.py b/anomalib/data/base/__init__.py
index e158357fc8..0c8fe84257 100644
--- a/anomalib/data/base/__init__.py
+++ b/anomalib/data/base/__init__.py
@@ -6,6 +6,6 @@
 
 from .datamodule import AnomalibDataModule
 from .dataset import AnomalibDataset
-from .video import VideoAnomalibDataset
+from .video import VideoAnomalibDataModule, VideoAnomalibDataset
 
-__all__ = ["AnomalibDataset", "AnomalibDataModule", "VideoAnomalibDataset"]
+__all__ = ["AnomalibDataset", "AnomalibDataModule", "VideoAnomalibDataset", "VideoAnomalibDataModule"]
diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py
index b9e5bf720b..019da5e3ec 100644
--- a/anomalib/data/base/datamodule.py
+++ b/anomalib/data/base/datamodule.py
@@ -6,6 +6,7 @@
 from __future__ import annotations
 
 import logging
+import warnings
 from abc import ABC
 from typing import Any, Dict, List, Optional
 
@@ -15,8 +16,13 @@
 from torch.utils.data import DataLoader, default_collate
 
 from anomalib.data.base.dataset import AnomalibDataset
-from anomalib.data.synthetic import SyntheticValidationSet
-from anomalib.data.utils import ValSplitMode, random_split
+from anomalib.data.synthetic import SyntheticAnomalyDataset
+from anomalib.data.utils import (
+    TestSplitMode,
+    ValSplitMode,
+    random_split,
+    split_normal_and_anomalous,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -61,12 +67,16 @@ def __init__(
         num_workers: int,
         val_split_mode: ValSplitMode,
         val_split_ratio: float,
+        test_split_mode: Optional[TestSplitMode] = None,
+        test_split_ratio: Optional[float] = None,
         seed: Optional[int] = None,
     ):
         super().__init__()
         self.train_batch_size = train_batch_size
         self.eval_batch_size = eval_batch_size
         self.num_workers = num_workers
+        self.test_split_mode = test_split_mode
+        self.test_split_ratio = test_split_ratio
         self.val_split_mode = val_split_mode
         self.val_split_ratio = val_split_ratio
         self.seed = seed
@@ -102,6 +112,41 @@ def _setup(self, _stage: Optional[str] = None) -> None:
 
         self.train_data.setup()
         self.test_data.setup()
+
+        self._create_test_split()
+        self._create_val_split()
+
+    def _create_test_split(self):
+        # perform subset splitting for test set
+        if self.test_split_mode == TestSplitMode.FROM_DIR:
+            # normal data taken from normal_test_dir if available, otherwise sampled from training set
+            if not self.test_data.has_normal:
+                logger.info(
+                    "No normal test images found. Sampling from training set using a split ratio of %d",
+                    self.test_split_ratio,
+                )
+                self.train_data, normal_test_data = random_split(self.train_data, self.test_split_ratio)
+                self.test_data += normal_test_data
+            # anomalous data taken from abnormal_dir if available, otherwise raise warning
+            if not self.test_data.has_anomalous:
+                warnings.warn(
+                    "Your test set does not contain any anomalous images, which may lead to unreliable "
+                    "evaluation results. To fix, please include anomalous images in your dataset, or set "
+                    "`test_split_mode` to `synthetic`."
+                )
+        elif self.test_split_mode == TestSplitMode.SYNTHETIC:
+            if not self.test_data.has_normal:
+                logger.info(
+                    "No normal test images found. Sampling from training set using a split ratio of %d",
+                    self.test_split_ratio,
+                )
+                self.train_data, normal_test_data = random_split(self.train_data, self.test_split_ratio)
+            else:
+                normal_test_data, _ = split_normal_and_anomalous(self.test_data)
+            self.test_data = SyntheticAnomalyDataset.from_dataset(normal_test_data)
+
+    def _create_val_split(self):
+        # perform subset splitting for validation set
         if self.val_split_mode == ValSplitMode.FROM_TEST:
             self.test_data, self.val_data = random_split(
                 self.test_data, self.val_split_ratio, label_aware=True, seed=self.seed
@@ -110,7 +155,7 @@ def _setup(self, _stage: Optional[str] = None) -> None:
             self.val_data = self.test_data
         elif self.val_split_mode == ValSplitMode.SYNTHETIC:
             self.train_data, normal_val_data = random_split(self.train_data, self.val_split_ratio)
-            self.val_data = SyntheticValidationSet.from_dataset(normal_val_data)
+            self.val_data = SyntheticAnomalyDataset.from_dataset(normal_val_data)
         elif self.val_split_mode != ValSplitMode.NONE:
             raise ValueError(f"Unknown validation split mode: {self.val_split_mode}")
 
diff --git a/anomalib/data/base/video.py b/anomalib/data/base/video.py
index 65735dab05..538b45e6a9 100644
--- a/anomalib/data/base/video.py
+++ b/anomalib/data/base/video.py
@@ -6,9 +6,10 @@
 import torch
 from torch import Tensor
 
+from anomalib.data.base.datamodule import AnomalibDataModule
 from anomalib.data.base.dataset import AnomalibDataset
 from anomalib.data.task_type import TaskType
-from anomalib.data.utils import masks_to_boxes
+from anomalib.data.utils import ValSplitMode, masks_to_boxes
 from anomalib.data.utils.video import ClipsIndexer
 from anomalib.pre_processing import PreProcessor
 
@@ -93,3 +94,25 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
             item.pop("mask")
 
         return item
+
+
+class VideoAnomalibDataModule(AnomalibDataModule):
+    """Base class for video data modules."""
+
+    def _setup(self, _stage: Optional[str] = None) -> None:
+        """Set up the datasets and perform dynamic subset splitting.
+
+        This method may be overridden in subclass for custom splitting behaviour.
+
+        Video datamodules are not compatible with synthetic anomaly generation.
+        """
+        assert self.train_data is not None
+        assert self.test_data is not None
+
+        self.train_data.setup()
+        self.test_data.setup()
+
+        if self.val_split_mode == ValSplitMode.SYNTHETIC:
+            raise ValueError(f"Val split mode {self.test_split_mode} not supported for video datasets.")
+
+        self._create_val_split()
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 61696c31d0..7d1b0fc75c 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -25,7 +25,13 @@
 
 from anomalib.data.base import AnomalibDataModule, AnomalibDataset
 from anomalib.data.task_type import TaskType
-from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check
+from anomalib.data.utils import (
+    DownloadProgressBar,
+    Split,
+    TestSplitMode,
+    ValSplitMode,
+    hash_check,
+)
 from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
@@ -181,6 +187,8 @@ def __init__(
         task: TaskType = TaskType.SEGMENTATION,
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_eval: Optional[Union[str, A.Compose]] = None,
+        test_split_mode: TestSplitMode = TestSplitMode.FROM_DIR,
+        test_split_ratio: float = 0.2,
         val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
         val_split_ratio: float = 0.5,
         seed: Optional[int] = None,
@@ -199,6 +207,11 @@ def __init__(
             transform_config_val: Config for pre-processing during validation.
             create_validation_set: Create a validation subset in addition to the train and test subsets
             seed (Optional[int], optional): Seed used during random subset splitting.
+            test_split_mode (TestSplitMode): Setting that determines how the testing subset is obtained.
+            test_split_ratio (float): Fraction of images from the train set that will be reserved for testing.
+            val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained.
+            val_split_ratio (float): Fraction of train or test images that will be reserved for validation.
+            seed (Optional[int], optional): Seed which may be set to a fixed value for reproducibility.
 
         Examples:
             >>> from anomalib.data import BTech
@@ -230,6 +243,8 @@ def __init__(
             train_batch_size=train_batch_size,
             eval_batch_size=eval_batch_size,
             num_workers=num_workers,
+            test_split_mode=test_split_mode,
+            test_split_ratio=test_split_ratio,
             val_split_mode=val_split_mode,
             val_split_ratio=val_split_ratio,
             seed=seed,
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
index 957b0e1274..ddb327e7a9 100644
--- a/anomalib/data/folder.py
+++ b/anomalib/data/folder.py
@@ -15,7 +15,7 @@
 
 from anomalib.data.base import AnomalibDataModule, AnomalibDataset
 from anomalib.data.task_type import TaskType
-from anomalib.data.utils import Split, ValSplitMode, random_split
+from anomalib.data.utils import Split, TestSplitMode, ValSplitMode
 from anomalib.pre_processing.pre_process import PreProcessor
 
 
@@ -237,7 +237,10 @@ class Folder(AnomalibDataModule):
         transform_config_val (Optional[Union[str, A.Compose]], optional): Config for pre-processing
             during validation.
             Defaults to None.
+        test_split_mode (TestSplitMode): Setting that determines how the testing subset is obtained.
+        test_split_ratio (float): Fraction of images from the train set that will be reserved for testing.
         val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained.
+        val_split_ratio (float): Fraction of train or test images that will be reserved for validation.
         seed (Optional[int], optional): Seed used during random subset splitting.
     """
 
@@ -258,6 +261,8 @@ def __init__(
         task: TaskType = TaskType.SEGMENTATION,
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_eval: Optional[Union[str, A.Compose]] = None,
+        test_split_mode: TestSplitMode = TestSplitMode.FROM_DIR,
+        test_split_ratio: float = 0.2,
         val_split_mode: ValSplitMode = ValSplitMode.FROM_TEST,
         val_split_ratio: float = 0.5,
         seed: Optional[int] = None,
@@ -266,6 +271,8 @@ def __init__(
             train_batch_size=train_batch_size,
             eval_batch_size=eval_batch_size,
             num_workers=num_workers,
+            test_split_mode=test_split_mode,
+            test_split_ratio=test_split_ratio,
             val_split_mode=val_split_mode,
             val_split_ratio=val_split_ratio,
             seed=seed,
@@ -299,18 +306,3 @@ def __init__(
             mask_dir=mask_dir,
             extensions=extensions,
         )
-
-    def _setup(self, _stage: Optional[str] = None):
-        """Set up the datasets for the Folder Data Module."""
-        assert self.train_data is not None
-        assert self.test_data is not None
-
-        self.train_data.setup()
-        self.test_data.setup()
-
-        # add some normal images to the test set
-        if not self.test_data.has_normal:
-            self.train_data, normal_test_data = random_split(self.train_data, self.normal_split_ratio, seed=self.seed)
-            self.test_data += normal_test_data
-
-        super()._setup()
diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index 6aa2424cec..e70bb823a4 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -34,7 +34,13 @@
 
 from anomalib.data.base import AnomalibDataModule, AnomalibDataset
 from anomalib.data.task_type import TaskType
-from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check
+from anomalib.data.utils import (
+    DownloadProgressBar,
+    Split,
+    TestSplitMode,
+    ValSplitMode,
+    hash_check,
+)
 from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
@@ -149,7 +155,29 @@ def _setup(self):
 
 
 class MVTec(AnomalibDataModule):
-    """MVTec Datamodule."""
+    """MVTec Datamodule.
+
+    Args:
+        root (str): Path to the root of the dataset
+        category (str): Category of the MVTec dataset (e.g. "bottle" or "cable").
+        image_size (Optional[Union[int, Tuple[int, int]]], optional): Size of the input image.
+            Defaults to None.
+        train_batch_size (int, optional): Training batch size. Defaults to 32.
+        eval_batch_size (int, optional): Test batch size. Defaults to 32.
+        num_workers (int, optional): Number of workers. Defaults to 8.
+        task TaskType): Task type, 'classification', 'detection' or 'segmentation'
+        transform_config_train (Optional[Union[str, A.Compose]], optional): Config for pre-processing
+            during training.
+            Defaults to None.
+        transform_config_val (Optional[Union[str, A.Compose]], optional): Config for pre-processing
+            during validation.
+            Defaults to None.
+        test_split_mode (TestSplitMode): Setting that determines how the testing subset is obtained.
+        test_split_ratio (float): Fraction of images from the train set that will be reserved for testing.
+        val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained.
+        val_split_ratio (float): Fraction of train or test images that will be reserved for validation.
+        seed (Optional[int], optional): Seed which may be set to a fixed value for reproducibility.
+    """
 
     def __init__(
         self,
@@ -162,6 +190,8 @@ def __init__(
         task: TaskType = TaskType.SEGMENTATION,
         transform_config_train: Optional[Union[str, A.Compose]] = None,
         transform_config_eval: Optional[Union[str, A.Compose]] = None,
+        test_split_mode: TestSplitMode = TestSplitMode.FROM_DIR,
+        test_split_ratio: float = 0.2,
         val_split_mode: ValSplitMode = ValSplitMode.SAME_AS_TEST,
         val_split_ratio: float = 0.5,
         seed: Optional[int] = None,
@@ -170,6 +200,8 @@ def __init__(
             train_batch_size=train_batch_size,
             eval_batch_size=eval_batch_size,
             num_workers=num_workers,
+            test_split_mode=test_split_mode,
+            test_split_ratio=test_split_ratio,
             val_split_mode=val_split_mode,
             val_split_ratio=val_split_ratio,
             seed=seed,
diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py
index 534ec9a64f..42c4cc7cec 100644
--- a/anomalib/data/synthetic.py
+++ b/anomalib/data/synthetic.py
@@ -6,7 +6,10 @@
 import logging
 import math
 import shutil
+from copy import deepcopy
 from pathlib import Path
+from tempfile import mkdtemp
+from typing import Dict
 
 import albumentations as A
 import cv2
@@ -14,14 +17,17 @@
 from albumentations.pytorch import ToTensorV2
 from pandas import DataFrame, Series
 
-from anomalib.data import TaskType
 from anomalib.data.base.dataset import AnomalibDataset
+from anomalib.data.task_type import TaskType
 from anomalib.data.utils import Augmenter, Split, read_image
 from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(__name__)
 
 
+ROOT = "./.tmp/synthetic_anomaly"
+
+
 def make_synthetic_dataset(
     source_samples: DataFrame, im_dir: Path, mask_dir: Path, anomalous_ratio: float = 0.5
 ) -> DataFrame:
@@ -92,7 +98,7 @@ def augment(sample: Series) -> Series:
     return samples
 
 
-class SyntheticValidationSet(AnomalibDataset):
+class SyntheticAnomalyDataset(AnomalibDataset):
     """Dataset which reads synthetically generated anomalous images from a temporary folder.
 
     Args:
@@ -107,18 +113,18 @@ def __init__(self, task: TaskType, pre_process: PreProcessor, source_samples: Da
         self.source_samples = source_samples
 
         # Files will be written to a temporary directory in the workdir, which is cleaned up after code execution
-        self.root = Path("./.tmp/synthetic_anomaly")
-        self.im_dir = self.root / "images"
-        self.mask_dir = self.root / "ground_truth"
+        root = Path(ROOT)
+        root.mkdir(parents=True, exist_ok=True)
 
-        # clean up any existing data that may be left over from previous run
-        if self.root.exists():
-            shutil.rmtree(self.root)
+        self.root = Path(mkdtemp(dir=root))
+        self.im_dir = self.root / "abnormal"
+        self.mask_dir = self.root / "ground_truth"
 
         # create directories
-        self.im_dir.mkdir(parents=True)
+        self.im_dir.mkdir()
         self.mask_dir.mkdir()
 
+        self._cleanup = True  # flag that determines if temp dir is cleaned up when instance is deleted
         self.setup()
 
     @classmethod
@@ -126,6 +132,23 @@ def from_dataset(cls, dataset):
         """Create a synthetic anomaly dataset from an existing dataset of normal images."""
         return cls(task=dataset.task, pre_process=dataset.pre_process, source_samples=dataset.samples)
 
+    def __copy__(self) -> "SyntheticAnomalyDataset":
+        """Returns a shallow copy of the dataset object and prevents cleanup when original object is deleted."""
+        cls = self.__class__
+        new = cls.__new__(cls)
+        new.__dict__.update(self.__dict__)
+        self._cleanup = False
+        return new
+
+    def __deepcopy__(self, _memo: Dict) -> "SyntheticAnomalyDataset":
+        """Returns a deep copy of the dataset object and prevents cleanup when original object is deleted."""
+        cls = self.__class__
+        new = cls.__new__(cls)
+        for key, value in self.__dict__.items():
+            setattr(new, key, deepcopy(value))
+        self._cleanup = False
+        return new
+
     def _setup(self) -> None:
         """Create samples dataframe."""
         logger.info("Generating synthetic anomalous images for validation set")
@@ -133,4 +156,5 @@ def _setup(self) -> None:
 
     def __del__(self):
         """Make sure the temporary directory is cleaned up when the dataset object is deleted."""
-        shutil.rmtree(self.root)
+        if self._cleanup:
+            shutil.rmtree(self.root)
diff --git a/anomalib/data/ucsd_ped.py b/anomalib/data/ucsd_ped.py
index 0ce32ce8b7..fd810c4402 100644
--- a/anomalib/data/ucsd_ped.py
+++ b/anomalib/data/ucsd_ped.py
@@ -14,8 +14,7 @@
 from pandas import DataFrame
 from torch import Tensor
 
-from anomalib.data.base import AnomalibDataModule
-from anomalib.data.base.video import VideoAnomalibDataset
+from anomalib.data.base import VideoAnomalibDataModule, VideoAnomalibDataset
 from anomalib.data.task_type import TaskType
 from anomalib.data.utils import (
     DownloadProgressBar,
@@ -169,7 +168,7 @@ def _setup(self):
         self.samples = make_ucsd_dataset(self.root_category, self.split)
 
 
-class UCSDped(AnomalibDataModule):
+class UCSDped(VideoAnomalibDataModule):
     """UCSDped DataModule class.
 
     Args:
@@ -190,6 +189,8 @@ class UCSDped(AnomalibDataModule):
             during validation.
             Defaults to None.
         val_split_mode (ValSplitMode): Setting that determines how the validation subset is obtained.
+        val_split_ratio (float): Fraction of train or test images that will be reserved for validation.
+        seed (Optional[int], optional): Seed which may be set to a fixed value for reproducibility.
     """
 
     def __init__(
diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py
index d274577f54..4add078386 100644
--- a/anomalib/data/utils/__init__.py
+++ b/anomalib/data/utils/__init__.py
@@ -13,7 +13,14 @@
     get_image_height_and_width,
     read_image,
 )
-from .split import Split, ValSplitMode, concatenate_datasets, random_split
+from .split import (
+    Split,
+    TestSplitMode,
+    ValSplitMode,
+    concatenate_datasets,
+    random_split,
+    split_normal_and_anomalous,
+)
 
 __all__ = [
     "generate_output_image_filename",
@@ -24,9 +31,11 @@
     "read_image",
     "DownloadProgressBar",
     "random_split",
+    "split_normal_and_anomalous",
     "concatenate_datasets",
     "Split",
     "ValSplitMode",
+    "TestSplitMode",
     "Augmenter",
     "masks_to_boxes",
     "boxes_to_masks",
diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index acf4338b3e..56cc15a736 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -16,7 +16,7 @@
 import math
 import warnings
 from enum import Enum
-from typing import TYPE_CHECKING, List, Optional, Sequence, Union
+from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple, Union
 
 import torch
 
@@ -32,6 +32,14 @@ class Split(str, Enum):
     TEST = "test"
 
 
+class TestSplitMode(str, Enum):
+    """Splitting mode used to obtain subset."""
+
+    NONE = "none"
+    FROM_DIR = "from_dir"
+    SYNTHETIC = "synthetic"
+
+
 class ValSplitMode(str, Enum):
     """Splitting mode used to obtain validation subset."""
 
@@ -117,3 +125,14 @@ def random_split(
     # outer list: subsets with the given ratio, inner list: per-label unique
     subsets = list(map(list, zip(*subsets)))
     return [concatenate_datasets(subset) for subset in subsets]
+
+
+def split_normal_and_anomalous(dataset: AnomalibDataset) -> Tuple[AnomalibDataset, AnomalibDataset]:
+    """Splits the dataset into the normal and anomalous subsets."""
+    samples = dataset.samples
+    normal_indices = samples[samples.label_index == 0].index
+    anomalous_indices = samples[samples.label_index == 1].index
+
+    normal_subset = dataset.subsample(list(normal_indices))
+    anomalous_subset = dataset.subsample(list(anomalous_indices))
+    return normal_subset, anomalous_subset

From ea004423c8f5792e3a8bfecc5615834bf41bdeba Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 9 Dec 2022 17:16:16 +0100
Subject: [PATCH 90/96] update configs

---
 anomalib/models/cflow/config.yaml                | 6 ++++--
 anomalib/models/dfkde/config.yaml                | 6 ++++--
 anomalib/models/dfm/config.yaml                  | 6 ++++--
 anomalib/models/draem/config.yaml                | 6 ++++--
 anomalib/models/fastflow/config.yaml             | 6 ++++--
 anomalib/models/ganomaly/config.yaml             | 6 ++++--
 anomalib/models/padim/config.yaml                | 6 ++++--
 anomalib/models/patchcore/config.yaml            | 6 ++++--
 anomalib/models/reverse_distillation/config.yaml | 6 ++++--
 anomalib/models/stfpm/config.yaml                | 6 ++++--
 10 files changed, 40 insertions(+), 20 deletions(-)

diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml
index 6261a8b0dd..9da02805c4 100644
--- a/anomalib/models/cflow/config.yaml
+++ b/anomalib/models/cflow/config.yaml
@@ -12,8 +12,10 @@ dataset:
   transform_config:
     train: null
     eval: null
-  val_split_mode: same_as_test # options: [same_as_test, from_test]
-  val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode)
+  test_split_mode: from_dir # options: [from_dir, synthetic]
+  test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode)
+  val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic]
+  val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode)
 
 model:
   name: cflow
diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml
index 3fe5dcdeaa..82d4a159c9 100644
--- a/anomalib/models/dfkde/config.yaml
+++ b/anomalib/models/dfkde/config.yaml
@@ -11,8 +11,10 @@ dataset:
   transform_config:
     train: null
     eval: null
-  val_split_mode: same_as_test # options: [same_as_test, from_test]
-  val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode)
+  test_split_mode: from_dir # options: [from_dir, synthetic]
+  test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode)
+  val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic]
+  val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode)
 
 model:
   name: dfkde
diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml
index fec94106e8..ecc59aac36 100755
--- a/anomalib/models/dfm/config.yaml
+++ b/anomalib/models/dfm/config.yaml
@@ -11,8 +11,10 @@ dataset:
   transform_config:
     train: null
     eval: null
-  val_split_mode: same_as_test # options: [same_as_test, from_test]
-  val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode)
+  test_split_mode: from_dir # options: [from_dir, synthetic]
+  test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode)
+  val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic]
+  val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode)
 
 model:
   name: dfm
diff --git a/anomalib/models/draem/config.yaml b/anomalib/models/draem/config.yaml
index 495b6444bf..31727abe5e 100644
--- a/anomalib/models/draem/config.yaml
+++ b/anomalib/models/draem/config.yaml
@@ -11,8 +11,10 @@ dataset:
   transform_config:
     train: ./anomalib/models/draem/transform_config.yaml
     eval: ./anomalib/models/draem/transform_config.yaml
-  val_split_mode: same_as_test # options: [same_as_test, from_test]
-  val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode)
+  test_split_mode: from_dir # options: [from_dir, synthetic]
+  test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode)
+  val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic]
+  val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode)
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/fastflow/config.yaml b/anomalib/models/fastflow/config.yaml
index 05aa838fe2..93953a9ef0 100644
--- a/anomalib/models/fastflow/config.yaml
+++ b/anomalib/models/fastflow/config.yaml
@@ -11,8 +11,10 @@ dataset:
   transform_config:
     train: null
     eval: null
-  val_split_mode: same_as_test # options: [same_as_test, from_test]
-  val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode)
+  test_split_mode: from_dir # options: [from_dir, synthetic]
+  test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode)
+  val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic]
+  val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode)
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml
index a41d9d2421..bbe5738830 100644
--- a/anomalib/models/ganomaly/config.yaml
+++ b/anomalib/models/ganomaly/config.yaml
@@ -12,8 +12,10 @@ dataset:
   transform_config:
     train: null
     eval: null
-  val_split_mode: same_as_test # options: [same_as_test, from_test]
-  val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode)
+  test_split_mode: from_dir # options: [from_dir, synthetic]
+  test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode)
+  val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic]
+  val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode)
   tiling:
     apply: true
     tile_size: 64
diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml
index 315f3e6691..3861bca8a3 100644
--- a/anomalib/models/padim/config.yaml
+++ b/anomalib/models/padim/config.yaml
@@ -11,8 +11,10 @@ dataset:
   transform_config:
     train: null
     eval: null
-  val_split_mode: same_as_test # options: [same_as_test, from_test]
-  val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode)
+  test_split_mode: from_dir # options: [from_dir, synthetic]
+  test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode)
+  val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic]
+  val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode)
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml
index d29a5a39f9..fad1104931 100644
--- a/anomalib/models/patchcore/config.yaml
+++ b/anomalib/models/patchcore/config.yaml
@@ -11,8 +11,10 @@ dataset:
   transform_config:
     train: null
     eval: null
-  val_split_mode: same_as_test # options: [same_as_test, from_test]
-  val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode)
+  test_split_mode: from_dir # options: [from_dir, synthetic]
+  test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode)
+  val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic]
+  val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode)
   tiling:
     apply: false
     tile_size: null
diff --git a/anomalib/models/reverse_distillation/config.yaml b/anomalib/models/reverse_distillation/config.yaml
index 1e5c3f8f82..2deafe3ca3 100644
--- a/anomalib/models/reverse_distillation/config.yaml
+++ b/anomalib/models/reverse_distillation/config.yaml
@@ -12,8 +12,10 @@ dataset:
   transform_config:
     train: null
     eval: null
-  val_split_mode: same_as_test # options: [same_as_test, from_test]
-  val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode)
+  test_split_mode: from_dir # options: [from_dir, synthetic]
+  test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode)
+  val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic]
+  val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode)
   tiling:
     apply: false
     tile_size: 64
diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml
index 504998ec72..1284156847 100644
--- a/anomalib/models/stfpm/config.yaml
+++ b/anomalib/models/stfpm/config.yaml
@@ -12,8 +12,10 @@ dataset:
   transform_config:
     train: null
     eval: null
-  val_split_mode: same_as_test # options: [same_as_test, from_test]
-  val_split_ratio: 0.5 # fraction of test images that will be used for validation (not used in 'same_as_test' mode)
+  test_split_mode: from_dir # options: [from_dir, synthetic]
+  test_split_ratio: 0.2 # fraction of train images held out testing (usage depends on test_split_mode)
+  val_split_mode: same_as_test # options: [same_as_test, from_test, synthetic]
+  val_split_ratio: 0.5 # fraction of train/test images held out for validation (usage depends on val_split_mode)
   tiling:
     apply: false
     tile_size: null

From dfd2d80266698758bb96c696ad5fa1dd7a98c619 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 9 Dec 2022 17:17:32 +0100
Subject: [PATCH 91/96] add tests

---
 tests/pre_merge/datasets/test_datamodule.py   | 69 ++++++++++++--
 .../pre_merge/datasets/test_synthetic_data.py | 93 +++++++++++++++++++
 2 files changed, 155 insertions(+), 7 deletions(-)
 create mode 100644 tests/pre_merge/datasets/test_synthetic_data.py

diff --git a/tests/pre_merge/datasets/test_datamodule.py b/tests/pre_merge/datasets/test_datamodule.py
index d39c385912..063155de99 100644
--- a/tests/pre_merge/datasets/test_datamodule.py
+++ b/tests/pre_merge/datasets/test_datamodule.py
@@ -37,7 +37,7 @@ def make_avenue_data_module(task="classification", batch_size=1, val_split_mode=
     return data_module
 
 
-def make_mvtec_data_module(task="classification", batch_size=1, val_split_mode="from_test"):
+def make_mvtec_data_module(task="classification", batch_size=1, test_split_mode="from_dir", val_split_mode="from_test"):
     data_module = MVTec(
         root=get_dataset_path(dataset="MVTec"),
         category="leather",
@@ -46,6 +46,7 @@ def make_mvtec_data_module(task="classification", batch_size=1, val_split_mode="
         eval_batch_size=batch_size,
         num_workers=0,
         task=task,
+        test_split_mode=test_split_mode,
         val_split_mode=val_split_mode,
     )
     data_module.prepare_data()
@@ -53,7 +54,7 @@ def make_mvtec_data_module(task="classification", batch_size=1, val_split_mode="
     return data_module
 
 
-def make_btech_data_module(task="classification", batch_size=1, val_split_mode="from_test"):
+def make_btech_data_module(task="classification", batch_size=1, test_split_mode="from_dir", val_split_mode="from_test"):
     """Create BTech Data Module."""
     data_module = BTech(
         root=get_dataset_path(dataset="BTech"),
@@ -63,6 +64,7 @@ def make_btech_data_module(task="classification", batch_size=1, val_split_mode="
         eval_batch_size=batch_size,
         num_workers=0,
         task=task,
+        test_split_mode=test_split_mode,
         val_split_mode=val_split_mode,
     )
     data_module.prepare_data()
@@ -70,13 +72,22 @@ def make_btech_data_module(task="classification", batch_size=1, val_split_mode="
     return data_module
 
 
-def make_folder_data_module(task="classification", batch_size=1, val_split_mode="from_test"):
+def make_folder_data_module(
+    task="classification",
+    batch_size=1,
+    test_split_mode="from_dir",
+    val_split_mode="from_test",
+    normal_dir="good",
+    abnormal_dir="broken_large",
+    normal_test_dir="good_test",
+):
     """Create Folder Data Module."""
     root = get_dataset_path(dataset="bottle")
     data_module = Folder(
         root=root,
-        normal_dir="good",
-        abnormal_dir="broken_large",
+        normal_dir=normal_dir,
+        abnormal_dir=abnormal_dir,
+        normal_test_dir=normal_test_dir,
         mask_dir=os.path.join(root, "ground_truth/broken_large"),
         normal_split_ratio=0.2,
         image_size=(256, 256),
@@ -84,6 +95,7 @@ def make_folder_data_module(task="classification", batch_size=1, val_split_mode=
         eval_batch_size=batch_size,
         num_workers=8,
         task=task,
+        test_split_mode=test_split_mode,
         val_split_mode=val_split_mode,
     )
     data_module.setup()
@@ -116,8 +128,8 @@ def make_ucsdped_data_module(task="classification", batch_size=1, val_split_mode
 
 @pytest.fixture(autouse=True)
 def make_data_module():
-    def make(dataset="folder", task="classification", batch_size=1, val_split_mode="from_test"):
-        return DATASETS[dataset](task=task, batch_size=batch_size, val_split_mode=val_split_mode)
+    def make(dataset="folder", **kwargs):
+        return DATASETS[dataset](**kwargs)
 
     return make
 
@@ -271,3 +283,46 @@ def test_image_size(self, input_size, effective_image_size, category="shapes", p
         data_module = get_datamodule(configurable_parameters)
         data_module.setup()
         assert next(iter(data_module.train_dataloader()))["image"].shape[-2:] == effective_image_size
+
+
+class TestSubsetSplitting:
+    @pytest.mark.parametrize("dataset", ["folder", "mvtec", "btech"])
+    # @pytest.mark.parametrize("dataset", ["folder"])
+    @pytest.mark.parametrize("test_split_mode", ("from_dir", "synthetic"))
+    @pytest.mark.parametrize("val_split_mode", ("from_test", "synthetic"))
+    def test_non_overlapping_splits(self, make_data_module, dataset, test_split_mode, val_split_mode):
+        """Tests if train, test and val splits are non-overlapping."""
+        data_module = make_data_module(dataset, test_split_mode=test_split_mode, val_split_mode=val_split_mode)
+        train_samples = data_module.train_data.samples
+        val_samples = data_module.val_data.samples
+        test_samples = data_module.test_data.samples
+        assert len(set(train_samples.image_path).intersection(set(test_samples.image_path))) == 0
+        assert len(set(val_samples.image_path).intersection(set(test_samples.image_path))) == 0
+
+    @pytest.mark.parametrize("dataset", ["folder", "mvtec", "btech"])
+    # @pytest.mark.parametrize("dataset", ["folder"])
+    @pytest.mark.parametrize("test_split_mode", ("from_dir", "synthetic"))
+    def test_equal_splits(self, make_data_module, dataset, test_split_mode):
+        """Tests if test and and val splits are equal and non-overlapping with train when val_split_mode == same_as_test."""
+        data_module = make_data_module(dataset, test_split_mode=test_split_mode, val_split_mode="same_as_test")
+        train_samples = data_module.train_data.samples
+        val_samples = data_module.val_data.samples
+        test_samples = data_module.test_data.samples
+        assert len(set(train_samples.image_path).intersection(set(test_samples.image_path))) == 0
+        assert len(set(val_samples.image_path).intersection(set(test_samples.image_path))) == len(val_samples)
+
+    @pytest.mark.parametrize("test_split_mode", ("from_dir", "synthetic"))
+    def test_normal_test_dir_omitted(self, make_data_module, test_split_mode):
+        """The test set should always contain normal samples even when no normal_test_dir ir provided."""
+        data_module = make_data_module(dataset="folder", test_split_mode=test_split_mode, normal_test_dir=None)
+        assert data_module.test_data.has_normal
+
+    def test_abnormal_dir_omitted_from_dir(self, make_data_module):
+        """The test set should not contain anomalous samples if no abnormal_dir provided and split mode is from_dir."""
+        data_module = make_data_module(dataset="folder", test_split_mode="from_dir", abnormal_dir=None)
+        assert not data_module.test_data.has_anomalous
+
+    def test_abnormal_dir_omitted_synthetic(self, make_data_module):
+        """The test set should contain anomalous samples if no abnormal_dir provided and split mode is synthetic."""
+        data_module = make_data_module(dataset="folder", test_split_mode="synthetic", abnormal_dir=None)
+        assert data_module.test_data.has_anomalous
diff --git a/tests/pre_merge/datasets/test_synthetic_data.py b/tests/pre_merge/datasets/test_synthetic_data.py
new file mode 100644
index 0000000000..f7b97d768b
--- /dev/null
+++ b/tests/pre_merge/datasets/test_synthetic_data.py
@@ -0,0 +1,93 @@
+"""Tests for synthetic anomalous dataset."""
+import os
+from copy import copy, deepcopy
+from pathlib import Path
+
+import pytest
+
+from anomalib.data import TaskType
+from anomalib.data.folder import FolderDataset
+from anomalib.data.synthetic import SyntheticAnomalyDataset
+from anomalib.pre_processing import PreProcessor
+from tests.helpers.dataset import get_dataset_path
+
+
+def get_folder_dataset():
+    """Create Folder Dataset."""
+    root = get_dataset_path(dataset="bottle")
+    pre_process = PreProcessor(image_size=(256, 256))
+    dataset = FolderDataset(
+        task="segmentation",
+        pre_process=pre_process,
+        root=root,
+        normal_dir="good",
+        abnormal_dir="broken_large",
+        mask_dir=os.path.join(root, "ground_truth/broken_large"),
+        split="train",
+    )
+    dataset.setup()
+
+    return dataset
+
+
+@pytest.fixture(autouse=True)
+def make_synthetic_dataset():
+    """Create synthetic anomaly dataset from folder dataset."""
+
+    def make():
+        folder_dataset = get_folder_dataset()
+        synthetic_dataset = SyntheticAnomalyDataset.from_dataset(folder_dataset)
+        return synthetic_dataset
+
+    return make
+
+
+@pytest.fixture(autouse=True)
+def synthetic_dataset_from_samples():
+    """Create synthetic anomaly dataset by passing a samples dataframe."""
+    folder_dataset = get_folder_dataset()
+    pre_process = PreProcessor(image_size=(256, 256))
+    synthetic_dataset = SyntheticAnomalyDataset(
+        task=folder_dataset.task, pre_process=pre_process, source_samples=folder_dataset.samples
+    )
+    return synthetic_dataset
+
+
+def test_create_synthetic_dataset(make_synthetic_dataset):
+    """Tests if the image and mask files listed in the synthetic dataset exist."""
+    synthetic_dataset = make_synthetic_dataset()
+    assert all(Path(path).exists() for path in synthetic_dataset.samples.image_path)
+    assert all(Path(path).exists() for path in synthetic_dataset.samples.mask_path)
+
+
+def test_create_from_dataset(synthetic_dataset_from_samples):
+    """Tests if the image and mask files listed in the synthetic dataset exist, when instantiated from samples df."""
+    synthetic_dataset = synthetic_dataset_from_samples
+    assert all(Path(path).exists() for path in synthetic_dataset.samples.image_path)
+    assert all(Path(path).exists() for path in synthetic_dataset.samples.mask_path)
+
+
+def test_cleanup(make_synthetic_dataset):
+    """Tests if the temporary directory is cleaned up when the instance is deleted."""
+    synthetic_dataset = make_synthetic_dataset()
+    root = synthetic_dataset.root
+    del synthetic_dataset
+    assert not root.exists()
+
+
+def test_copy(make_synthetic_dataset):
+    """Tests if the dataset is copied correctly, and files still exist after original instance is deleted."""
+    synthetic_dataset = make_synthetic_dataset()
+    synthetic_dataset_cp = copy(synthetic_dataset)
+    assert all(synthetic_dataset.samples == synthetic_dataset_cp.samples)
+    del synthetic_dataset
+    assert synthetic_dataset_cp.root.exists()
+
+
+def test_deepcopy(make_synthetic_dataset):
+    """Tests if the dataset is deep-copied correctly, and files still exist after original instance is deleted."""
+    synthetic_dataset = make_synthetic_dataset()
+    synthetic_dataset_cp = deepcopy(synthetic_dataset)
+    assert all(synthetic_dataset.samples == synthetic_dataset_cp.samples)
+    del synthetic_dataset
+    assert synthetic_dataset_cp.root.exists()

From ce43e091ada82ae03729772d9ebf424948e48a00 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Fri, 9 Dec 2022 17:46:02 +0100
Subject: [PATCH 92/96] simplify test set splitting logic

---
 anomalib/data/base/datamodule.py | 45 ++++++++++++++------------------
 1 file changed, 19 insertions(+), 26 deletions(-)

diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py
index 019da5e3ec..e53f358f74 100644
--- a/anomalib/data/base/datamodule.py
+++ b/anomalib/data/base/datamodule.py
@@ -6,7 +6,6 @@
 from __future__ import annotations
 
 import logging
-import warnings
 from abc import ABC
 from typing import Any, Dict, List, Optional
 
@@ -117,43 +116,37 @@ def _setup(self, _stage: Optional[str] = None) -> None:
         self._create_val_split()
 
     def _create_test_split(self):
-        # perform subset splitting for test set
+        """Obtain the test set based on the settings in the config."""
+        if self.test_data.has_normal:
+            # split the test data into normal and anomalous so these can be processed separately
+            normal_test_data, self.test_data = split_normal_and_anomalous(self.test_data)
+        else:
+            # when the user did not provide any normal images for testing, we sample some from the training set
+            logger.info(
+                "No normal test images found. Sampling from training set using a split ratio of %d",
+                self.test_split_ratio,
+            )
+            self.train_data, normal_test_data = random_split(self.train_data, self.test_split_ratio)
+
         if self.test_split_mode == TestSplitMode.FROM_DIR:
-            # normal data taken from normal_test_dir if available, otherwise sampled from training set
-            if not self.test_data.has_normal:
-                logger.info(
-                    "No normal test images found. Sampling from training set using a split ratio of %d",
-                    self.test_split_ratio,
-                )
-                self.train_data, normal_test_data = random_split(self.train_data, self.test_split_ratio)
-                self.test_data += normal_test_data
-            # anomalous data taken from abnormal_dir if available, otherwise raise warning
-            if not self.test_data.has_anomalous:
-                warnings.warn(
-                    "Your test set does not contain any anomalous images, which may lead to unreliable "
-                    "evaluation results. To fix, please include anomalous images in your dataset, or set "
-                    "`test_split_mode` to `synthetic`."
-                )
+            self.test_data += normal_test_data
         elif self.test_split_mode == TestSplitMode.SYNTHETIC:
-            if not self.test_data.has_normal:
-                logger.info(
-                    "No normal test images found. Sampling from training set using a split ratio of %d",
-                    self.test_split_ratio,
-                )
-                self.train_data, normal_test_data = random_split(self.train_data, self.test_split_ratio)
-            else:
-                normal_test_data, _ = split_normal_and_anomalous(self.test_data)
             self.test_data = SyntheticAnomalyDataset.from_dataset(normal_test_data)
+        else:
+            raise ValueError(f"Unsupported Test Split Mode: {self.test_split_mode}")
 
     def _create_val_split(self):
-        # perform subset splitting for validation set
+        """Obtain the validation set based on the settings in the config."""
         if self.val_split_mode == ValSplitMode.FROM_TEST:
+            # randomly sampled from test set
             self.test_data, self.val_data = random_split(
                 self.test_data, self.val_split_ratio, label_aware=True, seed=self.seed
             )
         elif self.val_split_mode == ValSplitMode.SAME_AS_TEST:
+            # equal to test set
             self.val_data = self.test_data
         elif self.val_split_mode == ValSplitMode.SYNTHETIC:
+            # converted from random training sample
             self.train_data, normal_val_data = random_split(self.train_data, self.val_split_ratio)
             self.val_data = SyntheticAnomalyDataset.from_dataset(normal_val_data)
         elif self.val_split_mode != ValSplitMode.NONE:

From 8b2d35640400f83ea162f095d19965cb64531686 Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 12 Dec 2022 14:49:49 +0100
Subject: [PATCH 93/96] update docstring

---
 anomalib/data/utils/augmenter.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/anomalib/data/utils/augmenter.py b/anomalib/data/utils/augmenter.py
index b08bb11898..b72b9f2d78 100644
--- a/anomalib/data/utils/augmenter.py
+++ b/anomalib/data/utils/augmenter.py
@@ -36,6 +36,8 @@ class Augmenter:
     Args:
         anomaly_source_path (Optional[str]): Path to a folder of images that will be used as source of the anomalous
         noise. If not specified, random noise will be used instead.
+        p_anomalous (float): Probability that the anomalous perturbation will be applied to a given image.
+        beta (float): Parameter that determines the opacity of the noise mask.
     """
 
     def __init__(

From a126af1d90a41ae58f811fd847db4157c05496ea Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 12 Dec 2022 17:49:39 +0100
Subject: [PATCH 94/96] add missing licence

---
 anomalib/data/synthetic.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/anomalib/data/synthetic.py b/anomalib/data/synthetic.py
index 42c4cc7cec..561e0fdc3f 100644
--- a/anomalib/data/synthetic.py
+++ b/anomalib/data/synthetic.py
@@ -3,6 +3,9 @@
 This dataset can be used when there is a lack of real anomalous data.
 """
 
+# Copyright (C) 2022 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
 import logging
 import math
 import shutil

From b2879c8cefe302c059c315c1c70ce5050188e77f Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 12 Dec 2022 17:55:31 +0100
Subject: [PATCH 95/96] split_normal_and_anomalous -> split_by_label

---
 anomalib/data/base/datamodule.py | 4 ++--
 anomalib/data/utils/__init__.py  | 4 ++--
 anomalib/data/utils/split.py     | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/anomalib/data/base/datamodule.py b/anomalib/data/base/datamodule.py
index e53f358f74..5870ab56ea 100644
--- a/anomalib/data/base/datamodule.py
+++ b/anomalib/data/base/datamodule.py
@@ -20,7 +20,7 @@
     TestSplitMode,
     ValSplitMode,
     random_split,
-    split_normal_and_anomalous,
+    split_by_label,
 )
 
 logger = logging.getLogger(__name__)
@@ -119,7 +119,7 @@ def _create_test_split(self):
         """Obtain the test set based on the settings in the config."""
         if self.test_data.has_normal:
             # split the test data into normal and anomalous so these can be processed separately
-            normal_test_data, self.test_data = split_normal_and_anomalous(self.test_data)
+            normal_test_data, self.test_data = split_by_label(self.test_data)
         else:
             # when the user did not provide any normal images for testing, we sample some from the training set
             logger.info(
diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py
index 4add078386..288e167762 100644
--- a/anomalib/data/utils/__init__.py
+++ b/anomalib/data/utils/__init__.py
@@ -19,7 +19,7 @@
     ValSplitMode,
     concatenate_datasets,
     random_split,
-    split_normal_and_anomalous,
+    split_by_label,
 )
 
 __all__ = [
@@ -31,7 +31,7 @@
     "read_image",
     "DownloadProgressBar",
     "random_split",
-    "split_normal_and_anomalous",
+    "split_by_label",
     "concatenate_datasets",
     "Split",
     "ValSplitMode",
diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
index 56cc15a736..60f8b7f0e1 100644
--- a/anomalib/data/utils/split.py
+++ b/anomalib/data/utils/split.py
@@ -127,7 +127,7 @@ def random_split(
     return [concatenate_datasets(subset) for subset in subsets]
 
 
-def split_normal_and_anomalous(dataset: AnomalibDataset) -> Tuple[AnomalibDataset, AnomalibDataset]:
+def split_by_label(dataset: AnomalibDataset) -> Tuple[AnomalibDataset, AnomalibDataset]:
     """Splits the dataset into the normal and anomalous subsets."""
     samples = dataset.samples
     normal_indices = samples[samples.label_index == 0].index

From 532ff8be8894ae66c6e53a1e76fd311cf989c61f Mon Sep 17 00:00:00 2001
From: Dick Ameln <dick.ameln@intel.com>
Date: Mon, 12 Dec 2022 18:04:20 +0100
Subject: [PATCH 96/96] VideoAnomalib -> AnomalibVideo

---
 anomalib/data/avenue.py        | 6 +++---
 anomalib/data/base/__init__.py | 4 ++--
 anomalib/data/base/video.py    | 6 +++---
 anomalib/data/ucsd_ped.py      | 6 +++---
 4 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/anomalib/data/avenue.py b/anomalib/data/avenue.py
index 459f42677c..ca58b37cde 100644
--- a/anomalib/data/avenue.py
+++ b/anomalib/data/avenue.py
@@ -26,7 +26,7 @@
 from pandas import DataFrame
 from torch import Tensor
 
-from anomalib.data.base import VideoAnomalibDataModule, VideoAnomalibDataset
+from anomalib.data.base import AnomalibVideoDataModule, AnomalibVideoDataset
 from anomalib.data.task_type import TaskType
 from anomalib.data.utils import DownloadProgressBar, Split, ValSplitMode, hash_check
 from anomalib.data.utils.video import ClipsIndexer
@@ -121,7 +121,7 @@ def get_mask(self, idx) -> Optional[Tensor]:
         return masks
 
 
-class AvenueDataset(VideoAnomalibDataset):
+class AvenueDataset(AnomalibVideoDataset):
     """Avenue Dataset class.
 
     Args:
@@ -156,7 +156,7 @@ def _setup(self):
         self.samples = make_avenue_dataset(self.root, self.gt_dir, self.split)
 
 
-class Avenue(VideoAnomalibDataModule):
+class Avenue(AnomalibVideoDataModule):
     """Avenue DataModule class.
 
     Args:
diff --git a/anomalib/data/base/__init__.py b/anomalib/data/base/__init__.py
index 0c8fe84257..936388b228 100644
--- a/anomalib/data/base/__init__.py
+++ b/anomalib/data/base/__init__.py
@@ -6,6 +6,6 @@
 
 from .datamodule import AnomalibDataModule
 from .dataset import AnomalibDataset
-from .video import VideoAnomalibDataModule, VideoAnomalibDataset
+from .video import AnomalibVideoDataModule, AnomalibVideoDataset
 
-__all__ = ["AnomalibDataset", "AnomalibDataModule", "VideoAnomalibDataset", "VideoAnomalibDataModule"]
+__all__ = ["AnomalibDataset", "AnomalibDataModule", "AnomalibVideoDataset", "AnomalibVideoDataModule"]
diff --git a/anomalib/data/base/video.py b/anomalib/data/base/video.py
index 538b45e6a9..b2f3b3678b 100644
--- a/anomalib/data/base/video.py
+++ b/anomalib/data/base/video.py
@@ -14,7 +14,7 @@
 from anomalib.pre_processing import PreProcessor
 
 
-class VideoAnomalibDataset(AnomalibDataset, ABC):
+class AnomalibVideoDataset(AnomalibDataset, ABC):
     """Base video anomalib dataset class.
 
     Args:
@@ -49,7 +49,7 @@ def samples(self):
     @samples.setter
     def samples(self, samples):
         """Overwrite samples and re-index subvideos."""
-        super(VideoAnomalibDataset, self.__class__).samples.fset(self, samples)
+        super(AnomalibVideoDataset, self.__class__).samples.fset(self, samples)
         self._setup_clips()
 
     def _setup_clips(self) -> None:
@@ -96,7 +96,7 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
         return item
 
 
-class VideoAnomalibDataModule(AnomalibDataModule):
+class AnomalibVideoDataModule(AnomalibDataModule):
     """Base class for video data modules."""
 
     def _setup(self, _stage: Optional[str] = None) -> None:
diff --git a/anomalib/data/ucsd_ped.py b/anomalib/data/ucsd_ped.py
index fd810c4402..996d786e19 100644
--- a/anomalib/data/ucsd_ped.py
+++ b/anomalib/data/ucsd_ped.py
@@ -14,7 +14,7 @@
 from pandas import DataFrame
 from torch import Tensor
 
-from anomalib.data.base import VideoAnomalibDataModule, VideoAnomalibDataset
+from anomalib.data.base import AnomalibVideoDataModule, AnomalibVideoDataset
 from anomalib.data.task_type import TaskType
 from anomalib.data.utils import (
     DownloadProgressBar,
@@ -134,7 +134,7 @@ def get_clip(self, idx: int) -> Tuple[Tensor, Tensor, Dict[str, Any], int]:
         return video, torch.empty((1, 0)), {}, video_idx
 
 
-class UCSDpedDataset(VideoAnomalibDataset):
+class UCSDpedDataset(AnomalibVideoDataset):
     """UCSDped Dataset class.
 
     Args:
@@ -168,7 +168,7 @@ def _setup(self):
         self.samples = make_ucsd_dataset(self.root_category, self.split)
 
 
-class UCSDped(VideoAnomalibDataModule):
+class UCSDped(AnomalibVideoDataModule):
     """UCSDped DataModule class.
 
     Args: