From f175a24c459cf5eacd0e6c4d8512e2353ae6b2f1 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Feb 2022 04:24:17 -0700
Subject: [PATCH 01/24] renamed download-progress-bar as download

---
 .../{download_progress_bar.py => download.py} | 49 +++++++++++++++++++
 1 file changed, 49 insertions(+)
 rename anomalib/data/utils/{download_progress_bar.py => download.py} (88%)

diff --git a/anomalib/data/utils/download_progress_bar.py b/anomalib/data/utils/download.py
similarity index 88%
rename from anomalib/data/utils/download_progress_bar.py
rename to anomalib/data/utils/download.py
index 26af24834a..9f0ec4980f 100644
--- a/anomalib/data/utils/download_progress_bar.py
+++ b/anomalib/data/utils/download.py
@@ -18,7 +18,11 @@
 # and limitations under the License.
 
 import io
+import tarfile
+import zipfile
+from pathlib import Path
 from typing import Dict, Iterable, Optional, Union
+from urllib.request import urlretrieve
 
 from tqdm import tqdm
 
@@ -193,3 +197,48 @@ def update_to(self, chunk_number: int = 1, max_chunk_size: int = 1, total_size=N
         if total_size is not None:
             self.total = total_size
         self.update(chunk_number * max_chunk_size - self.n)
+
+
+def download(url: str, filename: Union[str, Path], description: Optional[str] = None) -> None:
+    """Download the dataset from the given url.
+
+    This function downloads the dataset from url to the given filename.
+
+    Args:
+        url (str): Dataset URL
+        filename (str): Filename to save the file locally.
+        description (Optional[str], optional): _description_. Defaults to None.
+    """
+
+    with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=description) as progress_bar:
+        urlretrieve(url=url, filename=filename, reporthook=progress_bar.update_to)  # nosec
+
+
+def extract(filename: Path, path: Optional[Path] = None) -> None:
+    """Extract file from tar file.
+
+    Args:
+        filename (Path): Name of the tar/zip file
+        path (Optional[Path], optional): Path to which tar/zip file is extracted. Defaults to None.
+
+    Raises:
+        ValueError: When the file extension is not ".tar", ".gzip", ".bz2", ".lzma" or ".zip"
+
+    """
+    if path is None:
+        path = Path(".")
+
+    if filename.suffix == ".zip":
+        with zipfile.ZipFile(filename, "r") as zip_file:
+            zip_file.extractall(path)
+    else:
+        try:
+            with tarfile.open(filename) as tar_file:
+                tar_file.extractall(path)
+        except ValueError:
+            print("Unknown file extension to extract")
+
+
+def clean(filename: Path) -> None:
+    """Cleanup Dataset tar file."""
+    filename.unlink()

From f841f51811f9f6cdc8bbf02b1facc3ad20a9c153 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Feb 2022 04:26:46 -0700
Subject: [PATCH 02/24] added new download functions to init

---
 anomalib/data/utils/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py
index 01c8f98459..8b22ad4405 100644
--- a/anomalib/data/utils/__init__.py
+++ b/anomalib/data/utils/__init__.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions
 # and limitations under the License.
 
-from .download_progress_bar import DownloadProgressBar
+from .download import DownloadProgressBar, clean, download, extract
 from .image import get_image_filenames, read_image
 
-__all__ = ["get_image_filenames", "read_image", "DownloadProgressBar"]
+__all__ = ["get_image_filenames", "read_image", "DownloadProgressBar", "download", "extract", "clean"]

From 12cd8ee572091d5e67c96e794e991e3e1c2e9526 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Feb 2022 23:15:19 -0700
Subject: [PATCH 03/24] Added Btech data module

---
 anomalib/data/btech.py | 424 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 424 insertions(+)
 create mode 100644 anomalib/data/btech.py

diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
new file mode 100644
index 0000000000..13874cb337
--- /dev/null
+++ b/anomalib/data/btech.py
@@ -0,0 +1,424 @@
+"""BTech Dataset.
+
+This script contains PyTorch Lightning DataModule for the BTech dataset.
+
+If the dataset is not on the file system, the script downloads and
+extracts the dataset and create PyTorch data objects.
+"""
+
+# Copyright (C) 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+
+import logging
+import shutil
+import zipfile
+from pathlib import Path
+from typing import Dict, Optional, Tuple, Union
+from urllib.request import urlretrieve
+
+import albumentations as A
+import cv2
+import numpy as np
+import pandas as pd
+from pandas.core.frame import DataFrame
+from pytorch_lightning.core.datamodule import LightningDataModule
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch import Tensor
+from torch.utils.data import DataLoader
+from torch.utils.data.dataset import Dataset
+from torchvision.datasets.folder import VisionDataset
+
+from anomalib.data.inference import InferenceDataset
+from anomalib.data.utils import DownloadProgressBar, read_image
+from anomalib.data.utils.split import (
+    create_validation_set_from_test_set,
+    split_normal_images_in_train_set,
+)
+from anomalib.pre_processing import PreProcessor
+
+logger = logging.getLogger(name="Dataset: BTech")
+logger.setLevel(logging.DEBUG)
+
+
+def make_btech_dataset(
+    path: Path,
+    split: Optional[str] = None,
+    split_ratio: float = 0.1,
+    seed: int = 0,
+    create_validation_set: bool = False,
+) -> DataFrame:
+    """Create BTech samples by parsing the BTech data file structure.
+
+    The files are expected to follow the structure:
+        path/to/dataset/split/category/image_filename.png
+        path/to/dataset/ground_truth/category/mask_filename.png
+
+    Args:
+        path (Path): Path to dataset
+        split (str, optional): Dataset split (ie., either train or test). Defaults to None.
+        split_ratio (float, optional): Ratio to split normal training images and add to the
+            test set in case test set doesn't contain any normal images.
+            Defaults to 0.1.
+        seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0.
+        create_validation_set (bool, optional): Boolean to create a validation set from the test set.
+            BTech dataset does not contain a validation set. Those wanting to create a validation set
+            could set this flag to ``True``.
+
+    Example:
+        The following example shows how to get training samples from BTech 01 category:
+
+        >>> root = Path('./BTech')
+        >>> category = '01'
+        >>> path = root / category
+        >>> path
+        PosixPath('BTech/01')
+
+        >>> samples = make_btech_dataset(path, split='train', split_ratio=0.1, seed=0)
+        >>> samples.head()
+           path     split label image_path                  mask_path                     label_index
+        0  BTech/01 train 01    BTech/01/train/good/105.bmp BTech/01/ground_truth/good/105.bmp      0
+        1  BTech/01 train 01    BTech/01/train/good/017.bmp BTech/01/ground_truth/good/017.bmp      0
+        2  BTech/01 train 01    BTech/01/train/good/137.bmp BTech/01/ground_truth/good/137.bmp      0
+        3  BTech/01 train 01    BTech/01/train/good/152.bmp BTech/01/ground_truth/good/152.bmp      0
+        4  BTech/01 train 01    BTech/01/train/good/109.bmp BTech/01/ground_truth/good/109.bmp      0
+
+    Returns:
+        DataFrame: an output dataframe containing samples for the requested split (ie., train or test)
+    """
+    samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.bmp")]
+    if len(samples_list) == 0:
+        raise RuntimeError(f"Found 0 images in {path}")
+
+    samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"])
+    samples = samples[samples.split != "ground_truth"]
+
+    # Create mask_path column
+    samples["mask_path"] = (
+        samples.path
+        + "/ground_truth/"
+        + samples.label
+        + "/"
+        + samples.image_path.str.rstrip("bmp").str.rstrip(".")
+        + ".png"
+    )
+
+    # Modify image_path column by converting to absolute path
+    samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path
+
+    # Split the normal images in training set if test set doesn't
+    # contain any normal images. This is needed because AUC score
+    # cannot be computed based on 1-class
+    if sum((samples.split == "test") & (samples.label == "ok")) == 0:
+        samples = split_normal_images_in_train_set(samples, split_ratio, seed)
+
+    # Good images don't have mask
+    samples.loc[(samples.split == "test") & (samples.label == "ok"), "mask_path"] = ""
+
+    # Create label index for normal (0) and anomalous (1) images.
+    samples.loc[(samples.label == "ok"), "label_index"] = 0
+    samples.loc[(samples.label != "ok"), "label_index"] = 1
+    samples.label_index = samples.label_index.astype(int)
+
+    if create_validation_set:
+        samples = create_validation_set_from_test_set(samples, seed=seed)
+
+    # Get the data frame for the split.
+    if split is not None and split in ["train", "val", "test"]:
+        samples = samples[samples.split == split]
+        samples = samples.reset_index(drop=True)
+
+    return samples
+
+
+class BTech(VisionDataset):
+    """BTech PyTorch Dataset."""
+
+    def __init__(
+        self,
+        root: Union[Path, str],
+        category: str,
+        pre_process: PreProcessor,
+        split: str,
+        task: str = "segmentation",
+        seed: int = 0,
+        create_validation_set: bool = False,
+    ) -> None:
+        """Mvtec Dataset class.
+
+        Args:
+            root: Path to the BTech dataset
+            category: Name of the BTech category.
+            pre_process: List of pre_processing object containing albumentation compose.
+            split: 'train', 'val' or 'test'
+            task: ``classification`` or ``segmentation``
+            seed: seed used for the random subset splitting
+            create_validation_set: Create a validation subset in addition to the train and test subsets
+
+        Examples:
+            >>> from anomalib.data.btech import BTech
+            >>> from anomalib.data.transforms import PreProcessor
+            >>> pre_process = PreProcessor(image_size=256)
+            >>> dataset = BTech(
+            ...     root='./datasets/BTech',
+            ...     category='leather',
+            ...     pre_process=pre_process,
+            ...     task="classification",
+            ...     is_train=True,
+            ... )
+            >>> dataset[0].keys()
+            dict_keys(['image'])
+
+            >>> dataset.split = "test"
+            >>> dataset[0].keys()
+            dict_keys(['image', 'image_path', 'label'])
+
+            >>> dataset.task = "segmentation"
+            >>> dataset.split = "train"
+            >>> dataset[0].keys()
+            dict_keys(['image'])
+
+            >>> dataset.split = "test"
+            >>> dataset[0].keys()
+            dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask'])
+
+            >>> dataset[0]["image"].shape, dataset[0]["mask"].shape
+            (torch.Size([3, 256, 256]), torch.Size([256, 256]))
+        """
+        super().__init__(root)
+        self.root = Path(root) if isinstance(root, str) else root
+        self.category: str = category
+        self.split = split
+        self.task = task
+
+        self.pre_process = pre_process
+
+        self.samples = make_btech_dataset(
+            path=self.root / category,
+            split=self.split,
+            seed=seed,
+            create_validation_set=create_validation_set,
+        )
+
+    def __len__(self) -> int:
+        """Get length of the dataset."""
+        return len(self.samples)
+
+    def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
+        """Get dataset item for the index ``index``.
+
+        Args:
+            index (int): Index to get the item.
+
+        Returns:
+            Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
+                Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
+        """
+        item: Dict[str, Union[str, Tensor]] = {}
+
+        image_path = self.samples.image_path[index]
+        image = read_image(image_path)
+
+        if self.split == "train" or self.task == "classification":
+            pre_processed = self.pre_process(image=image)
+            item = {"image": pre_processed["image"]}
+        elif self.split in ["val", "test"]:
+            label_index = self.samples.label_index[index]
+
+            item["image_path"] = image_path
+            item["label"] = label_index
+
+            if self.task == "segmentation":
+                mask_path = self.samples.mask_path[index]
+
+                # Only Anomalous (1) images has masks in BTech dataset.
+                # Therefore, create empty mask for Normal (0) images.
+                if label_index == 0:
+                    mask = np.zeros(shape=image.shape[:2])
+                else:
+                    mask = cv2.imread(mask_path, flags=0) / 255.0
+
+                pre_processed = self.pre_process(image=image, mask=mask)
+
+                item["mask_path"] = mask_path
+                item["image"] = pre_processed["image"]
+                item["mask"] = pre_processed["mask"]
+
+        return item
+
+
+class BTechDataModule(LightningDataModule):
+    """BTechDataModule Lightning Data Module."""
+
+    def __init__(
+        self,
+        root: str,
+        category: str,
+        # TODO: Remove default values. IAAALD-211
+        image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        train_batch_size: int = 32,
+        test_batch_size: int = 32,
+        num_workers: int = 8,
+        transform_config: Optional[Union[str, A.Compose]] = None,
+        seed: int = 0,
+        create_validation_set: bool = False,
+    ) -> None:
+        """Instantiate BTech Lightning Data Module.
+
+        Args:
+            root: Path to the BTech dataset
+            category: Name of the BTech category.
+            image_size: Variable to which image is resized.
+            train_batch_size: Training batch size.
+            test_batch_size: Testing batch size.
+            num_workers: Number of workers.
+            transform_config: Config for pre-processing.
+            seed: seed used for the random subset splitting
+            create_validation_set: Create a validation subset in addition to the train and test subsets
+
+        Examples
+            >>> from anomalib.data import BTechDataModule
+            >>> datamodule = BTechDataModule(
+            ...     root="./datasets/BTech",
+            ...     category="leather",
+            ...     image_size=256,
+            ...     train_batch_size=32,
+            ...     test_batch_size=32,
+            ...     num_workers=8,
+            ...     transform_config=None,
+            ... )
+            >>> datamodule.setup()
+
+            >>> i, data = next(enumerate(datamodule.train_dataloader()))
+            >>> data.keys()
+            dict_keys(['image'])
+            >>> data["image"].shape
+            torch.Size([32, 3, 256, 256])
+
+            >>> i, data = next(enumerate(datamodule.val_dataloader()))
+            >>> data.keys()
+            dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask'])
+            >>> data["image"].shape, data["mask"].shape
+            (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256]))
+        """
+        super().__init__()
+
+        self.root = root if isinstance(root, Path) else Path(root)
+        self.category = category
+        self.dataset_path = self.root / self.category
+        self.transform_config = transform_config
+        self.image_size = image_size
+
+        self.pre_process = PreProcessor(config=self.transform_config, image_size=self.image_size)
+
+        self.train_batch_size = train_batch_size
+        self.test_batch_size = test_batch_size
+        self.num_workers = num_workers
+
+        self.create_validation_set = create_validation_set
+        self.seed = seed
+
+        self.train_data: Dataset
+        self.test_data: Dataset
+        if create_validation_set:
+            self.val_data: Dataset
+        self.inference_data: Dataset
+
+    def prepare_data(self) -> None:
+        """Download the dataset if not available."""
+        if (self.root / self.category).is_dir():
+            logging.info("Found the dataset.")
+        else:
+            self.root.mkdir(parents=True, exist_ok=True)
+            zip_filename = self.root / "btad.zip"
+
+            logging.info("Downloading the dataset.")
+            with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc="BTech") as progress_bar:
+                urlretrieve(
+                    url="https://avires.dimi.uniud.it/papers/btad/btad.zip",
+                    filename=zip_filename,
+                    reporthook=progress_bar.update_to,
+                )  # nosec
+
+            logging.info("Extracting the dataset.")
+            with zipfile.ZipFile(zip_filename, "r") as zip_file:
+                zip_file.extractall(self.root)
+
+            logging.info("Renaming the dataset directory")
+            shutil.move(src=self.root / "BTech_Dataset_transformed", dst=self.root / "BTech")
+
+            logging.info("Cleaning the tar file")
+            zip_filename.unlink()
+
+    def setup(self, stage: Optional[str] = None) -> None:
+        """Setup train, validation and test data.
+
+        BTech dataset uses BTech dataset structure, which is the reason for
+        using `anomalib.data.btech.BTech` class to get the dataset items.
+
+        Args:
+          stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
+
+        """
+        if stage in (None, "fit"):
+            self.train_data = BTech(
+                root=self.root,
+                category=self.category,
+                pre_process=self.pre_process,
+                split="train",
+                seed=self.seed,
+                create_validation_set=self.create_validation_set,
+            )
+
+        if self.create_validation_set:
+            self.val_data = BTech(
+                root=self.root,
+                category=self.category,
+                pre_process=self.pre_process,
+                split="val",
+                seed=self.seed,
+                create_validation_set=self.create_validation_set,
+            )
+
+        self.test_data = BTech(
+            root=self.root,
+            category=self.category,
+            pre_process=self.pre_process,
+            split="test",
+            seed=self.seed,
+            create_validation_set=self.create_validation_set,
+        )
+
+        if stage == "predict":
+            self.inference_data = InferenceDataset(
+                path=self.root, image_size=self.image_size, transform_config=self.transform_config
+            )
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        """Get train dataloader."""
+        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        """Get validation dataloader."""
+        dataset = self.val_data if self.create_validation_set else self.test_data
+        return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        """Get test dataloader."""
+        return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
+
+    def predict_dataloader(self) -> EVAL_DATALOADERS:
+        """Get predict dataloader."""
+        return DataLoader(
+            self.inference_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers
+        )

From 7bc453f0f4ef8e919a0f4268d0664a2aa08740f4 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Feb 2022 23:18:06 -0700
Subject: [PATCH 04/24] Added btech tests

---
 .gitignore                               |  2 +-
 tests/pre_merge/datasets/test_dataset.py | 59 +++++++++++++++++++-----
 2 files changed, 48 insertions(+), 13 deletions(-)

diff --git a/.gitignore b/.gitignore
index ba0b6d5a3b..b34903cc2e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 # Project related
 datasets
 !anomalib/datasets
-!tests/datasets
+!tests/pre_merge/datasets
 results
 !anomalib/core/results
 
diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
index 608c12f3af..efb5b179ba 100644
--- a/tests/pre_merge/datasets/test_dataset.py
+++ b/tests/pre_merge/datasets/test_dataset.py
@@ -3,13 +3,13 @@
 import numpy as np
 import pytest
 
-from anomalib.data.mvtec import MVTecDataModule
+from anomalib.data import BTechDataModule, MVTecDataModule
 from anomalib.pre_processing.transforms import Denormalize, ToNumpy
 from tests.helpers.dataset import get_dataset_path
 
 
 @pytest.fixture(autouse=True)
-def data_module():
+def mvtec_data_module():
     datamodule = MVTecDataModule(
         root=get_dataset_path(),
         category="leather",
@@ -25,24 +25,59 @@ def data_module():
 
 
 @pytest.fixture(autouse=True)
-def data_sample(data_module):
-    _, data = next(enumerate(data_module.train_dataloader()))
+def btech_data_module():
+    """Create BTech Data Module."""
+    datamodule = BTechDataModule(
+        root=get_dataset_path(path="./datasets/BTech"),
+        category="01",
+        image_size=(256, 256),
+        train_batch_size=1,
+        test_batch_size=1,
+        num_workers=0,
+    )
+    datamodule.prepare_data()
+    datamodule.setup()
+
+    return datamodule
+
+
+@pytest.fixture(autouse=True)
+def data_sample(mvtec_data_module):
+    _, data = next(enumerate(mvtec_data_module.train_dataloader()))
     return data
 
 
 class TestMVTecDataModule:
-    def test_batch_size(self, data_module):
+    def test_batch_size(self, mvtec_data_module):
         """test_mvtec_datamodule [summary]"""
-        _, train_data_sample = next(enumerate(data_module.train_dataloader()))
-        _, val_data_sample = next(enumerate(data_module.val_dataloader()))
+        _, train_data_sample = next(enumerate(mvtec_data_module.train_dataloader()))
+        _, val_data_sample = next(enumerate(mvtec_data_module.val_dataloader()))
+        assert train_data_sample["image"].shape[0] == 1
+        assert val_data_sample["image"].shape[0] == 1
+
+    def test_val_and_test_dataloaders_has_mask_and_gt(self, mvtec_data_module):
+        """Test Validation and Test dataloaders should return filenames, image, mask and label."""
+        _, val_data = next(enumerate(mvtec_data_module.val_dataloader()))
+        _, test_data = next(enumerate(mvtec_data_module.test_dataloader()))
+
+        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys())
+        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys())
+
+
+class TestBTechDataModule:
+    """Test BTech Data Module."""
+
+    def test_batch_size(self, btech_data_module):
+        """test_btech_datamodule [summary]"""
+        _, train_data_sample = next(enumerate(btech_data_module.train_dataloader()))
+        _, val_data_sample = next(enumerate(btech_data_module.val_dataloader()))
         assert train_data_sample["image"].shape[0] == 1
         assert val_data_sample["image"].shape[0] == 1
 
-    def test_val_and_test_dataloaders_has_mask_and_gt(self, data_module):
-        """Validation and Test dataloaders should return filenames, image, mask
-        and label."""
-        _, val_data = next(enumerate(data_module.val_dataloader()))
-        _, test_data = next(enumerate(data_module.test_dataloader()))
+    def test_val_and_test_dataloaders_has_mask_and_gt(self, btech_data_module):
+        """Test Validation and Test dataloaders should return filenames, image, mask and label."""
+        _, val_data = next(enumerate(btech_data_module.val_dataloader()))
+        _, test_data = next(enumerate(btech_data_module.test_dataloader()))
 
         assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys())
         assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys())

From 3a3244357b449c53791634bf639963effa1bea93 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Feb 2022 23:30:25 -0700
Subject: [PATCH 05/24] Move split functions into a util module

---
 anomalib/data/utils/split.py | 94 ++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 anomalib/data/utils/split.py

diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py
new file mode 100644
index 0000000000..559d11c9c6
--- /dev/null
+++ b/anomalib/data/utils/split.py
@@ -0,0 +1,94 @@
+"""Dataset Split Utils.
+
+This module contains function in regards to splitting normal images in training set,
+and creating validation sets from test sets.
+
+These function are useful
+    - when the test set does not contain any normal images.
+    - when the dataset doesn't have a validation set.
+"""
+
+# Copyright (C) 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+
+import random
+
+from pandas.core.frame import DataFrame
+
+
+def split_normal_images_in_train_set(
+    samples: DataFrame, split_ratio: float = 0.1, seed: int = 0, normal_label: str = "good"
+) -> DataFrame:
+    """Split normal images in train set.
+
+        This function splits the normal images in training set and assigns the
+        values to the test set. This is particularly useful especially when the
+        test set does not contain any normal images.
+
+        This is important because when the test set doesn't have any normal images,
+        AUC computation fails due to having single class.
+
+    Args:
+        samples (DataFrame): Dataframe containing dataset info such as filenames, splits etc.
+        split_ratio (float, optional): Train-Test normal image split ratio. Defaults to 0.1.
+        seed (int, optional): Random seed to ensure reproducibility. Defaults to 0.
+        normal_label (str): Name of the normal label. For MVTec, for instance, this is normal_label.
+
+    Returns:
+        DataFrame: Output dataframe where the part of the training set is assigned to test set.
+    """
+
+    if seed > 0:
+        random.seed(seed)
+
+    normal_train_image_indices = samples.index[(samples.split == "train") & (samples.label == normal_label)].to_list()
+    num_normal_train_images = len(normal_train_image_indices)
+    num_normal_valid_images = int(num_normal_train_images * split_ratio)
+
+    indices_to_split_from_train_set = random.sample(population=normal_train_image_indices, k=num_normal_valid_images)
+    samples.loc[indices_to_split_from_train_set, "split"] = "test"
+
+    return samples
+
+
+def create_validation_set_from_test_set(samples: DataFrame, seed: int = 0, normal_label: str = "good") -> DataFrame:
+    """Craete Validation Set from Test Set.
+
+    This function creates a validation set from test set by splitting both
+    normal and abnormal samples to two.
+
+    Args:
+        samples (DataFrame): Dataframe containing dataset info such as filenames, splits etc.
+        seed (int, optional): Random seed to ensure reproducibility. Defaults to 0.
+        normal_label (str): Name of the normal label. For MVTec, for instance, this is normal_label.
+    """
+
+    if seed > 0:
+        random.seed(seed)
+
+    # Split normal images.
+    normal_test_image_indices = samples.index[(samples.split == "test") & (samples.label == normal_label)].to_list()
+    num_normal_valid_images = len(normal_test_image_indices) // 2
+
+    indices_to_sample = random.sample(population=normal_test_image_indices, k=num_normal_valid_images)
+    samples.loc[indices_to_sample, "split"] = "val"
+
+    # Split abnormal images.
+    abnormal_test_image_indices = samples.index[(samples.split == "test") & (samples.label != normal_label)].to_list()
+    num_abnormal_valid_images = len(abnormal_test_image_indices) // 2
+
+    indices_to_sample = random.sample(population=abnormal_test_image_indices, k=num_abnormal_valid_images)
+    samples.loc[indices_to_sample, "split"] = "val"
+
+    return samples

From 132ceb13cbcd7964f9e1b1db5e16118017a6c8a6 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Feb 2022 23:42:35 -0700
Subject: [PATCH 06/24] Modified mvtec

---
 anomalib/data/mvtec.py | 134 ++++++++++-------------------------------
 1 file changed, 31 insertions(+), 103 deletions(-)

diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py
index e29e02b8b7..f2343ac908 100644
--- a/anomalib/data/mvtec.py
+++ b/anomalib/data/mvtec.py
@@ -22,7 +22,6 @@
 # and limitations under the License.
 
 import logging
-import random
 import tarfile
 from pathlib import Path
 from typing import Dict, Optional, Tuple, Union
@@ -42,76 +41,15 @@
 
 from anomalib.data.inference import InferenceDataset
 from anomalib.data.utils import DownloadProgressBar, read_image
+from anomalib.data.utils.split import (
+    create_validation_set_from_test_set,
+    split_normal_images_in_train_set,
+)
 from anomalib.pre_processing import PreProcessor
 
 logger = logging.getLogger(name="Dataset: MVTec")
 logger.setLevel(logging.DEBUG)
 
-__all__ = ["MVTec", "MVTecDataModule"]
-
-
-def split_normal_images_in_train_set(samples: DataFrame, split_ratio: float = 0.1, seed: int = 0) -> DataFrame:
-    """Split normal images in train set.
-
-        This function splits the normal images in training set and assigns the
-        values to the test set. This is particularly useful especially when the
-        test set does not contain any normal images.
-
-        This is important because when the test set doesn't have any normal images,
-        AUC computation fails due to having single class.
-
-    Args:
-        samples (DataFrame): Dataframe containing dataset info such as filenames, splits etc.
-        split_ratio (float, optional): Train-Test normal image split ratio. Defaults to 0.1.
-        seed (int, optional): Random seed to ensure reproducibility. Defaults to 0.
-
-    Returns:
-        DataFrame: Output dataframe where the part of the training set is assigned to test set.
-    """
-
-    if seed > 0:
-        random.seed(seed)
-
-    normal_train_image_indices = samples.index[(samples.split == "train") & (samples.label == "good")].to_list()
-    num_normal_train_images = len(normal_train_image_indices)
-    num_normal_valid_images = int(num_normal_train_images * split_ratio)
-
-    indices_to_split_from_train_set = random.sample(population=normal_train_image_indices, k=num_normal_valid_images)
-    samples.loc[indices_to_split_from_train_set, "split"] = "test"
-
-    return samples
-
-
-def create_validation_set_from_test_set(samples: DataFrame, seed: int = 0) -> DataFrame:
-    """Craete Validation Set from Test Set.
-
-    This function creates a validation set from test set by splitting both
-    normal and abnormal samples to two.
-
-    Args:
-        samples (DataFrame): Dataframe containing dataset info such as filenames, splits etc.
-        seed (int, optional): Random seed to ensure reproducibility. Defaults to 0.
-    """
-
-    if seed > 0:
-        random.seed(seed)
-
-    # Split normal images.
-    normal_test_image_indices = samples.index[(samples.split == "test") & (samples.label == "good")].to_list()
-    num_normal_valid_images = len(normal_test_image_indices) // 2
-
-    indices_to_sample = random.sample(population=normal_test_image_indices, k=num_normal_valid_images)
-    samples.loc[indices_to_sample, "split"] = "val"
-
-    # Split abnormal images.
-    abnormal_test_image_indices = samples.index[(samples.split == "test") & (samples.label != "good")].to_list()
-    num_abnormal_valid_images = len(abnormal_test_image_indices) // 2
-
-    indices_to_sample = random.sample(population=abnormal_test_image_indices, k=num_abnormal_valid_images)
-    samples.loc[indices_to_sample, "split"] = "val"
-
-    return samples
-
 
 def make_mvtec_dataset(
     path: Path,
@@ -220,7 +158,6 @@ def __init__(
         pre_process: PreProcessor,
         split: str,
         task: str = "segmentation",
-        download: bool = False,
         seed: int = 0,
         create_validation_set: bool = False,
     ) -> None:
@@ -232,7 +169,6 @@ def __init__(
             pre_process: List of pre_processing object containing albumentation compose.
             split: 'train', 'val' or 'test'
             task: ``classification`` or ``segmentation``
-            download: Boolean to download the MVTec dataset.
             seed: seed used for the random subset splitting
             create_validation_set: Create a validation subset in addition to the train and test subsets
 
@@ -274,44 +210,13 @@ def __init__(
 
         self.pre_process = pre_process
 
-        if download:
-            self._download()
-
         self.samples = make_mvtec_dataset(
-            path=self.root / category, split=self.split, seed=seed, create_validation_set=create_validation_set
+            path=self.root / category,
+            split=self.split,
+            seed=seed,
+            create_validation_set=create_validation_set,
         )
 
-    def _download(self) -> None:
-        """Download the MVTec dataset."""
-        if (self.root / self.category).is_dir():
-            logger.warning("Dataset directory exists.")
-        else:
-            self.root.mkdir(parents=True, exist_ok=True)
-            dataset_name = "mvtec_anomaly_detection.tar.xz"
-            self.filename = self.root / dataset_name
-
-            logger.info("Downloading MVTec Dataset")
-            with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=dataset_name) as progress_bar:
-                urlretrieve(  # nosec
-                    url=f"ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/{dataset_name}",
-                    filename=self.filename,
-                    reporthook=progress_bar.update_to,
-                )  # nosec
-
-            self._extract()
-            self._clean()
-
-    def _extract(self) -> None:
-        """Extract MVTec Dataset."""
-        logger.info("Extracting MVTec dataset")
-        with tarfile.open(self.filename) as file:
-            file.extractall(self.root)
-
-    def _clean(self) -> None:
-        """Cleanup MVTec Dataset tar file."""
-        logger.info("Cleaning up the tar file")
-        self.filename.unlink()
-
     def __len__(self) -> int:
         """Get length of the dataset."""
         return len(self.samples)
@@ -436,6 +341,29 @@ def __init__(
             self.val_data: Dataset
         self.inference_data: Dataset
 
+    def prepare_data(self) -> None:
+        """Download the dataset if not available."""
+        if (self.root / self.category).is_dir():
+            logging.info("Found the dataset.")
+        else:
+            self.root.mkdir(parents=True, exist_ok=True)
+            dataset_name = "mvtec_anomaly_detection.tar.xz"
+
+            logging.info("Downloading the dataset.")
+            with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc="MVTec") as progress_bar:
+                urlretrieve(
+                    url=f"ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/{dataset_name}",
+                    filename=self.root / dataset_name,
+                    reporthook=progress_bar.update_to,
+                )
+
+            logging.info("Extracting the dataset.")
+            with tarfile.open(self.root / dataset_name) as tar_file:
+                tar_file.extractall(self.root)
+
+            logging.info("Cleaning the tar file")
+            (self.root / dataset_name).unlink()
+
     def setup(self, stage: Optional[str] = None) -> None:
         """Setup train, validation and test data.
 

From 907281fda4506ad05ca8ed2945b3829e4286a1b5 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Feb 2022 23:51:44 -0700
Subject: [PATCH 07/24] added btech to get-datamodule

---
 anomalib/data/__init__.py | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index 302475d3e8..2deed94ba5 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -19,6 +19,7 @@
 from omegaconf import DictConfig, ListConfig
 from pytorch_lightning import LightningDataModule
 
+from .btech import BTechDataModule
 from .inference import InferenceDataset
 from .mvtec import MVTecDataModule
 
@@ -34,21 +35,27 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule
     """
     datamodule: LightningDataModule
 
-    if config.dataset.format.lower() == "mvtec":
-        datamodule = MVTecDataModule(
-            # TODO: Remove config values. IAAALD-211
-            root=config.dataset.path,
-            category=config.dataset.category,
-            image_size=(config.dataset.image_size[0], config.dataset.image_size[0]),
-            train_batch_size=config.dataset.train_batch_size,
-            test_batch_size=config.dataset.test_batch_size,
-            num_workers=config.dataset.num_workers,
-            seed=config.project.seed,
-        )
+    if config.dataset.name.lower() == "mvtec":
+        datamodule = MVTecDataModule
+    elif config.dataset.name.lower() == "btech":
+        datamodule = BTechDataModule
     else:
-        raise ValueError("Unknown dataset!")
+        raise ValueError(
+            "Unknown dataset! \n"
+            "If you use a custom dataset make sure you initialize it "
+            "in `get_datamodule` in `anomalib.data.__init__.py"
+        )
 
-    return datamodule
+    return datamodule(
+        # TODO: Remove config values. IAAALD-211
+        root=config.dataset.path,
+        category=config.dataset.category,
+        image_size=(config.dataset.image_size[0], config.dataset.image_size[0]),
+        train_batch_size=config.dataset.train_batch_size,
+        test_batch_size=config.dataset.test_batch_size,
+        num_workers=config.dataset.num_workers,
+        seed=config.project.seed,
+    )
 
 
 __all__ = ["get_datamodule", "InferenceDataset"]

From 16de223a2042fd945f736afc6eef8e7f9afbff47 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Feb 2022 23:54:44 -0700
Subject: [PATCH 08/24] fix typo in btech docstring

---
 anomalib/data/__init__.py |  4 ++--
 anomalib/data/btech.py    | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index 2deed94ba5..48ff4f9c64 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -42,8 +42,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule
     else:
         raise ValueError(
             "Unknown dataset! \n"
-            "If you use a custom dataset make sure you initialize it "
-            "in `get_datamodule` in `anomalib.data.__init__.py"
+            "If you use a custom dataset make sure you initialize it in"
+            "`get_datamodule` in `anomalib.data.__init__.py"
         )
 
     return datamodule(
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 13874cb337..8c43566b95 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -87,11 +87,11 @@ def make_btech_dataset(
         >>> samples = make_btech_dataset(path, split='train', split_ratio=0.1, seed=0)
         >>> samples.head()
            path     split label image_path                  mask_path                     label_index
-        0  BTech/01 train 01    BTech/01/train/good/105.bmp BTech/01/ground_truth/good/105.bmp      0
-        1  BTech/01 train 01    BTech/01/train/good/017.bmp BTech/01/ground_truth/good/017.bmp      0
-        2  BTech/01 train 01    BTech/01/train/good/137.bmp BTech/01/ground_truth/good/137.bmp      0
-        3  BTech/01 train 01    BTech/01/train/good/152.bmp BTech/01/ground_truth/good/152.bmp      0
-        4  BTech/01 train 01    BTech/01/train/good/109.bmp BTech/01/ground_truth/good/109.bmp      0
+        0  BTech/01 train 01    BTech/01/train/good/105.bmp BTech/01/ground_truth/good/105.png      0
+        1  BTech/01 train 01    BTech/01/train/good/017.bmp BTech/01/ground_truth/good/017.png      0
+        2  BTech/01 train 01    BTech/01/train/good/137.bmp BTech/01/ground_truth/good/137.png      0
+        3  BTech/01 train 01    BTech/01/train/good/152.bmp BTech/01/ground_truth/good/152.png      0
+        4  BTech/01 train 01    BTech/01/train/good/109.bmp BTech/01/ground_truth/good/109.png      0
 
     Returns:
         DataFrame: an output dataframe containing samples for the requested split (ie., train or test)

From c2353db26bdce3fc65020881963f590967d956a1 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Feb 2022 23:56:04 -0700
Subject: [PATCH 09/24] update docstring

---
 anomalib/data/btech.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index 8c43566b95..f28e66a5fc 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -87,11 +87,9 @@ def make_btech_dataset(
         >>> samples = make_btech_dataset(path, split='train', split_ratio=0.1, seed=0)
         >>> samples.head()
            path     split label image_path                  mask_path                     label_index
-        0  BTech/01 train 01    BTech/01/train/good/105.bmp BTech/01/ground_truth/good/105.png      0
-        1  BTech/01 train 01    BTech/01/train/good/017.bmp BTech/01/ground_truth/good/017.png      0
-        2  BTech/01 train 01    BTech/01/train/good/137.bmp BTech/01/ground_truth/good/137.png      0
-        3  BTech/01 train 01    BTech/01/train/good/152.bmp BTech/01/ground_truth/good/152.png      0
-        4  BTech/01 train 01    BTech/01/train/good/109.bmp BTech/01/ground_truth/good/109.png      0
+        0  BTech/01 train 01    BTech/01/train/ok/105.bmp BTech/01/ground_truth/ok/105.png      0
+        1  BTech/01 train 01    BTech/01/train/ok/017.bmp BTech/01/ground_truth/ok/017.png      0
+        ...
 
     Returns:
         DataFrame: an output dataframe containing samples for the requested split (ie., train or test)

From 287c974c8aa51c2673c6fd1b306e5b9ec7d6f5f3 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Fri, 25 Feb 2022 00:01:36 -0700
Subject: [PATCH 10/24] cleanedup dataset download utils

---
 anomalib/data/utils/__init__.py |  4 +--
 anomalib/data/utils/download.py | 49 ---------------------------------
 2 files changed, 2 insertions(+), 51 deletions(-)

diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py
index 8b22ad4405..c493058051 100644
--- a/anomalib/data/utils/__init__.py
+++ b/anomalib/data/utils/__init__.py
@@ -14,7 +14,7 @@
 # See the License for the specific language governing permissions
 # and limitations under the License.
 
-from .download import DownloadProgressBar, clean, download, extract
+from .download import DownloadProgressBar
 from .image import get_image_filenames, read_image
 
-__all__ = ["get_image_filenames", "read_image", "DownloadProgressBar", "download", "extract", "clean"]
+__all__ = ["get_image_filenames", "read_image", "DownloadProgressBar"]
diff --git a/anomalib/data/utils/download.py b/anomalib/data/utils/download.py
index 9f0ec4980f..26af24834a 100644
--- a/anomalib/data/utils/download.py
+++ b/anomalib/data/utils/download.py
@@ -18,11 +18,7 @@
 # and limitations under the License.
 
 import io
-import tarfile
-import zipfile
-from pathlib import Path
 from typing import Dict, Iterable, Optional, Union
-from urllib.request import urlretrieve
 
 from tqdm import tqdm
 
@@ -197,48 +193,3 @@ def update_to(self, chunk_number: int = 1, max_chunk_size: int = 1, total_size=N
         if total_size is not None:
             self.total = total_size
         self.update(chunk_number * max_chunk_size - self.n)
-
-
-def download(url: str, filename: Union[str, Path], description: Optional[str] = None) -> None:
-    """Download the dataset from the given url.
-
-    This function downloads the dataset from url to the given filename.
-
-    Args:
-        url (str): Dataset URL
-        filename (str): Filename to save the file locally.
-        description (Optional[str], optional): _description_. Defaults to None.
-    """
-
-    with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=description) as progress_bar:
-        urlretrieve(url=url, filename=filename, reporthook=progress_bar.update_to)  # nosec
-
-
-def extract(filename: Path, path: Optional[Path] = None) -> None:
-    """Extract file from tar file.
-
-    Args:
-        filename (Path): Name of the tar/zip file
-        path (Optional[Path], optional): Path to which tar/zip file is extracted. Defaults to None.
-
-    Raises:
-        ValueError: When the file extension is not ".tar", ".gzip", ".bz2", ".lzma" or ".zip"
-
-    """
-    if path is None:
-        path = Path(".")
-
-    if filename.suffix == ".zip":
-        with zipfile.ZipFile(filename, "r") as zip_file:
-            zip_file.extractall(path)
-    else:
-        try:
-            with tarfile.open(filename) as tar_file:
-                tar_file.extractall(path)
-        except ValueError:
-            print("Unknown file extension to extract")
-
-
-def clean(filename: Path) -> None:
-    """Cleanup Dataset tar file."""
-    filename.unlink()

From df8b655ade1f588c57b10e40fd9b426b0fbbbc74 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Fri, 25 Feb 2022 00:20:06 -0700
Subject: [PATCH 11/24] Address mypy

---
 anomalib/data/__init__.py | 33 +++++++++++++++++++++------------
 anomalib/data/btech.py    |  2 +-
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index 48ff4f9c64..0f53101e9a 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -36,9 +36,27 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule
     datamodule: LightningDataModule
 
     if config.dataset.name.lower() == "mvtec":
-        datamodule = MVTecDataModule
+        datamodule = MVTecDataModule(
+            # TODO: Remove config values. IAAALD-211
+            root=config.dataset.path,
+            category=config.dataset.category,
+            image_size=(config.dataset.image_size[0], config.dataset.image_size[0]),
+            train_batch_size=config.dataset.train_batch_size,
+            test_batch_size=config.dataset.test_batch_size,
+            num_workers=config.dataset.num_workers,
+            seed=config.project.seed,
+        )
     elif config.dataset.name.lower() == "btech":
-        datamodule = BTechDataModule
+        datamodule = BTechDataModule(
+            # TODO: Remove config values. IAAALD-211
+            root=config.dataset.path,
+            category=config.dataset.category,
+            image_size=(config.dataset.image_size[0], config.dataset.image_size[0]),
+            train_batch_size=config.dataset.train_batch_size,
+            test_batch_size=config.dataset.test_batch_size,
+            num_workers=config.dataset.num_workers,
+            seed=config.project.seed,
+        )
     else:
         raise ValueError(
             "Unknown dataset! \n"
@@ -46,16 +64,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule
             "`get_datamodule` in `anomalib.data.__init__.py"
         )
 
-    return datamodule(
-        # TODO: Remove config values. IAAALD-211
-        root=config.dataset.path,
-        category=config.dataset.category,
-        image_size=(config.dataset.image_size[0], config.dataset.image_size[0]),
-        train_batch_size=config.dataset.train_batch_size,
-        test_batch_size=config.dataset.test_batch_size,
-        num_workers=config.dataset.num_workers,
-        seed=config.project.seed,
-    )
+    return datamodule
 
 
 __all__ = ["get_datamodule", "InferenceDataset"]
diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py
index f28e66a5fc..2bdc942f80 100644
--- a/anomalib/data/btech.py
+++ b/anomalib/data/btech.py
@@ -353,7 +353,7 @@ def prepare_data(self) -> None:
                 zip_file.extractall(self.root)
 
             logging.info("Renaming the dataset directory")
-            shutil.move(src=self.root / "BTech_Dataset_transformed", dst=self.root / "BTech")
+            shutil.move(src=str(self.root / "BTech_Dataset_transformed"), dst=str(self.root / "BTech"))
 
             logging.info("Cleaning the tar file")
             zip_filename.unlink()

From 966ad94721eaeb2d40ab256b9a6b5f9d5b50b9d9 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Fri, 25 Feb 2022 00:27:54 -0700
Subject: [PATCH 12/24] modify config files and update readme.md

---
 README.md                             | 1 +
 anomalib/models/cflow/config.yaml     | 3 +--
 anomalib/models/dfkde/config.yaml     | 3 +--
 anomalib/models/dfm/config.yaml       | 3 +--
 anomalib/models/ganomaly/config.yaml  | 3 +--
 anomalib/models/padim/config.yaml     | 3 +--
 anomalib/models/patchcore/config.yaml | 3 +--
 anomalib/models/stfpm/config.yaml     | 3 +--
 8 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/README.md b/README.md
index 762c2b8673..9b7ac1cd9b 100644
--- a/README.md
+++ b/README.md
@@ -150,6 +150,7 @@ python tools/inference.py \
 ___
 
 ## Datasets
+The `development` branch supports MVTec and BeanTech datasets.
 
 ### [MVTec Dataset](https://www.mvtec.com/company/research/datasets/mvtec-ad)
 
diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml
index d1d8c26bda..5dd6a47e5a 100644
--- a/anomalib/models/cflow/config.yaml
+++ b/anomalib/models/cflow/config.yaml
@@ -1,8 +1,7 @@
 dataset:
-  name: mvtec
+  name: mvtec #options: [mvtec, btech]
   format: mvtec
   path: ./datasets/MVTec
-  url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz
   category: bottle
   task: segmentation
   label_format: None
diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml
index e73b636208..56c74ecd5f 100644
--- a/anomalib/models/dfkde/config.yaml
+++ b/anomalib/models/dfkde/config.yaml
@@ -1,8 +1,7 @@
 dataset:
-  name: mvtec
+  name: mvtec #options: [mvtec, btech]
   format: mvtec
   path: ./datasets/MVTec
-  url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz
   category: bottle
   task: classification
   label_format: None
diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml
index 9ab2b04161..83548f1139 100755
--- a/anomalib/models/dfm/config.yaml
+++ b/anomalib/models/dfm/config.yaml
@@ -1,8 +1,7 @@
 dataset:
-  name: mvtec
+  name: mvtec #options: [mvtec, btech]
   format: mvtec
   path: ./datasets/MVTec
-  url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz
   category: bottle
   task: classification
   label_format: None
diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml
index 2837bf13d1..0fcacf309f 100644
--- a/anomalib/models/ganomaly/config.yaml
+++ b/anomalib/models/ganomaly/config.yaml
@@ -1,8 +1,7 @@
 dataset:
-  name: mvtec
+  name: mvtec #options: [mvtec, btech]
   format: mvtec
   path: ./datasets/MVTec
-  url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz
   category: bottle
   task: classification
   label_format: None
diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml
index 8b5df3cc37..86cb2d1aac 100644
--- a/anomalib/models/padim/config.yaml
+++ b/anomalib/models/padim/config.yaml
@@ -1,8 +1,7 @@
 dataset:
-  name: mvtec
+  name: mvtec #options: [mvtec, btech]
   format: mvtec
   path: ./datasets/MVTec
-  url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz
   category: bottle
   task: segmentation
   label_format: None
diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml
index ebb1755993..599d957f9d 100644
--- a/anomalib/models/patchcore/config.yaml
+++ b/anomalib/models/patchcore/config.yaml
@@ -1,8 +1,7 @@
 dataset:
-  name: mvtec
+  name: mvtec #options: [mvtec, btech]
   format: mvtec
   path: ./datasets/MVTec
-  url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz
   task: segmentation
   category: bottle
   label_format: None
diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml
index 1f476e9f41..958d7d6a58 100644
--- a/anomalib/models/stfpm/config.yaml
+++ b/anomalib/models/stfpm/config.yaml
@@ -1,8 +1,7 @@
 dataset:
-  name: mvtec
+  name: mvtec #options: [mvtec, btech]
   format: mvtec
   path: ./datasets/MVTec
-  url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz
   category: bottle
   task: segmentation
   label_format: None

From 97d98fafc952acdced2537791609dff7499b2ee1 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Fri, 25 Feb 2022 05:44:22 -0700
Subject: [PATCH 13/24] Fix dataset path

---
 .github/workflows/nightly.yml            |  2 +-
 .github/workflows/pre_merge.yml          |  2 +-
 tests/helpers/dataset.py                 | 27 +++++++++++++++++-------
 tests/pre_merge/datasets/test_dataset.py |  4 ++--
 4 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 1813b0c49d..4124507cfb 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -20,7 +20,7 @@ jobs:
         run: pip install tox
       - name: Coverage
         run: |
-          export ANOMALIB_DATASET_PATH=/media/data1/datasets/MVTec
+          export ANOMALIB_DATASET_PATH=/media/data1/datasets/
           export CUDA_VISIBLE_DEVICES=2
           tox -e nightly
       - name: Upload coverage result
diff --git a/.github/workflows/pre_merge.yml b/.github/workflows/pre_merge.yml
index f6151ca73e..0f0e8c2223 100644
--- a/.github/workflows/pre_merge.yml
+++ b/.github/workflows/pre_merge.yml
@@ -22,7 +22,7 @@ jobs:
         run: tox -e black,isort,flake8,pylint,mypy,pydocstyle
       - name: Coverage
         run: |
-          export ANOMALIB_DATASET_PATH=/media/data1/datasets/MVTec
+          export ANOMALIB_DATASET_PATH=/media/data1/datasets/
           export CUDA_VISIBLE_DEVICES=3
           tox -e pre_merge
       - name: Upload coverage result
diff --git a/tests/helpers/dataset.py b/tests/helpers/dataset.py
index 401704e53d..08db75d824 100644
--- a/tests/helpers/dataset.py
+++ b/tests/helpers/dataset.py
@@ -12,18 +12,29 @@
 from .shapes import random_shapes
 
 
-def get_dataset_path(path: Union[str, Path] = "./datasets/MVTec"):
+def get_dataset_path(dataset: str = "MVTec") -> str:
     """Selects path based on tests in local system or docker image.
 
-    Local install assumes dataset is downloaded to
-    anomaly/datasets/MVTec. In either case, if the location is empty,
-    the dataset is downloaded again. This speeds up tests in docker
-    images where dataset is already stored in /tmp/anomalib
+    Local install assumes datasets are located in anomaly/datasets/.
+    In either case, if the location is empty, the dataset is downloaded again.
+    This speeds up tests in docker images where dataset is already stored in /tmp/anomalib
+
+    Example:
+    Assume that `datasets directory exists in ~/anomalib/,
+
+    >>> get_dataset_path(dataset="MVTec")
+    './datasets/MVTec'
+
     """
-    # when running locally
-    path = str(path)
+    # Initially check if `datasets` directory exists locally and look
+    # for the `dataset`. This is useful for local testing.
+    path = os.path.join("./datasets", dataset)
+
+    # For docker deployment or a CI that runs on server, dataset directory
+    # may not necessarily be located in the repo. Therefore, check anomalib
+    # dataset path environment variable.
     if not os.path.isdir(path):
-        path = os.environ["ANOMALIB_DATASET_PATH"]
+        path = os.path.join(os.environ["ANOMALIB_DATASET_PATH"], dataset)
     return path
 
 
diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
index efb5b179ba..c849422ed2 100644
--- a/tests/pre_merge/datasets/test_dataset.py
+++ b/tests/pre_merge/datasets/test_dataset.py
@@ -11,7 +11,7 @@
 @pytest.fixture(autouse=True)
 def mvtec_data_module():
     datamodule = MVTecDataModule(
-        root=get_dataset_path(),
+        root=get_dataset_path(dataset="MVTec"),
         category="leather",
         image_size=(256, 256),
         train_batch_size=1,
@@ -28,7 +28,7 @@ def mvtec_data_module():
 def btech_data_module():
     """Create BTech Data Module."""
     datamodule = BTechDataModule(
-        root=get_dataset_path(path="./datasets/BTech"),
+        root=get_dataset_path(dataset="BTech"),
         category="01",
         image_size=(256, 256),
         train_batch_size=1,

From b71f4d36c249c91ec74168c7f29388dc3ca02a36 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Tue, 15 Mar 2022 01:57:35 -0700
Subject: [PATCH 14/24] WiP: Created make_dataset function

---
 anomalib/data/folder.py | 170 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 170 insertions(+)
 create mode 100644 anomalib/data/folder.py

diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
new file mode 100644
index 0000000000..eeb2524a34
--- /dev/null
+++ b/anomalib/data/folder.py
@@ -0,0 +1,170 @@
+"""Custom Folder Dataset.
+
+This script creates a custom dataset from a folder.
+"""
+
+# Copyright (C) 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+
+import logging
+import tarfile
+from distutils import extension
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+from urllib.request import urlretrieve
+
+import albumentations as A
+import cv2
+import numpy as np
+import pandas as pd
+from pandas.core.frame import DataFrame
+from pytorch_lightning.core.datamodule import LightningDataModule
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch import Tensor
+from torch.utils.data import DataLoader, Dataset
+from torch.utils.data.dataset import Dataset
+from torchvision.datasets.folder import IMG_EXTENSIONS
+
+from anomalib.data.inference import InferenceDataset
+from anomalib.data.utils import DownloadProgressBar, read_image
+from anomalib.data.utils.split import (
+    create_validation_set_from_test_set,
+    split_normal_images_in_train_set,
+)
+from anomalib.pre_processing import PreProcessor
+
+logger = logging.getLogger(name="Dataset: MVTec")
+logger.setLevel(logging.DEBUG)
+
+
+def __check_and_convert_path(path: Union[str, Path]) -> Path:
+    """Check an input path, and convert to Pathlib object.
+
+    Args:
+        path (Union[str, Path]): Input path.
+
+    Returns:
+        Path: Output path converted to pathlib object.
+    """
+    if not isinstance(path, Path):
+        path = Path(path)
+    return path
+
+
+def make_dataset(
+    normal_dir: Path,
+    abnormal_dir: Path,
+    split: Optional[str] = None,
+    split_ratio: float = 0.1,
+    seed: int = 0,
+    create_validation_set: bool = False,
+    extensions: Optional[Tuple[str, ...]] = None,
+) -> DataFrame:
+    """Create a folder dataset."""
+
+    normal_dir = __check_and_convert_path(normal_dir)
+    abnormal_dir = __check_and_convert_path(abnormal_dir)
+
+    if extensions is None:
+        extensions = IMG_EXTENSIONS
+
+    normal_filenames = [f for f in normal_dir.glob(r"**/*") if f.suffix in extensions]
+    abnormal_filenames = [f for f in abnormal_dir.glob(r"**/*") if f.suffix in extensions]
+
+    # TODO: Create a pd dataframe based on the above filenames.
+
+    # samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.png")]
+    # if len(samples_list) == 0:
+    #     raise RuntimeError(f"Found 0 images in {path}")
+
+    # samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"])
+    # samples = samples[samples.split != "ground_truth"]
+
+    # # Create mask_path column
+    # samples["mask_path"] = (
+    #     samples.path
+    #     + "/ground_truth/"
+    #     + samples.label
+    #     + "/"
+    #     + samples.image_path.str.rstrip("png").str.rstrip(".")
+    #     + "_mask.png"
+    # )
+
+    # # Modify image_path column by converting to absolute path
+    # samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path
+
+    # # Split the normal images in training set if test set doesn't
+    # # contain any normal images. This is needed because AUC score
+    # # cannot be computed based on 1-class
+    # if sum((samples.split == "test") & (samples.label == "good")) == 0:
+    #     samples = split_normal_images_in_train_set(samples, split_ratio, seed)
+
+    # # Good images don't have mask
+    # samples.loc[(samples.split == "test") & (samples.label == "good"), "mask_path"] = ""
+
+    # # Create label index for normal (0) and anomalous (1) images.
+    # samples.loc[(samples.label == "good"), "label_index"] = 0
+    # samples.loc[(samples.label != "good"), "label_index"] = 1
+    # samples.label_index = samples.label_index.astype(int)
+
+    # if create_validation_set:
+    #     samples = create_validation_set_from_test_set(samples, seed=seed)
+
+    # # Get the data frame for the split.
+    # if split is not None and split in ["train", "val", "test"]:
+    #     samples = samples[samples.split == split]
+    #     samples = samples.reset_index(drop=True)
+
+    # return samples
+
+
+class FolderDataset(Dataset):
+    """Folder Dataset."""
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        normal: Union[Path, str],
+        abnormal: Union[Path, str],
+        split: str,
+        mask: Optional[Union[Path, str]] = None,
+        pre_process: Optional[PreProcessor] = None,
+        extensions: Optional[Sequence[str]] = None,
+        task: str = "segmentation",
+        seed: int = 0,
+        create_validation_set: bool = False,
+    ) -> None:
+        pass
+
+    def __len__(self) -> int:
+        """Get length of the dataset."""
+        pass
+
+    def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
+        """Get dataset item for the index ``index``.
+
+        Args:
+            index (int): Index to get the item.
+
+        Returns:
+            Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
+                Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
+        """
+        pass
+
+
+samples = make_dataset(
+    normal_dir="/home/sakcay/projects/anomalib/datasets/MVTec/bottle/test/good",
+    abnormal_dir="/home/sakcay/projects/anomalib/datasets/MVTec/bottle/test/broken_large",
+)

From 28f7d3ec76af642281d0788e2c0092167e894b5f Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Mon, 21 Mar 2022 22:23:03 -0700
Subject: [PATCH 15/24] Renamed folder dataset into custom

---
 anomalib/data/custom.py | 474 ++++++++++++++++++++++++++++++++++++++++
 anomalib/data/folder.py | 170 --------------
 2 files changed, 474 insertions(+), 170 deletions(-)
 create mode 100644 anomalib/data/custom.py
 delete mode 100644 anomalib/data/folder.py

diff --git a/anomalib/data/custom.py b/anomalib/data/custom.py
new file mode 100644
index 0000000000..b7dba3bbce
--- /dev/null
+++ b/anomalib/data/custom.py
@@ -0,0 +1,474 @@
+"""Custom Folder Dataset.
+
+This script creates a custom dataset from a folder.
+"""
+
+# Copyright (C) 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+
+import logging
+from pathlib import Path
+from typing import Dict, Optional, Tuple, Union
+
+import albumentations as A
+import cv2
+import numpy as np
+from pandas.core.frame import DataFrame
+from pytorch_lightning.core.datamodule import LightningDataModule
+from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
+from torch import Tensor
+from torch.utils.data import DataLoader, Dataset
+from torchvision.datasets.folder import IMG_EXTENSIONS
+
+from anomalib.data.inference import InferenceDataset
+from anomalib.data.utils import read_image
+from anomalib.data.utils.split import (
+    create_validation_set_from_test_set,
+    split_normal_images_in_train_set,
+)
+from anomalib.pre_processing import PreProcessor
+
+logger = logging.getLogger(name="Dataset: Custom Dataset")
+logger.setLevel(logging.DEBUG)
+
+
+def _check_and_convert_path(path: Union[str, Path]) -> Path:
+    """Check an input path, and convert to Pathlib object.
+
+    Args:
+        path (Union[str, Path]): Input path.
+
+    Returns:
+        Path: Output path converted to pathlib object.
+    """
+    if not isinstance(path, Path):
+        path = Path(path)
+    return path
+
+
+def make_dataset(
+    normal_dir: Union[str, Path],
+    abnormal_dir: Union[str, Path],
+    mask_dir: Optional[Union[str, Path]] = None,
+    split: Optional[str] = None,
+    split_ratio: float = 0.2,
+    seed: int = 0,
+    create_validation_set: bool = True,
+    extensions: Optional[Tuple[str, ...]] = None,
+):
+    """Make Custom Dataset.
+
+    Args:
+        normal_dir (Union[str, Path]): Path to the directory containing normal images.
+        abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images.
+        mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
+            the mask annotations. Defaults to None.
+        split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None.
+        split_ratio (float, optional): Ratio to split normal training images and add to the
+            test set in case test set doesn't contain any normal images.
+            Defaults to 0.2.
+        seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0.
+        create_validation_set (bool, optional):Boolean to create a validation set from the test set.
+            MVTec dataset does not contain a validation set. Those wanting to create a validation set
+            could set this flag to ``True``.
+        extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
+            directory.
+
+    Returns:
+        DataFrame: an output dataframe containing samples for the requested split (ie., train or test)
+    """
+    normal_dir = _check_and_convert_path(normal_dir)
+    abnormal_dir = _check_and_convert_path(abnormal_dir)
+
+    if extensions is None:
+        extensions = IMG_EXTENSIONS
+
+    # Get filenames from normal and abnormal directory.
+    normal_filenames = [f for f in normal_dir.glob(r"**/*") if f.suffix in extensions]
+    abnormal_filenames = [f for f in abnormal_dir.glob(r"**/*") if f.suffix in extensions]
+    filenames = normal_filenames + abnormal_filenames
+
+    # Add normal and abnormal labels to the samples as `label` column.
+    normal_labels = ["normal"] * len(normal_filenames)
+    abnormal_labels = ["abnormal"] * len(abnormal_filenames)
+    labels = normal_labels + abnormal_labels
+
+    samples = DataFrame({"image_path": filenames, "label": labels})
+
+    # Create label index for normal (0) and abnormal (1) images.
+    samples.loc[(samples.label == "normal"), "label_index"] = 0
+    samples.loc[(samples.label == "abnormal"), "label_index"] = 1
+    samples.label_index = samples.label_index.astype(int)
+
+    # If a path to mask is provided, add it to the sample dataframe.
+    if mask_dir is not None:
+        mask_dir = _check_and_convert_path(mask_dir)
+        normal_gt = ["" for f in normal_filenames]
+        abnormal_gt = [str(mask_dir / f.name) for f in abnormal_filenames]
+        gt_filenames = normal_gt + abnormal_gt
+
+        samples["mask_path"] = gt_filenames
+
+    # Ensure the pathlib objects are converted to str.
+    # This is because torch dataloader doesn't like pathlib.
+    samples = samples.astype({"image_path": "str"})
+
+    # Create train/test split.
+    # By default, all the normal samples are assigned as train.
+    #   and all the abnormal samples are test.
+    samples.loc[(samples.label == "normal"), "split"] = "train"
+    samples.loc[(samples.label == "abnormal"), "split"] = "test"
+    samples = split_normal_images_in_train_set(
+        samples=samples, split_ratio=split_ratio, seed=seed, normal_label="normal"
+    )
+
+    # If `create_validation_set` is set to True, the test set is split into half.
+    if create_validation_set:
+        samples = create_validation_set_from_test_set(samples, seed=seed, normal_label="normal")
+
+    # Get the data frame for the split.
+    if split is not None and split in ["train", "val", "test"]:
+        samples = samples[samples.split == split]
+        samples = samples.reset_index(drop=True)
+
+    return samples
+
+
+class CustomDataset(Dataset):
+    """Custom Dataset."""
+
+    def __init__(
+        self,
+        normal_dir: Union[Path, str],
+        abnormal_dir: Union[Path, str],
+        split: str,
+        pre_process: PreProcessor,
+        split_ratio: float = 0.2,
+        mask_dir: Optional[Union[Path, str]] = None,
+        extensions: Optional[Tuple[str, ...]] = None,
+        task: Optional[str] = None,
+        seed: int = 0,
+        create_validation_set: bool = False,
+    ) -> None:
+        """Create Custom Folder Dataset.
+
+        Args:
+            normal_dir (Union[str, Path]): Path to the directory containing normal images.
+            abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images.
+            split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None.
+            pre_process (Optional[PreProcessor], optional): Image Pro-processor to apply transform.
+                Defaults to None.
+            split_ratio (float, optional): Ratio to split normal training images and add to the
+                test set in case test set doesn't contain any normal images.
+                Defaults to 0.2.
+            mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
+                the mask annotations. Defaults to None.
+            extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
+                directory.
+            task (Optional[str], optional): Task type. (classification or segmentation) Defaults to None.
+            seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0.
+            create_validation_set (bool, optional):Boolean to create a validation set from the test set.
+                MVTec dataset does not contain a validation set. Those wanting to create a validation set
+                could set this flag to ``True``.
+
+        Raises:
+            ValueError: When task is set to classification and `mask_dir` is provided. When `mask_dir` is
+                provided, `task` should be set to `segmentation`.
+
+        """
+        self.split = split
+
+        if task == "classification" and mask_dir:
+            raise ValueError(
+                "Classification task is requested, but mask directory is provided. "
+                "Segmentation task is to be chosen if mask directory is provided."
+            )
+        if task is None or mask_dir is None:
+            self.task = "classification"
+        else:
+            self.task = task
+
+        self.pre_process = pre_process
+        self.samples = make_dataset(
+            normal_dir=normal_dir,
+            abnormal_dir=abnormal_dir,
+            mask_dir=mask_dir,
+            split=split,
+            split_ratio=split_ratio,
+            seed=seed,
+            create_validation_set=create_validation_set,
+            extensions=extensions,
+        )
+
+    def __len__(self) -> int:
+        """Get length of the dataset."""
+        return len(self.samples)
+
+    def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
+        """Get dataset item for the index ``index``.
+
+        Args:
+            index (int): Index to get the item.
+
+        Returns:
+            Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
+                Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
+        """
+        item: Dict[str, Union[str, Tensor]] = {}
+
+        image_path = self.samples.image_path[index]
+        image = read_image(image_path)
+
+        if self.split == "train" or self.task == "classification":
+            pre_processed = self.pre_process(image=image)
+            item = {"image": pre_processed["image"]}
+        elif self.split in ["val", "test"]:
+            label_index = self.samples.label_index[index]
+
+            item["image_path"] = image_path
+            item["label"] = label_index
+
+            if self.task == "segmentation":
+                mask_path = self.samples.mask_path[index]
+
+                # Only Anomalous (1) images has masks in MVTec dataset.
+                # Therefore, create empty mask for Normal (0) images.
+                if label_index == 0:
+                    mask = np.zeros(shape=image.shape[:2])
+                else:
+                    mask = cv2.imread(mask_path, flags=0) / 255.0
+
+                pre_processed = self.pre_process(image=image, mask=mask)
+
+                item["mask_path"] = mask_path
+                item["image"] = pre_processed["image"]
+                item["mask"] = pre_processed["mask"]
+
+        return item
+
+
+class CustomDataModule(LightningDataModule):
+    """Custom Lightning Data Module."""
+
+    def __init__(
+        self,
+        root: Union[str, Path],
+        normal: str = "normal",
+        abnormal: str = "abnormal",
+        mask_dir: Optional[Union[Path, str]] = None,
+        extensions: Optional[Tuple[str, ...]] = None,
+        split_ratio: float = 0.2,
+        seed: int = 0,
+        image_size: Optional[Union[int, Tuple[int, int]]] = None,
+        train_batch_size: int = 32,
+        test_batch_size: int = 32,
+        num_workers: int = 8,
+        transform_config: Optional[Union[str, A.Compose]] = None,
+        create_validation_set: bool = False,
+    ) -> None:
+        """Custom Dataset PL Datamodule.
+
+        Args:
+            root (Union[str, Path]): Path to the root folder containing normal and abnormal dirs.
+            normal (str, optional): Name of the directory containing normal images.
+                Defaults to "normal".
+            abnormal (str, optional): Name of the directory containing abnormal images.
+                Defaults to "abnormal".
+            mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
+                the mask annotations. Defaults to None.
+            extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
+                directory. Defaults to None.
+            split_ratio (float, optional): Ratio to split normal training images and add to the
+                test set in case test set doesn't contain any normal images.
+                Defaults to 0.2.
+            seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0.
+            image_size (Optional[Union[int, Tuple[int, int]]], optional): Size of the input image.
+                Defaults to None.
+            train_batch_size (int, optional): Training batch size. Defaults to 32.
+            test_batch_size (int, optional): Test batch size. Defaults to 32.
+            num_workers (int, optional): Number of workers. Defaults to 8.
+            transform_config (Optional[Union[str, A.Compose]], optional): Config for pre-processing.
+                Defaults to None.
+            create_validation_set (bool, optional):Boolean to create a validation set from the test set.
+                MVTec dataset does not contain a validation set. Those wanting to create a validation set
+                could set this flag to ``True``.
+
+        Examples:
+            Assume that we use Custom Dataset for the MVTec/bottle/broken_large category. We would do:
+            >>> from anomalib.data import CustomDataModule
+            >>> datamodule = CustomDataModule(
+            ...     root="./datasets/MVTec/bottle/test",
+            ...     normal="good",
+            ...     abnormal="broken_large",
+            ...     image_size=256
+            ... )
+            >>> datamodule.setup()
+            >>> i, data = next(enumerate(datamodule.train_dataloader()))
+            >>> data["image"].shape
+            torch.Size([16, 3, 256, 256])
+
+            >>> i, test_data = next(enumerate(datamodule.test_dataloader()))
+            >>> test_data.keys()
+            dict_keys(['image'])
+
+            We could also create a Custom DataModule for datasets containing mask annotations.
+            The dataset expects that mask annotation filenames must be same as the original filename.
+            To show an example, we therefore need to modify the mask filenames in MVTec dataset.
+
+            >>> # Rename MVTec mask annotations so that they are the same as image filanames
+            >>> folder = Path("./datasets/bottle/ground_truth/")
+            >>> for f in folder.glob(r"**/*.png"):
+            ...     f.rename(f.parent / (f.stem.split("_")[0] + f.suffix))
+
+            Now we could try custom data module using the mvtec bottle broken large category
+            >>> datamodule = CustomDataModule(
+            ...     root="./datasets/bottle/test",
+            ...     normal="good",
+            ...     abnormal="broken_large",
+            ...     mask_dir="./datasets/bottle/ground_truth/broken_large",
+            ...     image_size=256
+            ... )
+
+            >>> i , train_data = next(enumerate(datamodule.train_dataloader()))
+            >>> train_data.keys()
+            dict_keys(['image'])
+            >>> train_data["image"].shape
+            torch.Size([16, 3, 256, 256])
+
+            >>> i, test_data = next(enumerate(datamodule.test_dataloader()))
+            dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask'])
+            >>> print(test_data["image"].shape, test_data["mask"].shape)
+            torch.Size([24, 3, 256, 256]) torch.Size([24, 256, 256])
+
+            By default, Custom Data Module does not create a validation set. If a validation set
+            is needed it could be set as follows:
+
+            >>> datamodule = CustomDataModule(
+            ...     root="./datasets/bottle/test",
+            ...     normal="good",
+            ...     abnormal="broken_large",
+            ...     mask_dir="./datasets/bottle/ground_truth/broken_large",
+            ...     image_size=256,
+            ...     create_validation_set=True,
+            ... )
+
+            >>> i, val_data = next(enumerate(datamodule.val_dataloader()))
+            >>> val_data.keys()
+            dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask'])
+            >>> print(val_data["image"].shape, val_data["mask"].shape)
+            torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256])
+
+            >>> i, test_data = next(enumerate(datamodule.test_dataloader()))
+            >>> print(test_data["image"].shape, test_data["mask"].shape)
+            torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256])
+
+        """
+        super().__init__()
+
+        self.root = _check_and_convert_path(root)
+        self.normal_dir = self.root / normal
+        self.abnormal_dir = self.root / abnormal
+        self.mask_dir = mask_dir
+        self.extensions = extensions
+        self.split_ratio = split_ratio
+        self.task = "classification" if mask_dir is None else "segmentation"
+        self.transform_config = transform_config
+        self.image_size = image_size
+
+        self.pre_process = PreProcessor(config=self.transform_config, image_size=self.image_size)
+
+        self.train_batch_size = train_batch_size
+        self.test_batch_size = test_batch_size
+        self.num_workers = num_workers
+
+        self.create_validation_set = create_validation_set
+        self.seed = seed
+
+        self.train_data: Dataset
+        self.test_data: Dataset
+        if create_validation_set:
+            self.val_data: Dataset
+        self.inference_data: Dataset
+
+    def setup(self, stage: Optional[str] = None) -> None:
+        """Setup train, validation and test data.
+
+        Args:
+          stage: Optional[str]:  Train/Val/Test stages. (Default value = None)
+
+        """
+        if stage in (None, "fit"):
+            self.train_data = CustomDataset(
+                normal_dir=self.normal_dir,
+                abnormal_dir=self.abnormal_dir,
+                split="train",
+                split_ratio=self.split_ratio,
+                mask_dir=self.mask_dir,
+                pre_process=self.pre_process,
+                extensions=self.extensions,
+                task=self.task,
+                seed=self.seed,
+                create_validation_set=self.create_validation_set,
+            )
+
+        if self.create_validation_set:
+            self.val_data = CustomDataset(
+                normal_dir=self.normal_dir,
+                abnormal_dir=self.abnormal_dir,
+                split="val",
+                split_ratio=self.split_ratio,
+                mask_dir=self.mask_dir,
+                pre_process=self.pre_process,
+                extensions=self.extensions,
+                task=self.task,
+                seed=self.seed,
+                create_validation_set=self.create_validation_set,
+            )
+
+        self.test_data = CustomDataset(
+            normal_dir=self.normal_dir,
+            abnormal_dir=self.abnormal_dir,
+            split="test",
+            split_ratio=self.split_ratio,
+            mask_dir=self.mask_dir,
+            pre_process=self.pre_process,
+            extensions=self.extensions,
+            task=self.task,
+            seed=self.seed,
+            create_validation_set=self.create_validation_set,
+        )
+
+        if stage == "predict":
+            self.inference_data = InferenceDataset(
+                path=self.root, image_size=self.image_size, transform_config=self.transform_config
+            )
+
+    def train_dataloader(self) -> TRAIN_DATALOADERS:
+        """Get train dataloader."""
+        return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers)
+
+    def val_dataloader(self) -> EVAL_DATALOADERS:
+        """Get validation dataloader."""
+        dataset = self.val_data if self.create_validation_set else self.test_data
+        return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
+
+    def test_dataloader(self) -> EVAL_DATALOADERS:
+        """Get test dataloader."""
+        return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers)
+
+    def predict_dataloader(self) -> EVAL_DATALOADERS:
+        """Get predict dataloader."""
+        return DataLoader(
+            self.inference_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers
+        )
diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py
deleted file mode 100644
index eeb2524a34..0000000000
--- a/anomalib/data/folder.py
+++ /dev/null
@@ -1,170 +0,0 @@
-"""Custom Folder Dataset.
-
-This script creates a custom dataset from a folder.
-"""
-
-# Copyright (C) 2020 Intel Corporation
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-
-import logging
-import tarfile
-from distutils import extension
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
-from urllib.request import urlretrieve
-
-import albumentations as A
-import cv2
-import numpy as np
-import pandas as pd
-from pandas.core.frame import DataFrame
-from pytorch_lightning.core.datamodule import LightningDataModule
-from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS
-from torch import Tensor
-from torch.utils.data import DataLoader, Dataset
-from torch.utils.data.dataset import Dataset
-from torchvision.datasets.folder import IMG_EXTENSIONS
-
-from anomalib.data.inference import InferenceDataset
-from anomalib.data.utils import DownloadProgressBar, read_image
-from anomalib.data.utils.split import (
-    create_validation_set_from_test_set,
-    split_normal_images_in_train_set,
-)
-from anomalib.pre_processing import PreProcessor
-
-logger = logging.getLogger(name="Dataset: MVTec")
-logger.setLevel(logging.DEBUG)
-
-
-def __check_and_convert_path(path: Union[str, Path]) -> Path:
-    """Check an input path, and convert to Pathlib object.
-
-    Args:
-        path (Union[str, Path]): Input path.
-
-    Returns:
-        Path: Output path converted to pathlib object.
-    """
-    if not isinstance(path, Path):
-        path = Path(path)
-    return path
-
-
-def make_dataset(
-    normal_dir: Path,
-    abnormal_dir: Path,
-    split: Optional[str] = None,
-    split_ratio: float = 0.1,
-    seed: int = 0,
-    create_validation_set: bool = False,
-    extensions: Optional[Tuple[str, ...]] = None,
-) -> DataFrame:
-    """Create a folder dataset."""
-
-    normal_dir = __check_and_convert_path(normal_dir)
-    abnormal_dir = __check_and_convert_path(abnormal_dir)
-
-    if extensions is None:
-        extensions = IMG_EXTENSIONS
-
-    normal_filenames = [f for f in normal_dir.glob(r"**/*") if f.suffix in extensions]
-    abnormal_filenames = [f for f in abnormal_dir.glob(r"**/*") if f.suffix in extensions]
-
-    # TODO: Create a pd dataframe based on the above filenames.
-
-    # samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.png")]
-    # if len(samples_list) == 0:
-    #     raise RuntimeError(f"Found 0 images in {path}")
-
-    # samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"])
-    # samples = samples[samples.split != "ground_truth"]
-
-    # # Create mask_path column
-    # samples["mask_path"] = (
-    #     samples.path
-    #     + "/ground_truth/"
-    #     + samples.label
-    #     + "/"
-    #     + samples.image_path.str.rstrip("png").str.rstrip(".")
-    #     + "_mask.png"
-    # )
-
-    # # Modify image_path column by converting to absolute path
-    # samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path
-
-    # # Split the normal images in training set if test set doesn't
-    # # contain any normal images. This is needed because AUC score
-    # # cannot be computed based on 1-class
-    # if sum((samples.split == "test") & (samples.label == "good")) == 0:
-    #     samples = split_normal_images_in_train_set(samples, split_ratio, seed)
-
-    # # Good images don't have mask
-    # samples.loc[(samples.split == "test") & (samples.label == "good"), "mask_path"] = ""
-
-    # # Create label index for normal (0) and anomalous (1) images.
-    # samples.loc[(samples.label == "good"), "label_index"] = 0
-    # samples.loc[(samples.label != "good"), "label_index"] = 1
-    # samples.label_index = samples.label_index.astype(int)
-
-    # if create_validation_set:
-    #     samples = create_validation_set_from_test_set(samples, seed=seed)
-
-    # # Get the data frame for the split.
-    # if split is not None and split in ["train", "val", "test"]:
-    #     samples = samples[samples.split == split]
-    #     samples = samples.reset_index(drop=True)
-
-    # return samples
-
-
-class FolderDataset(Dataset):
-    """Folder Dataset."""
-
-    def __init__(
-        self,
-        root: Union[str, Path],
-        normal: Union[Path, str],
-        abnormal: Union[Path, str],
-        split: str,
-        mask: Optional[Union[Path, str]] = None,
-        pre_process: Optional[PreProcessor] = None,
-        extensions: Optional[Sequence[str]] = None,
-        task: str = "segmentation",
-        seed: int = 0,
-        create_validation_set: bool = False,
-    ) -> None:
-        pass
-
-    def __len__(self) -> int:
-        """Get length of the dataset."""
-        pass
-
-    def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
-        """Get dataset item for the index ``index``.
-
-        Args:
-            index (int): Index to get the item.
-
-        Returns:
-            Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training.
-                Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box.
-        """
-        pass
-
-
-samples = make_dataset(
-    normal_dir="/home/sakcay/projects/anomalib/datasets/MVTec/bottle/test/good",
-    abnormal_dir="/home/sakcay/projects/anomalib/datasets/MVTec/bottle/test/broken_large",
-)

From 83c138496a2ef3828e5cf6f421d64d70be7a2abe Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Mon, 21 Mar 2022 23:08:16 -0700
Subject: [PATCH 16/24] Added custom dataset tests

---
 anomalib/data/custom.py                  |  6 +++
 tests/pre_merge/datasets/test_dataset.py | 53 +++++++++++++++++++++++-
 2 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/anomalib/data/custom.py b/anomalib/data/custom.py
index b7dba3bbce..33f620e27b 100644
--- a/anomalib/data/custom.py
+++ b/anomalib/data/custom.py
@@ -99,6 +99,12 @@ def make_dataset(
     abnormal_filenames = [f for f in abnormal_dir.glob(r"**/*") if f.suffix in extensions]
     filenames = normal_filenames + abnormal_filenames
 
+    if len(normal_filenames) == 0:
+        raise RuntimeError(f"Found 0 normal images in {normal_dir}")
+
+    if len(abnormal_filenames) == 0:
+        raise RuntimeError(f"Found 0 annormal images in {abnormal_dir}")
+
     # Add normal and abnormal labels to the samples as `label` column.
     normal_labels = ["normal"] * len(normal_filenames)
     abnormal_labels = ["abnormal"] * len(abnormal_filenames)
diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
index ab9844c843..f1203a7d61 100644
--- a/tests/pre_merge/datasets/test_dataset.py
+++ b/tests/pre_merge/datasets/test_dataset.py
@@ -1,10 +1,17 @@
 """Test Dataset."""
 
+import os
+
 import numpy as np
 import pytest
 
 from anomalib.config import get_configurable_parameters, update_input_size_config
-from anomalib.data import BTechDataModule, MVTecDataModule, get_datamodule
+from anomalib.data import (
+    BTechDataModule,
+    CustomDataModule,
+    MVTecDataModule,
+    get_datamodule,
+)
 from anomalib.pre_processing.transforms import Denormalize, ToNumpy
 from tests.helpers.dataset import TestDataset, get_dataset_path
 
@@ -42,6 +49,27 @@ def btech_data_module():
     return datamodule
 
 
+@pytest.fixture(autouse=True)
+def custom_data_module():
+    """Create Custom Data Module."""
+    datamodule = CustomDataModule(
+        root="./datasets/bottle/test",
+        normal="good",
+        abnormal="broken_large",
+        mask_dir="./datasets/bottle/ground_truth/broken_large",
+        split_ratio=0.2,
+        seed=0,
+        image_size=(256, 256),
+        train_batch_size=32,
+        test_batch_size=32,
+        num_workers=8,
+        create_validation_set=True,
+    )
+    datamodule.setup()
+
+    return datamodule
+
+
 @pytest.fixture(autouse=True)
 def data_sample(mvtec_data_module):
     _, data = next(enumerate(mvtec_data_module.train_dataloader()))
@@ -49,6 +77,8 @@ def data_sample(mvtec_data_module):
 
 
 class TestMVTecDataModule:
+    """Test MVTec Data Module."""
+
     def test_batch_size(self, mvtec_data_module):
         """test_mvtec_datamodule [summary]"""
         _, train_data_sample = next(enumerate(mvtec_data_module.train_dataloader()))
@@ -69,7 +99,7 @@ class TestBTechDataModule:
     """Test BTech Data Module."""
 
     def test_batch_size(self, btech_data_module):
-        """test_btech_datamodule [summary]"""
+        """Test batch size."""
         _, train_data_sample = next(enumerate(btech_data_module.train_dataloader()))
         _, val_data_sample = next(enumerate(btech_data_module.val_dataloader()))
         assert train_data_sample["image"].shape[0] == 1
@@ -84,6 +114,25 @@ def test_val_and_test_dataloaders_has_mask_and_gt(self, btech_data_module):
         assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys())
 
 
+class TestCustomDataModule:
+    """Test Custom Data Module."""
+
+    def test_batch_size(self, custom_data_module):
+        """Test batch size."""
+        _, train_data_sample = next(enumerate(custom_data_module.train_dataloader()))
+        _, val_data_sample = next(enumerate(custom_data_module.val_dataloader()))
+        assert train_data_sample["image"].shape[0] == 16
+        assert val_data_sample["image"].shape[0] == 12
+
+    def test_val_and_test_dataloaders_has_mask_and_gt(self, custom_data_module):
+        """Test Validation and Test dataloaders should return filenames, image, mask and label."""
+        _, val_data = next(enumerate(custom_data_module.val_dataloader()))
+        _, test_data = next(enumerate(custom_data_module.test_dataloader()))
+
+        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys())
+        assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys())
+
+
 class TestDenormalize:
     """Test Denormalize Util."""
 

From 09908b0915155f64a55863ef82495985f3a6f740 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Mon, 21 Mar 2022 23:48:03 -0700
Subject: [PATCH 17/24] updated config.yaml file to show custom dataset is
 available

---
 anomalib/models/cflow/config.yaml     |  2 +-
 anomalib/models/dfkde/config.yaml     |  2 +-
 anomalib/models/dfm/config.yaml       |  2 +-
 anomalib/models/ganomaly/config.yaml  |  2 +-
 anomalib/models/padim/config.yaml     | 32 +++++++++++++++++++--------
 anomalib/models/patchcore/config.yaml |  2 +-
 anomalib/models/stfpm/config.yaml     |  2 +-
 7 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml
index 5dd6a47e5a..5eb3ed5178 100644
--- a/anomalib/models/cflow/config.yaml
+++ b/anomalib/models/cflow/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech]
+  name: mvtec #options: [mvtec, btech, custom]
   format: mvtec
   path: ./datasets/MVTec
   category: bottle
diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml
index 56c74ecd5f..53163fcec0 100644
--- a/anomalib/models/dfkde/config.yaml
+++ b/anomalib/models/dfkde/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech]
+  name: mvtec #options: [mvtec, btech, custom]
   format: mvtec
   path: ./datasets/MVTec
   category: bottle
diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml
index 83548f1139..587ea95331 100755
--- a/anomalib/models/dfm/config.yaml
+++ b/anomalib/models/dfm/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech]
+  name: mvtec #options: [mvtec, btech, custom]
   format: mvtec
   path: ./datasets/MVTec
   category: bottle
diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml
index 0fcacf309f..fba8949228 100644
--- a/anomalib/models/ganomaly/config.yaml
+++ b/anomalib/models/ganomaly/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech]
+  name: mvtec #options: [mvtec, btech, custom]
   format: mvtec
   path: ./datasets/MVTec
   category: bottle
diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml
index 86cb2d1aac..2bcaa5f019 100644
--- a/anomalib/models/padim/config.yaml
+++ b/anomalib/models/padim/config.yaml
@@ -1,10 +1,24 @@
 dataset:
-  name: mvtec #options: [mvtec, btech]
-  format: mvtec
-  path: ./datasets/MVTec
-  category: bottle
+  name: custom #options: [mvtec, btech, custom]
+  path: ./datasets/bottle/test
+  normal: good
+  abnormal: broken_large
+  mask: ./datasets/bottle/ground_truth/broken_large
+  extensions: null
+  split_ratio: 0.2
+  seed: 0
+  image_size: 256
+  train_batch_size: 32
+  test_batch_size: 32
+  num_workers: 8
+  transform_config: null
+  create_validation_set: true
+  # name: mvtec #options: [mvtec, btech, custom]
+  # format: mvtec
+  # path: ./datasets/MVTec
+  # category: bottle
   task: segmentation
-  label_format: None
+  # label_format: None
   tiling:
     apply: false
     tile_size: null
@@ -12,10 +26,10 @@ dataset:
     remove_border_count: 0
     use_random_tiling: False
     random_tile_count: 16
-  image_size: 256
-  train_batch_size: 32
-  test_batch_size: 32
-  num_workers: 36
+  # image_size: 256
+  # train_batch_size: 32
+  # test_batch_size: 32
+  # num_workers: 36
 
 model:
   name: padim
diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml
index 599d957f9d..64d6092405 100644
--- a/anomalib/models/patchcore/config.yaml
+++ b/anomalib/models/patchcore/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech]
+  name: mvtec #options: [mvtec, btech, custom]
   format: mvtec
   path: ./datasets/MVTec
   task: segmentation
diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml
index 958d7d6a58..77d983fab2 100644
--- a/anomalib/models/stfpm/config.yaml
+++ b/anomalib/models/stfpm/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech]
+  name: mvtec #options: [mvtec, btech, custom]
   format: mvtec
   path: ./datasets/MVTec
   category: bottle

From 215df462dfdf065d417450aeca812d078ad69fee Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Tue, 22 Mar 2022 00:12:21 -0700
Subject: [PATCH 18/24] Added custom dataset to get_datamodule

---
 README.md                 | 27 +++++++++++++++++++++++++++
 anomalib/config/config.py |  3 ++-
 anomalib/data/__init__.py | 20 +++++++++++++++++++-
 anomalib/data/custom.py   | 12 +++++++++++-
 4 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 6f5a463658..934bf5f09e 100644
--- a/README.md
+++ b/README.md
@@ -102,6 +102,33 @@ where the currently available models are:
 - [DFKDE](anomalib/models/dfkde)
 - [GANomaly](anomalib/models/ganomaly)
 
+### Custom Dataset
+It is also possible to train on a custom dataset. To do so, `data` section in `config.yaml` is to be modified as follows:
+```yaml
+dataset:
+  name: custom
+  path: <path/to/custom/dataset>
+  normal: normal # name of the folder containing normal images.
+  abnormal: abnormal # name of the folder containing abnormal images.
+  task: segmentation # classification or segmentation
+  mask: <path/to/mask/annotations> #optional
+  extensions: null
+  split_ratio: 0.2
+  seed: 0
+  image_size: 256
+  train_batch_size: 32
+  test_batch_size: 32
+  num_workers: 8
+  transform_config: null
+  create_validation_set: true
+  tiling:
+    apply: false
+    tile_size: null
+    stride: null
+    remove_border_count: 0
+    use_random_tiling: False
+    random_tile_count: 16
+```
 ## Inference
 
 Anomalib contains several tools that can be used to perform inference with a trained model. The script in [`tools/inference`](tools/inference.py) contains an example of how the inference tools can be used to generate a prediction for an input image.
diff --git a/anomalib/config/config.py b/anomalib/config/config.py
index 27c652d7a6..ed82f4b480 100644
--- a/anomalib/config/config.py
+++ b/anomalib/config/config.py
@@ -177,7 +177,8 @@ def get_configurable_parameters(
     config = update_input_size_config(config)
 
     # Project Configs
-    project_path = Path(config.project.path) / config.model.name / config.dataset.name / config.dataset.category
+    category = config.dataset.category if "category" in config.dataset.keys() else ""
+    project_path = Path(config.project.path) / config.model.name / config.dataset.name / category
     (project_path / "weights").mkdir(parents=True, exist_ok=True)
     (project_path / "images").mkdir(parents=True, exist_ok=True)
     config.project.path = str(project_path)
diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index 6aa8fab510..55a60af20e 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -20,6 +20,7 @@
 from pytorch_lightning import LightningDataModule
 
 from .btech import BTechDataModule
+from .custom import CustomDataModule
 from .inference import InferenceDataset
 from .mvtec import MVTecDataModule
 
@@ -51,12 +52,29 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule
             # TODO: Remove config values. IAAALD-211
             root=config.dataset.path,
             category=config.dataset.category,
-            image_size=(config.dataset.image_size[0], config.dataset.image_size[0]),
+            image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
             train_batch_size=config.dataset.train_batch_size,
             test_batch_size=config.dataset.test_batch_size,
             num_workers=config.dataset.num_workers,
             seed=config.project.seed,
         )
+    elif config.dataset.name.lower() == "custom":
+        datamodule = CustomDataModule(
+            root=config.dataset.path,
+            normal=config.dataset.normal,
+            abnormal=config.dataset.abnormal,
+            task=config.dataset.task,
+            mask_dir=config.dataset.mask,
+            extensions=config.dataset.extensions,
+            split_ratio=config.dataset.split_ratio,
+            seed=config.dataset.seed,
+            image_size=(config.dataset.image_size[0], config.dataset.image_size[1]),
+            train_batch_size=config.dataset.train_batch_size,
+            test_batch_size=config.dataset.test_batch_size,
+            num_workers=config.dataset.num_workers,
+            transform_config=config.dataset.transform_config,
+            create_validation_set=config.dataset.create_validation_set,
+        )
     else:
         raise ValueError(
             "Unknown dataset! \n"
diff --git a/anomalib/data/custom.py b/anomalib/data/custom.py
index 33f620e27b..96c2f5daca 100644
--- a/anomalib/data/custom.py
+++ b/anomalib/data/custom.py
@@ -272,6 +272,7 @@ def __init__(
         root: Union[str, Path],
         normal: str = "normal",
         abnormal: str = "abnormal",
+        task: str = "classification",
         mask_dir: Optional[Union[Path, str]] = None,
         extensions: Optional[Tuple[str, ...]] = None,
         split_ratio: float = 0.2,
@@ -291,6 +292,8 @@ def __init__(
                 Defaults to "normal".
             abnormal (str, optional): Name of the directory containing abnormal images.
                 Defaults to "abnormal".
+            task (str, optional): Task type. Could be either classification or segmentation.
+                Defaults to "classification".
             mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing
                 the mask annotations. Defaults to None.
             extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the
@@ -388,7 +391,14 @@ def __init__(
         self.mask_dir = mask_dir
         self.extensions = extensions
         self.split_ratio = split_ratio
-        self.task = "classification" if mask_dir is None else "segmentation"
+
+        if task == "classification" and mask_dir is not None:
+            raise ValueError(
+                "Classification type is set but mask_dir provided. "
+                "If mask_dir is provided task type must be segmentation. "
+                "Check your configuration."
+            )
+        self.task = task
         self.transform_config = transform_config
         self.image_size = image_size
 

From cf22594dbf44b70bee4577780de91360385f632d Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Wed, 23 Mar 2022 06:03:44 -0700
Subject: [PATCH 19/24] Address PR comments

---
 README.md                                |  9 ++---
 anomalib/config/config.py                |  6 ++--
 anomalib/data/__init__.py                | 18 ++++++----
 anomalib/data/{custom.py => folder.py}   | 46 +++++++++++-------------
 anomalib/models/cflow/config.yaml        |  2 +-
 anomalib/models/dfkde/config.yaml        |  2 +-
 anomalib/models/dfm/config.yaml          |  2 +-
 anomalib/models/ganomaly/config.yaml     |  2 +-
 anomalib/models/padim/config.yaml        |  2 +-
 anomalib/models/patchcore/config.yaml    |  2 +-
 anomalib/models/stfpm/config.yaml        |  2 +-
 tests/pre_merge/datasets/test_dataset.py | 25 ++++++-------
 12 files changed, 61 insertions(+), 57 deletions(-)
 rename anomalib/data/{custom.py => folder.py} (93%)

diff --git a/README.md b/README.md
index 934bf5f09e..0ca8329af9 100644
--- a/README.md
+++ b/README.md
@@ -103,17 +103,18 @@ where the currently available models are:
 - [GANomaly](anomalib/models/ganomaly)
 
 ### Custom Dataset
-It is also possible to train on a custom dataset. To do so, `data` section in `config.yaml` is to be modified as follows:
+It is also possible to train on a custom folder dataset. To do so, `data` section in `config.yaml` is to be modified as follows:
 ```yaml
 dataset:
-  name: custom
-  path: <path/to/custom/dataset>
+  name: <name-of-the-dataset>
+  format: folder
+  path: <path/to/folder/dataset>
   normal: normal # name of the folder containing normal images.
   abnormal: abnormal # name of the folder containing abnormal images.
   task: segmentation # classification or segmentation
   mask: <path/to/mask/annotations> #optional
   extensions: null
-  split_ratio: 0.2
+  split_ratio: 0.2  # ratio of the normal images that will be used to create a test split
   seed: 0
   image_size: 256
   train_batch_size: 32
diff --git a/anomalib/config/config.py b/anomalib/config/config.py
index ed82f4b480..ee2e3177bd 100644
--- a/anomalib/config/config.py
+++ b/anomalib/config/config.py
@@ -177,8 +177,10 @@ def get_configurable_parameters(
     config = update_input_size_config(config)
 
     # Project Configs
-    category = config.dataset.category if "category" in config.dataset.keys() else ""
-    project_path = Path(config.project.path) / config.model.name / config.dataset.name / category
+    project_path = Path(config.project.path) / config.model.name / config.dataset.name
+    if config.dataset.format.lower() in ("btech", "mvtec"):
+        project_path = project_path / config.dataset.category
+
     (project_path / "weights").mkdir(parents=True, exist_ok=True)
     (project_path / "images").mkdir(parents=True, exist_ok=True)
     config.project.path = str(project_path)
diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py
index 55a60af20e..a6f3f3b1f1 100644
--- a/anomalib/data/__init__.py
+++ b/anomalib/data/__init__.py
@@ -20,7 +20,7 @@
 from pytorch_lightning import LightningDataModule
 
 from .btech import BTechDataModule
-from .custom import CustomDataModule
+from .folder import FolderDataModule
 from .inference import InferenceDataset
 from .mvtec import MVTecDataModule
 
@@ -36,7 +36,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule
     """
     datamodule: LightningDataModule
 
-    if config.dataset.name.lower() == "mvtec":
+    if config.dataset.format.lower() == "mvtec":
         datamodule = MVTecDataModule(
             # TODO: Remove config values. IAAALD-211
             root=config.dataset.path,
@@ -47,7 +47,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule
             num_workers=config.dataset.num_workers,
             seed=config.project.seed,
         )
-    elif config.dataset.name.lower() == "btech":
+    elif config.dataset.format.lower() == "btech":
         datamodule = BTechDataModule(
             # TODO: Remove config values. IAAALD-211
             root=config.dataset.path,
@@ -58,8 +58,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule
             num_workers=config.dataset.num_workers,
             seed=config.project.seed,
         )
-    elif config.dataset.name.lower() == "custom":
-        datamodule = CustomDataModule(
+    elif config.dataset.format.lower() == "folder":
+        datamodule = FolderDataModule(
             root=config.dataset.path,
             normal=config.dataset.normal,
             abnormal=config.dataset.abnormal,
@@ -85,4 +85,10 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule
     return datamodule
 
 
-__all__ = ["get_datamodule", "InferenceDataset"]
+__all__ = [
+    "get_datamodule",
+    "BTechDataModule",
+    "FolderDataModule",
+    "InferenceDataset",
+    "MVTecDataModule",
+]
diff --git a/anomalib/data/custom.py b/anomalib/data/folder.py
similarity index 93%
rename from anomalib/data/custom.py
rename to anomalib/data/folder.py
index 96c2f5daca..d935d75292 100644
--- a/anomalib/data/custom.py
+++ b/anomalib/data/folder.py
@@ -39,7 +39,7 @@
 )
 from anomalib.pre_processing import PreProcessor
 
-logger = logging.getLogger(name="Dataset: Custom Dataset")
+logger = logging.getLogger(name="Dataset: Folder Dataset")
 logger.setLevel(logging.DEBUG)
 
 
@@ -67,7 +67,7 @@ def make_dataset(
     create_validation_set: bool = True,
     extensions: Optional[Tuple[str, ...]] = None,
 ):
-    """Make Custom Dataset.
+    """Make Folder Dataset.
 
     Args:
         normal_dir (Union[str, Path]): Path to the directory containing normal images.
@@ -151,8 +151,8 @@ def make_dataset(
     return samples
 
 
-class CustomDataset(Dataset):
-    """Custom Dataset."""
+class FolderDataset(Dataset):
+    """Folder Dataset."""
 
     def __init__(
         self,
@@ -167,7 +167,7 @@ def __init__(
         seed: int = 0,
         create_validation_set: bool = False,
     ) -> None:
-        """Create Custom Folder Dataset.
+        """Create Folder Folder Dataset.
 
         Args:
             normal_dir (Union[str, Path]): Path to the directory containing normal images.
@@ -264,8 +264,8 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]:
         return item
 
 
-class CustomDataModule(LightningDataModule):
-    """Custom Lightning Data Module."""
+class FolderDataModule(LightningDataModule):
+    """Folder Lightning Data Module."""
 
     def __init__(
         self,
@@ -284,7 +284,7 @@ def __init__(
         transform_config: Optional[Union[str, A.Compose]] = None,
         create_validation_set: bool = False,
     ) -> None:
-        """Custom Dataset PL Datamodule.
+        """Folder Dataset PL Datamodule.
 
         Args:
             root (Union[str, Path]): Path to the root folder containing normal and abnormal dirs.
@@ -314,9 +314,9 @@ def __init__(
                 could set this flag to ``True``.
 
         Examples:
-            Assume that we use Custom Dataset for the MVTec/bottle/broken_large category. We would do:
-            >>> from anomalib.data import CustomDataModule
-            >>> datamodule = CustomDataModule(
+            Assume that we use Folder Dataset for the MVTec/bottle/broken_large category. We would do:
+            >>> from anomalib.data import FolderDataModule
+            >>> datamodule = FolderDataModule(
             ...     root="./datasets/MVTec/bottle/test",
             ...     normal="good",
             ...     abnormal="broken_large",
@@ -331,17 +331,11 @@ def __init__(
             >>> test_data.keys()
             dict_keys(['image'])
 
-            We could also create a Custom DataModule for datasets containing mask annotations.
+            We could also create a Folder DataModule for datasets containing mask annotations.
             The dataset expects that mask annotation filenames must be same as the original filename.
-            To show an example, we therefore need to modify the mask filenames in MVTec dataset.
-
-            >>> # Rename MVTec mask annotations so that they are the same as image filanames
-            >>> folder = Path("./datasets/bottle/ground_truth/")
-            >>> for f in folder.glob(r"**/*.png"):
-            ...     f.rename(f.parent / (f.stem.split("_")[0] + f.suffix))
-
-            Now we could try custom data module using the mvtec bottle broken large category
-            >>> datamodule = CustomDataModule(
+            To this end, we modified mask filenames in MVTec bottle category.
+            Now we could try folder data module using the mvtec bottle broken large category
+            >>> datamodule = FolderDataModule(
             ...     root="./datasets/bottle/test",
             ...     normal="good",
             ...     abnormal="broken_large",
@@ -360,10 +354,10 @@ def __init__(
             >>> print(test_data["image"].shape, test_data["mask"].shape)
             torch.Size([24, 3, 256, 256]) torch.Size([24, 256, 256])
 
-            By default, Custom Data Module does not create a validation set. If a validation set
+            By default, Folder Data Module does not create a validation set. If a validation set
             is needed it could be set as follows:
 
-            >>> datamodule = CustomDataModule(
+            >>> datamodule = FolderDataModule(
             ...     root="./datasets/bottle/test",
             ...     normal="good",
             ...     abnormal="broken_large",
@@ -425,7 +419,7 @@ def setup(self, stage: Optional[str] = None) -> None:
 
         """
         if stage in (None, "fit"):
-            self.train_data = CustomDataset(
+            self.train_data = FolderDataset(
                 normal_dir=self.normal_dir,
                 abnormal_dir=self.abnormal_dir,
                 split="train",
@@ -439,7 +433,7 @@ def setup(self, stage: Optional[str] = None) -> None:
             )
 
         if self.create_validation_set:
-            self.val_data = CustomDataset(
+            self.val_data = FolderDataset(
                 normal_dir=self.normal_dir,
                 abnormal_dir=self.abnormal_dir,
                 split="val",
@@ -452,7 +446,7 @@ def setup(self, stage: Optional[str] = None) -> None:
                 create_validation_set=self.create_validation_set,
             )
 
-        self.test_data = CustomDataset(
+        self.test_data = FolderDataset(
             normal_dir=self.normal_dir,
             abnormal_dir=self.abnormal_dir,
             split="test",
diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml
index 5eb3ed5178..915e371745 100644
--- a/anomalib/models/cflow/config.yaml
+++ b/anomalib/models/cflow/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech, custom]
+  name: mvtec #options: [mvtec, btech, folder]
   format: mvtec
   path: ./datasets/MVTec
   category: bottle
diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml
index 53163fcec0..abd2fba02f 100644
--- a/anomalib/models/dfkde/config.yaml
+++ b/anomalib/models/dfkde/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech, custom]
+  name: mvtec #options: [mvtec, btech, folder]
   format: mvtec
   path: ./datasets/MVTec
   category: bottle
diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml
index 587ea95331..6740e86f38 100755
--- a/anomalib/models/dfm/config.yaml
+++ b/anomalib/models/dfm/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech, custom]
+  name: mvtec #options: [mvtec, btech, folder]
   format: mvtec
   path: ./datasets/MVTec
   category: bottle
diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml
index fba8949228..774537b903 100644
--- a/anomalib/models/ganomaly/config.yaml
+++ b/anomalib/models/ganomaly/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech, custom]
+  name: mvtec #options: [mvtec, btech, folder]
   format: mvtec
   path: ./datasets/MVTec
   category: bottle
diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml
index 86cb2d1aac..b6ac798373 100644
--- a/anomalib/models/padim/config.yaml
+++ b/anomalib/models/padim/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech]
+  name: mvtec #options: [mvtec, btech, folder]
   format: mvtec
   path: ./datasets/MVTec
   category: bottle
diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml
index 64d6092405..10cfe9abf2 100644
--- a/anomalib/models/patchcore/config.yaml
+++ b/anomalib/models/patchcore/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech, custom]
+  name: mvtec #options: [mvtec, btech, folder]
   format: mvtec
   path: ./datasets/MVTec
   task: segmentation
diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml
index 77d983fab2..ab17e5fd29 100644
--- a/anomalib/models/stfpm/config.yaml
+++ b/anomalib/models/stfpm/config.yaml
@@ -1,5 +1,5 @@
 dataset:
-  name: mvtec #options: [mvtec, btech, custom]
+  name: mvtec #options: [mvtec, btech, folder]
   format: mvtec
   path: ./datasets/MVTec
   category: bottle
diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
index f1203a7d61..9e428d9113 100644
--- a/tests/pre_merge/datasets/test_dataset.py
+++ b/tests/pre_merge/datasets/test_dataset.py
@@ -8,7 +8,7 @@
 from anomalib.config import get_configurable_parameters, update_input_size_config
 from anomalib.data import (
     BTechDataModule,
-    CustomDataModule,
+    FolderDataModule,
     MVTecDataModule,
     get_datamodule,
 )
@@ -50,13 +50,14 @@ def btech_data_module():
 
 
 @pytest.fixture(autouse=True)
-def custom_data_module():
-    """Create Custom Data Module."""
-    datamodule = CustomDataModule(
+def folder_data_module():
+    """Create Folder Data Module."""
+    datamodule = FolderDataModule(
         root="./datasets/bottle/test",
         normal="good",
         abnormal="broken_large",
         mask_dir="./datasets/bottle/ground_truth/broken_large",
+        task="segmentation",
         split_ratio=0.2,
         seed=0,
         image_size=(256, 256),
@@ -114,20 +115,20 @@ def test_val_and_test_dataloaders_has_mask_and_gt(self, btech_data_module):
         assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys())
 
 
-class TestCustomDataModule:
-    """Test Custom Data Module."""
+class TestFolderDataModule:
+    """Test Folder Data Module."""
 
-    def test_batch_size(self, custom_data_module):
+    def test_batch_size(self, folder_data_module):
         """Test batch size."""
-        _, train_data_sample = next(enumerate(custom_data_module.train_dataloader()))
-        _, val_data_sample = next(enumerate(custom_data_module.val_dataloader()))
+        _, train_data_sample = next(enumerate(folder_data_module.train_dataloader()))
+        _, val_data_sample = next(enumerate(folder_data_module.val_dataloader()))
         assert train_data_sample["image"].shape[0] == 16
         assert val_data_sample["image"].shape[0] == 12
 
-    def test_val_and_test_dataloaders_has_mask_and_gt(self, custom_data_module):
+    def test_val_and_test_dataloaders_has_mask_and_gt(self, folder_data_module):
         """Test Validation and Test dataloaders should return filenames, image, mask and label."""
-        _, val_data = next(enumerate(custom_data_module.val_dataloader()))
-        _, test_data = next(enumerate(custom_data_module.test_dataloader()))
+        _, val_data = next(enumerate(folder_data_module.val_dataloader()))
+        _, test_data = next(enumerate(folder_data_module.test_dataloader()))
 
         assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys())
         assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys())

From 6646c3b93df2be23bb561864ae12f6dafe0de3a9 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Wed, 23 Mar 2022 07:56:38 -0700
Subject: [PATCH 20/24] fix dataset path

---
 tests/pre_merge/datasets/test_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
index 9e428d9113..b74d07dac5 100644
--- a/tests/pre_merge/datasets/test_dataset.py
+++ b/tests/pre_merge/datasets/test_dataset.py
@@ -53,7 +53,7 @@ def btech_data_module():
 def folder_data_module():
     """Create Folder Data Module."""
     datamodule = FolderDataModule(
-        root="./datasets/bottle/test",
+        root=get_dataset_path(dataset="bottle/test"),
         normal="good",
         abnormal="broken_large",
         mask_dir="./datasets/bottle/ground_truth/broken_large",

From b3cf100c94f37bb4121b04e5f036cd8a4b27be4d Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Mar 2022 01:13:18 -0700
Subject: [PATCH 21/24] Debugging the ci

---
 .github/workflows/pre_merge.yml |  4 ++--
 tox.ini                         | 11 ++++++-----
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pre_merge.yml b/.github/workflows/pre_merge.yml
index 0f0e8c2223..d7fd00c8a8 100644
--- a/.github/workflows/pre_merge.yml
+++ b/.github/workflows/pre_merge.yml
@@ -18,8 +18,8 @@ jobs:
         uses: actions/checkout@v2
       - name: Install Tox
         run: pip install tox
-      - name: Code quality checks
-        run: tox -e black,isort,flake8,pylint,mypy,pydocstyle
+      # - name: Code quality checks
+      #   run: tox -e black,isort,flake8,pylint,mypy,pydocstyle
       - name: Coverage
         run: |
           export ANOMALIB_DATASET_PATH=/media/data1/datasets/
diff --git a/tox.ini b/tox.ini
index 74aee167b3..e6e765b2ee 100644
--- a/tox.ini
+++ b/tox.ini
@@ -72,11 +72,12 @@ deps =
     -r{toxinidir}/requirements/base.txt
     -r{toxinidir}/requirements/openvino.txt
 commands =
-    coverage erase
-    coverage run --include=anomalib/* -m pytest tests/pre_merge/ -ra --showlocals
-    ; https://github.com/openvinotoolkit/anomalib/issues/94
-    coverage report -m --fail-under=85
-    coverage xml -o {toxworkdir}/coverage.xml
+    python -m pytest tests/pre_merge/datasets/test_dataset.py
+    ; coverage erase
+    ; coverage run --include=anomalib/* -m pytest tests/pre_merge/ -ra --showlocals
+    ; ; https://github.com/openvinotoolkit/anomalib/issues/94
+    ; coverage report -m --fail-under=85
+    ; coverage xml -o {toxworkdir}/coverage.xml
 
 [testenv:nightly]
 basepython = python3

From 00e8020d7255cb5b6870128e5bbb1ff7ef675cf1 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Mar 2022 02:37:12 -0700
Subject: [PATCH 22/24] Fixed folder dataset tests

---
 tests/pre_merge/datasets/test_dataset.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py
index b74d07dac5..fad7a48e0b 100644
--- a/tests/pre_merge/datasets/test_dataset.py
+++ b/tests/pre_merge/datasets/test_dataset.py
@@ -52,11 +52,12 @@ def btech_data_module():
 @pytest.fixture(autouse=True)
 def folder_data_module():
     """Create Folder Data Module."""
+    root = get_dataset_path(dataset="bottle")
     datamodule = FolderDataModule(
-        root=get_dataset_path(dataset="bottle/test"),
+        root=root,
         normal="good",
         abnormal="broken_large",
-        mask_dir="./datasets/bottle/ground_truth/broken_large",
+        mask_dir=os.path.join(root, "ground_truth/broken_large"),
         task="segmentation",
         split_ratio=0.2,
         seed=0,

From 8e47bd35b223bfd61e21a264fff6c1d5e93f41b7 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Mar 2022 02:38:13 -0700
Subject: [PATCH 23/24] Added code quality checks back to the ci

---
 .github/workflows/pre_merge.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pre_merge.yml b/.github/workflows/pre_merge.yml
index d7fd00c8a8..0f0e8c2223 100644
--- a/.github/workflows/pre_merge.yml
+++ b/.github/workflows/pre_merge.yml
@@ -18,8 +18,8 @@ jobs:
         uses: actions/checkout@v2
       - name: Install Tox
         run: pip install tox
-      # - name: Code quality checks
-      #   run: tox -e black,isort,flake8,pylint,mypy,pydocstyle
+      - name: Code quality checks
+        run: tox -e black,isort,flake8,pylint,mypy,pydocstyle
       - name: Coverage
         run: |
           export ANOMALIB_DATASET_PATH=/media/data1/datasets/

From 314b16404799c6dcae948e5b284f77d8ccc4bf96 Mon Sep 17 00:00:00 2001
From: Samet Akcay <samet.akcay@intel.com>
Date: Thu, 24 Mar 2022 02:38:36 -0700
Subject: [PATCH 24/24] Added code coverage back to pre-merge tests

---
 tox.ini | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/tox.ini b/tox.ini
index e6e765b2ee..74aee167b3 100644
--- a/tox.ini
+++ b/tox.ini
@@ -72,12 +72,11 @@ deps =
     -r{toxinidir}/requirements/base.txt
     -r{toxinidir}/requirements/openvino.txt
 commands =
-    python -m pytest tests/pre_merge/datasets/test_dataset.py
-    ; coverage erase
-    ; coverage run --include=anomalib/* -m pytest tests/pre_merge/ -ra --showlocals
-    ; ; https://github.com/openvinotoolkit/anomalib/issues/94
-    ; coverage report -m --fail-under=85
-    ; coverage xml -o {toxworkdir}/coverage.xml
+    coverage erase
+    coverage run --include=anomalib/* -m pytest tests/pre_merge/ -ra --showlocals
+    ; https://github.com/openvinotoolkit/anomalib/issues/94
+    coverage report -m --fail-under=85
+    coverage xml -o {toxworkdir}/coverage.xml
 
 [testenv:nightly]
 basepython = python3