From f175a24c459cf5eacd0e6c4d8512e2353ae6b2f1 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Feb 2022 04:24:17 -0700 Subject: [PATCH 01/24] renamed download-progress-bar as download --- .../{download_progress_bar.py => download.py} | 49 +++++++++++++++++++ 1 file changed, 49 insertions(+) rename anomalib/data/utils/{download_progress_bar.py => download.py} (88%) diff --git a/anomalib/data/utils/download_progress_bar.py b/anomalib/data/utils/download.py similarity index 88% rename from anomalib/data/utils/download_progress_bar.py rename to anomalib/data/utils/download.py index 26af24834a..9f0ec4980f 100644 --- a/anomalib/data/utils/download_progress_bar.py +++ b/anomalib/data/utils/download.py @@ -18,7 +18,11 @@ # and limitations under the License. import io +import tarfile +import zipfile +from pathlib import Path from typing import Dict, Iterable, Optional, Union +from urllib.request import urlretrieve from tqdm import tqdm @@ -193,3 +197,48 @@ def update_to(self, chunk_number: int = 1, max_chunk_size: int = 1, total_size=N if total_size is not None: self.total = total_size self.update(chunk_number * max_chunk_size - self.n) + + +def download(url: str, filename: Union[str, Path], description: Optional[str] = None) -> None: + """Download the dataset from the given url. + + This function downloads the dataset from url to the given filename. + + Args: + url (str): Dataset URL + filename (str): Filename to save the file locally. + description (Optional[str], optional): _description_. Defaults to None. + """ + + with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=description) as progress_bar: + urlretrieve(url=url, filename=filename, reporthook=progress_bar.update_to) # nosec + + +def extract(filename: Path, path: Optional[Path] = None) -> None: + """Extract file from tar file. + + Args: + filename (Path): Name of the tar/zip file + path (Optional[Path], optional): Path to which tar/zip file is extracted. Defaults to None. + + Raises: + ValueError: When the file extension is not ".tar", ".gzip", ".bz2", ".lzma" or ".zip" + + """ + if path is None: + path = Path(".") + + if filename.suffix == ".zip": + with zipfile.ZipFile(filename, "r") as zip_file: + zip_file.extractall(path) + else: + try: + with tarfile.open(filename) as tar_file: + tar_file.extractall(path) + except ValueError: + print("Unknown file extension to extract") + + +def clean(filename: Path) -> None: + """Cleanup Dataset tar file.""" + filename.unlink() From f841f51811f9f6cdc8bbf02b1facc3ad20a9c153 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Feb 2022 04:26:46 -0700 Subject: [PATCH 02/24] added new download functions to init --- anomalib/data/utils/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py index 01c8f98459..8b22ad4405 100644 --- a/anomalib/data/utils/__init__.py +++ b/anomalib/data/utils/__init__.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions # and limitations under the License. -from .download_progress_bar import DownloadProgressBar +from .download import DownloadProgressBar, clean, download, extract from .image import get_image_filenames, read_image -__all__ = ["get_image_filenames", "read_image", "DownloadProgressBar"] +__all__ = ["get_image_filenames", "read_image", "DownloadProgressBar", "download", "extract", "clean"] From 12cd8ee572091d5e67c96e794e991e3e1c2e9526 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Feb 2022 23:15:19 -0700 Subject: [PATCH 03/24] Added Btech data module --- anomalib/data/btech.py | 424 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 424 insertions(+) create mode 100644 anomalib/data/btech.py diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py new file mode 100644 index 0000000000..13874cb337 --- /dev/null +++ b/anomalib/data/btech.py @@ -0,0 +1,424 @@ +"""BTech Dataset. + +This script contains PyTorch Lightning DataModule for the BTech dataset. + +If the dataset is not on the file system, the script downloads and +extracts the dataset and create PyTorch data objects. +""" + +# Copyright (C) 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. + +import logging +import shutil +import zipfile +from pathlib import Path +from typing import Dict, Optional, Tuple, Union +from urllib.request import urlretrieve + +import albumentations as A +import cv2 +import numpy as np +import pandas as pd +from pandas.core.frame import DataFrame +from pytorch_lightning.core.datamodule import LightningDataModule +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS +from torch import Tensor +from torch.utils.data import DataLoader +from torch.utils.data.dataset import Dataset +from torchvision.datasets.folder import VisionDataset + +from anomalib.data.inference import InferenceDataset +from anomalib.data.utils import DownloadProgressBar, read_image +from anomalib.data.utils.split import ( + create_validation_set_from_test_set, + split_normal_images_in_train_set, +) +from anomalib.pre_processing import PreProcessor + +logger = logging.getLogger(name="Dataset: BTech") +logger.setLevel(logging.DEBUG) + + +def make_btech_dataset( + path: Path, + split: Optional[str] = None, + split_ratio: float = 0.1, + seed: int = 0, + create_validation_set: bool = False, +) -> DataFrame: + """Create BTech samples by parsing the BTech data file structure. + + The files are expected to follow the structure: + path/to/dataset/split/category/image_filename.png + path/to/dataset/ground_truth/category/mask_filename.png + + Args: + path (Path): Path to dataset + split (str, optional): Dataset split (ie., either train or test). Defaults to None. + split_ratio (float, optional): Ratio to split normal training images and add to the + test set in case test set doesn't contain any normal images. + Defaults to 0.1. + seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0. + create_validation_set (bool, optional): Boolean to create a validation set from the test set. + BTech dataset does not contain a validation set. Those wanting to create a validation set + could set this flag to ``True``. + + Example: + The following example shows how to get training samples from BTech 01 category: + + >>> root = Path('./BTech') + >>> category = '01' + >>> path = root / category + >>> path + PosixPath('BTech/01') + + >>> samples = make_btech_dataset(path, split='train', split_ratio=0.1, seed=0) + >>> samples.head() + path split label image_path mask_path label_index + 0 BTech/01 train 01 BTech/01/train/good/105.bmp BTech/01/ground_truth/good/105.bmp 0 + 1 BTech/01 train 01 BTech/01/train/good/017.bmp BTech/01/ground_truth/good/017.bmp 0 + 2 BTech/01 train 01 BTech/01/train/good/137.bmp BTech/01/ground_truth/good/137.bmp 0 + 3 BTech/01 train 01 BTech/01/train/good/152.bmp BTech/01/ground_truth/good/152.bmp 0 + 4 BTech/01 train 01 BTech/01/train/good/109.bmp BTech/01/ground_truth/good/109.bmp 0 + + Returns: + DataFrame: an output dataframe containing samples for the requested split (ie., train or test) + """ + samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.bmp")] + if len(samples_list) == 0: + raise RuntimeError(f"Found 0 images in {path}") + + samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"]) + samples = samples[samples.split != "ground_truth"] + + # Create mask_path column + samples["mask_path"] = ( + samples.path + + "/ground_truth/" + + samples.label + + "/" + + samples.image_path.str.rstrip("bmp").str.rstrip(".") + + ".png" + ) + + # Modify image_path column by converting to absolute path + samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path + + # Split the normal images in training set if test set doesn't + # contain any normal images. This is needed because AUC score + # cannot be computed based on 1-class + if sum((samples.split == "test") & (samples.label == "ok")) == 0: + samples = split_normal_images_in_train_set(samples, split_ratio, seed) + + # Good images don't have mask + samples.loc[(samples.split == "test") & (samples.label == "ok"), "mask_path"] = "" + + # Create label index for normal (0) and anomalous (1) images. + samples.loc[(samples.label == "ok"), "label_index"] = 0 + samples.loc[(samples.label != "ok"), "label_index"] = 1 + samples.label_index = samples.label_index.astype(int) + + if create_validation_set: + samples = create_validation_set_from_test_set(samples, seed=seed) + + # Get the data frame for the split. + if split is not None and split in ["train", "val", "test"]: + samples = samples[samples.split == split] + samples = samples.reset_index(drop=True) + + return samples + + +class BTech(VisionDataset): + """BTech PyTorch Dataset.""" + + def __init__( + self, + root: Union[Path, str], + category: str, + pre_process: PreProcessor, + split: str, + task: str = "segmentation", + seed: int = 0, + create_validation_set: bool = False, + ) -> None: + """Mvtec Dataset class. + + Args: + root: Path to the BTech dataset + category: Name of the BTech category. + pre_process: List of pre_processing object containing albumentation compose. + split: 'train', 'val' or 'test' + task: ``classification`` or ``segmentation`` + seed: seed used for the random subset splitting + create_validation_set: Create a validation subset in addition to the train and test subsets + + Examples: + >>> from anomalib.data.btech import BTech + >>> from anomalib.data.transforms import PreProcessor + >>> pre_process = PreProcessor(image_size=256) + >>> dataset = BTech( + ... root='./datasets/BTech', + ... category='leather', + ... pre_process=pre_process, + ... task="classification", + ... is_train=True, + ... ) + >>> dataset[0].keys() + dict_keys(['image']) + + >>> dataset.split = "test" + >>> dataset[0].keys() + dict_keys(['image', 'image_path', 'label']) + + >>> dataset.task = "segmentation" + >>> dataset.split = "train" + >>> dataset[0].keys() + dict_keys(['image']) + + >>> dataset.split = "test" + >>> dataset[0].keys() + dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask']) + + >>> dataset[0]["image"].shape, dataset[0]["mask"].shape + (torch.Size([3, 256, 256]), torch.Size([256, 256])) + """ + super().__init__(root) + self.root = Path(root) if isinstance(root, str) else root + self.category: str = category + self.split = split + self.task = task + + self.pre_process = pre_process + + self.samples = make_btech_dataset( + path=self.root / category, + split=self.split, + seed=seed, + create_validation_set=create_validation_set, + ) + + def __len__(self) -> int: + """Get length of the dataset.""" + return len(self.samples) + + def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: + """Get dataset item for the index ``index``. + + Args: + index (int): Index to get the item. + + Returns: + Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. + Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. + """ + item: Dict[str, Union[str, Tensor]] = {} + + image_path = self.samples.image_path[index] + image = read_image(image_path) + + if self.split == "train" or self.task == "classification": + pre_processed = self.pre_process(image=image) + item = {"image": pre_processed["image"]} + elif self.split in ["val", "test"]: + label_index = self.samples.label_index[index] + + item["image_path"] = image_path + item["label"] = label_index + + if self.task == "segmentation": + mask_path = self.samples.mask_path[index] + + # Only Anomalous (1) images has masks in BTech dataset. + # Therefore, create empty mask for Normal (0) images. + if label_index == 0: + mask = np.zeros(shape=image.shape[:2]) + else: + mask = cv2.imread(mask_path, flags=0) / 255.0 + + pre_processed = self.pre_process(image=image, mask=mask) + + item["mask_path"] = mask_path + item["image"] = pre_processed["image"] + item["mask"] = pre_processed["mask"] + + return item + + +class BTechDataModule(LightningDataModule): + """BTechDataModule Lightning Data Module.""" + + def __init__( + self, + root: str, + category: str, + # TODO: Remove default values. IAAALD-211 + image_size: Optional[Union[int, Tuple[int, int]]] = None, + train_batch_size: int = 32, + test_batch_size: int = 32, + num_workers: int = 8, + transform_config: Optional[Union[str, A.Compose]] = None, + seed: int = 0, + create_validation_set: bool = False, + ) -> None: + """Instantiate BTech Lightning Data Module. + + Args: + root: Path to the BTech dataset + category: Name of the BTech category. + image_size: Variable to which image is resized. + train_batch_size: Training batch size. + test_batch_size: Testing batch size. + num_workers: Number of workers. + transform_config: Config for pre-processing. + seed: seed used for the random subset splitting + create_validation_set: Create a validation subset in addition to the train and test subsets + + Examples + >>> from anomalib.data import BTechDataModule + >>> datamodule = BTechDataModule( + ... root="./datasets/BTech", + ... category="leather", + ... image_size=256, + ... train_batch_size=32, + ... test_batch_size=32, + ... num_workers=8, + ... transform_config=None, + ... ) + >>> datamodule.setup() + + >>> i, data = next(enumerate(datamodule.train_dataloader())) + >>> data.keys() + dict_keys(['image']) + >>> data["image"].shape + torch.Size([32, 3, 256, 256]) + + >>> i, data = next(enumerate(datamodule.val_dataloader())) + >>> data.keys() + dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask']) + >>> data["image"].shape, data["mask"].shape + (torch.Size([32, 3, 256, 256]), torch.Size([32, 256, 256])) + """ + super().__init__() + + self.root = root if isinstance(root, Path) else Path(root) + self.category = category + self.dataset_path = self.root / self.category + self.transform_config = transform_config + self.image_size = image_size + + self.pre_process = PreProcessor(config=self.transform_config, image_size=self.image_size) + + self.train_batch_size = train_batch_size + self.test_batch_size = test_batch_size + self.num_workers = num_workers + + self.create_validation_set = create_validation_set + self.seed = seed + + self.train_data: Dataset + self.test_data: Dataset + if create_validation_set: + self.val_data: Dataset + self.inference_data: Dataset + + def prepare_data(self) -> None: + """Download the dataset if not available.""" + if (self.root / self.category).is_dir(): + logging.info("Found the dataset.") + else: + self.root.mkdir(parents=True, exist_ok=True) + zip_filename = self.root / "btad.zip" + + logging.info("Downloading the dataset.") + with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc="BTech") as progress_bar: + urlretrieve( + url="https://avires.dimi.uniud.it/papers/btad/btad.zip", + filename=zip_filename, + reporthook=progress_bar.update_to, + ) # nosec + + logging.info("Extracting the dataset.") + with zipfile.ZipFile(zip_filename, "r") as zip_file: + zip_file.extractall(self.root) + + logging.info("Renaming the dataset directory") + shutil.move(src=self.root / "BTech_Dataset_transformed", dst=self.root / "BTech") + + logging.info("Cleaning the tar file") + zip_filename.unlink() + + def setup(self, stage: Optional[str] = None) -> None: + """Setup train, validation and test data. + + BTech dataset uses BTech dataset structure, which is the reason for + using `anomalib.data.btech.BTech` class to get the dataset items. + + Args: + stage: Optional[str]: Train/Val/Test stages. (Default value = None) + + """ + if stage in (None, "fit"): + self.train_data = BTech( + root=self.root, + category=self.category, + pre_process=self.pre_process, + split="train", + seed=self.seed, + create_validation_set=self.create_validation_set, + ) + + if self.create_validation_set: + self.val_data = BTech( + root=self.root, + category=self.category, + pre_process=self.pre_process, + split="val", + seed=self.seed, + create_validation_set=self.create_validation_set, + ) + + self.test_data = BTech( + root=self.root, + category=self.category, + pre_process=self.pre_process, + split="test", + seed=self.seed, + create_validation_set=self.create_validation_set, + ) + + if stage == "predict": + self.inference_data = InferenceDataset( + path=self.root, image_size=self.image_size, transform_config=self.transform_config + ) + + def train_dataloader(self) -> TRAIN_DATALOADERS: + """Get train dataloader.""" + return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers) + + def val_dataloader(self) -> EVAL_DATALOADERS: + """Get validation dataloader.""" + dataset = self.val_data if self.create_validation_set else self.test_data + return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) + + def test_dataloader(self) -> EVAL_DATALOADERS: + """Get test dataloader.""" + return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) + + def predict_dataloader(self) -> EVAL_DATALOADERS: + """Get predict dataloader.""" + return DataLoader( + self.inference_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers + ) From 7bc453f0f4ef8e919a0f4268d0664a2aa08740f4 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Feb 2022 23:18:06 -0700 Subject: [PATCH 04/24] Added btech tests --- .gitignore | 2 +- tests/pre_merge/datasets/test_dataset.py | 59 +++++++++++++++++++----- 2 files changed, 48 insertions(+), 13 deletions(-) diff --git a/.gitignore b/.gitignore index ba0b6d5a3b..b34903cc2e 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # Project related datasets !anomalib/datasets -!tests/datasets +!tests/pre_merge/datasets results !anomalib/core/results diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py index 608c12f3af..efb5b179ba 100644 --- a/tests/pre_merge/datasets/test_dataset.py +++ b/tests/pre_merge/datasets/test_dataset.py @@ -3,13 +3,13 @@ import numpy as np import pytest -from anomalib.data.mvtec import MVTecDataModule +from anomalib.data import BTechDataModule, MVTecDataModule from anomalib.pre_processing.transforms import Denormalize, ToNumpy from tests.helpers.dataset import get_dataset_path @pytest.fixture(autouse=True) -def data_module(): +def mvtec_data_module(): datamodule = MVTecDataModule( root=get_dataset_path(), category="leather", @@ -25,24 +25,59 @@ def data_module(): @pytest.fixture(autouse=True) -def data_sample(data_module): - _, data = next(enumerate(data_module.train_dataloader())) +def btech_data_module(): + """Create BTech Data Module.""" + datamodule = BTechDataModule( + root=get_dataset_path(path="./datasets/BTech"), + category="01", + image_size=(256, 256), + train_batch_size=1, + test_batch_size=1, + num_workers=0, + ) + datamodule.prepare_data() + datamodule.setup() + + return datamodule + + +@pytest.fixture(autouse=True) +def data_sample(mvtec_data_module): + _, data = next(enumerate(mvtec_data_module.train_dataloader())) return data class TestMVTecDataModule: - def test_batch_size(self, data_module): + def test_batch_size(self, mvtec_data_module): """test_mvtec_datamodule [summary]""" - _, train_data_sample = next(enumerate(data_module.train_dataloader())) - _, val_data_sample = next(enumerate(data_module.val_dataloader())) + _, train_data_sample = next(enumerate(mvtec_data_module.train_dataloader())) + _, val_data_sample = next(enumerate(mvtec_data_module.val_dataloader())) + assert train_data_sample["image"].shape[0] == 1 + assert val_data_sample["image"].shape[0] == 1 + + def test_val_and_test_dataloaders_has_mask_and_gt(self, mvtec_data_module): + """Test Validation and Test dataloaders should return filenames, image, mask and label.""" + _, val_data = next(enumerate(mvtec_data_module.val_dataloader())) + _, test_data = next(enumerate(mvtec_data_module.test_dataloader())) + + assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys()) + assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys()) + + +class TestBTechDataModule: + """Test BTech Data Module.""" + + def test_batch_size(self, btech_data_module): + """test_btech_datamodule [summary]""" + _, train_data_sample = next(enumerate(btech_data_module.train_dataloader())) + _, val_data_sample = next(enumerate(btech_data_module.val_dataloader())) assert train_data_sample["image"].shape[0] == 1 assert val_data_sample["image"].shape[0] == 1 - def test_val_and_test_dataloaders_has_mask_and_gt(self, data_module): - """Validation and Test dataloaders should return filenames, image, mask - and label.""" - _, val_data = next(enumerate(data_module.val_dataloader())) - _, test_data = next(enumerate(data_module.test_dataloader())) + def test_val_and_test_dataloaders_has_mask_and_gt(self, btech_data_module): + """Test Validation and Test dataloaders should return filenames, image, mask and label.""" + _, val_data = next(enumerate(btech_data_module.val_dataloader())) + _, test_data = next(enumerate(btech_data_module.test_dataloader())) assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys()) assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys()) From 3a3244357b449c53791634bf639963effa1bea93 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Feb 2022 23:30:25 -0700 Subject: [PATCH 05/24] Move split functions into a util module --- anomalib/data/utils/split.py | 94 ++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) create mode 100644 anomalib/data/utils/split.py diff --git a/anomalib/data/utils/split.py b/anomalib/data/utils/split.py new file mode 100644 index 0000000000..559d11c9c6 --- /dev/null +++ b/anomalib/data/utils/split.py @@ -0,0 +1,94 @@ +"""Dataset Split Utils. + +This module contains function in regards to splitting normal images in training set, +and creating validation sets from test sets. + +These function are useful + - when the test set does not contain any normal images. + - when the dataset doesn't have a validation set. +""" + +# Copyright (C) 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. + +import random + +from pandas.core.frame import DataFrame + + +def split_normal_images_in_train_set( + samples: DataFrame, split_ratio: float = 0.1, seed: int = 0, normal_label: str = "good" +) -> DataFrame: + """Split normal images in train set. + + This function splits the normal images in training set and assigns the + values to the test set. This is particularly useful especially when the + test set does not contain any normal images. + + This is important because when the test set doesn't have any normal images, + AUC computation fails due to having single class. + + Args: + samples (DataFrame): Dataframe containing dataset info such as filenames, splits etc. + split_ratio (float, optional): Train-Test normal image split ratio. Defaults to 0.1. + seed (int, optional): Random seed to ensure reproducibility. Defaults to 0. + normal_label (str): Name of the normal label. For MVTec, for instance, this is normal_label. + + Returns: + DataFrame: Output dataframe where the part of the training set is assigned to test set. + """ + + if seed > 0: + random.seed(seed) + + normal_train_image_indices = samples.index[(samples.split == "train") & (samples.label == normal_label)].to_list() + num_normal_train_images = len(normal_train_image_indices) + num_normal_valid_images = int(num_normal_train_images * split_ratio) + + indices_to_split_from_train_set = random.sample(population=normal_train_image_indices, k=num_normal_valid_images) + samples.loc[indices_to_split_from_train_set, "split"] = "test" + + return samples + + +def create_validation_set_from_test_set(samples: DataFrame, seed: int = 0, normal_label: str = "good") -> DataFrame: + """Craete Validation Set from Test Set. + + This function creates a validation set from test set by splitting both + normal and abnormal samples to two. + + Args: + samples (DataFrame): Dataframe containing dataset info such as filenames, splits etc. + seed (int, optional): Random seed to ensure reproducibility. Defaults to 0. + normal_label (str): Name of the normal label. For MVTec, for instance, this is normal_label. + """ + + if seed > 0: + random.seed(seed) + + # Split normal images. + normal_test_image_indices = samples.index[(samples.split == "test") & (samples.label == normal_label)].to_list() + num_normal_valid_images = len(normal_test_image_indices) // 2 + + indices_to_sample = random.sample(population=normal_test_image_indices, k=num_normal_valid_images) + samples.loc[indices_to_sample, "split"] = "val" + + # Split abnormal images. + abnormal_test_image_indices = samples.index[(samples.split == "test") & (samples.label != normal_label)].to_list() + num_abnormal_valid_images = len(abnormal_test_image_indices) // 2 + + indices_to_sample = random.sample(population=abnormal_test_image_indices, k=num_abnormal_valid_images) + samples.loc[indices_to_sample, "split"] = "val" + + return samples From 132ceb13cbcd7964f9e1b1db5e16118017a6c8a6 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Feb 2022 23:42:35 -0700 Subject: [PATCH 06/24] Modified mvtec --- anomalib/data/mvtec.py | 134 ++++++++++------------------------------- 1 file changed, 31 insertions(+), 103 deletions(-) diff --git a/anomalib/data/mvtec.py b/anomalib/data/mvtec.py index e29e02b8b7..f2343ac908 100644 --- a/anomalib/data/mvtec.py +++ b/anomalib/data/mvtec.py @@ -22,7 +22,6 @@ # and limitations under the License. import logging -import random import tarfile from pathlib import Path from typing import Dict, Optional, Tuple, Union @@ -42,76 +41,15 @@ from anomalib.data.inference import InferenceDataset from anomalib.data.utils import DownloadProgressBar, read_image +from anomalib.data.utils.split import ( + create_validation_set_from_test_set, + split_normal_images_in_train_set, +) from anomalib.pre_processing import PreProcessor logger = logging.getLogger(name="Dataset: MVTec") logger.setLevel(logging.DEBUG) -__all__ = ["MVTec", "MVTecDataModule"] - - -def split_normal_images_in_train_set(samples: DataFrame, split_ratio: float = 0.1, seed: int = 0) -> DataFrame: - """Split normal images in train set. - - This function splits the normal images in training set and assigns the - values to the test set. This is particularly useful especially when the - test set does not contain any normal images. - - This is important because when the test set doesn't have any normal images, - AUC computation fails due to having single class. - - Args: - samples (DataFrame): Dataframe containing dataset info such as filenames, splits etc. - split_ratio (float, optional): Train-Test normal image split ratio. Defaults to 0.1. - seed (int, optional): Random seed to ensure reproducibility. Defaults to 0. - - Returns: - DataFrame: Output dataframe where the part of the training set is assigned to test set. - """ - - if seed > 0: - random.seed(seed) - - normal_train_image_indices = samples.index[(samples.split == "train") & (samples.label == "good")].to_list() - num_normal_train_images = len(normal_train_image_indices) - num_normal_valid_images = int(num_normal_train_images * split_ratio) - - indices_to_split_from_train_set = random.sample(population=normal_train_image_indices, k=num_normal_valid_images) - samples.loc[indices_to_split_from_train_set, "split"] = "test" - - return samples - - -def create_validation_set_from_test_set(samples: DataFrame, seed: int = 0) -> DataFrame: - """Craete Validation Set from Test Set. - - This function creates a validation set from test set by splitting both - normal and abnormal samples to two. - - Args: - samples (DataFrame): Dataframe containing dataset info such as filenames, splits etc. - seed (int, optional): Random seed to ensure reproducibility. Defaults to 0. - """ - - if seed > 0: - random.seed(seed) - - # Split normal images. - normal_test_image_indices = samples.index[(samples.split == "test") & (samples.label == "good")].to_list() - num_normal_valid_images = len(normal_test_image_indices) // 2 - - indices_to_sample = random.sample(population=normal_test_image_indices, k=num_normal_valid_images) - samples.loc[indices_to_sample, "split"] = "val" - - # Split abnormal images. - abnormal_test_image_indices = samples.index[(samples.split == "test") & (samples.label != "good")].to_list() - num_abnormal_valid_images = len(abnormal_test_image_indices) // 2 - - indices_to_sample = random.sample(population=abnormal_test_image_indices, k=num_abnormal_valid_images) - samples.loc[indices_to_sample, "split"] = "val" - - return samples - def make_mvtec_dataset( path: Path, @@ -220,7 +158,6 @@ def __init__( pre_process: PreProcessor, split: str, task: str = "segmentation", - download: bool = False, seed: int = 0, create_validation_set: bool = False, ) -> None: @@ -232,7 +169,6 @@ def __init__( pre_process: List of pre_processing object containing albumentation compose. split: 'train', 'val' or 'test' task: ``classification`` or ``segmentation`` - download: Boolean to download the MVTec dataset. seed: seed used for the random subset splitting create_validation_set: Create a validation subset in addition to the train and test subsets @@ -274,44 +210,13 @@ def __init__( self.pre_process = pre_process - if download: - self._download() - self.samples = make_mvtec_dataset( - path=self.root / category, split=self.split, seed=seed, create_validation_set=create_validation_set + path=self.root / category, + split=self.split, + seed=seed, + create_validation_set=create_validation_set, ) - def _download(self) -> None: - """Download the MVTec dataset.""" - if (self.root / self.category).is_dir(): - logger.warning("Dataset directory exists.") - else: - self.root.mkdir(parents=True, exist_ok=True) - dataset_name = "mvtec_anomaly_detection.tar.xz" - self.filename = self.root / dataset_name - - logger.info("Downloading MVTec Dataset") - with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=dataset_name) as progress_bar: - urlretrieve( # nosec - url=f"ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/{dataset_name}", - filename=self.filename, - reporthook=progress_bar.update_to, - ) # nosec - - self._extract() - self._clean() - - def _extract(self) -> None: - """Extract MVTec Dataset.""" - logger.info("Extracting MVTec dataset") - with tarfile.open(self.filename) as file: - file.extractall(self.root) - - def _clean(self) -> None: - """Cleanup MVTec Dataset tar file.""" - logger.info("Cleaning up the tar file") - self.filename.unlink() - def __len__(self) -> int: """Get length of the dataset.""" return len(self.samples) @@ -436,6 +341,29 @@ def __init__( self.val_data: Dataset self.inference_data: Dataset + def prepare_data(self) -> None: + """Download the dataset if not available.""" + if (self.root / self.category).is_dir(): + logging.info("Found the dataset.") + else: + self.root.mkdir(parents=True, exist_ok=True) + dataset_name = "mvtec_anomaly_detection.tar.xz" + + logging.info("Downloading the dataset.") + with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc="MVTec") as progress_bar: + urlretrieve( + url=f"ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/{dataset_name}", + filename=self.root / dataset_name, + reporthook=progress_bar.update_to, + ) + + logging.info("Extracting the dataset.") + with tarfile.open(self.root / dataset_name) as tar_file: + tar_file.extractall(self.root) + + logging.info("Cleaning the tar file") + (self.root / dataset_name).unlink() + def setup(self, stage: Optional[str] = None) -> None: """Setup train, validation and test data. From 907281fda4506ad05ca8ed2945b3829e4286a1b5 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Feb 2022 23:51:44 -0700 Subject: [PATCH 07/24] added btech to get-datamodule --- anomalib/data/__init__.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index 302475d3e8..2deed94ba5 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -19,6 +19,7 @@ from omegaconf import DictConfig, ListConfig from pytorch_lightning import LightningDataModule +from .btech import BTechDataModule from .inference import InferenceDataset from .mvtec import MVTecDataModule @@ -34,21 +35,27 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule """ datamodule: LightningDataModule - if config.dataset.format.lower() == "mvtec": - datamodule = MVTecDataModule( - # TODO: Remove config values. IAAALD-211 - root=config.dataset.path, - category=config.dataset.category, - image_size=(config.dataset.image_size[0], config.dataset.image_size[0]), - train_batch_size=config.dataset.train_batch_size, - test_batch_size=config.dataset.test_batch_size, - num_workers=config.dataset.num_workers, - seed=config.project.seed, - ) + if config.dataset.name.lower() == "mvtec": + datamodule = MVTecDataModule + elif config.dataset.name.lower() == "btech": + datamodule = BTechDataModule else: - raise ValueError("Unknown dataset!") + raise ValueError( + "Unknown dataset! \n" + "If you use a custom dataset make sure you initialize it " + "in `get_datamodule` in `anomalib.data.__init__.py" + ) - return datamodule + return datamodule( + # TODO: Remove config values. IAAALD-211 + root=config.dataset.path, + category=config.dataset.category, + image_size=(config.dataset.image_size[0], config.dataset.image_size[0]), + train_batch_size=config.dataset.train_batch_size, + test_batch_size=config.dataset.test_batch_size, + num_workers=config.dataset.num_workers, + seed=config.project.seed, + ) __all__ = ["get_datamodule", "InferenceDataset"] From 16de223a2042fd945f736afc6eef8e7f9afbff47 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Feb 2022 23:54:44 -0700 Subject: [PATCH 08/24] fix typo in btech docstring --- anomalib/data/__init__.py | 4 ++-- anomalib/data/btech.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index 2deed94ba5..48ff4f9c64 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -42,8 +42,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule else: raise ValueError( "Unknown dataset! \n" - "If you use a custom dataset make sure you initialize it " - "in `get_datamodule` in `anomalib.data.__init__.py" + "If you use a custom dataset make sure you initialize it in" + "`get_datamodule` in `anomalib.data.__init__.py" ) return datamodule( diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 13874cb337..8c43566b95 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -87,11 +87,11 @@ def make_btech_dataset( >>> samples = make_btech_dataset(path, split='train', split_ratio=0.1, seed=0) >>> samples.head() path split label image_path mask_path label_index - 0 BTech/01 train 01 BTech/01/train/good/105.bmp BTech/01/ground_truth/good/105.bmp 0 - 1 BTech/01 train 01 BTech/01/train/good/017.bmp BTech/01/ground_truth/good/017.bmp 0 - 2 BTech/01 train 01 BTech/01/train/good/137.bmp BTech/01/ground_truth/good/137.bmp 0 - 3 BTech/01 train 01 BTech/01/train/good/152.bmp BTech/01/ground_truth/good/152.bmp 0 - 4 BTech/01 train 01 BTech/01/train/good/109.bmp BTech/01/ground_truth/good/109.bmp 0 + 0 BTech/01 train 01 BTech/01/train/good/105.bmp BTech/01/ground_truth/good/105.png 0 + 1 BTech/01 train 01 BTech/01/train/good/017.bmp BTech/01/ground_truth/good/017.png 0 + 2 BTech/01 train 01 BTech/01/train/good/137.bmp BTech/01/ground_truth/good/137.png 0 + 3 BTech/01 train 01 BTech/01/train/good/152.bmp BTech/01/ground_truth/good/152.png 0 + 4 BTech/01 train 01 BTech/01/train/good/109.bmp BTech/01/ground_truth/good/109.png 0 Returns: DataFrame: an output dataframe containing samples for the requested split (ie., train or test) From c2353db26bdce3fc65020881963f590967d956a1 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Feb 2022 23:56:04 -0700 Subject: [PATCH 09/24] update docstring --- anomalib/data/btech.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index 8c43566b95..f28e66a5fc 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -87,11 +87,9 @@ def make_btech_dataset( >>> samples = make_btech_dataset(path, split='train', split_ratio=0.1, seed=0) >>> samples.head() path split label image_path mask_path label_index - 0 BTech/01 train 01 BTech/01/train/good/105.bmp BTech/01/ground_truth/good/105.png 0 - 1 BTech/01 train 01 BTech/01/train/good/017.bmp BTech/01/ground_truth/good/017.png 0 - 2 BTech/01 train 01 BTech/01/train/good/137.bmp BTech/01/ground_truth/good/137.png 0 - 3 BTech/01 train 01 BTech/01/train/good/152.bmp BTech/01/ground_truth/good/152.png 0 - 4 BTech/01 train 01 BTech/01/train/good/109.bmp BTech/01/ground_truth/good/109.png 0 + 0 BTech/01 train 01 BTech/01/train/ok/105.bmp BTech/01/ground_truth/ok/105.png 0 + 1 BTech/01 train 01 BTech/01/train/ok/017.bmp BTech/01/ground_truth/ok/017.png 0 + ... Returns: DataFrame: an output dataframe containing samples for the requested split (ie., train or test) From 287c974c8aa51c2673c6fd1b306e5b9ec7d6f5f3 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Fri, 25 Feb 2022 00:01:36 -0700 Subject: [PATCH 10/24] cleanedup dataset download utils --- anomalib/data/utils/__init__.py | 4 +-- anomalib/data/utils/download.py | 49 --------------------------------- 2 files changed, 2 insertions(+), 51 deletions(-) diff --git a/anomalib/data/utils/__init__.py b/anomalib/data/utils/__init__.py index 8b22ad4405..c493058051 100644 --- a/anomalib/data/utils/__init__.py +++ b/anomalib/data/utils/__init__.py @@ -14,7 +14,7 @@ # See the License for the specific language governing permissions # and limitations under the License. -from .download import DownloadProgressBar, clean, download, extract +from .download import DownloadProgressBar from .image import get_image_filenames, read_image -__all__ = ["get_image_filenames", "read_image", "DownloadProgressBar", "download", "extract", "clean"] +__all__ = ["get_image_filenames", "read_image", "DownloadProgressBar"] diff --git a/anomalib/data/utils/download.py b/anomalib/data/utils/download.py index 9f0ec4980f..26af24834a 100644 --- a/anomalib/data/utils/download.py +++ b/anomalib/data/utils/download.py @@ -18,11 +18,7 @@ # and limitations under the License. import io -import tarfile -import zipfile -from pathlib import Path from typing import Dict, Iterable, Optional, Union -from urllib.request import urlretrieve from tqdm import tqdm @@ -197,48 +193,3 @@ def update_to(self, chunk_number: int = 1, max_chunk_size: int = 1, total_size=N if total_size is not None: self.total = total_size self.update(chunk_number * max_chunk_size - self.n) - - -def download(url: str, filename: Union[str, Path], description: Optional[str] = None) -> None: - """Download the dataset from the given url. - - This function downloads the dataset from url to the given filename. - - Args: - url (str): Dataset URL - filename (str): Filename to save the file locally. - description (Optional[str], optional): _description_. Defaults to None. - """ - - with DownloadProgressBar(unit="B", unit_scale=True, miniters=1, desc=description) as progress_bar: - urlretrieve(url=url, filename=filename, reporthook=progress_bar.update_to) # nosec - - -def extract(filename: Path, path: Optional[Path] = None) -> None: - """Extract file from tar file. - - Args: - filename (Path): Name of the tar/zip file - path (Optional[Path], optional): Path to which tar/zip file is extracted. Defaults to None. - - Raises: - ValueError: When the file extension is not ".tar", ".gzip", ".bz2", ".lzma" or ".zip" - - """ - if path is None: - path = Path(".") - - if filename.suffix == ".zip": - with zipfile.ZipFile(filename, "r") as zip_file: - zip_file.extractall(path) - else: - try: - with tarfile.open(filename) as tar_file: - tar_file.extractall(path) - except ValueError: - print("Unknown file extension to extract") - - -def clean(filename: Path) -> None: - """Cleanup Dataset tar file.""" - filename.unlink() From df8b655ade1f588c57b10e40fd9b426b0fbbbc74 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Fri, 25 Feb 2022 00:20:06 -0700 Subject: [PATCH 11/24] Address mypy --- anomalib/data/__init__.py | 33 +++++++++++++++++++++------------ anomalib/data/btech.py | 2 +- 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index 48ff4f9c64..0f53101e9a 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -36,9 +36,27 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule datamodule: LightningDataModule if config.dataset.name.lower() == "mvtec": - datamodule = MVTecDataModule + datamodule = MVTecDataModule( + # TODO: Remove config values. IAAALD-211 + root=config.dataset.path, + category=config.dataset.category, + image_size=(config.dataset.image_size[0], config.dataset.image_size[0]), + train_batch_size=config.dataset.train_batch_size, + test_batch_size=config.dataset.test_batch_size, + num_workers=config.dataset.num_workers, + seed=config.project.seed, + ) elif config.dataset.name.lower() == "btech": - datamodule = BTechDataModule + datamodule = BTechDataModule( + # TODO: Remove config values. IAAALD-211 + root=config.dataset.path, + category=config.dataset.category, + image_size=(config.dataset.image_size[0], config.dataset.image_size[0]), + train_batch_size=config.dataset.train_batch_size, + test_batch_size=config.dataset.test_batch_size, + num_workers=config.dataset.num_workers, + seed=config.project.seed, + ) else: raise ValueError( "Unknown dataset! \n" @@ -46,16 +64,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule "`get_datamodule` in `anomalib.data.__init__.py" ) - return datamodule( - # TODO: Remove config values. IAAALD-211 - root=config.dataset.path, - category=config.dataset.category, - image_size=(config.dataset.image_size[0], config.dataset.image_size[0]), - train_batch_size=config.dataset.train_batch_size, - test_batch_size=config.dataset.test_batch_size, - num_workers=config.dataset.num_workers, - seed=config.project.seed, - ) + return datamodule __all__ = ["get_datamodule", "InferenceDataset"] diff --git a/anomalib/data/btech.py b/anomalib/data/btech.py index f28e66a5fc..2bdc942f80 100644 --- a/anomalib/data/btech.py +++ b/anomalib/data/btech.py @@ -353,7 +353,7 @@ def prepare_data(self) -> None: zip_file.extractall(self.root) logging.info("Renaming the dataset directory") - shutil.move(src=self.root / "BTech_Dataset_transformed", dst=self.root / "BTech") + shutil.move(src=str(self.root / "BTech_Dataset_transformed"), dst=str(self.root / "BTech")) logging.info("Cleaning the tar file") zip_filename.unlink() From 966ad94721eaeb2d40ab256b9a6b5f9d5b50b9d9 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Fri, 25 Feb 2022 00:27:54 -0700 Subject: [PATCH 12/24] modify config files and update readme.md --- README.md | 1 + anomalib/models/cflow/config.yaml | 3 +-- anomalib/models/dfkde/config.yaml | 3 +-- anomalib/models/dfm/config.yaml | 3 +-- anomalib/models/ganomaly/config.yaml | 3 +-- anomalib/models/padim/config.yaml | 3 +-- anomalib/models/patchcore/config.yaml | 3 +-- anomalib/models/stfpm/config.yaml | 3 +-- 8 files changed, 8 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index 762c2b8673..9b7ac1cd9b 100644 --- a/README.md +++ b/README.md @@ -150,6 +150,7 @@ python tools/inference.py \ ___ ## Datasets +The `development` branch supports MVTec and BeanTech datasets. ### [MVTec Dataset](https://www.mvtec.com/company/research/datasets/mvtec-ad) diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml index d1d8c26bda..5dd6a47e5a 100644 --- a/anomalib/models/cflow/config.yaml +++ b/anomalib/models/cflow/config.yaml @@ -1,8 +1,7 @@ dataset: - name: mvtec + name: mvtec #options: [mvtec, btech] format: mvtec path: ./datasets/MVTec - url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz category: bottle task: segmentation label_format: None diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml index e73b636208..56c74ecd5f 100644 --- a/anomalib/models/dfkde/config.yaml +++ b/anomalib/models/dfkde/config.yaml @@ -1,8 +1,7 @@ dataset: - name: mvtec + name: mvtec #options: [mvtec, btech] format: mvtec path: ./datasets/MVTec - url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz category: bottle task: classification label_format: None diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml index 9ab2b04161..83548f1139 100755 --- a/anomalib/models/dfm/config.yaml +++ b/anomalib/models/dfm/config.yaml @@ -1,8 +1,7 @@ dataset: - name: mvtec + name: mvtec #options: [mvtec, btech] format: mvtec path: ./datasets/MVTec - url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz category: bottle task: classification label_format: None diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml index 2837bf13d1..0fcacf309f 100644 --- a/anomalib/models/ganomaly/config.yaml +++ b/anomalib/models/ganomaly/config.yaml @@ -1,8 +1,7 @@ dataset: - name: mvtec + name: mvtec #options: [mvtec, btech] format: mvtec path: ./datasets/MVTec - url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz category: bottle task: classification label_format: None diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml index 8b5df3cc37..86cb2d1aac 100644 --- a/anomalib/models/padim/config.yaml +++ b/anomalib/models/padim/config.yaml @@ -1,8 +1,7 @@ dataset: - name: mvtec + name: mvtec #options: [mvtec, btech] format: mvtec path: ./datasets/MVTec - url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz category: bottle task: segmentation label_format: None diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml index ebb1755993..599d957f9d 100644 --- a/anomalib/models/patchcore/config.yaml +++ b/anomalib/models/patchcore/config.yaml @@ -1,8 +1,7 @@ dataset: - name: mvtec + name: mvtec #options: [mvtec, btech] format: mvtec path: ./datasets/MVTec - url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz task: segmentation category: bottle label_format: None diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml index 1f476e9f41..958d7d6a58 100644 --- a/anomalib/models/stfpm/config.yaml +++ b/anomalib/models/stfpm/config.yaml @@ -1,8 +1,7 @@ dataset: - name: mvtec + name: mvtec #options: [mvtec, btech] format: mvtec path: ./datasets/MVTec - url: ftp://guest:GU.205dldo@ftp.softronics.ch/mvtec_anomaly_detection/mvtec_anomaly_detection.tar.xz category: bottle task: segmentation label_format: None From 97d98fafc952acdced2537791609dff7499b2ee1 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Fri, 25 Feb 2022 05:44:22 -0700 Subject: [PATCH 13/24] Fix dataset path --- .github/workflows/nightly.yml | 2 +- .github/workflows/pre_merge.yml | 2 +- tests/helpers/dataset.py | 27 +++++++++++++++++------- tests/pre_merge/datasets/test_dataset.py | 4 ++-- 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 1813b0c49d..4124507cfb 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -20,7 +20,7 @@ jobs: run: pip install tox - name: Coverage run: | - export ANOMALIB_DATASET_PATH=/media/data1/datasets/MVTec + export ANOMALIB_DATASET_PATH=/media/data1/datasets/ export CUDA_VISIBLE_DEVICES=2 tox -e nightly - name: Upload coverage result diff --git a/.github/workflows/pre_merge.yml b/.github/workflows/pre_merge.yml index f6151ca73e..0f0e8c2223 100644 --- a/.github/workflows/pre_merge.yml +++ b/.github/workflows/pre_merge.yml @@ -22,7 +22,7 @@ jobs: run: tox -e black,isort,flake8,pylint,mypy,pydocstyle - name: Coverage run: | - export ANOMALIB_DATASET_PATH=/media/data1/datasets/MVTec + export ANOMALIB_DATASET_PATH=/media/data1/datasets/ export CUDA_VISIBLE_DEVICES=3 tox -e pre_merge - name: Upload coverage result diff --git a/tests/helpers/dataset.py b/tests/helpers/dataset.py index 401704e53d..08db75d824 100644 --- a/tests/helpers/dataset.py +++ b/tests/helpers/dataset.py @@ -12,18 +12,29 @@ from .shapes import random_shapes -def get_dataset_path(path: Union[str, Path] = "./datasets/MVTec"): +def get_dataset_path(dataset: str = "MVTec") -> str: """Selects path based on tests in local system or docker image. - Local install assumes dataset is downloaded to - anomaly/datasets/MVTec. In either case, if the location is empty, - the dataset is downloaded again. This speeds up tests in docker - images where dataset is already stored in /tmp/anomalib + Local install assumes datasets are located in anomaly/datasets/. + In either case, if the location is empty, the dataset is downloaded again. + This speeds up tests in docker images where dataset is already stored in /tmp/anomalib + + Example: + Assume that `datasets directory exists in ~/anomalib/, + + >>> get_dataset_path(dataset="MVTec") + './datasets/MVTec' + """ - # when running locally - path = str(path) + # Initially check if `datasets` directory exists locally and look + # for the `dataset`. This is useful for local testing. + path = os.path.join("./datasets", dataset) + + # For docker deployment or a CI that runs on server, dataset directory + # may not necessarily be located in the repo. Therefore, check anomalib + # dataset path environment variable. if not os.path.isdir(path): - path = os.environ["ANOMALIB_DATASET_PATH"] + path = os.path.join(os.environ["ANOMALIB_DATASET_PATH"], dataset) return path diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py index efb5b179ba..c849422ed2 100644 --- a/tests/pre_merge/datasets/test_dataset.py +++ b/tests/pre_merge/datasets/test_dataset.py @@ -11,7 +11,7 @@ @pytest.fixture(autouse=True) def mvtec_data_module(): datamodule = MVTecDataModule( - root=get_dataset_path(), + root=get_dataset_path(dataset="MVTec"), category="leather", image_size=(256, 256), train_batch_size=1, @@ -28,7 +28,7 @@ def mvtec_data_module(): def btech_data_module(): """Create BTech Data Module.""" datamodule = BTechDataModule( - root=get_dataset_path(path="./datasets/BTech"), + root=get_dataset_path(dataset="BTech"), category="01", image_size=(256, 256), train_batch_size=1, From b71f4d36c249c91ec74168c7f29388dc3ca02a36 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Tue, 15 Mar 2022 01:57:35 -0700 Subject: [PATCH 14/24] WiP: Created make_dataset function --- anomalib/data/folder.py | 170 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 170 insertions(+) create mode 100644 anomalib/data/folder.py diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py new file mode 100644 index 0000000000..eeb2524a34 --- /dev/null +++ b/anomalib/data/folder.py @@ -0,0 +1,170 @@ +"""Custom Folder Dataset. + +This script creates a custom dataset from a folder. +""" + +# Copyright (C) 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. + +import logging +import tarfile +from distutils import extension +from pathlib import Path +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union +from urllib.request import urlretrieve + +import albumentations as A +import cv2 +import numpy as np +import pandas as pd +from pandas.core.frame import DataFrame +from pytorch_lightning.core.datamodule import LightningDataModule +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS +from torch import Tensor +from torch.utils.data import DataLoader, Dataset +from torch.utils.data.dataset import Dataset +from torchvision.datasets.folder import IMG_EXTENSIONS + +from anomalib.data.inference import InferenceDataset +from anomalib.data.utils import DownloadProgressBar, read_image +from anomalib.data.utils.split import ( + create_validation_set_from_test_set, + split_normal_images_in_train_set, +) +from anomalib.pre_processing import PreProcessor + +logger = logging.getLogger(name="Dataset: MVTec") +logger.setLevel(logging.DEBUG) + + +def __check_and_convert_path(path: Union[str, Path]) -> Path: + """Check an input path, and convert to Pathlib object. + + Args: + path (Union[str, Path]): Input path. + + Returns: + Path: Output path converted to pathlib object. + """ + if not isinstance(path, Path): + path = Path(path) + return path + + +def make_dataset( + normal_dir: Path, + abnormal_dir: Path, + split: Optional[str] = None, + split_ratio: float = 0.1, + seed: int = 0, + create_validation_set: bool = False, + extensions: Optional[Tuple[str, ...]] = None, +) -> DataFrame: + """Create a folder dataset.""" + + normal_dir = __check_and_convert_path(normal_dir) + abnormal_dir = __check_and_convert_path(abnormal_dir) + + if extensions is None: + extensions = IMG_EXTENSIONS + + normal_filenames = [f for f in normal_dir.glob(r"**/*") if f.suffix in extensions] + abnormal_filenames = [f for f in abnormal_dir.glob(r"**/*") if f.suffix in extensions] + + # TODO: Create a pd dataframe based on the above filenames. + + # samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.png")] + # if len(samples_list) == 0: + # raise RuntimeError(f"Found 0 images in {path}") + + # samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"]) + # samples = samples[samples.split != "ground_truth"] + + # # Create mask_path column + # samples["mask_path"] = ( + # samples.path + # + "/ground_truth/" + # + samples.label + # + "/" + # + samples.image_path.str.rstrip("png").str.rstrip(".") + # + "_mask.png" + # ) + + # # Modify image_path column by converting to absolute path + # samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path + + # # Split the normal images in training set if test set doesn't + # # contain any normal images. This is needed because AUC score + # # cannot be computed based on 1-class + # if sum((samples.split == "test") & (samples.label == "good")) == 0: + # samples = split_normal_images_in_train_set(samples, split_ratio, seed) + + # # Good images don't have mask + # samples.loc[(samples.split == "test") & (samples.label == "good"), "mask_path"] = "" + + # # Create label index for normal (0) and anomalous (1) images. + # samples.loc[(samples.label == "good"), "label_index"] = 0 + # samples.loc[(samples.label != "good"), "label_index"] = 1 + # samples.label_index = samples.label_index.astype(int) + + # if create_validation_set: + # samples = create_validation_set_from_test_set(samples, seed=seed) + + # # Get the data frame for the split. + # if split is not None and split in ["train", "val", "test"]: + # samples = samples[samples.split == split] + # samples = samples.reset_index(drop=True) + + # return samples + + +class FolderDataset(Dataset): + """Folder Dataset.""" + + def __init__( + self, + root: Union[str, Path], + normal: Union[Path, str], + abnormal: Union[Path, str], + split: str, + mask: Optional[Union[Path, str]] = None, + pre_process: Optional[PreProcessor] = None, + extensions: Optional[Sequence[str]] = None, + task: str = "segmentation", + seed: int = 0, + create_validation_set: bool = False, + ) -> None: + pass + + def __len__(self) -> int: + """Get length of the dataset.""" + pass + + def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: + """Get dataset item for the index ``index``. + + Args: + index (int): Index to get the item. + + Returns: + Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. + Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. + """ + pass + + +samples = make_dataset( + normal_dir="/home/sakcay/projects/anomalib/datasets/MVTec/bottle/test/good", + abnormal_dir="/home/sakcay/projects/anomalib/datasets/MVTec/bottle/test/broken_large", +) From 28f7d3ec76af642281d0788e2c0092167e894b5f Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Mon, 21 Mar 2022 22:23:03 -0700 Subject: [PATCH 15/24] Renamed folder dataset into custom --- anomalib/data/custom.py | 474 ++++++++++++++++++++++++++++++++++++++++ anomalib/data/folder.py | 170 -------------- 2 files changed, 474 insertions(+), 170 deletions(-) create mode 100644 anomalib/data/custom.py delete mode 100644 anomalib/data/folder.py diff --git a/anomalib/data/custom.py b/anomalib/data/custom.py new file mode 100644 index 0000000000..b7dba3bbce --- /dev/null +++ b/anomalib/data/custom.py @@ -0,0 +1,474 @@ +"""Custom Folder Dataset. + +This script creates a custom dataset from a folder. +""" + +# Copyright (C) 2020 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. + +import logging +from pathlib import Path +from typing import Dict, Optional, Tuple, Union + +import albumentations as A +import cv2 +import numpy as np +from pandas.core.frame import DataFrame +from pytorch_lightning.core.datamodule import LightningDataModule +from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS +from torch import Tensor +from torch.utils.data import DataLoader, Dataset +from torchvision.datasets.folder import IMG_EXTENSIONS + +from anomalib.data.inference import InferenceDataset +from anomalib.data.utils import read_image +from anomalib.data.utils.split import ( + create_validation_set_from_test_set, + split_normal_images_in_train_set, +) +from anomalib.pre_processing import PreProcessor + +logger = logging.getLogger(name="Dataset: Custom Dataset") +logger.setLevel(logging.DEBUG) + + +def _check_and_convert_path(path: Union[str, Path]) -> Path: + """Check an input path, and convert to Pathlib object. + + Args: + path (Union[str, Path]): Input path. + + Returns: + Path: Output path converted to pathlib object. + """ + if not isinstance(path, Path): + path = Path(path) + return path + + +def make_dataset( + normal_dir: Union[str, Path], + abnormal_dir: Union[str, Path], + mask_dir: Optional[Union[str, Path]] = None, + split: Optional[str] = None, + split_ratio: float = 0.2, + seed: int = 0, + create_validation_set: bool = True, + extensions: Optional[Tuple[str, ...]] = None, +): + """Make Custom Dataset. + + Args: + normal_dir (Union[str, Path]): Path to the directory containing normal images. + abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images. + mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing + the mask annotations. Defaults to None. + split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None. + split_ratio (float, optional): Ratio to split normal training images and add to the + test set in case test set doesn't contain any normal images. + Defaults to 0.2. + seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0. + create_validation_set (bool, optional):Boolean to create a validation set from the test set. + MVTec dataset does not contain a validation set. Those wanting to create a validation set + could set this flag to ``True``. + extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the + directory. + + Returns: + DataFrame: an output dataframe containing samples for the requested split (ie., train or test) + """ + normal_dir = _check_and_convert_path(normal_dir) + abnormal_dir = _check_and_convert_path(abnormal_dir) + + if extensions is None: + extensions = IMG_EXTENSIONS + + # Get filenames from normal and abnormal directory. + normal_filenames = [f for f in normal_dir.glob(r"**/*") if f.suffix in extensions] + abnormal_filenames = [f for f in abnormal_dir.glob(r"**/*") if f.suffix in extensions] + filenames = normal_filenames + abnormal_filenames + + # Add normal and abnormal labels to the samples as `label` column. + normal_labels = ["normal"] * len(normal_filenames) + abnormal_labels = ["abnormal"] * len(abnormal_filenames) + labels = normal_labels + abnormal_labels + + samples = DataFrame({"image_path": filenames, "label": labels}) + + # Create label index for normal (0) and abnormal (1) images. + samples.loc[(samples.label == "normal"), "label_index"] = 0 + samples.loc[(samples.label == "abnormal"), "label_index"] = 1 + samples.label_index = samples.label_index.astype(int) + + # If a path to mask is provided, add it to the sample dataframe. + if mask_dir is not None: + mask_dir = _check_and_convert_path(mask_dir) + normal_gt = ["" for f in normal_filenames] + abnormal_gt = [str(mask_dir / f.name) for f in abnormal_filenames] + gt_filenames = normal_gt + abnormal_gt + + samples["mask_path"] = gt_filenames + + # Ensure the pathlib objects are converted to str. + # This is because torch dataloader doesn't like pathlib. + samples = samples.astype({"image_path": "str"}) + + # Create train/test split. + # By default, all the normal samples are assigned as train. + # and all the abnormal samples are test. + samples.loc[(samples.label == "normal"), "split"] = "train" + samples.loc[(samples.label == "abnormal"), "split"] = "test" + samples = split_normal_images_in_train_set( + samples=samples, split_ratio=split_ratio, seed=seed, normal_label="normal" + ) + + # If `create_validation_set` is set to True, the test set is split into half. + if create_validation_set: + samples = create_validation_set_from_test_set(samples, seed=seed, normal_label="normal") + + # Get the data frame for the split. + if split is not None and split in ["train", "val", "test"]: + samples = samples[samples.split == split] + samples = samples.reset_index(drop=True) + + return samples + + +class CustomDataset(Dataset): + """Custom Dataset.""" + + def __init__( + self, + normal_dir: Union[Path, str], + abnormal_dir: Union[Path, str], + split: str, + pre_process: PreProcessor, + split_ratio: float = 0.2, + mask_dir: Optional[Union[Path, str]] = None, + extensions: Optional[Tuple[str, ...]] = None, + task: Optional[str] = None, + seed: int = 0, + create_validation_set: bool = False, + ) -> None: + """Create Custom Folder Dataset. + + Args: + normal_dir (Union[str, Path]): Path to the directory containing normal images. + abnormal_dir (Union[str, Path]): Path to the directory containing abnormal images. + split (Optional[str], optional): Dataset split (ie., either train or test). Defaults to None. + pre_process (Optional[PreProcessor], optional): Image Pro-processor to apply transform. + Defaults to None. + split_ratio (float, optional): Ratio to split normal training images and add to the + test set in case test set doesn't contain any normal images. + Defaults to 0.2. + mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing + the mask annotations. Defaults to None. + extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the + directory. + task (Optional[str], optional): Task type. (classification or segmentation) Defaults to None. + seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0. + create_validation_set (bool, optional):Boolean to create a validation set from the test set. + MVTec dataset does not contain a validation set. Those wanting to create a validation set + could set this flag to ``True``. + + Raises: + ValueError: When task is set to classification and `mask_dir` is provided. When `mask_dir` is + provided, `task` should be set to `segmentation`. + + """ + self.split = split + + if task == "classification" and mask_dir: + raise ValueError( + "Classification task is requested, but mask directory is provided. " + "Segmentation task is to be chosen if mask directory is provided." + ) + if task is None or mask_dir is None: + self.task = "classification" + else: + self.task = task + + self.pre_process = pre_process + self.samples = make_dataset( + normal_dir=normal_dir, + abnormal_dir=abnormal_dir, + mask_dir=mask_dir, + split=split, + split_ratio=split_ratio, + seed=seed, + create_validation_set=create_validation_set, + extensions=extensions, + ) + + def __len__(self) -> int: + """Get length of the dataset.""" + return len(self.samples) + + def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: + """Get dataset item for the index ``index``. + + Args: + index (int): Index to get the item. + + Returns: + Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. + Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. + """ + item: Dict[str, Union[str, Tensor]] = {} + + image_path = self.samples.image_path[index] + image = read_image(image_path) + + if self.split == "train" or self.task == "classification": + pre_processed = self.pre_process(image=image) + item = {"image": pre_processed["image"]} + elif self.split in ["val", "test"]: + label_index = self.samples.label_index[index] + + item["image_path"] = image_path + item["label"] = label_index + + if self.task == "segmentation": + mask_path = self.samples.mask_path[index] + + # Only Anomalous (1) images has masks in MVTec dataset. + # Therefore, create empty mask for Normal (0) images. + if label_index == 0: + mask = np.zeros(shape=image.shape[:2]) + else: + mask = cv2.imread(mask_path, flags=0) / 255.0 + + pre_processed = self.pre_process(image=image, mask=mask) + + item["mask_path"] = mask_path + item["image"] = pre_processed["image"] + item["mask"] = pre_processed["mask"] + + return item + + +class CustomDataModule(LightningDataModule): + """Custom Lightning Data Module.""" + + def __init__( + self, + root: Union[str, Path], + normal: str = "normal", + abnormal: str = "abnormal", + mask_dir: Optional[Union[Path, str]] = None, + extensions: Optional[Tuple[str, ...]] = None, + split_ratio: float = 0.2, + seed: int = 0, + image_size: Optional[Union[int, Tuple[int, int]]] = None, + train_batch_size: int = 32, + test_batch_size: int = 32, + num_workers: int = 8, + transform_config: Optional[Union[str, A.Compose]] = None, + create_validation_set: bool = False, + ) -> None: + """Custom Dataset PL Datamodule. + + Args: + root (Union[str, Path]): Path to the root folder containing normal and abnormal dirs. + normal (str, optional): Name of the directory containing normal images. + Defaults to "normal". + abnormal (str, optional): Name of the directory containing abnormal images. + Defaults to "abnormal". + mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing + the mask annotations. Defaults to None. + extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the + directory. Defaults to None. + split_ratio (float, optional): Ratio to split normal training images and add to the + test set in case test set doesn't contain any normal images. + Defaults to 0.2. + seed (int, optional): Random seed to ensure reproducibility when splitting. Defaults to 0. + image_size (Optional[Union[int, Tuple[int, int]]], optional): Size of the input image. + Defaults to None. + train_batch_size (int, optional): Training batch size. Defaults to 32. + test_batch_size (int, optional): Test batch size. Defaults to 32. + num_workers (int, optional): Number of workers. Defaults to 8. + transform_config (Optional[Union[str, A.Compose]], optional): Config for pre-processing. + Defaults to None. + create_validation_set (bool, optional):Boolean to create a validation set from the test set. + MVTec dataset does not contain a validation set. Those wanting to create a validation set + could set this flag to ``True``. + + Examples: + Assume that we use Custom Dataset for the MVTec/bottle/broken_large category. We would do: + >>> from anomalib.data import CustomDataModule + >>> datamodule = CustomDataModule( + ... root="./datasets/MVTec/bottle/test", + ... normal="good", + ... abnormal="broken_large", + ... image_size=256 + ... ) + >>> datamodule.setup() + >>> i, data = next(enumerate(datamodule.train_dataloader())) + >>> data["image"].shape + torch.Size([16, 3, 256, 256]) + + >>> i, test_data = next(enumerate(datamodule.test_dataloader())) + >>> test_data.keys() + dict_keys(['image']) + + We could also create a Custom DataModule for datasets containing mask annotations. + The dataset expects that mask annotation filenames must be same as the original filename. + To show an example, we therefore need to modify the mask filenames in MVTec dataset. + + >>> # Rename MVTec mask annotations so that they are the same as image filanames + >>> folder = Path("./datasets/bottle/ground_truth/") + >>> for f in folder.glob(r"**/*.png"): + ... f.rename(f.parent / (f.stem.split("_")[0] + f.suffix)) + + Now we could try custom data module using the mvtec bottle broken large category + >>> datamodule = CustomDataModule( + ... root="./datasets/bottle/test", + ... normal="good", + ... abnormal="broken_large", + ... mask_dir="./datasets/bottle/ground_truth/broken_large", + ... image_size=256 + ... ) + + >>> i , train_data = next(enumerate(datamodule.train_dataloader())) + >>> train_data.keys() + dict_keys(['image']) + >>> train_data["image"].shape + torch.Size([16, 3, 256, 256]) + + >>> i, test_data = next(enumerate(datamodule.test_dataloader())) + dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask']) + >>> print(test_data["image"].shape, test_data["mask"].shape) + torch.Size([24, 3, 256, 256]) torch.Size([24, 256, 256]) + + By default, Custom Data Module does not create a validation set. If a validation set + is needed it could be set as follows: + + >>> datamodule = CustomDataModule( + ... root="./datasets/bottle/test", + ... normal="good", + ... abnormal="broken_large", + ... mask_dir="./datasets/bottle/ground_truth/broken_large", + ... image_size=256, + ... create_validation_set=True, + ... ) + + >>> i, val_data = next(enumerate(datamodule.val_dataloader())) + >>> val_data.keys() + dict_keys(['image_path', 'label', 'mask_path', 'image', 'mask']) + >>> print(val_data["image"].shape, val_data["mask"].shape) + torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256]) + + >>> i, test_data = next(enumerate(datamodule.test_dataloader())) + >>> print(test_data["image"].shape, test_data["mask"].shape) + torch.Size([12, 3, 256, 256]) torch.Size([12, 256, 256]) + + """ + super().__init__() + + self.root = _check_and_convert_path(root) + self.normal_dir = self.root / normal + self.abnormal_dir = self.root / abnormal + self.mask_dir = mask_dir + self.extensions = extensions + self.split_ratio = split_ratio + self.task = "classification" if mask_dir is None else "segmentation" + self.transform_config = transform_config + self.image_size = image_size + + self.pre_process = PreProcessor(config=self.transform_config, image_size=self.image_size) + + self.train_batch_size = train_batch_size + self.test_batch_size = test_batch_size + self.num_workers = num_workers + + self.create_validation_set = create_validation_set + self.seed = seed + + self.train_data: Dataset + self.test_data: Dataset + if create_validation_set: + self.val_data: Dataset + self.inference_data: Dataset + + def setup(self, stage: Optional[str] = None) -> None: + """Setup train, validation and test data. + + Args: + stage: Optional[str]: Train/Val/Test stages. (Default value = None) + + """ + if stage in (None, "fit"): + self.train_data = CustomDataset( + normal_dir=self.normal_dir, + abnormal_dir=self.abnormal_dir, + split="train", + split_ratio=self.split_ratio, + mask_dir=self.mask_dir, + pre_process=self.pre_process, + extensions=self.extensions, + task=self.task, + seed=self.seed, + create_validation_set=self.create_validation_set, + ) + + if self.create_validation_set: + self.val_data = CustomDataset( + normal_dir=self.normal_dir, + abnormal_dir=self.abnormal_dir, + split="val", + split_ratio=self.split_ratio, + mask_dir=self.mask_dir, + pre_process=self.pre_process, + extensions=self.extensions, + task=self.task, + seed=self.seed, + create_validation_set=self.create_validation_set, + ) + + self.test_data = CustomDataset( + normal_dir=self.normal_dir, + abnormal_dir=self.abnormal_dir, + split="test", + split_ratio=self.split_ratio, + mask_dir=self.mask_dir, + pre_process=self.pre_process, + extensions=self.extensions, + task=self.task, + seed=self.seed, + create_validation_set=self.create_validation_set, + ) + + if stage == "predict": + self.inference_data = InferenceDataset( + path=self.root, image_size=self.image_size, transform_config=self.transform_config + ) + + def train_dataloader(self) -> TRAIN_DATALOADERS: + """Get train dataloader.""" + return DataLoader(self.train_data, shuffle=True, batch_size=self.train_batch_size, num_workers=self.num_workers) + + def val_dataloader(self) -> EVAL_DATALOADERS: + """Get validation dataloader.""" + dataset = self.val_data if self.create_validation_set else self.test_data + return DataLoader(dataset=dataset, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) + + def test_dataloader(self) -> EVAL_DATALOADERS: + """Get test dataloader.""" + return DataLoader(self.test_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers) + + def predict_dataloader(self) -> EVAL_DATALOADERS: + """Get predict dataloader.""" + return DataLoader( + self.inference_data, shuffle=False, batch_size=self.test_batch_size, num_workers=self.num_workers + ) diff --git a/anomalib/data/folder.py b/anomalib/data/folder.py deleted file mode 100644 index eeb2524a34..0000000000 --- a/anomalib/data/folder.py +++ /dev/null @@ -1,170 +0,0 @@ -"""Custom Folder Dataset. - -This script creates a custom dataset from a folder. -""" - -# Copyright (C) 2020 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions -# and limitations under the License. - -import logging -import tarfile -from distutils import extension -from pathlib import Path -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union -from urllib.request import urlretrieve - -import albumentations as A -import cv2 -import numpy as np -import pandas as pd -from pandas.core.frame import DataFrame -from pytorch_lightning.core.datamodule import LightningDataModule -from pytorch_lightning.utilities.types import EVAL_DATALOADERS, TRAIN_DATALOADERS -from torch import Tensor -from torch.utils.data import DataLoader, Dataset -from torch.utils.data.dataset import Dataset -from torchvision.datasets.folder import IMG_EXTENSIONS - -from anomalib.data.inference import InferenceDataset -from anomalib.data.utils import DownloadProgressBar, read_image -from anomalib.data.utils.split import ( - create_validation_set_from_test_set, - split_normal_images_in_train_set, -) -from anomalib.pre_processing import PreProcessor - -logger = logging.getLogger(name="Dataset: MVTec") -logger.setLevel(logging.DEBUG) - - -def __check_and_convert_path(path: Union[str, Path]) -> Path: - """Check an input path, and convert to Pathlib object. - - Args: - path (Union[str, Path]): Input path. - - Returns: - Path: Output path converted to pathlib object. - """ - if not isinstance(path, Path): - path = Path(path) - return path - - -def make_dataset( - normal_dir: Path, - abnormal_dir: Path, - split: Optional[str] = None, - split_ratio: float = 0.1, - seed: int = 0, - create_validation_set: bool = False, - extensions: Optional[Tuple[str, ...]] = None, -) -> DataFrame: - """Create a folder dataset.""" - - normal_dir = __check_and_convert_path(normal_dir) - abnormal_dir = __check_and_convert_path(abnormal_dir) - - if extensions is None: - extensions = IMG_EXTENSIONS - - normal_filenames = [f for f in normal_dir.glob(r"**/*") if f.suffix in extensions] - abnormal_filenames = [f for f in abnormal_dir.glob(r"**/*") if f.suffix in extensions] - - # TODO: Create a pd dataframe based on the above filenames. - - # samples_list = [(str(path),) + filename.parts[-3:] for filename in path.glob("**/*.png")] - # if len(samples_list) == 0: - # raise RuntimeError(f"Found 0 images in {path}") - - # samples = pd.DataFrame(samples_list, columns=["path", "split", "label", "image_path"]) - # samples = samples[samples.split != "ground_truth"] - - # # Create mask_path column - # samples["mask_path"] = ( - # samples.path - # + "/ground_truth/" - # + samples.label - # + "/" - # + samples.image_path.str.rstrip("png").str.rstrip(".") - # + "_mask.png" - # ) - - # # Modify image_path column by converting to absolute path - # samples["image_path"] = samples.path + "/" + samples.split + "/" + samples.label + "/" + samples.image_path - - # # Split the normal images in training set if test set doesn't - # # contain any normal images. This is needed because AUC score - # # cannot be computed based on 1-class - # if sum((samples.split == "test") & (samples.label == "good")) == 0: - # samples = split_normal_images_in_train_set(samples, split_ratio, seed) - - # # Good images don't have mask - # samples.loc[(samples.split == "test") & (samples.label == "good"), "mask_path"] = "" - - # # Create label index for normal (0) and anomalous (1) images. - # samples.loc[(samples.label == "good"), "label_index"] = 0 - # samples.loc[(samples.label != "good"), "label_index"] = 1 - # samples.label_index = samples.label_index.astype(int) - - # if create_validation_set: - # samples = create_validation_set_from_test_set(samples, seed=seed) - - # # Get the data frame for the split. - # if split is not None and split in ["train", "val", "test"]: - # samples = samples[samples.split == split] - # samples = samples.reset_index(drop=True) - - # return samples - - -class FolderDataset(Dataset): - """Folder Dataset.""" - - def __init__( - self, - root: Union[str, Path], - normal: Union[Path, str], - abnormal: Union[Path, str], - split: str, - mask: Optional[Union[Path, str]] = None, - pre_process: Optional[PreProcessor] = None, - extensions: Optional[Sequence[str]] = None, - task: str = "segmentation", - seed: int = 0, - create_validation_set: bool = False, - ) -> None: - pass - - def __len__(self) -> int: - """Get length of the dataset.""" - pass - - def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: - """Get dataset item for the index ``index``. - - Args: - index (int): Index to get the item. - - Returns: - Union[Dict[str, Tensor], Dict[str, Union[str, Tensor]]]: Dict of image tensor during training. - Otherwise, Dict containing image path, target path, image tensor, label and transformed bounding box. - """ - pass - - -samples = make_dataset( - normal_dir="/home/sakcay/projects/anomalib/datasets/MVTec/bottle/test/good", - abnormal_dir="/home/sakcay/projects/anomalib/datasets/MVTec/bottle/test/broken_large", -) From 83c138496a2ef3828e5cf6f421d64d70be7a2abe Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Mon, 21 Mar 2022 23:08:16 -0700 Subject: [PATCH 16/24] Added custom dataset tests --- anomalib/data/custom.py | 6 +++ tests/pre_merge/datasets/test_dataset.py | 53 +++++++++++++++++++++++- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/anomalib/data/custom.py b/anomalib/data/custom.py index b7dba3bbce..33f620e27b 100644 --- a/anomalib/data/custom.py +++ b/anomalib/data/custom.py @@ -99,6 +99,12 @@ def make_dataset( abnormal_filenames = [f for f in abnormal_dir.glob(r"**/*") if f.suffix in extensions] filenames = normal_filenames + abnormal_filenames + if len(normal_filenames) == 0: + raise RuntimeError(f"Found 0 normal images in {normal_dir}") + + if len(abnormal_filenames) == 0: + raise RuntimeError(f"Found 0 annormal images in {abnormal_dir}") + # Add normal and abnormal labels to the samples as `label` column. normal_labels = ["normal"] * len(normal_filenames) abnormal_labels = ["abnormal"] * len(abnormal_filenames) diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py index ab9844c843..f1203a7d61 100644 --- a/tests/pre_merge/datasets/test_dataset.py +++ b/tests/pre_merge/datasets/test_dataset.py @@ -1,10 +1,17 @@ """Test Dataset.""" +import os + import numpy as np import pytest from anomalib.config import get_configurable_parameters, update_input_size_config -from anomalib.data import BTechDataModule, MVTecDataModule, get_datamodule +from anomalib.data import ( + BTechDataModule, + CustomDataModule, + MVTecDataModule, + get_datamodule, +) from anomalib.pre_processing.transforms import Denormalize, ToNumpy from tests.helpers.dataset import TestDataset, get_dataset_path @@ -42,6 +49,27 @@ def btech_data_module(): return datamodule +@pytest.fixture(autouse=True) +def custom_data_module(): + """Create Custom Data Module.""" + datamodule = CustomDataModule( + root="./datasets/bottle/test", + normal="good", + abnormal="broken_large", + mask_dir="./datasets/bottle/ground_truth/broken_large", + split_ratio=0.2, + seed=0, + image_size=(256, 256), + train_batch_size=32, + test_batch_size=32, + num_workers=8, + create_validation_set=True, + ) + datamodule.setup() + + return datamodule + + @pytest.fixture(autouse=True) def data_sample(mvtec_data_module): _, data = next(enumerate(mvtec_data_module.train_dataloader())) @@ -49,6 +77,8 @@ def data_sample(mvtec_data_module): class TestMVTecDataModule: + """Test MVTec Data Module.""" + def test_batch_size(self, mvtec_data_module): """test_mvtec_datamodule [summary]""" _, train_data_sample = next(enumerate(mvtec_data_module.train_dataloader())) @@ -69,7 +99,7 @@ class TestBTechDataModule: """Test BTech Data Module.""" def test_batch_size(self, btech_data_module): - """test_btech_datamodule [summary]""" + """Test batch size.""" _, train_data_sample = next(enumerate(btech_data_module.train_dataloader())) _, val_data_sample = next(enumerate(btech_data_module.val_dataloader())) assert train_data_sample["image"].shape[0] == 1 @@ -84,6 +114,25 @@ def test_val_and_test_dataloaders_has_mask_and_gt(self, btech_data_module): assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys()) +class TestCustomDataModule: + """Test Custom Data Module.""" + + def test_batch_size(self, custom_data_module): + """Test batch size.""" + _, train_data_sample = next(enumerate(custom_data_module.train_dataloader())) + _, val_data_sample = next(enumerate(custom_data_module.val_dataloader())) + assert train_data_sample["image"].shape[0] == 16 + assert val_data_sample["image"].shape[0] == 12 + + def test_val_and_test_dataloaders_has_mask_and_gt(self, custom_data_module): + """Test Validation and Test dataloaders should return filenames, image, mask and label.""" + _, val_data = next(enumerate(custom_data_module.val_dataloader())) + _, test_data = next(enumerate(custom_data_module.test_dataloader())) + + assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys()) + assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys()) + + class TestDenormalize: """Test Denormalize Util.""" From 09908b0915155f64a55863ef82495985f3a6f740 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Mon, 21 Mar 2022 23:48:03 -0700 Subject: [PATCH 17/24] updated config.yaml file to show custom dataset is available --- anomalib/models/cflow/config.yaml | 2 +- anomalib/models/dfkde/config.yaml | 2 +- anomalib/models/dfm/config.yaml | 2 +- anomalib/models/ganomaly/config.yaml | 2 +- anomalib/models/padim/config.yaml | 32 +++++++++++++++++++-------- anomalib/models/patchcore/config.yaml | 2 +- anomalib/models/stfpm/config.yaml | 2 +- 7 files changed, 29 insertions(+), 15 deletions(-) diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml index 5dd6a47e5a..5eb3ed5178 100644 --- a/anomalib/models/cflow/config.yaml +++ b/anomalib/models/cflow/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech] + name: mvtec #options: [mvtec, btech, custom] format: mvtec path: ./datasets/MVTec category: bottle diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml index 56c74ecd5f..53163fcec0 100644 --- a/anomalib/models/dfkde/config.yaml +++ b/anomalib/models/dfkde/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech] + name: mvtec #options: [mvtec, btech, custom] format: mvtec path: ./datasets/MVTec category: bottle diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml index 83548f1139..587ea95331 100755 --- a/anomalib/models/dfm/config.yaml +++ b/anomalib/models/dfm/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech] + name: mvtec #options: [mvtec, btech, custom] format: mvtec path: ./datasets/MVTec category: bottle diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml index 0fcacf309f..fba8949228 100644 --- a/anomalib/models/ganomaly/config.yaml +++ b/anomalib/models/ganomaly/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech] + name: mvtec #options: [mvtec, btech, custom] format: mvtec path: ./datasets/MVTec category: bottle diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml index 86cb2d1aac..2bcaa5f019 100644 --- a/anomalib/models/padim/config.yaml +++ b/anomalib/models/padim/config.yaml @@ -1,10 +1,24 @@ dataset: - name: mvtec #options: [mvtec, btech] - format: mvtec - path: ./datasets/MVTec - category: bottle + name: custom #options: [mvtec, btech, custom] + path: ./datasets/bottle/test + normal: good + abnormal: broken_large + mask: ./datasets/bottle/ground_truth/broken_large + extensions: null + split_ratio: 0.2 + seed: 0 + image_size: 256 + train_batch_size: 32 + test_batch_size: 32 + num_workers: 8 + transform_config: null + create_validation_set: true + # name: mvtec #options: [mvtec, btech, custom] + # format: mvtec + # path: ./datasets/MVTec + # category: bottle task: segmentation - label_format: None + # label_format: None tiling: apply: false tile_size: null @@ -12,10 +26,10 @@ dataset: remove_border_count: 0 use_random_tiling: False random_tile_count: 16 - image_size: 256 - train_batch_size: 32 - test_batch_size: 32 - num_workers: 36 + # image_size: 256 + # train_batch_size: 32 + # test_batch_size: 32 + # num_workers: 36 model: name: padim diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml index 599d957f9d..64d6092405 100644 --- a/anomalib/models/patchcore/config.yaml +++ b/anomalib/models/patchcore/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech] + name: mvtec #options: [mvtec, btech, custom] format: mvtec path: ./datasets/MVTec task: segmentation diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml index 958d7d6a58..77d983fab2 100644 --- a/anomalib/models/stfpm/config.yaml +++ b/anomalib/models/stfpm/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech] + name: mvtec #options: [mvtec, btech, custom] format: mvtec path: ./datasets/MVTec category: bottle From 215df462dfdf065d417450aeca812d078ad69fee Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Tue, 22 Mar 2022 00:12:21 -0700 Subject: [PATCH 18/24] Added custom dataset to get_datamodule --- README.md | 27 +++++++++++++++++++++++++++ anomalib/config/config.py | 3 ++- anomalib/data/__init__.py | 20 +++++++++++++++++++- anomalib/data/custom.py | 12 +++++++++++- 4 files changed, 59 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 6f5a463658..934bf5f09e 100644 --- a/README.md +++ b/README.md @@ -102,6 +102,33 @@ where the currently available models are: - [DFKDE](anomalib/models/dfkde) - [GANomaly](anomalib/models/ganomaly) +### Custom Dataset +It is also possible to train on a custom dataset. To do so, `data` section in `config.yaml` is to be modified as follows: +```yaml +dataset: + name: custom + path: + normal: normal # name of the folder containing normal images. + abnormal: abnormal # name of the folder containing abnormal images. + task: segmentation # classification or segmentation + mask: #optional + extensions: null + split_ratio: 0.2 + seed: 0 + image_size: 256 + train_batch_size: 32 + test_batch_size: 32 + num_workers: 8 + transform_config: null + create_validation_set: true + tiling: + apply: false + tile_size: null + stride: null + remove_border_count: 0 + use_random_tiling: False + random_tile_count: 16 +``` ## Inference Anomalib contains several tools that can be used to perform inference with a trained model. The script in [`tools/inference`](tools/inference.py) contains an example of how the inference tools can be used to generate a prediction for an input image. diff --git a/anomalib/config/config.py b/anomalib/config/config.py index 27c652d7a6..ed82f4b480 100644 --- a/anomalib/config/config.py +++ b/anomalib/config/config.py @@ -177,7 +177,8 @@ def get_configurable_parameters( config = update_input_size_config(config) # Project Configs - project_path = Path(config.project.path) / config.model.name / config.dataset.name / config.dataset.category + category = config.dataset.category if "category" in config.dataset.keys() else "" + project_path = Path(config.project.path) / config.model.name / config.dataset.name / category (project_path / "weights").mkdir(parents=True, exist_ok=True) (project_path / "images").mkdir(parents=True, exist_ok=True) config.project.path = str(project_path) diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index 6aa8fab510..55a60af20e 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -20,6 +20,7 @@ from pytorch_lightning import LightningDataModule from .btech import BTechDataModule +from .custom import CustomDataModule from .inference import InferenceDataset from .mvtec import MVTecDataModule @@ -51,12 +52,29 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule # TODO: Remove config values. IAAALD-211 root=config.dataset.path, category=config.dataset.category, - image_size=(config.dataset.image_size[0], config.dataset.image_size[0]), + image_size=(config.dataset.image_size[0], config.dataset.image_size[1]), train_batch_size=config.dataset.train_batch_size, test_batch_size=config.dataset.test_batch_size, num_workers=config.dataset.num_workers, seed=config.project.seed, ) + elif config.dataset.name.lower() == "custom": + datamodule = CustomDataModule( + root=config.dataset.path, + normal=config.dataset.normal, + abnormal=config.dataset.abnormal, + task=config.dataset.task, + mask_dir=config.dataset.mask, + extensions=config.dataset.extensions, + split_ratio=config.dataset.split_ratio, + seed=config.dataset.seed, + image_size=(config.dataset.image_size[0], config.dataset.image_size[1]), + train_batch_size=config.dataset.train_batch_size, + test_batch_size=config.dataset.test_batch_size, + num_workers=config.dataset.num_workers, + transform_config=config.dataset.transform_config, + create_validation_set=config.dataset.create_validation_set, + ) else: raise ValueError( "Unknown dataset! \n" diff --git a/anomalib/data/custom.py b/anomalib/data/custom.py index 33f620e27b..96c2f5daca 100644 --- a/anomalib/data/custom.py +++ b/anomalib/data/custom.py @@ -272,6 +272,7 @@ def __init__( root: Union[str, Path], normal: str = "normal", abnormal: str = "abnormal", + task: str = "classification", mask_dir: Optional[Union[Path, str]] = None, extensions: Optional[Tuple[str, ...]] = None, split_ratio: float = 0.2, @@ -291,6 +292,8 @@ def __init__( Defaults to "normal". abnormal (str, optional): Name of the directory containing abnormal images. Defaults to "abnormal". + task (str, optional): Task type. Could be either classification or segmentation. + Defaults to "classification". mask_dir (Optional[Union[str, Path]], optional): Path to the directory containing the mask annotations. Defaults to None. extensions (Optional[Tuple[str, ...]], optional): Type of the image extensions to read from the @@ -388,7 +391,14 @@ def __init__( self.mask_dir = mask_dir self.extensions = extensions self.split_ratio = split_ratio - self.task = "classification" if mask_dir is None else "segmentation" + + if task == "classification" and mask_dir is not None: + raise ValueError( + "Classification type is set but mask_dir provided. " + "If mask_dir is provided task type must be segmentation. " + "Check your configuration." + ) + self.task = task self.transform_config = transform_config self.image_size = image_size From cf22594dbf44b70bee4577780de91360385f632d Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Wed, 23 Mar 2022 06:03:44 -0700 Subject: [PATCH 19/24] Address PR comments --- README.md | 9 ++--- anomalib/config/config.py | 6 ++-- anomalib/data/__init__.py | 18 ++++++---- anomalib/data/{custom.py => folder.py} | 46 +++++++++++------------- anomalib/models/cflow/config.yaml | 2 +- anomalib/models/dfkde/config.yaml | 2 +- anomalib/models/dfm/config.yaml | 2 +- anomalib/models/ganomaly/config.yaml | 2 +- anomalib/models/padim/config.yaml | 2 +- anomalib/models/patchcore/config.yaml | 2 +- anomalib/models/stfpm/config.yaml | 2 +- tests/pre_merge/datasets/test_dataset.py | 25 ++++++------- 12 files changed, 61 insertions(+), 57 deletions(-) rename anomalib/data/{custom.py => folder.py} (93%) diff --git a/README.md b/README.md index 934bf5f09e..0ca8329af9 100644 --- a/README.md +++ b/README.md @@ -103,17 +103,18 @@ where the currently available models are: - [GANomaly](anomalib/models/ganomaly) ### Custom Dataset -It is also possible to train on a custom dataset. To do so, `data` section in `config.yaml` is to be modified as follows: +It is also possible to train on a custom folder dataset. To do so, `data` section in `config.yaml` is to be modified as follows: ```yaml dataset: - name: custom - path: + name: + format: folder + path: normal: normal # name of the folder containing normal images. abnormal: abnormal # name of the folder containing abnormal images. task: segmentation # classification or segmentation mask: #optional extensions: null - split_ratio: 0.2 + split_ratio: 0.2 # ratio of the normal images that will be used to create a test split seed: 0 image_size: 256 train_batch_size: 32 diff --git a/anomalib/config/config.py b/anomalib/config/config.py index ed82f4b480..ee2e3177bd 100644 --- a/anomalib/config/config.py +++ b/anomalib/config/config.py @@ -177,8 +177,10 @@ def get_configurable_parameters( config = update_input_size_config(config) # Project Configs - category = config.dataset.category if "category" in config.dataset.keys() else "" - project_path = Path(config.project.path) / config.model.name / config.dataset.name / category + project_path = Path(config.project.path) / config.model.name / config.dataset.name + if config.dataset.format.lower() in ("btech", "mvtec"): + project_path = project_path / config.dataset.category + (project_path / "weights").mkdir(parents=True, exist_ok=True) (project_path / "images").mkdir(parents=True, exist_ok=True) config.project.path = str(project_path) diff --git a/anomalib/data/__init__.py b/anomalib/data/__init__.py index 55a60af20e..a6f3f3b1f1 100644 --- a/anomalib/data/__init__.py +++ b/anomalib/data/__init__.py @@ -20,7 +20,7 @@ from pytorch_lightning import LightningDataModule from .btech import BTechDataModule -from .custom import CustomDataModule +from .folder import FolderDataModule from .inference import InferenceDataset from .mvtec import MVTecDataModule @@ -36,7 +36,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule """ datamodule: LightningDataModule - if config.dataset.name.lower() == "mvtec": + if config.dataset.format.lower() == "mvtec": datamodule = MVTecDataModule( # TODO: Remove config values. IAAALD-211 root=config.dataset.path, @@ -47,7 +47,7 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule num_workers=config.dataset.num_workers, seed=config.project.seed, ) - elif config.dataset.name.lower() == "btech": + elif config.dataset.format.lower() == "btech": datamodule = BTechDataModule( # TODO: Remove config values. IAAALD-211 root=config.dataset.path, @@ -58,8 +58,8 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule num_workers=config.dataset.num_workers, seed=config.project.seed, ) - elif config.dataset.name.lower() == "custom": - datamodule = CustomDataModule( + elif config.dataset.format.lower() == "folder": + datamodule = FolderDataModule( root=config.dataset.path, normal=config.dataset.normal, abnormal=config.dataset.abnormal, @@ -85,4 +85,10 @@ def get_datamodule(config: Union[DictConfig, ListConfig]) -> LightningDataModule return datamodule -__all__ = ["get_datamodule", "InferenceDataset"] +__all__ = [ + "get_datamodule", + "BTechDataModule", + "FolderDataModule", + "InferenceDataset", + "MVTecDataModule", +] diff --git a/anomalib/data/custom.py b/anomalib/data/folder.py similarity index 93% rename from anomalib/data/custom.py rename to anomalib/data/folder.py index 96c2f5daca..d935d75292 100644 --- a/anomalib/data/custom.py +++ b/anomalib/data/folder.py @@ -39,7 +39,7 @@ ) from anomalib.pre_processing import PreProcessor -logger = logging.getLogger(name="Dataset: Custom Dataset") +logger = logging.getLogger(name="Dataset: Folder Dataset") logger.setLevel(logging.DEBUG) @@ -67,7 +67,7 @@ def make_dataset( create_validation_set: bool = True, extensions: Optional[Tuple[str, ...]] = None, ): - """Make Custom Dataset. + """Make Folder Dataset. Args: normal_dir (Union[str, Path]): Path to the directory containing normal images. @@ -151,8 +151,8 @@ def make_dataset( return samples -class CustomDataset(Dataset): - """Custom Dataset.""" +class FolderDataset(Dataset): + """Folder Dataset.""" def __init__( self, @@ -167,7 +167,7 @@ def __init__( seed: int = 0, create_validation_set: bool = False, ) -> None: - """Create Custom Folder Dataset. + """Create Folder Folder Dataset. Args: normal_dir (Union[str, Path]): Path to the directory containing normal images. @@ -264,8 +264,8 @@ def __getitem__(self, index: int) -> Dict[str, Union[str, Tensor]]: return item -class CustomDataModule(LightningDataModule): - """Custom Lightning Data Module.""" +class FolderDataModule(LightningDataModule): + """Folder Lightning Data Module.""" def __init__( self, @@ -284,7 +284,7 @@ def __init__( transform_config: Optional[Union[str, A.Compose]] = None, create_validation_set: bool = False, ) -> None: - """Custom Dataset PL Datamodule. + """Folder Dataset PL Datamodule. Args: root (Union[str, Path]): Path to the root folder containing normal and abnormal dirs. @@ -314,9 +314,9 @@ def __init__( could set this flag to ``True``. Examples: - Assume that we use Custom Dataset for the MVTec/bottle/broken_large category. We would do: - >>> from anomalib.data import CustomDataModule - >>> datamodule = CustomDataModule( + Assume that we use Folder Dataset for the MVTec/bottle/broken_large category. We would do: + >>> from anomalib.data import FolderDataModule + >>> datamodule = FolderDataModule( ... root="./datasets/MVTec/bottle/test", ... normal="good", ... abnormal="broken_large", @@ -331,17 +331,11 @@ def __init__( >>> test_data.keys() dict_keys(['image']) - We could also create a Custom DataModule for datasets containing mask annotations. + We could also create a Folder DataModule for datasets containing mask annotations. The dataset expects that mask annotation filenames must be same as the original filename. - To show an example, we therefore need to modify the mask filenames in MVTec dataset. - - >>> # Rename MVTec mask annotations so that they are the same as image filanames - >>> folder = Path("./datasets/bottle/ground_truth/") - >>> for f in folder.glob(r"**/*.png"): - ... f.rename(f.parent / (f.stem.split("_")[0] + f.suffix)) - - Now we could try custom data module using the mvtec bottle broken large category - >>> datamodule = CustomDataModule( + To this end, we modified mask filenames in MVTec bottle category. + Now we could try folder data module using the mvtec bottle broken large category + >>> datamodule = FolderDataModule( ... root="./datasets/bottle/test", ... normal="good", ... abnormal="broken_large", @@ -360,10 +354,10 @@ def __init__( >>> print(test_data["image"].shape, test_data["mask"].shape) torch.Size([24, 3, 256, 256]) torch.Size([24, 256, 256]) - By default, Custom Data Module does not create a validation set. If a validation set + By default, Folder Data Module does not create a validation set. If a validation set is needed it could be set as follows: - >>> datamodule = CustomDataModule( + >>> datamodule = FolderDataModule( ... root="./datasets/bottle/test", ... normal="good", ... abnormal="broken_large", @@ -425,7 +419,7 @@ def setup(self, stage: Optional[str] = None) -> None: """ if stage in (None, "fit"): - self.train_data = CustomDataset( + self.train_data = FolderDataset( normal_dir=self.normal_dir, abnormal_dir=self.abnormal_dir, split="train", @@ -439,7 +433,7 @@ def setup(self, stage: Optional[str] = None) -> None: ) if self.create_validation_set: - self.val_data = CustomDataset( + self.val_data = FolderDataset( normal_dir=self.normal_dir, abnormal_dir=self.abnormal_dir, split="val", @@ -452,7 +446,7 @@ def setup(self, stage: Optional[str] = None) -> None: create_validation_set=self.create_validation_set, ) - self.test_data = CustomDataset( + self.test_data = FolderDataset( normal_dir=self.normal_dir, abnormal_dir=self.abnormal_dir, split="test", diff --git a/anomalib/models/cflow/config.yaml b/anomalib/models/cflow/config.yaml index 5eb3ed5178..915e371745 100644 --- a/anomalib/models/cflow/config.yaml +++ b/anomalib/models/cflow/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech, custom] + name: mvtec #options: [mvtec, btech, folder] format: mvtec path: ./datasets/MVTec category: bottle diff --git a/anomalib/models/dfkde/config.yaml b/anomalib/models/dfkde/config.yaml index 53163fcec0..abd2fba02f 100644 --- a/anomalib/models/dfkde/config.yaml +++ b/anomalib/models/dfkde/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech, custom] + name: mvtec #options: [mvtec, btech, folder] format: mvtec path: ./datasets/MVTec category: bottle diff --git a/anomalib/models/dfm/config.yaml b/anomalib/models/dfm/config.yaml index 587ea95331..6740e86f38 100755 --- a/anomalib/models/dfm/config.yaml +++ b/anomalib/models/dfm/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech, custom] + name: mvtec #options: [mvtec, btech, folder] format: mvtec path: ./datasets/MVTec category: bottle diff --git a/anomalib/models/ganomaly/config.yaml b/anomalib/models/ganomaly/config.yaml index fba8949228..774537b903 100644 --- a/anomalib/models/ganomaly/config.yaml +++ b/anomalib/models/ganomaly/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech, custom] + name: mvtec #options: [mvtec, btech, folder] format: mvtec path: ./datasets/MVTec category: bottle diff --git a/anomalib/models/padim/config.yaml b/anomalib/models/padim/config.yaml index 86cb2d1aac..b6ac798373 100644 --- a/anomalib/models/padim/config.yaml +++ b/anomalib/models/padim/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech] + name: mvtec #options: [mvtec, btech, folder] format: mvtec path: ./datasets/MVTec category: bottle diff --git a/anomalib/models/patchcore/config.yaml b/anomalib/models/patchcore/config.yaml index 64d6092405..10cfe9abf2 100644 --- a/anomalib/models/patchcore/config.yaml +++ b/anomalib/models/patchcore/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech, custom] + name: mvtec #options: [mvtec, btech, folder] format: mvtec path: ./datasets/MVTec task: segmentation diff --git a/anomalib/models/stfpm/config.yaml b/anomalib/models/stfpm/config.yaml index 77d983fab2..ab17e5fd29 100644 --- a/anomalib/models/stfpm/config.yaml +++ b/anomalib/models/stfpm/config.yaml @@ -1,5 +1,5 @@ dataset: - name: mvtec #options: [mvtec, btech, custom] + name: mvtec #options: [mvtec, btech, folder] format: mvtec path: ./datasets/MVTec category: bottle diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py index f1203a7d61..9e428d9113 100644 --- a/tests/pre_merge/datasets/test_dataset.py +++ b/tests/pre_merge/datasets/test_dataset.py @@ -8,7 +8,7 @@ from anomalib.config import get_configurable_parameters, update_input_size_config from anomalib.data import ( BTechDataModule, - CustomDataModule, + FolderDataModule, MVTecDataModule, get_datamodule, ) @@ -50,13 +50,14 @@ def btech_data_module(): @pytest.fixture(autouse=True) -def custom_data_module(): - """Create Custom Data Module.""" - datamodule = CustomDataModule( +def folder_data_module(): + """Create Folder Data Module.""" + datamodule = FolderDataModule( root="./datasets/bottle/test", normal="good", abnormal="broken_large", mask_dir="./datasets/bottle/ground_truth/broken_large", + task="segmentation", split_ratio=0.2, seed=0, image_size=(256, 256), @@ -114,20 +115,20 @@ def test_val_and_test_dataloaders_has_mask_and_gt(self, btech_data_module): assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys()) -class TestCustomDataModule: - """Test Custom Data Module.""" +class TestFolderDataModule: + """Test Folder Data Module.""" - def test_batch_size(self, custom_data_module): + def test_batch_size(self, folder_data_module): """Test batch size.""" - _, train_data_sample = next(enumerate(custom_data_module.train_dataloader())) - _, val_data_sample = next(enumerate(custom_data_module.val_dataloader())) + _, train_data_sample = next(enumerate(folder_data_module.train_dataloader())) + _, val_data_sample = next(enumerate(folder_data_module.val_dataloader())) assert train_data_sample["image"].shape[0] == 16 assert val_data_sample["image"].shape[0] == 12 - def test_val_and_test_dataloaders_has_mask_and_gt(self, custom_data_module): + def test_val_and_test_dataloaders_has_mask_and_gt(self, folder_data_module): """Test Validation and Test dataloaders should return filenames, image, mask and label.""" - _, val_data = next(enumerate(custom_data_module.val_dataloader())) - _, test_data = next(enumerate(custom_data_module.test_dataloader())) + _, val_data = next(enumerate(folder_data_module.val_dataloader())) + _, test_data = next(enumerate(folder_data_module.test_dataloader())) assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(val_data.keys()) assert sorted(["image_path", "mask_path", "image", "label", "mask"]) == sorted(test_data.keys()) From 6646c3b93df2be23bb561864ae12f6dafe0de3a9 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Wed, 23 Mar 2022 07:56:38 -0700 Subject: [PATCH 20/24] fix dataset path --- tests/pre_merge/datasets/test_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py index 9e428d9113..b74d07dac5 100644 --- a/tests/pre_merge/datasets/test_dataset.py +++ b/tests/pre_merge/datasets/test_dataset.py @@ -53,7 +53,7 @@ def btech_data_module(): def folder_data_module(): """Create Folder Data Module.""" datamodule = FolderDataModule( - root="./datasets/bottle/test", + root=get_dataset_path(dataset="bottle/test"), normal="good", abnormal="broken_large", mask_dir="./datasets/bottle/ground_truth/broken_large", From b3cf100c94f37bb4121b04e5f036cd8a4b27be4d Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Mar 2022 01:13:18 -0700 Subject: [PATCH 21/24] Debugging the ci --- .github/workflows/pre_merge.yml | 4 ++-- tox.ini | 11 ++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/.github/workflows/pre_merge.yml b/.github/workflows/pre_merge.yml index 0f0e8c2223..d7fd00c8a8 100644 --- a/.github/workflows/pre_merge.yml +++ b/.github/workflows/pre_merge.yml @@ -18,8 +18,8 @@ jobs: uses: actions/checkout@v2 - name: Install Tox run: pip install tox - - name: Code quality checks - run: tox -e black,isort,flake8,pylint,mypy,pydocstyle + # - name: Code quality checks + # run: tox -e black,isort,flake8,pylint,mypy,pydocstyle - name: Coverage run: | export ANOMALIB_DATASET_PATH=/media/data1/datasets/ diff --git a/tox.ini b/tox.ini index 74aee167b3..e6e765b2ee 100644 --- a/tox.ini +++ b/tox.ini @@ -72,11 +72,12 @@ deps = -r{toxinidir}/requirements/base.txt -r{toxinidir}/requirements/openvino.txt commands = - coverage erase - coverage run --include=anomalib/* -m pytest tests/pre_merge/ -ra --showlocals - ; https://github.com/openvinotoolkit/anomalib/issues/94 - coverage report -m --fail-under=85 - coverage xml -o {toxworkdir}/coverage.xml + python -m pytest tests/pre_merge/datasets/test_dataset.py + ; coverage erase + ; coverage run --include=anomalib/* -m pytest tests/pre_merge/ -ra --showlocals + ; ; https://github.com/openvinotoolkit/anomalib/issues/94 + ; coverage report -m --fail-under=85 + ; coverage xml -o {toxworkdir}/coverage.xml [testenv:nightly] basepython = python3 From 00e8020d7255cb5b6870128e5bbb1ff7ef675cf1 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Mar 2022 02:37:12 -0700 Subject: [PATCH 22/24] Fixed folder dataset tests --- tests/pre_merge/datasets/test_dataset.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/pre_merge/datasets/test_dataset.py b/tests/pre_merge/datasets/test_dataset.py index b74d07dac5..fad7a48e0b 100644 --- a/tests/pre_merge/datasets/test_dataset.py +++ b/tests/pre_merge/datasets/test_dataset.py @@ -52,11 +52,12 @@ def btech_data_module(): @pytest.fixture(autouse=True) def folder_data_module(): """Create Folder Data Module.""" + root = get_dataset_path(dataset="bottle") datamodule = FolderDataModule( - root=get_dataset_path(dataset="bottle/test"), + root=root, normal="good", abnormal="broken_large", - mask_dir="./datasets/bottle/ground_truth/broken_large", + mask_dir=os.path.join(root, "ground_truth/broken_large"), task="segmentation", split_ratio=0.2, seed=0, From 8e47bd35b223bfd61e21a264fff6c1d5e93f41b7 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Mar 2022 02:38:13 -0700 Subject: [PATCH 23/24] Added code quality checks back to the ci --- .github/workflows/pre_merge.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/pre_merge.yml b/.github/workflows/pre_merge.yml index d7fd00c8a8..0f0e8c2223 100644 --- a/.github/workflows/pre_merge.yml +++ b/.github/workflows/pre_merge.yml @@ -18,8 +18,8 @@ jobs: uses: actions/checkout@v2 - name: Install Tox run: pip install tox - # - name: Code quality checks - # run: tox -e black,isort,flake8,pylint,mypy,pydocstyle + - name: Code quality checks + run: tox -e black,isort,flake8,pylint,mypy,pydocstyle - name: Coverage run: | export ANOMALIB_DATASET_PATH=/media/data1/datasets/ From 314b16404799c6dcae948e5b284f77d8ccc4bf96 Mon Sep 17 00:00:00 2001 From: Samet Akcay Date: Thu, 24 Mar 2022 02:38:36 -0700 Subject: [PATCH 24/24] Added code coverage back to pre-merge tests --- tox.ini | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tox.ini b/tox.ini index e6e765b2ee..74aee167b3 100644 --- a/tox.ini +++ b/tox.ini @@ -72,12 +72,11 @@ deps = -r{toxinidir}/requirements/base.txt -r{toxinidir}/requirements/openvino.txt commands = - python -m pytest tests/pre_merge/datasets/test_dataset.py - ; coverage erase - ; coverage run --include=anomalib/* -m pytest tests/pre_merge/ -ra --showlocals - ; ; https://github.com/openvinotoolkit/anomalib/issues/94 - ; coverage report -m --fail-under=85 - ; coverage xml -o {toxworkdir}/coverage.xml + coverage erase + coverage run --include=anomalib/* -m pytest tests/pre_merge/ -ra --showlocals + ; https://github.com/openvinotoolkit/anomalib/issues/94 + coverage report -m --fail-under=85 + coverage xml -o {toxworkdir}/coverage.xml [testenv:nightly] basepython = python3