Imagenet21 (#34)

* include root as argument * update * add doc strings. Cache download * update smoke test * update test * add comment * udate worflow
visual-layer · Jun 13, 2023 · 945974e · 945974e
1 parent 61d88d5
commit 945974e
Show file tree

Hide file tree

Showing 11 changed files with 494 additions and 279 deletions.
diff --git a/.github/workflows/smoke_test.yml b/.github/workflows/smoke_test.yml
@@ -5,7 +5,7 @@ on:
     branches: ["main"]
   workflow_dispatch:
   pull_request:
-    types: [opened, synchronize, ready_for_review]
+    types: [opened, synchronize]
 
 jobs:
     test_imports:
@@ -14,7 +14,8 @@ jobs:
         SENTRY_OPT_OUT: True
       strategy:
         matrix:
-            os: [ubuntu-latest, macos-latest, windows-latest]
+            os: [ubuntu-latest]
+            python: ['3.9']
       steps:
         - name: Checkout code
           uses: actions/checkout@v3
@@ -24,7 +25,7 @@ jobs:
         - name: Set up Python
           uses: actions/setup-python@v3
           with:
-            python-version: '3.9'
+            python-version: ${{ matrix.python }}
 
         - name: Install dependencies
           run: |
@@ -37,9 +38,32 @@ jobs:
             echo $SENTRY_OPT_OUT
             python .github/workflows/test_imports.py
         
+    test_pets_example:
+      runs-on: ${{ matrix.os }}
+      env:
+        SENTRY_OPT_OUT: True
+      strategy:
+        matrix:
+            os: [ubuntu-latest]
+            python: ['3.7', '3.11']
+      steps:
+        - name: Checkout code
+          uses: actions/checkout@v3
+          with:
+           fetch-depth: 0
+
+        - name: Set up Python
+          uses: actions/setup-python@v3
+          with:
+            python-version: ${{ matrix.python }}
+
+        - name: Install dependencies
+          run: |
+            python -m pip install --upgrade pip
+            pip install -e .
+        
         - name: Test pets
           run: |
             echo "Sentry opt out: "
             echo $SENTRY_OPT_OUT
-            python .github/workflows/test_pets.py
-      
+            python .github/workflows/test_pets.py
diff --git a/.github/workflows/test_imports.py b/.github/workflows/test_imports.py
@@ -1,5 +1,5 @@
 import visuallayer
-from visuallayer.datasets.zoo import VLFood101, VLOxfordIIITPet, VLImageNet1k
+from visuallayer.datasets.zoo import VLFood101, VLOxfordIIITPet, VLImageNet1k, VLImageNet21k
 
 print(f'Running on version: {visuallayer.__version__}')
 print("Successfully imported all modules")
diff --git a/.github/workflows/test_pets.py b/.github/workflows/test_pets.py
@@ -18,6 +18,10 @@
 train_dataset = my_pets.export(output_format="pytorch", split="train")
 test_dataset = my_pets.export(output_format="pytorch", split="test")
 
+print("Exporting to csv")
+train_dataset = my_pets.export(output_format="csv", split="train")
+test_dataset = my_pets.export(output_format="csv", split="test")
+
 print("Exporting issues")
 my_pets.export_issues("./issues-pets.csv")
 

diff --git a/notebooks/dataset-test.ipynb b/notebooks/dataset-test.ipynb
diff --git a/notebooks/train-fastai.ipynb b/notebooks/train-fastai.ipynb
diff --git a/visuallayer/datasets/dataset.py b/visuallayer/datasets/dataset.py
@@ -1,16 +1,61 @@
 import pandas as pd
 from typing import Union, List, Tuple
+from dataclasses import dataclass
+from functools import lru_cache
 
 
+@dataclass(frozen=True)
 class Dataset:
+    root: str
+    name: str
+    homepage_url: str 
+    license: str 
+    description: str 
+    num_images: int 
+    filelist_csv_url: str 
+    issue_count_csv_url: str
+
+    @staticmethod
+    @lru_cache(maxsize=None)
+    def _get_csv(url: str) -> pd.DataFrame:
+        """
+        Downloads a CSV file from a given URL and returns it as a pandas DataFrame. The results are cached, so 
+        calling the method multiple times with the same URL will not cause additional network requests.
+
+        Args:
+            url (str): The URL of the CSV file.
+
+        Returns:
+            pd.DataFrame: The data from the CSV file as a pandas DataFrame.
+        """
+        return pd.read_csv(url)
+
     @property
     def num_images_with_issues(self) -> int:
-        df = pd.read_csv(self.filelist_csv_url)
+        """
+        Computes the number of images with issues in the dataset. The number is calculated as the number of unique 
+        entries in the "filename" column of the DataFrame obtained from the `filelist_csv_url`.
+
+        Returns:
+            int: The number of images with issues.
+        """
+        df = self._get_csv(self.filelist_csv_url)
         return len(df["filename"].unique())
 
     @property
     def info(self) -> None:
-        # Get all attributes and methods of the class
+        """
+        Prints the metadata information for the dataset.
+
+        The following information is printed:
+        - Name
+        - Description
+        - License
+        - Homepage URL
+        - Number of Images
+        - Number of Images with Issues
+        """
+
         dataset_metadata: List[Tuple[str, Union[str, int]]] = [
             ("Name", self.name),
             ("Description", self.description),
@@ -26,7 +71,14 @@ def info(self) -> None:
 
     @property
     def report(self) -> pd.DataFrame:
-        df = pd.read_csv(self.issue_count_csv_url)
+        """
+        Creates a summary report for the dataset. The report is a DataFrame that contains the reason, count, and 
+        percentage of issues for each type of issue in the dataset.
+
+        Returns:
+            pd.DataFrame: The report DataFrame.
+        """
+        df = self._get_csv(self.issue_count_csv_url)
         df = df.loc[df["split"] == "all"].drop("split", axis=1).reset_index(drop=True)
 
         # Calculate the total sum per column
@@ -40,6 +92,15 @@ def report(self) -> pd.DataFrame:
         return df
 
     def explore(self) -> pd.DataFrame:
+        """
+        Creates a DataFrame that can be used to visually explore the dataset. This DataFrame contains several columns 
+        related to the images and their issues, including previews of the images.
+
+        Returns:
+            pd.DataFrame: The exploration DataFrame.
+        """
+
+        # Interactive table in jupyter notebook
         import base64
         from itables import init_notebook_mode
         init_notebook_mode(all_interactive=True)
@@ -54,7 +115,7 @@ def to_img_tag(path):
                 return path  # Return the original value if it's not a string
 
 
-        df = pd.read_csv(self.filelist_csv_url)
+        df = self._get_csv(self.filelist_csv_url)
         df["filename_preview"] = df["filename"]
         df["prototype_preview"] = df["prototype"]
         df = df.loc[
@@ -74,6 +135,12 @@ def to_img_tag(path):
         return df
 
     def export_issues(self, filename: str) -> None:
-        df = pd.read_csv(self.issue_count_csv_url)
+        """
+        Exports the issue count data to a CSV file.
+
+        Args:
+            filename (str): The path where the CSV file will be saved.
+        """
+        df = self._get_csv(self.issue_count_csv_url)
         df.to_csv(filename, index=False)
 
diff --git a/visuallayer/datasets/zoo/__init__.py b/visuallayer/datasets/zoo/__init__.py
@@ -1,4 +1,4 @@
 from .vl_oxford_iiit import VLOxfordIIITPet
 from .vl_food101 import VLFood101
-from .vl_imagenet import VLImageNet1k
+from .vl_imagenet import VLImageNet1k, VLImageNet21k
 from .utils import load, list_datasets
diff --git a/visuallayer/datasets/zoo/utils.py b/visuallayer/datasets/zoo/utils.py
@@ -1,14 +1,16 @@
 from .vl_oxford_iiit import VLOxfordIIITPet, VLOriginalOxfordIIITPet
 from .vl_food101 import VLFood101, VLOriginalFood101
-from .vl_imagenet import VLImageNet1k, VLOriginalImageNet1k
+from .vl_imagenet import VLImageNet1k, VLOriginalImageNet1k, VLImageNet21k, VLOriginalImageNet21k
 
 dataset = {
     "vl-oxford-iiit-pets": VLOxfordIIITPet,
     "oxford-iiit-pets": VLOriginalOxfordIIITPet,
     "vl-food101": VLFood101,
     "food101": VLOriginalFood101,
     "vl-imagenet-1k": VLImageNet1k,
-    "imagenet-1k": VLOriginalImageNet1k
+    "imagenet-1k": VLOriginalImageNet1k,
+    "vl-imagenet-21k": VLImageNet21k,
+    "imagenet-21k": VLOriginalImageNet21k
 }
 
 
@@ -18,7 +20,7 @@ def load(dataset_name: str):
 
 def list_datasets():
     names = _get_dataset_names()
-    print("Listing all datasets in zoo.")
+    print("Listing all datasets in the zoo.")
     return list(sorted(names))
 
 def _get_dataset_names():

diff --git a/visuallayer/datasets/zoo/vl_food101.py b/visuallayer/datasets/zoo/vl_food101.py
@@ -7,6 +7,7 @@
 
 @dataclass(frozen=True)
 class VLFood101(Dataset):
+    root: str = './'
     name: str = "vl-food101"
     homepage_url: str = "https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/"
     license: str = "Unknown"
@@ -16,43 +17,42 @@ class VLFood101(Dataset):
     issue_count_csv_url: str = "https://sharedvisuallayer.s3.us-east-2.amazonaws.com/visual-layer-sdk/food101_images_issue_count.csv"
     exclude_csv: str = None
 
-    # Hack: Download the dataset in the current dir
+    # Hack: Download the dataset in root dir
     def __post_init__(self):
-        Food101(root="./", download=True)
+        Food101(root=self.root, download=True)
 
     def export(
         self,
         output_format: str,
         variation: str = "vl",
-        root: str = "./",
         split: str = "train",
     ):
         if output_format == "pytorch":
             if variation == "vl":
                 print(
                     f"Exporting {variation.upper()} dataset into {output_format} dataset."
                 )
-                return CleanTorchvisionFood101(root=root, split=split, exclude_csv=self.exclude_csv)
+                return CleanTorchvisionFood101(root=self.root, split=split, exclude_csv=self.exclude_csv)
             elif variation == "original":
                 print(
                     f"Exporting {variation.upper()} dataset into {output_format} dataset."
                 )
-                return Food101(root=root, split=split, download=True)
+                return Food101(root=self.root, split=split, download=True)
 
         elif output_format == "csv":
             if variation == "vl":
                 print(
                     f"Exporting {variation.upper()} dataset into {output_format} dataset."
                 )
-                dataset = CleanTorchvisionFood101(root=root, split=split, exclude_csv=self.exclude_csv)
+                dataset = CleanTorchvisionFood101(root=self.root, split=split, exclude_csv=self.exclude_csv)
                 samples = {"Image": dataset._images, "Label": dataset._labels}
                 df = pd.DataFrame(samples)
                 return df
             elif variation == "original":
                 print(
                     f"Exporting {variation.upper()} dataset into {output_format} dataset."
                 )
-                dataset = Food101(root=root, split=split, download=True)
+                dataset = Food101(root=self.root, split=split, download=True)
                 samples = {"Image": dataset._images, "Label": dataset._labels}
                 df = pd.DataFrame(samples)
                 return df

diff --git a/visuallayer/datasets/zoo/vl_imagenet.py b/visuallayer/datasets/zoo/vl_imagenet.py
@@ -6,7 +6,7 @@
 
 @dataclass(frozen=True)
 class VLImageNet1k(Dataset):
-    root:str
+    root: str = "./"
     name: str = "vl-imagenet-1k"
     homepage_url: str = "https://www.image-net.org/"
     license: str = "Unknown"
@@ -56,12 +56,29 @@ def export(
                 f"Unknown output format: {output_format} or variation {variation}."
             )
 
-    # TODO
+    # TODO - fix me. Does not work because directory from csv file is different from what is expected locally in ImageFolder form. Also check self.root
     def explore(self):
         raise NotImplementedError
 
 
 @dataclass(frozen=True)
 class VLOriginalImageNet1k(VLImageNet1k):
     name: str = "imagenet-1k"
-    description: str = "The original imagenet-1k dataset."
+    description: str = "The original imagenet-1k dataset."
+
+
+@dataclass(frozen=True)
+class VLImageNet21k(VLImageNet1k):
+    root: str = "./"
+    name: str = "vl-imagenet-21k"
+    homepage_url: str = "https://github.com/Alibaba-MIIL/ImageNet21K"
+    license: str = "Unknown"
+    description: str = "A modified version of the original ImageNet-21k dataset removing dataset issues."
+    num_images: int = 13153500
+    filelist_csv_url: str = "https://sharedvisuallayer.s3.us-east-2.amazonaws.com/visual-layer-sdk/ImageNet-21K_images_issue_file_list.csv"
+    issue_count_csv_url: str = "https://sharedvisuallayer.s3.us-east-2.amazonaws.com/visual-layer-sdk/ImageNet-21K_images_issue_count.csv"
+
+@dataclass(frozen=True)
+class VLOriginalImageNet21k(VLImageNet21k):
+    name: str = "imagenet-21k"
+    description: str = "The original imagenet-21k dataset."
diff --git a/visuallayer/datasets/zoo/vl_oxford_iiit.py b/visuallayer/datasets/zoo/vl_oxford_iiit.py
@@ -7,6 +7,7 @@
 
 @dataclass(frozen=True)
 class VLOxfordIIITPet(Dataset):
+    root: str = './'
     name: str = "vl-oxford-iiit-pets"
     homepage_url: str = "https://www.robots.ox.ac.uk/~vgg/data/pets/"
     license: str = "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)"
@@ -18,41 +19,40 @@ class VLOxfordIIITPet(Dataset):
 
     # Hack: Download the dataset in the current dir
     def __post_init__(self):
-        OxfordIIITPet(root="./", download=True)
+        OxfordIIITPet(root=self.root, download=True)
 
     def export(
         self,
         output_format: str,
         variation: str = "vl",
-        root: str = "./",
         split: str = "train",
     ):
         if output_format == "pytorch":
             if variation == "vl":
                 print(
                     f"Exporting {variation.upper()} dataset into {output_format} dataset."
                 )
-                return CleanTorchvisionOxfordIIITPet(root=root, split=split, exclude_csv=self.exclude_csv)
+                return CleanTorchvisionOxfordIIITPet(root=self.root, split=split, exclude_csv=self.exclude_csv)
             elif variation == "original":
                 print(
                     f"Exporting {variation.upper()} dataset into {output_format} dataset."
                 )
-                return OxfordIIITPet(root=root, split=split, download=True)
+                return OxfordIIITPet(root=self.root, split=split, download=True)
 
         elif output_format == "csv":
             if variation == "vl":
                 print(
                     f"Exporting {variation.upper()} dataset into {output_format} dataset."
                 )
-                dataset = CleanTorchvisionOxfordIIITPet(root=root, split=split, exclude_csv=self.exclude_csv)
+                dataset = CleanTorchvisionOxfordIIITPet(root=self.root, split=split, exclude_csv=self.exclude_csv)
                 samples = {"Image": dataset._images, "Label": dataset._labels}
                 df = pd.DataFrame(samples)
                 return df
             elif variation == "original":
                 print(
                     f"Exporting {variation.upper()} dataset into {output_format} dataset."
                 )
-                dataset = OxfordIIITPet(root=root, split=split, download=True)
+                dataset = OxfordIIITPet(root=self.root, split=split, download=True)
                 samples = {"Image": dataset._images, "Label": dataset._labels}
                 df = pd.DataFrame(samples)
                 return df