Skip to content

Commit

Permalink
Imagenet21 (#34)
Browse files Browse the repository at this point in the history
* include root as argument

* update

* add doc strings. Cache download

* update smoke test

* update test

* add comment

* udate worflow
  • Loading branch information
dnth authored Jun 13, 2023
1 parent 61d88d5 commit 945974e
Show file tree
Hide file tree
Showing 11 changed files with 494 additions and 279 deletions.
34 changes: 29 additions & 5 deletions .github/workflows/smoke_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ on:
branches: ["main"]
workflow_dispatch:
pull_request:
types: [opened, synchronize, ready_for_review]
types: [opened, synchronize]

jobs:
test_imports:
Expand All @@ -14,7 +14,8 @@ jobs:
SENTRY_OPT_OUT: True
strategy:
matrix:
os: [ubuntu-latest, macos-latest, windows-latest]
os: [ubuntu-latest]
python: ['3.9']
steps:
- name: Checkout code
uses: actions/checkout@v3
Expand All @@ -24,7 +25,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: '3.9'
python-version: ${{ matrix.python }}

- name: Install dependencies
run: |
Expand All @@ -37,9 +38,32 @@ jobs:
echo $SENTRY_OPT_OUT
python .github/workflows/test_imports.py
test_pets_example:
runs-on: ${{ matrix.os }}
env:
SENTRY_OPT_OUT: True
strategy:
matrix:
os: [ubuntu-latest]
python: ['3.7', '3.11']
steps:
- name: Checkout code
uses: actions/checkout@v3
with:
fetch-depth: 0

- name: Set up Python
uses: actions/setup-python@v3
with:
python-version: ${{ matrix.python }}

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e .
- name: Test pets
run: |
echo "Sentry opt out: "
echo $SENTRY_OPT_OUT
python .github/workflows/test_pets.py
python .github/workflows/test_pets.py
2 changes: 1 addition & 1 deletion .github/workflows/test_imports.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import visuallayer
from visuallayer.datasets.zoo import VLFood101, VLOxfordIIITPet, VLImageNet1k
from visuallayer.datasets.zoo import VLFood101, VLOxfordIIITPet, VLImageNet1k, VLImageNet21k

print(f'Running on version: {visuallayer.__version__}')
print("Successfully imported all modules")
4 changes: 4 additions & 0 deletions .github/workflows/test_pets.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@
train_dataset = my_pets.export(output_format="pytorch", split="train")
test_dataset = my_pets.export(output_format="pytorch", split="test")

print("Exporting to csv")
train_dataset = my_pets.export(output_format="csv", split="train")
test_dataset = my_pets.export(output_format="csv", split="test")

print("Exporting issues")
my_pets.export_issues("./issues-pets.csv")

Expand Down
495 changes: 298 additions & 197 deletions notebooks/dataset-test.ipynb

Large diffs are not rendered by default.

102 changes: 51 additions & 51 deletions notebooks/train-fastai.ipynb

Large diffs are not rendered by default.

77 changes: 72 additions & 5 deletions visuallayer/datasets/dataset.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,61 @@
import pandas as pd
from typing import Union, List, Tuple
from dataclasses import dataclass
from functools import lru_cache


@dataclass(frozen=True)
class Dataset:
root: str
name: str
homepage_url: str
license: str
description: str
num_images: int
filelist_csv_url: str
issue_count_csv_url: str

@staticmethod
@lru_cache(maxsize=None)
def _get_csv(url: str) -> pd.DataFrame:
"""
Downloads a CSV file from a given URL and returns it as a pandas DataFrame. The results are cached, so
calling the method multiple times with the same URL will not cause additional network requests.
Args:
url (str): The URL of the CSV file.
Returns:
pd.DataFrame: The data from the CSV file as a pandas DataFrame.
"""
return pd.read_csv(url)

@property
def num_images_with_issues(self) -> int:
df = pd.read_csv(self.filelist_csv_url)
"""
Computes the number of images with issues in the dataset. The number is calculated as the number of unique
entries in the "filename" column of the DataFrame obtained from the `filelist_csv_url`.
Returns:
int: The number of images with issues.
"""
df = self._get_csv(self.filelist_csv_url)
return len(df["filename"].unique())

@property
def info(self) -> None:
# Get all attributes and methods of the class
"""
Prints the metadata information for the dataset.
The following information is printed:
- Name
- Description
- License
- Homepage URL
- Number of Images
- Number of Images with Issues
"""

dataset_metadata: List[Tuple[str, Union[str, int]]] = [
("Name", self.name),
("Description", self.description),
Expand All @@ -26,7 +71,14 @@ def info(self) -> None:

@property
def report(self) -> pd.DataFrame:
df = pd.read_csv(self.issue_count_csv_url)
"""
Creates a summary report for the dataset. The report is a DataFrame that contains the reason, count, and
percentage of issues for each type of issue in the dataset.
Returns:
pd.DataFrame: The report DataFrame.
"""
df = self._get_csv(self.issue_count_csv_url)
df = df.loc[df["split"] == "all"].drop("split", axis=1).reset_index(drop=True)

# Calculate the total sum per column
Expand All @@ -40,6 +92,15 @@ def report(self) -> pd.DataFrame:
return df

def explore(self) -> pd.DataFrame:
"""
Creates a DataFrame that can be used to visually explore the dataset. This DataFrame contains several columns
related to the images and their issues, including previews of the images.
Returns:
pd.DataFrame: The exploration DataFrame.
"""

# Interactive table in jupyter notebook
import base64
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
Expand All @@ -54,7 +115,7 @@ def to_img_tag(path):
return path # Return the original value if it's not a string


df = pd.read_csv(self.filelist_csv_url)
df = self._get_csv(self.filelist_csv_url)
df["filename_preview"] = df["filename"]
df["prototype_preview"] = df["prototype"]
df = df.loc[
Expand All @@ -74,6 +135,12 @@ def to_img_tag(path):
return df

def export_issues(self, filename: str) -> None:
df = pd.read_csv(self.issue_count_csv_url)
"""
Exports the issue count data to a CSV file.
Args:
filename (str): The path where the CSV file will be saved.
"""
df = self._get_csv(self.issue_count_csv_url)
df.to_csv(filename, index=False)

2 changes: 1 addition & 1 deletion visuallayer/datasets/zoo/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from .vl_oxford_iiit import VLOxfordIIITPet
from .vl_food101 import VLFood101
from .vl_imagenet import VLImageNet1k
from .vl_imagenet import VLImageNet1k, VLImageNet21k
from .utils import load, list_datasets
8 changes: 5 additions & 3 deletions visuallayer/datasets/zoo/utils.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from .vl_oxford_iiit import VLOxfordIIITPet, VLOriginalOxfordIIITPet
from .vl_food101 import VLFood101, VLOriginalFood101
from .vl_imagenet import VLImageNet1k, VLOriginalImageNet1k
from .vl_imagenet import VLImageNet1k, VLOriginalImageNet1k, VLImageNet21k, VLOriginalImageNet21k

dataset = {
"vl-oxford-iiit-pets": VLOxfordIIITPet,
"oxford-iiit-pets": VLOriginalOxfordIIITPet,
"vl-food101": VLFood101,
"food101": VLOriginalFood101,
"vl-imagenet-1k": VLImageNet1k,
"imagenet-1k": VLOriginalImageNet1k
"imagenet-1k": VLOriginalImageNet1k,
"vl-imagenet-21k": VLImageNet21k,
"imagenet-21k": VLOriginalImageNet21k
}


Expand All @@ -18,7 +20,7 @@ def load(dataset_name: str):

def list_datasets():
names = _get_dataset_names()
print("Listing all datasets in zoo.")
print("Listing all datasets in the zoo.")
return list(sorted(names))

def _get_dataset_names():
Expand Down
14 changes: 7 additions & 7 deletions visuallayer/datasets/zoo/vl_food101.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

@dataclass(frozen=True)
class VLFood101(Dataset):
root: str = './'
name: str = "vl-food101"
homepage_url: str = "https://data.vision.ee.ethz.ch/cvl/datasets_extra/food-101/"
license: str = "Unknown"
Expand All @@ -16,43 +17,42 @@ class VLFood101(Dataset):
issue_count_csv_url: str = "https://sharedvisuallayer.s3.us-east-2.amazonaws.com/visual-layer-sdk/food101_images_issue_count.csv"
exclude_csv: str = None

# Hack: Download the dataset in the current dir
# Hack: Download the dataset in root dir
def __post_init__(self):
Food101(root="./", download=True)
Food101(root=self.root, download=True)

def export(
self,
output_format: str,
variation: str = "vl",
root: str = "./",
split: str = "train",
):
if output_format == "pytorch":
if variation == "vl":
print(
f"Exporting {variation.upper()} dataset into {output_format} dataset."
)
return CleanTorchvisionFood101(root=root, split=split, exclude_csv=self.exclude_csv)
return CleanTorchvisionFood101(root=self.root, split=split, exclude_csv=self.exclude_csv)
elif variation == "original":
print(
f"Exporting {variation.upper()} dataset into {output_format} dataset."
)
return Food101(root=root, split=split, download=True)
return Food101(root=self.root, split=split, download=True)

elif output_format == "csv":
if variation == "vl":
print(
f"Exporting {variation.upper()} dataset into {output_format} dataset."
)
dataset = CleanTorchvisionFood101(root=root, split=split, exclude_csv=self.exclude_csv)
dataset = CleanTorchvisionFood101(root=self.root, split=split, exclude_csv=self.exclude_csv)
samples = {"Image": dataset._images, "Label": dataset._labels}
df = pd.DataFrame(samples)
return df
elif variation == "original":
print(
f"Exporting {variation.upper()} dataset into {output_format} dataset."
)
dataset = Food101(root=root, split=split, download=True)
dataset = Food101(root=self.root, split=split, download=True)
samples = {"Image": dataset._images, "Label": dataset._labels}
df = pd.DataFrame(samples)
return df
Expand Down
23 changes: 20 additions & 3 deletions visuallayer/datasets/zoo/vl_imagenet.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

@dataclass(frozen=True)
class VLImageNet1k(Dataset):
root:str
root: str = "./"
name: str = "vl-imagenet-1k"
homepage_url: str = "https://www.image-net.org/"
license: str = "Unknown"
Expand Down Expand Up @@ -56,12 +56,29 @@ def export(
f"Unknown output format: {output_format} or variation {variation}."
)

# TODO
# TODO - fix me. Does not work because directory from csv file is different from what is expected locally in ImageFolder form. Also check self.root
def explore(self):
raise NotImplementedError


@dataclass(frozen=True)
class VLOriginalImageNet1k(VLImageNet1k):
name: str = "imagenet-1k"
description: str = "The original imagenet-1k dataset."
description: str = "The original imagenet-1k dataset."


@dataclass(frozen=True)
class VLImageNet21k(VLImageNet1k):
root: str = "./"
name: str = "vl-imagenet-21k"
homepage_url: str = "https://github.com/Alibaba-MIIL/ImageNet21K"
license: str = "Unknown"
description: str = "A modified version of the original ImageNet-21k dataset removing dataset issues."
num_images: int = 13153500
filelist_csv_url: str = "https://sharedvisuallayer.s3.us-east-2.amazonaws.com/visual-layer-sdk/ImageNet-21K_images_issue_file_list.csv"
issue_count_csv_url: str = "https://sharedvisuallayer.s3.us-east-2.amazonaws.com/visual-layer-sdk/ImageNet-21K_images_issue_count.csv"

@dataclass(frozen=True)
class VLOriginalImageNet21k(VLImageNet21k):
name: str = "imagenet-21k"
description: str = "The original imagenet-21k dataset."
12 changes: 6 additions & 6 deletions visuallayer/datasets/zoo/vl_oxford_iiit.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@

@dataclass(frozen=True)
class VLOxfordIIITPet(Dataset):
root: str = './'
name: str = "vl-oxford-iiit-pets"
homepage_url: str = "https://www.robots.ox.ac.uk/~vgg/data/pets/"
license: str = "Creative Commons Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)"
Expand All @@ -18,41 +19,40 @@ class VLOxfordIIITPet(Dataset):

# Hack: Download the dataset in the current dir
def __post_init__(self):
OxfordIIITPet(root="./", download=True)
OxfordIIITPet(root=self.root, download=True)

def export(
self,
output_format: str,
variation: str = "vl",
root: str = "./",
split: str = "train",
):
if output_format == "pytorch":
if variation == "vl":
print(
f"Exporting {variation.upper()} dataset into {output_format} dataset."
)
return CleanTorchvisionOxfordIIITPet(root=root, split=split, exclude_csv=self.exclude_csv)
return CleanTorchvisionOxfordIIITPet(root=self.root, split=split, exclude_csv=self.exclude_csv)
elif variation == "original":
print(
f"Exporting {variation.upper()} dataset into {output_format} dataset."
)
return OxfordIIITPet(root=root, split=split, download=True)
return OxfordIIITPet(root=self.root, split=split, download=True)

elif output_format == "csv":
if variation == "vl":
print(
f"Exporting {variation.upper()} dataset into {output_format} dataset."
)
dataset = CleanTorchvisionOxfordIIITPet(root=root, split=split, exclude_csv=self.exclude_csv)
dataset = CleanTorchvisionOxfordIIITPet(root=self.root, split=split, exclude_csv=self.exclude_csv)
samples = {"Image": dataset._images, "Label": dataset._labels}
df = pd.DataFrame(samples)
return df
elif variation == "original":
print(
f"Exporting {variation.upper()} dataset into {output_format} dataset."
)
dataset = OxfordIIITPet(root=root, split=split, download=True)
dataset = OxfordIIITPet(root=self.root, split=split, download=True)
samples = {"Image": dataset._images, "Label": dataset._labels}
df = pd.DataFrame(samples)
return df
Expand Down

0 comments on commit 945974e

Please sign in to comment.