Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/cv save local paths #704

Merged
merged 37 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
26a0152
add mock CV dataset
bogdan-galileo Jul 6, 2023
7115748
lint mock data
bogdan-galileo Jul 6, 2023
1a7859b
make main changes
bogdan-galileo Jul 6, 2023
4179f37
add tests
bogdan-galileo Jul 6, 2023
3a77099
update older tests
bogdan-galileo Jul 6, 2023
f528157
Merge branch 'main' into feat/cv-save-local-paths
bogdan-galileo Jul 6, 2023
30746cb
lint 1 line....
bogdan-galileo Jul 6, 2023
2555106
restrict PIL version
bogdan-galileo Jul 6, 2023
d2a7acd
bring PIL back and wait for Franz' PR
bogdan-galileo Jul 6, 2023
9569e22
fix cv2
bogdan-galileo Jul 6, 2023
f94048b
temp downgrade TF
bogdan-galileo Jul 6, 2023
293f3bb
fix tests
bogdan-galileo Jul 7, 2023
07b05b4
fix if df contains col named text
bogdan-galileo Jul 7, 2023
5fa2058
fix older test
bogdan-galileo Jul 7, 2023
f8f3eea
update docstring
bogdan-galileo Jul 7, 2023
c80d87c
Merge branch 'main' into feat/cv-save-local-paths
bogdan-galileo Jul 7, 2023
e4479c2
Merge branch 'main' into feat/cv-save-local-paths
bogdan-galileo Jul 7, 2023
196e5db
rename non_meta to extra_cols
bogdan-galileo Jul 7, 2023
c293321
update remote paths colname
bogdan-galileo Jul 7, 2023
655d786
format
bogdan-galileo Jul 7, 2023
2bc04da
Merge branch 'main' into feat/cv-save-local-paths
bogdan-galileo Jul 7, 2023
b545e49
use var for local and remote colnames
bogdan-galileo Jul 7, 2023
1f02696
remove TODO
bogdan-galileo Jul 7, 2023
3799f0d
improve comments
bogdan-galileo Jul 10, 2023
2d04ad9
Merge branch 'main' into feat/cv-save-local-paths
bogdan-galileo Jul 10, 2023
dc1ceac
remove class attribute FO U ELLIOT
bogdan-galileo Jul 11, 2023
a641052
lol
bogdan-galileo Jul 11, 2023
7267ca2
fix old test
bogdan-galileo Jul 11, 2023
d56399e
fix ooolder test
bogdan-galileo Jul 11, 2023
3ca0ff8
utilize column_map
bogdan-galileo Jul 11, 2023
27d8692
some changes with bogdan :)
elboy3 Jul 12, 2023
d5332b8
Fix linting
elboy3 Jul 19, 2023
02129b2
rename remote col to text
bogdan-galileo Jul 19, 2023
4f21b6d
Merge branch 'main' into feat/cv-save-local-paths
bogdan-galileo Jul 19, 2023
8061f78
Merge branch 'main' into feat/cv-save-local-paths
bogdan-galileo Jul 20, 2023
b668bc5
bump version
bogdan-galileo Jul 21, 2023
9b8a253
Merge branch 'main' into feat/cv-save-local-paths
bogdan-galileo Jul 25, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dataquality/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"""


__version__ = "0.9.8"
__version__ = "0.10.0"

import sys
from typing import Any, List, Optional
Expand Down
32 changes: 16 additions & 16 deletions dataquality/core/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,30 +123,28 @@ def log_data_sample(*, text: str, id: int, **kwargs: Any) -> None:
def log_image_dataset(
dataset: DataSet,
*,
imgs_colname: Optional[str] = None,
imgs_location_colname: Optional[str] = None,
imgs_remote_location: Optional[str] = None,
imgs_local_colname: Optional[str] = None,
imgs_remote: Optional[str] = None,
batch_size: int = ITER_CHUNK_SIZE,
id: str = "id",
label: Union[str, int] = "label",
label: str = "label",
split: Optional[Split] = None,
inference_name: Optional[str] = None,
meta: Union[List[str], List[int], None] = None,
meta: Optional[List[str]] = None,
parallel: bool = False,
**kwargs: Any,
) -> None:
"""
Log an image dataset of input samples for image classification

:param dataset: The dataset to log. This can be a Pandas/Vaex dataframe or an
:param dataset: The dataset to log. This can be a Pandas/HF dataframe or an
ImageFolder (from Torchvision).
:param imgs_colname: If the images are passed as bytes in the dataframe, this
indicates the name of the column containing the images
:param imgs_location_colname: If the images are passed via their path in the
dataframe, this indicates the name of the column containing the paths.
These paths could be remote (skip upload) or local (upload)
:param imgs_remote_location: If the dataset is of type ImageFolder and the
images are stored remotely, pass the folder name here to avoid upload
:param imgs_local_colname: The name of the column containing the local images
(typically paths but could also be bytes for HF dataframes). Ignored for
ImageFolder where local paths are directly retrieved from the dataset.
:param imgs_remote: The name of the column containing paths to the remote images (in
the case of a df) or remote directory containing the images (in the case of
ImageFolder). Specifying this argument is required to skip uploading the images.
:param batch_size: Number of samples to log in a batch. Default 10,000
:param id: The name of the column containing the ids (in the dataframe)
:param label: The name of the column containing the labels (in the dataframe)
Expand All @@ -165,11 +163,13 @@ def log_image_dataset(
"This method is only supported for image tasks. "
"Please use dq.log_samples for text tasks."
)

# TODO: raise warning if imgs_local is None (and we provide no smart features)

data_logger.log_image_dataset(
dataset=dataset,
imgs_colname=imgs_colname,
imgs_location_colname=imgs_location_colname,
imgs_remote_location=imgs_remote_location,
imgs_local_colname=imgs_local_colname,
imgs_remote=imgs_remote,
batch_size=batch_size,
id=id,
label=label,
Expand Down
3 changes: 2 additions & 1 deletion dataquality/integrations/fastai.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from dataquality.analytics import Analytics
from dataquality.clients.api import ApiClient
from dataquality.exceptions import GalileoException
from dataquality.loggers.data_logger.image_classification import GAL_LOCAL_IMAGES_PATHS
from dataquality.loggers.logger_config.base_logger_config import BaseLoggerConfig
from dataquality.schemas.split import Split
from dataquality.utils.helpers import galileo_disabled
Expand Down Expand Up @@ -396,7 +397,7 @@ def convert_img_dl_to_df(dl: DataLoader, x_col: str = "image") -> pd.DataFrame:
a.log_function("fastai/convert_img_dl_to_df")
additional_data = {}
if x_col == "image":
additional_data["text"] = dl.items
additional_data[GAL_LOCAL_IMAGES_PATHS] = dl.items
x, y = [], []
for x_item, y_item in dl.dataset:
x.append(x_item)
Expand Down
23 changes: 22 additions & 1 deletion dataquality/loggers/data_logger/base_data_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
import pandas as pd
import vaex
from datasets.arrow_dataset import Dataset as HFDataset
from huggingface_hub.utils import HfHubHTTPError
from vaex.dataframe import DataFrame

Expand Down Expand Up @@ -40,7 +41,7 @@
)

DATA_FOLDERS = ["emb", "prob", "data"]
DataSet = TypeVar("DataSet", bound=Union[Iterable, pd.DataFrame, DataFrame])
DataSet = TypeVar("DataSet", bound=Union[Iterable, pd.DataFrame, HFDataset, DataFrame])
MetasType = TypeVar("MetasType", bound=Dict[str, List[Union[str, float, int]]])
MetaType = TypeVar("MetaType", bound=Dict[str, Union[str, float, int]])
ITER_CHUNK_SIZE = 100_000
Expand Down Expand Up @@ -219,6 +220,26 @@ def export_df(self, df: vaex.DataFrame) -> None:
def support_data_embs(self) -> bool:
return True

def apply_column_map(self, dataset: DataSet, column_map: Dict[str, str]) -> DataSet:
"""Rename columns in the dataset according to the column_map

This function works for both pandas and HF datasets
"""
# Remove any columns that are mapped to themselves
column_map = {k: v for k, v in column_map.items() if k != v}

if isinstance(dataset, pd.DataFrame):
dataset = dataset.rename(columns=column_map)
elif self.is_hf_dataset(dataset):
import datasets

assert isinstance(dataset, datasets.Dataset)
for old_col, new_col in column_map.items():
if old_col in dataset.column_names: # HF breaks if col doesn't exist
dataset = dataset.rename_column(old_col, new_col)

return dataset

def upload(
self, last_epoch: Optional[int] = None, create_data_embs: bool = False
) -> None:
Expand Down
Loading