Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add warnings in s2s for bad inputs #812

Merged
merged 17 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions dataquality/core/finish.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@ def finish(
via dq.metrics.get_data_embeddings(). Default True if a GPU is
available, else default False.
:param data_embs_col: Optional text col on which to compute data embeddings.
If not set, we default to 'text' which corresponds to the input text
Can also be set to `target` or `generated_output`
If not set, we default to 'text' which corresponds to the input text.
Can also be set to `target`, `generated_output` or any other column that is
logged as metadata.
"""
a.log_function("dq/finish")
if create_data_embs is None:
Expand Down
42 changes: 25 additions & 17 deletions dataquality/integrations/seq2seq/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ def watch(
"""Seq2seq only. Log model generations for your run

Iterates over a given dataset and logs the generations for each sample.
`model` must be an instance of transformers PreTrainedModel and have a `generate`
method.
To generate outputs, a model that is an instance of transformers PreTrainedModel
must be given and it must have a `generate` method.

Unlike other watch functions, in this one we are just registering the model
and generation config and not attaching any hooks to the model. We call it 'watch'
Expand All @@ -135,25 +135,24 @@ def watch(
max_target_tokens=max_target_tokens,
)

if model:
bogdan-galileo marked this conversation as resolved.
Show resolved Hide resolved
assert isinstance(
model, PreTrainedModel
), "model must be an instance of transformers PreTrainedModel"
assert (
model.can_generate()
), "model must contain a `generate` method for seq2seq"

if model_type == Seq2SeqModelType.decoder_only and not response_template:
raise GalileoException(
"You must specify a `response_template` when using Decoder-Only models."
" This is necessary to internally isolate the target response tokens."
)

if model_type == Seq2SeqModelType.encoder_decoder and response_template:
if model_type == Seq2SeqModelType.decoder_only:
if response_template is None:
raise GalileoException(
"You must specify a `response_template` when using Decoder-Only models."
" This is necessary to internally isolate the target response tokens."
)
elif not isinstance(response_template, list) or not all(
isinstance(token, int) for token in response_template
):
raise GalileoException(
"The response template must already be tokenized and be a list of ints."
)
elif model_type == Seq2SeqModelType.encoder_decoder and response_template:
warn(
"The argument response_template is only used when working with "
"DecoderOnly models. This value will be ignored."
)

seq2seq_logger_config.response_template = response_template
seq2seq_logger_config.model = model
seq2seq_logger_config.generation_config = generation_config
Expand All @@ -170,4 +169,13 @@ def watch(

generation_splits_set.add(Split[split])

# A model of the correct type is required if we need to generate
if generation_splits:
assert isinstance(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome!

model, PreTrainedModel
), "model must be an instance of transformers PreTrainedModel"
assert (
model.can_generate()
), "model must contain a `generate` method for seq2seq"

seq2seq_logger_config.generation_splits = generation_splits_set
13 changes: 6 additions & 7 deletions dataquality/schemas/seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,22 +72,21 @@ class AlignedTokenData:
token_label_offsets: List[List[Tuple[int, int]]]
token_label_positions: List[List[Set[int]]]

def append(self, aligned_token_data: "AlignedTokenData") -> None:
def append(self, data: "AlignedTokenData") -> None:
"""Append offsets and positions for a *single* sample

Assumes that `sample_aligned_token_data` holds alignment info for
Assumes that `data` holds alignment info for
a *single* data sample. As such, when appending to `token_label_offsets`
and `token_label_positions` we remove the "batch" dimensions respectively.
e.g.
>> sample_aligned_token_data.token_label_offsets[0]
>> data.token_label_offsets[0]
"""
assert (
len(aligned_token_data.token_label_offsets) == 1
and len(aligned_token_data.token_label_positions) == 1
len(data.token_label_offsets) == 1 and len(data.token_label_positions) == 1
)

self.token_label_offsets.append(aligned_token_data.token_label_offsets[0])
self.token_label_positions.append(aligned_token_data.token_label_positions[0])
self.token_label_offsets.append(data.token_label_offsets[0])
self.token_label_positions.append(data.token_label_positions[0])


@dataclass
Expand Down
31 changes: 27 additions & 4 deletions dataquality/utils/vaex.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import warnings
from typing import Dict, List, Union

import numpy as np
Expand All @@ -9,7 +10,7 @@

from dataquality import config
from dataquality.clients.objectstore import ObjectStore
from dataquality.exceptions import GalileoException
from dataquality.exceptions import GalileoException, GalileoWarning
from dataquality.loggers.base_logger import BaseLoggerAttributes
from dataquality.schemas.split import Split
from dataquality.utils.cuda import (
Expand Down Expand Up @@ -185,12 +186,34 @@ def add_umap_pca_to_df(df: DataFrame, data_embs: bool = False) -> DataFrame:
def create_data_embs_df(df: DataFrame, text_col: str, lazy: bool = True) -> DataFrame:
"""Runs sentence transformer on raw text to get off the shelf data embeddings

text_col can be passed in as "input" or "target" for Seq2Seq tasks
text_col can be passed in as "input" or "target" for Seq2Seq tasks.
text_col can also be any given metadata text column.

:param df: The dataframe to get data embeddings for. Must have the text_col
:param text_col: The column to use for calculating data embeddings
:param lazy: If true, we lazily apply the model to encode the text
"""
# If the specified column doesn't exist, fall back on default columns
col_to_encode = text_col
if col_to_encode not in df.get_column_names():
col_to_encode = ""
for col in ["text", "input"]:
if col in df.get_column_names():
warnings.warn(
f"Column `{text_col}` not found, `{col}` will be used for data "
"embeddings",
GalileoWarning,
)
col_to_encode = col
break

# If user specified col and default cols don't exist, raise error
if not col_to_encode:
raise GalileoException(
f"The specified column create_data_embs={text_col} in dq.finish does not "
"exist in the dataframe. Re-run dq.finish with an existing column name."
)

# This import takes up to 25 seconds, so we don't want to eagerly import it
import transformers
from sentence_transformers import SentenceTransformer
Expand All @@ -208,15 +231,15 @@ def apply_sentence_transformer(text: pa.array) -> np.ndarray:
)

if lazy:
df_copy["emb"] = df_copy[text_col].apply_sentence_transformer()
df_copy["emb"] = df_copy[col_to_encode].apply_sentence_transformer()
df_copy = df_copy[["id", "emb"]]
else:
import torch

# Downcasts to float16 where possible, speeds up processing by 10 it/sec
with torch.autocast("cuda"):
df_copy["emb"] = data_model.encode(
df_copy[text_col].tolist(), show_progress_bar=True
df_copy[col_to_encode].tolist(), show_progress_bar=True
).astype(np.float32)

return df_copy
Expand Down
56 changes: 43 additions & 13 deletions tests/integrations/seq2seq/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from tokenizers.models import BPE
from transformers import PreTrainedTokenizerFast

from dataquality.exceptions import GalileoException
from dataquality.integrations.seq2seq.core import set_tokenizer, watch
from dataquality.schemas.seq2seq import Seq2SeqModelType
from tests.conftest import TestSessionVariables, tokenizer_T5, tokenizer_T5_not_auto
Expand Down Expand Up @@ -56,10 +57,9 @@ def test_set_tokenizer_other(
)
with pytest.raises(ValueError) as e:
set_tokenizer(tokenizer_T5_not_auto, "encoder_decoder")
assert str(e.value) == (
"The tokenizer must be an instance of PreTrainedTokenizerFast "
"or Tokenizer"
)
assert str(e.value) == (
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assert is not tested in the with clause, it has to be done outside (we can put an assert False in the with clause and the test passes!)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So these tests were not passing before?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They were passing, it's just that this line was not taken into account. We can literally replace it with assert False and the test would still pass. To make it count we have to put it outside of the with clause

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Screenshot 2023-11-30 at 11 20 14 AM

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool did not know this!

"The tokenizer must be an instance of PreTrainedTokenizerFast " "or Tokenizer"
)


def test_watch_invalid_task_type(
Expand All @@ -71,22 +71,52 @@ def test_watch_invalid_task_type(
set_test_config(task_type="text_classification")
with pytest.raises(AssertionError) as e:
watch(tokenizer_T5, "encoder_decoder")
assert str(e.value) == (
"This method is only supported for seq2seq tasks. "
"Make sure to set the task type with dq.init()"
)
assert str(e.value) == (
"This method is only supported for seq2seq tasks. "
"Make sure to set the task type with dq.init()"
)


def test_watch_invalid_model_type(
set_test_config: Callable,
cleanup_after_use: Callable,
test_session_vars: TestSessionVariables,
) -> None:
"""Test that we can't watch without a tokenizer"""
"""Test that we can't watch without an appropriate model_type"""
set_test_config(task_type="seq2seq")
with pytest.raises(ValueError) as e:
watch(tokenizer_T5, "invalid_model_type")
assert str(e.value) == (
f"model_type must be one of {Seq2SeqModelType.members()}, "
"got invalid_model_type"
)
assert str(e.value) == (
f"model_type must be one of {Seq2SeqModelType.members()}, "
"got invalid_model_type"
)


def test_watch_response_template_required(
set_test_config: Callable,
cleanup_after_use: Callable,
test_session_vars: TestSessionVariables,
) -> None:
"""Test that we need response template to be passed for decoder-only"""
set_test_config(task_type="seq2seq")
with pytest.raises(GalileoException) as e:
watch(tokenizer_T5, "decoder_only")
assert str(e.value) == (
"You must specify a `response_template` when using Decoder-Only models."
" This is necessary to internally isolate the target response tokens."
)


def test_watch_response_template_not_tokenized(
set_test_config: Callable,
cleanup_after_use: Callable,
test_session_vars: TestSessionVariables,
) -> None:
"""Test that we need response template to be tokenized (for decoder-only)"""
set_test_config(task_type="seq2seq")
with pytest.raises(GalileoException) as e:
watch(tokenizer_T5, "decoder_only", response_template="###\nRESPONSE:\n")
assert (
str(e.value)
== "The response template must already be tokenized and be a list of ints."
)
95 changes: 93 additions & 2 deletions tests/loggers/test_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from transformers import GenerationConfig, T5ForConditionalGeneration

import dataquality as dq
from dataquality.exceptions import GalileoWarning
from dataquality.integrations.seq2seq.core import set_tokenizer, watch
from dataquality.loggers.data_logger.base_data_logger import DataSet
from dataquality.loggers.data_logger.seq2seq.seq2seq_base import Seq2SeqDataLogger
Expand All @@ -28,7 +29,7 @@
generate_sample_output,
)
from dataquality.utils.thread_pool import ThreadPoolManager
from dataquality.utils.vaex import GALILEO_DATA_EMBS_ENCODER
from dataquality.utils.vaex import GALILEO_DATA_EMBS_ENCODER, create_data_embs_df
from tests.conftest import (
LOCAL_MODEL_PATH,
TestSessionVariables,
Expand Down Expand Up @@ -340,7 +341,6 @@ def test_tokenize_input_provide_maxlength(
set_test_config: Callable,
cleanup_after_use: Generator,
) -> None:
# TODO comment!
"""
Test that as we generate output and the user provided the max_input_tokens argument,
the input is tokenized correctly to the length set by max_input_tokens.
Expand Down Expand Up @@ -558,3 +558,94 @@ def test_create_and_upload_data_embs(
assert data_embs.emb.values.ndim == 2
# mini BERT model spits out 32 dims
assert data_embs.emb.values.shape == (10, 32)


def test_create_data_embs_df_bad_text_col_name(
cleanup_after_use: Callable,
set_test_config: Callable,
test_session_vars: TestSessionVariables,
) -> None:
"""Test data embeddings flow works with bad col name

When the wrong column name is passed, test that the flow still works
using the default value of "input".
"""
set_test_config(task_type=TaskType.seq2seq)
# Use the local mini bert model
os.environ[GALILEO_DATA_EMBS_ENCODER] = LOCAL_MODEL_PATH

watch(tokenizer_T5, "encoder_decoder", generation_splits=[])

input_1, input_2 = "dog dog dog done - tricked you", "bird"
target_1, target_2 = "cat cat cat cat cat done", "cat"
ds = Dataset.from_dict(
{
"id": [0, 1],
"inpinp": [input_1, input_2],
"tongtong": [target_1, target_2],
}
)
data_logger = Seq2SeqDataLogger()
data_logger.log_dataset(ds, text="inpinp", label="tongtong", split="training")

df = vaex.open(f"{data_logger.input_data_path}/**/data*.arrow")

# Check that no exception is thrown and that data embs are created
assert "text" not in df.get_column_names()
with pytest.warns(GalileoWarning) as gw:
data_embs = create_data_embs_df(df, text_col="text")

warning_msg = "Column `text` not found, `input` will be used for data embeddings"
assert str(list(gw)[0].message) == warning_msg

assert len(data_embs) == 2
assert data_embs.get_column_names() == ["id", "emb"]
assert isinstance(data_embs.emb.values, np.ndarray)
assert data_embs.emb.values.ndim == 2
# mini BERT model spits out 32 dims
assert data_embs.emb.values.shape == (2, 32)


def test_create_data_embs_df_custom_column(
cleanup_after_use: Callable,
set_test_config: Callable,
test_session_vars: TestSessionVariables,
) -> None:
"""Test that data embeddings work with a column specified by the user"""
set_test_config(task_type=TaskType.seq2seq)
# Use the local mini bert model
os.environ[GALILEO_DATA_EMBS_ENCODER] = LOCAL_MODEL_PATH

watch(tokenizer_T5, "encoder_decoder", generation_splits=[])

input_1, input_2 = "dog dog dog done - tricked you", "bird"
target_1, target_2 = "cat cat cat cat cat done", "cat"
other = "just some text"
ds = Dataset.from_dict(
{
"id": [0, 1],
"inpinp": [input_1, input_2],
"tartar": [target_1, target_2],
"other": [other, other],
}
)
data_logger = Seq2SeqDataLogger()
data_logger.log_dataset(
ds, text="inpinp", label="tartar", split="training", meta=["other"]
)

df = vaex.open(f"{data_logger.input_data_path}/**/data*.arrow")

# Check that no exception is thrown and that data embs are created
assert "text" not in df.get_column_names()
with pytest.warns(None):
data_embs = create_data_embs_df(df, text_col="other")

assert len(data_embs) == 2
assert data_embs.get_column_names() == ["id", "emb"]
np_emb = data_embs.emb.values
assert isinstance(np_emb, np.ndarray)
# mini BERT model spits out 32 dims
assert np_emb.shape == (2, 32)
# Check that the two embeddings are the same, i.e., we used the column "other"
assert np.isclose(np_emb[0], np_emb[1]).all()