Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add warnings in s2s for bad inputs #812

Merged
merged 17 commits into from
Dec 4, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 25 additions & 17 deletions dataquality/integrations/seq2seq/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ def watch(
"""Seq2seq only. Log model generations for your run

Iterates over a given dataset and logs the generations for each sample.
`model` must be an instance of transformers PreTrainedModel and have a `generate`
method.
To generate outputs, a model of instance of transformers PreTrainedModel must be
given and it must have a `generate` method.
elboy3 marked this conversation as resolved.
Show resolved Hide resolved

Unlike other watch functions, in this one we are just registering the model
and generation config and not attaching any hooks to the model. We call it 'watch'
Expand All @@ -135,25 +135,24 @@ def watch(
max_target_tokens=max_target_tokens,
)

if model:
bogdan-galileo marked this conversation as resolved.
Show resolved Hide resolved
assert isinstance(
model, PreTrainedModel
), "model must be an instance of transformers PreTrainedModel"
assert (
model.can_generate()
), "model must contain a `generate` method for seq2seq"

if model_type == Seq2SeqModelType.decoder_only and not response_template:
raise GalileoException(
"You must specify a `response_template` when using Decoder-Only models."
" This is necessary to internally isolate the target response tokens."
)

if model_type == Seq2SeqModelType.encoder_decoder and response_template:
if model_type == Seq2SeqModelType.decoder_only:
if response_template is None:
raise GalileoException(
"You must specify a `response_template` when using Decoder-Only models."
" This is necessary to internally isolate the target response tokens."
)
elif not isinstance(response_template, list) or not all(
isinstance(token, int) for token in response_template
):
raise GalileoException(
"The response template must already be tokenized and be a list of ints."
)
elif model_type == Seq2SeqModelType.encoder_decoder and response_template:
warn(
"The argument response_template is only used when working with "
"DecoderOnly models. This value will be ignored."
)

seq2seq_logger_config.response_template = response_template
seq2seq_logger_config.model = model
seq2seq_logger_config.generation_config = generation_config
Expand All @@ -170,4 +169,13 @@ def watch(

generation_splits_set.add(Split[split])

# A model of the correct type is required if we need to generate
if generation_splits:
assert isinstance(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Awesome!

model, PreTrainedModel
), "model must be an instance of transformers PreTrainedModel"
assert (
model.can_generate()
), "model must contain a `generate` method for seq2seq"

seq2seq_logger_config.generation_splits = generation_splits_set
8 changes: 8 additions & 0 deletions dataquality/loggers/data_logger/base_data_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,14 @@ def upload(
If create_data_embs is True, this will also run an off the shelf transformer
and upload those text embeddings alongside the models finetuned embeddings
"""
# Check that data_embs_col is in the df and raise an error right here otherwise
df = vaex.open(f"{self.input_data_path}/**/data*.arrow")
if data_embs_col not in df.get_column_names():
raise GalileoException(
f"The specified column {data_embs_col} for creating embeddings does not"
" exist in the provided dataframe"
)

# For linting
assert (
config.current_project_id and config.current_run_id
Expand Down
56 changes: 43 additions & 13 deletions tests/integrations/seq2seq/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from tokenizers.models import BPE
from transformers import PreTrainedTokenizerFast

from dataquality.exceptions import GalileoException
from dataquality.integrations.seq2seq.core import set_tokenizer, watch
from dataquality.schemas.seq2seq import Seq2SeqModelType
from tests.conftest import TestSessionVariables, tokenizer_T5, tokenizer_T5_not_auto
Expand Down Expand Up @@ -56,10 +57,9 @@ def test_set_tokenizer_other(
)
with pytest.raises(ValueError) as e:
set_tokenizer(tokenizer_T5_not_auto, "encoder_decoder")
assert str(e.value) == (
"The tokenizer must be an instance of PreTrainedTokenizerFast "
"or Tokenizer"
)
assert str(e.value) == (
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This assert is not tested in the with clause, it has to be done outside (we can put an assert False in the with clause and the test passes!)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So these tests were not passing before?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They were passing, it's just that this line was not taken into account. We can literally replace it with assert False and the test would still pass. To make it count we have to put it outside of the with clause

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Screenshot 2023-11-30 at 11 20 14 AM

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool did not know this!

"The tokenizer must be an instance of PreTrainedTokenizerFast " "or Tokenizer"
)


def test_watch_invalid_task_type(
Expand All @@ -71,22 +71,52 @@ def test_watch_invalid_task_type(
set_test_config(task_type="text_classification")
with pytest.raises(AssertionError) as e:
watch(tokenizer_T5, "encoder_decoder")
assert str(e.value) == (
"This method is only supported for seq2seq tasks. "
"Make sure to set the task type with dq.init()"
)
assert str(e.value) == (
"This method is only supported for seq2seq tasks. "
"Make sure to set the task type with dq.init()"
)


def test_watch_invalid_model_type(
set_test_config: Callable,
cleanup_after_use: Callable,
test_session_vars: TestSessionVariables,
) -> None:
"""Test that we can't watch without a tokenizer"""
"""Test that we can't watch without an appropriate model_type"""
set_test_config(task_type="seq2seq")
with pytest.raises(ValueError) as e:
watch(tokenizer_T5, "invalid_model_type")
assert str(e.value) == (
f"model_type must be one of {Seq2SeqModelType.members()}, "
"got invalid_model_type"
)
assert str(e.value) == (
f"model_type must be one of {Seq2SeqModelType.members()}, "
"got invalid_model_type"
)


def test_watch_response_template_required(
set_test_config: Callable,
cleanup_after_use: Callable,
test_session_vars: TestSessionVariables,
) -> None:
"""Test that we need response template to be passed for decoder-only"""
set_test_config(task_type="seq2seq")
with pytest.raises(GalileoException) as e:
watch(tokenizer_T5, "decoder_only")
assert str(e.value) == (
"You must specify a `response_template` when using Decoder-Only models."
" This is necessary to internally isolate the target response tokens."
)


def test_watch_response_template_not_tokenized(
set_test_config: Callable,
cleanup_after_use: Callable,
test_session_vars: TestSessionVariables,
) -> None:
"""Test that we need response template to be tokenized (for decoder-only)"""
set_test_config(task_type="seq2seq")
with pytest.raises(GalileoException) as e:
watch(tokenizer_T5, "decoder_only", response_template="###\nRESPONSE:\n")
assert (
str(e.value)
== "The response template must already be tokenized and be a list of ints."
)
34 changes: 33 additions & 1 deletion tests/loggers/test_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from transformers import GenerationConfig, T5ForConditionalGeneration

import dataquality as dq
from dataquality.exceptions import GalileoException
from dataquality.integrations.seq2seq.core import set_tokenizer, watch
from dataquality.loggers.data_logger.base_data_logger import DataSet
from dataquality.loggers.data_logger.seq2seq.seq2seq_base import Seq2SeqDataLogger
Expand Down Expand Up @@ -340,7 +341,6 @@ def test_tokenize_input_provide_maxlength(
set_test_config: Callable,
cleanup_after_use: Generator,
) -> None:
# TODO comment!
"""
Test that as we generate output and the user provided the max_input_tokens argument,
the input is tokenized correctly to the length set by max_input_tokens.
Expand Down Expand Up @@ -558,3 +558,35 @@ def test_create_and_upload_data_embs(
assert data_embs.emb.values.ndim == 2
# mini BERT model spits out 32 dims
assert data_embs.emb.values.shape == (10, 32)


def test_upload_wrong_data_emb_column(
cleanup_after_use: Callable,
set_test_config: Callable,
test_session_vars: TestSessionVariables,
) -> None:
"""
Test that we are prevented from uploading if the specified column name for data
embeddings does not exist in the df.
"""
set_test_config(task_type=TaskType.seq2seq)
watch(tokenizer_T5, "encoder_decoder", generation_splits=[])

input_1, input_2 = "dog dog dog done - tricked you", "bird"
target_1, target_2 = "cat cat cat cat cat done", "cat"
ds = Dataset.from_dict(
{
"id": [0, 1],
"input": [input_1, input_2],
"target": [target_1, target_2],
}
)
data_logger = Seq2SeqDataLogger()
data_logger.log_dataset(ds, text="input", label="target", split="training")

with pytest.raises(GalileoException) as e:
bogdan-galileo marked this conversation as resolved.
Show resolved Hide resolved
data_logger.upload(data_embs_col="not_input")
assert str(e.value) == (
"The specified column not_input for creating embeddings does not"
" exist in the provided dataframe"
)
Loading