rungalileo · elboy3 · Dec 4, 2023 · Nov 29, 2023 · Nov 29, 2023 · Nov 29, 2023
@@ -121,8 +121,8 @@ def watch(
     """Seq2seq only. Log model generations for your run
 
     Iterates over a given dataset and logs the generations for each sample.
-    `model` must be an instance of transformers PreTrainedModel and have a `generate`
-    method.
+    To generate outputs, a model of instance of transformers PreTrainedModel must be
+    given and it must have a `generate` method.
 
     Unlike other watch functions, in this one we are just registering the model
     and generation config and not attaching any hooks to the model. We call it 'watch'
@@ -135,25 +135,24 @@ def watch(
         max_target_tokens=max_target_tokens,
     )
 
-    if model:
-        assert isinstance(
-            model, PreTrainedModel
-        ), "model must be an instance of transformers PreTrainedModel"
-        assert (
-            model.can_generate()
-        ), "model must contain a `generate` method for seq2seq"
-
-    if model_type == Seq2SeqModelType.decoder_only and not response_template:
-        raise GalileoException(
-            "You must specify a `response_template` when using Decoder-Only models."
-            " This is necessary to internally isolate the target response tokens."
-        )
-
-    if model_type == Seq2SeqModelType.encoder_decoder and response_template:
+    if model_type == Seq2SeqModelType.decoder_only:
+        if response_template is None:
+            raise GalileoException(
+                "You must specify a `response_template` when using Decoder-Only models."
+                " This is necessary to internally isolate the target response tokens."
+            )
+        elif not isinstance(response_template, list) or not all(
+            isinstance(token, int) for token in response_template
+        ):
+            raise GalileoException(
+                "The response template must already be tokenized and be a list of ints."
+            )
+    elif model_type == Seq2SeqModelType.encoder_decoder and response_template:
         warn(
             "The argument response_template is only used when working with "
             "DecoderOnly models. This value will be ignored."
         )
+
     seq2seq_logger_config.response_template = response_template
     seq2seq_logger_config.model = model
     seq2seq_logger_config.generation_config = generation_config
@@ -170,4 +169,13 @@ def watch(
 
         generation_splits_set.add(Split[split])
 
+    # A model of the correct type is required if we need to generate
+    if generation_splits:
+        assert isinstance(
+            model, PreTrainedModel
+        ), "model must be an instance of transformers PreTrainedModel"
+        assert (
+            model.can_generate()
+        ), "model must contain a `generate` method for seq2seq"
+
     seq2seq_logger_config.generation_splits = generation_splits_set
diff --git a/dataquality/loggers/data_logger/base_data_logger.py b/dataquality/loggers/data_logger/base_data_logger.py
@@ -258,6 +258,14 @@ def upload(
         If create_data_embs is True, this will also run an off the shelf transformer
         and upload those text embeddings alongside the models finetuned embeddings
         """
+        # Check that data_embs_col is in the df and raise an error right here otherwise
+        df = vaex.open(f"{self.input_data_path}/**/data*.arrow")
+        if data_embs_col not in df.get_column_names():
+            raise GalileoException(
+                f"The specified column {data_embs_col} for creating embeddings does not"
+                " exist in the provided dataframe"
+            )
+
         # For linting
         assert (
             config.current_project_id and config.current_run_id

@@ -5,6 +5,7 @@
 from tokenizers.models import BPE
 from transformers import PreTrainedTokenizerFast
 
+from dataquality.exceptions import GalileoException
 from dataquality.integrations.seq2seq.core import set_tokenizer, watch
 from dataquality.schemas.seq2seq import Seq2SeqModelType
 from tests.conftest import TestSessionVariables, tokenizer_T5, tokenizer_T5_not_auto
@@ -56,10 +57,9 @@ def test_set_tokenizer_other(
     )
     with pytest.raises(ValueError) as e:
         set_tokenizer(tokenizer_T5_not_auto, "encoder_decoder")
-        assert str(e.value) == (
-            "The tokenizer must be an instance of PreTrainedTokenizerFast "
-            "or Tokenizer"
-        )
+    assert str(e.value) == (
+        "The tokenizer must be an instance of PreTrainedTokenizerFast " "or Tokenizer"
+    )
 
 
 def test_watch_invalid_task_type(
@@ -71,22 +71,52 @@ def test_watch_invalid_task_type(
     set_test_config(task_type="text_classification")
     with pytest.raises(AssertionError) as e:
         watch(tokenizer_T5, "encoder_decoder")
-        assert str(e.value) == (
-            "This method is only supported for seq2seq tasks. "
-            "Make sure to set the task type with dq.init()"
-        )
+    assert str(e.value) == (
+        "This method is only supported for seq2seq tasks. "
+        "Make sure to set the task type with dq.init()"
+    )
 
 
 def test_watch_invalid_model_type(
     set_test_config: Callable,
     cleanup_after_use: Callable,
     test_session_vars: TestSessionVariables,
 ) -> None:
-    """Test that we can't watch without a tokenizer"""
+    """Test that we can't watch without an appropriate model_type"""
     set_test_config(task_type="seq2seq")
     with pytest.raises(ValueError) as e:
         watch(tokenizer_T5, "invalid_model_type")
-        assert str(e.value) == (
-            f"model_type must be one of {Seq2SeqModelType.members()}, "
-            "got invalid_model_type"
-        )
+    assert str(e.value) == (
+        f"model_type must be one of {Seq2SeqModelType.members()}, "
+        "got invalid_model_type"
+    )
+
+
+def test_watch_response_template_required(
+    set_test_config: Callable,
+    cleanup_after_use: Callable,
+    test_session_vars: TestSessionVariables,
+) -> None:
+    """Test that we need response template to be passed for decoder-only"""
+    set_test_config(task_type="seq2seq")
+    with pytest.raises(GalileoException) as e:
+        watch(tokenizer_T5, "decoder_only")
+    assert str(e.value) == (
+        "You must specify a `response_template` when using Decoder-Only models."
+        " This is necessary to internally isolate the target response tokens."
+    )
+
+
+def test_watch_response_template_not_tokenized(
+    set_test_config: Callable,
+    cleanup_after_use: Callable,
+    test_session_vars: TestSessionVariables,
+) -> None:
+    """Test that we need response template to be tokenized (for decoder-only)"""
+    set_test_config(task_type="seq2seq")
+    with pytest.raises(GalileoException) as e:
+        watch(tokenizer_T5, "decoder_only", response_template="###\nRESPONSE:\n")
+    assert (
+        str(e.value)
+        == "The response template must already be tokenized and be a list of ints."
+    )
@@ -12,6 +12,7 @@
 from transformers import GenerationConfig, T5ForConditionalGeneration
 
 import dataquality as dq
+from dataquality.exceptions import GalileoException
 from dataquality.integrations.seq2seq.core import set_tokenizer, watch
 from dataquality.loggers.data_logger.base_data_logger import DataSet
 from dataquality.loggers.data_logger.seq2seq.seq2seq_base import Seq2SeqDataLogger
@@ -340,7 +341,6 @@ def test_tokenize_input_provide_maxlength(
     set_test_config: Callable,
     cleanup_after_use: Generator,
 ) -> None:
-    # TODO comment!
     """
     Test that as we generate output and the user provided the max_input_tokens argument,
     the input is tokenized correctly to the length set by max_input_tokens.
@@ -558,3 +558,35 @@ def test_create_and_upload_data_embs(
     assert data_embs.emb.values.ndim == 2
     # mini BERT model spits out 32 dims
     assert data_embs.emb.values.shape == (10, 32)
+
+
+def test_upload_wrong_data_emb_column(
+    cleanup_after_use: Callable,
+    set_test_config: Callable,
+    test_session_vars: TestSessionVariables,
+) -> None:
+    """
+    Test that we are prevented from uploading if the specified column name for data
+    embeddings does not exist in the df.
+    """
+    set_test_config(task_type=TaskType.seq2seq)
+    watch(tokenizer_T5, "encoder_decoder", generation_splits=[])
+
+    input_1, input_2 = "dog dog dog done - tricked you", "bird"
+    target_1, target_2 = "cat cat cat cat cat done", "cat"
+    ds = Dataset.from_dict(
+        {
+            "id": [0, 1],
+            "input": [input_1, input_2],
+            "target": [target_1, target_2],
+        }
+    )
+    data_logger = Seq2SeqDataLogger()
+    data_logger.log_dataset(ds, text="input", label="target", split="training")
+
+    with pytest.raises(GalileoException) as e:
+        data_logger.upload(data_embs_col="not_input")
+    assert str(e.value) == (
+        "The specified column not_input for creating embeddings does not"
+        " exist in the provided dataframe"
+    )