From fc5ef0cacf3c9565b4a73127795963042580a399 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Thu, 13 Feb 2025 17:26:28 +0000
Subject: [PATCH 1/8] halfway through models

---
 .../models/albert/modeling_albert.py          |  3 +-
 tests/generation/test_utils.py                | 37 ++++++++++++++++++-
 .../autoformer/test_modeling_autoformer.py    |  2 +
 tests/models/bark/test_modeling_bark.py       |  2 +
 .../models/big_bird/test_modeling_big_bird.py |  2 +
 tests/models/blip/test_modeling_blip.py       |  4 ++
 tests/models/blip_2/test_modeling_blip_2.py   |  2 +-
 tests/models/clvp/test_modeling_clvp.py       |  2 +
 .../test_modeling_conditional_detr.py         |  3 +-
 tests/models/cpmant/test_modeling_cpmant.py   |  2 +
 .../models/dab_detr/test_modeling_dab_detr.py | 12 +-----
 .../test_modeling_decision_transformer.py     |  3 +-
 .../test_modeling_deformable_detr.py          |  3 +-
 tests/models/detr/test_modeling_detr.py       |  3 +-
 tests/models/electra/test_modeling_electra.py |  2 +
 .../models/flaubert/test_modeling_flaubert.py |  2 +
 tests/models/idefics/test_modeling_idefics.py |  2 +
 .../models/informer/test_modeling_informer.py |  2 +
 .../moonshine/test_modeling_moonshine.py      |  3 +-
 .../models/musicgen/test_modeling_musicgen.py |  5 +--
 .../test_modeling_musicgen_melody.py          |  5 +--
 .../pop2piano/test_modeling_pop2piano.py      |  3 +-
 .../test_modeling_recurrent_gemma.py          |  3 +-
 .../test_modeling_seamless_m4t.py             |  5 +--
 .../test_modeling_seamless_m4t_v2.py          |  3 +-
 tests/models/udop/test_modeling_udop.py       | 11 +++++-
 tests/test_modeling_common.py                 | 27 ++++++++++++++
 27 files changed, 112 insertions(+), 41 deletions(-)

diff --git a/src/transformers/models/albert/modeling_albert.py b/src/transformers/models/albert/modeling_albert.py
index bfd8e38687ac..dca1fe7f6002 100755
--- a/src/transformers/models/albert/modeling_albert.py
+++ b/src/transformers/models/albert/modeling_albert.py
@@ -24,7 +24,6 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...generation import GenerationMixin
 from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa
 from ...modeling_outputs import (
     BaseModelOutput,
@@ -984,7 +983,7 @@ def forward(self, pooled_output: torch.Tensor) -> torch.Tensor:
     "Albert Model with a `language modeling` head on top.",
     ALBERT_START_DOCSTRING,
 )
-class AlbertForMaskedLM(AlbertPreTrainedModel, GenerationMixin):
+class AlbertForMaskedLM(AlbertPreTrainedModel):
     _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
 
     def __init__(self, config):
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index ce31cc844f19..6c6cfc48996a 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -19,6 +19,7 @@
 import datetime
 import gc
 import inspect
+import random
 import tempfile
 import unittest
 import warnings
@@ -48,7 +49,6 @@
 )
 from transformers.utils import is_ipex_available
 
-from ..test_modeling_common import floats_tensor, ids_tensor
 from .test_framework_agnostic import GenerationIntegrationTestsMixin
 
 
@@ -2753,6 +2753,41 @@ def test_speculative_sampling_target_distribution(self):
         self.assertTrue(last_token_counts[8] > last_token_counts[3])
 
 
+global_rng = random.Random()
+
+
+def ids_tensor(shape, vocab_size, rng=None, name=None):
+    #  Creates a random int32 tensor of the shape within the vocab size
+    if rng is None:
+        rng = global_rng
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.randint(0, vocab_size - 1))
+
+    return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
+
+
+def floats_tensor(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.random() * scale)
+
+    return torch.tensor(data=values, dtype=torch.float, device=torch_device).view(shape).contiguous()
+
+
 @pytest.mark.generate
 @require_torch
 class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py
index 3a2ed6385151..0de67549b97e 100644
--- a/tests/models/autoformer/test_modeling_autoformer.py
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@@ -205,6 +205,8 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 @require_torch
 class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (AutoformerModel, AutoformerForPrediction) if is_torch_available() else ()
+    # Doesn't run generation tests. Has a custom `generate` method with a difference interface
+    all_generative_model_classes = ()
     pipeline_model_mapping = {"feature-extraction": AutoformerModel} if is_torch_available() else {}
     test_pruning = False
     test_head_masking = False
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index d94f6d26d6e8..44bc5a51a843 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -707,6 +707,8 @@ def test_generate_fp16(self):
 @require_torch
 class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BarkFineModel,) if is_torch_available() else ()
+    # Doesn't run generation tests. Has custom generation method with a different interface
+    all_generative_model_classes = ()
 
     is_encoder_decoder = False
     fx_compatible = False
diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py
index 6aca3cbc4109..4eae69eb125a 100644
--- a/tests/models/big_bird/test_modeling_big_bird.py
+++ b/tests/models/big_bird/test_modeling_big_bird.py
@@ -451,6 +451,8 @@ class BigBirdModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": BigBirdModel,
diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py
index c4029e6377be..b00d4ca4a840 100644
--- a/tests/models/blip/test_modeling_blip.py
+++ b/tests/models/blip/test_modeling_blip.py
@@ -805,6 +805,8 @@ def prepare_config_and_inputs_for_common(self):
 @require_vision
 class BlipVQAModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BlipForQuestionAnswering,) if is_torch_available() else ()
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -1112,6 +1114,8 @@ def test_model_from_pretrained(self):
 @require_torch
 class BlipTextImageModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BlipForConditionalGeneration,) if is_torch_available() else ()
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index e26232e3eb43..a0d91a3ba8e1 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -992,7 +992,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Blip2ForConditionalGeneration, Blip2Model) if is_torch_available() else ()
     # Doesn't run generation tests. TODO: fix generation tests for Blip2ForConditionalGeneration
     all_generative_model_classes = ()
diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py
index 839c831eb9f6..0f59b91871c9 100644
--- a/tests/models/clvp/test_modeling_clvp.py
+++ b/tests/models/clvp/test_modeling_clvp.py
@@ -408,6 +408,8 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (ClvpModelForConditionalGeneration,) if is_torch_available() else ()
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
 
     test_head_masking = False
     test_pruning = False
diff --git a/tests/models/conditional_detr/test_modeling_conditional_detr.py b/tests/models/conditional_detr/test_modeling_conditional_detr.py
index 80360e8177e7..db5afffe2131 100644
--- a/tests/models/conditional_detr/test_modeling_conditional_detr.py
+++ b/tests/models/conditional_detr/test_modeling_conditional_detr.py
@@ -22,7 +22,6 @@
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -173,7 +172,7 @@ def create_and_check_conditional_detr_object_detection_head_model(self, config,
 
 
 @require_torch
-class ConditionalDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class ConditionalDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             ConditionalDetrModel,
diff --git a/tests/models/cpmant/test_modeling_cpmant.py b/tests/models/cpmant/test_modeling_cpmant.py
index e796d850a8d0..d3835ec2374b 100644
--- a/tests/models/cpmant/test_modeling_cpmant.py
+++ b/tests/models/cpmant/test_modeling_cpmant.py
@@ -136,6 +136,8 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class CpmAntModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (CpmAntModel, CpmAntForCausalLM) if is_torch_available() else ()
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {"feature-extraction": CpmAntModel, "text-generation": CpmAntForCausalLM} if is_torch_available() else {}
     )
diff --git a/tests/models/dab_detr/test_modeling_dab_detr.py b/tests/models/dab_detr/test_modeling_dab_detr.py
index 35d43123bfd8..584bd1882a82 100644
--- a/tests/models/dab_detr/test_modeling_dab_detr.py
+++ b/tests/models/dab_detr/test_modeling_dab_detr.py
@@ -23,7 +23,6 @@
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -174,15 +173,8 @@ def create_and_check_dab_detr_object_detection_head_model(self, config, pixel_va
 
 
 @require_torch
-class DabDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (
-            DabDetrModel,
-            DabDetrForObjectDetection,
-        )
-        if is_torch_available()
-        else ()
-    )
+class DabDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (DabDetrModel, DabDetrForObjectDetection) if is_torch_available() else ()
     pipeline_model_mapping = (
         {
             "image-feature-extraction": DabDetrModel,
diff --git a/tests/models/decision_transformer/test_modeling_decision_transformer.py b/tests/models/decision_transformer/test_modeling_decision_transformer.py
index f22911db9580..e004629c9c34 100644
--- a/tests/models/decision_transformer/test_modeling_decision_transformer.py
+++ b/tests/models/decision_transformer/test_modeling_decision_transformer.py
@@ -20,7 +20,6 @@
 from transformers import DecisionTransformerConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -125,7 +124,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class DecisionTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class DecisionTransformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (DecisionTransformerModel,) if is_torch_available() else ()
     pipeline_model_mapping = {"feature-extraction": DecisionTransformerModel} if is_torch_available() else {}
 
diff --git a/tests/models/deformable_detr/test_modeling_deformable_detr.py b/tests/models/deformable_detr/test_modeling_deformable_detr.py
index b9404e08a9df..1e27aaabf8d8 100644
--- a/tests/models/deformable_detr/test_modeling_deformable_detr.py
+++ b/tests/models/deformable_detr/test_modeling_deformable_detr.py
@@ -31,7 +31,6 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -188,7 +187,7 @@ def create_and_check_deformable_detr_object_detection_head_model(self, config, p
 
 
 @require_torch
-class DeformableDetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class DeformableDetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (DeformableDetrModel, DeformableDetrForObjectDetection) if is_torch_available() else ()
     pipeline_model_mapping = (
         {"image-feature-extraction": DeformableDetrModel, "object-detection": DeformableDetrForObjectDetection}
diff --git a/tests/models/detr/test_modeling_detr.py b/tests/models/detr/test_modeling_detr.py
index e92cc6ddc289..381fa1d7cd23 100644
--- a/tests/models/detr/test_modeling_detr.py
+++ b/tests/models/detr/test_modeling_detr.py
@@ -22,7 +22,6 @@
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 from transformers.utils import cached_property
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -169,7 +168,7 @@ def create_and_check_detr_object_detection_head_model(self, config, pixel_values
 
 
 @require_torch
-class DetrModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class DetrModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             DetrModel,
diff --git a/tests/models/electra/test_modeling_electra.py b/tests/models/electra/test_modeling_electra.py
index e2aa0d41f219..08865f22aba8 100644
--- a/tests/models/electra/test_modeling_electra.py
+++ b/tests/models/electra/test_modeling_electra.py
@@ -389,6 +389,8 @@ class ElectraModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": ElectraModel,
diff --git a/tests/models/flaubert/test_modeling_flaubert.py b/tests/models/flaubert/test_modeling_flaubert.py
index 2ba0b509e47e..4211e92e21e0 100644
--- a/tests/models/flaubert/test_modeling_flaubert.py
+++ b/tests/models/flaubert/test_modeling_flaubert.py
@@ -377,6 +377,8 @@ class FlaubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. Outdated custom `prepare_inputs_for_generation` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": FlaubertModel,
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index 5d19f5b02025..d00ba28c7443 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -327,6 +327,8 @@ def test_eager_matches_sdpa_generate(self):
 @require_torch
 class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else ()
+    # Doesn't run generation tests here -- idefics has a dedicated tester for generation tests below
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {"feature-extraction": IdeficsModel, "image-text-to-text": IdeficsForVisionText2Text}
         if is_torch_available()
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 5415717cd4b8..20446a19001d 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -190,6 +190,8 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 @require_torch
 class InformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (InformerModel, InformerForPrediction) if is_torch_available() else ()
+    # Doesn't run generation tests. Has a custom `generate` with an unique interface
+    all_generative_model_classes = ()
     pipeline_model_mapping = {"feature-extraction": InformerModel} if is_torch_available() else {}
     is_encoder_decoder = True
     test_pruning = False
diff --git a/tests/models/moonshine/test_modeling_moonshine.py b/tests/models/moonshine/test_modeling_moonshine.py
index bf30f2c3d522..65feb47c6dd7 100644
--- a/tests/models/moonshine/test_modeling_moonshine.py
+++ b/tests/models/moonshine/test_modeling_moonshine.py
@@ -20,7 +20,6 @@
 from transformers import MoonshineConfig, is_torch_available
 from transformers.testing_utils import cleanup, require_torch, slow, torch_device
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
@@ -168,7 +167,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MoonshineModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class MoonshineModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (MoonshineModel, MoonshineForConditionalGeneration) if is_torch_available() else ()
     # Doesn't run generation tests. TODO (eustache): remove this line and then make CI green
     all_generative_model_classes = ()
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 3852d8c3c4ff..69072af46f3e 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -45,7 +45,6 @@
 )
 from transformers.utils import cached_property, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, sdpa_kernel
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -174,7 +173,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class MusicgenDecoderTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (MusicgenModel, MusicgenForCausalLM) if is_torch_available() else ()
     # Doesn't run generation tests. See `greedy_sample_model_classes` below
     all_generative_model_classes = ()
@@ -801,7 +800,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class MusicgenTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (MusicgenForConditionalGeneration,) if is_torch_available() else ()
     # Doesn't run generation tests. See `greedy_sample_model_classes` below
     all_generative_model_classes = ()
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index 7cb31adaedbd..fab3979a65e1 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -49,7 +49,6 @@
 )
 from transformers.utils import cached_property, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, sdpa_kernel
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -180,7 +179,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class MusicgenMelodyDecoderTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (MusicgenMelodyModel, MusicgenMelodyForCausalLM) if is_torch_available() else ()
     # Doesn't run generation tests. See `greedy_sample_model_classes` below
     all_generative_model_classes = ()
@@ -820,7 +819,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenTest with Musicgen->MusicgenMelody, musicgen->musicgen_melody, EncoderDecoder->DecoderOnly, input_values->input_features
-class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class MusicgenMelodyTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (MusicgenMelodyForConditionalGeneration,) if is_torch_available() else ()
     # Doesn't run generation tests. See `greedy_sample_model_classes` below
     all_generative_model_classes = ()
diff --git a/tests/models/pop2piano/test_modeling_pop2piano.py b/tests/models/pop2piano/test_modeling_pop2piano.py
index 50d25aaf5cca..6c298198b8ec 100644
--- a/tests/models/pop2piano/test_modeling_pop2piano.py
+++ b/tests/models/pop2piano/test_modeling_pop2piano.py
@@ -34,7 +34,6 @@
 )
 from transformers.utils import is_essentia_available, is_librosa_available, is_scipy_available, is_torch_available
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -505,7 +504,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Pop2PianoModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class Pop2PianoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Pop2PianoForConditionalGeneration,) if is_torch_available() else ()
     # Doesn't run generation tests. Has custom generation method with a different interface
     all_generative_model_classes = ()
diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
index 3519604c8c0e..19179c073449 100644
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@@ -26,7 +26,6 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -281,7 +280,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class RecurrentGemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class RecurrentGemmaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (RecurrentGemmaForCausalLM,) if is_torch_available() else ()
     # Doesn't run generation tests. TODO @gante not fully supported
     all_generative_model_classes = ()
diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index a68030c86b17..7b7fdcb585e7 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -23,7 +23,6 @@
 from transformers.trainer_utils import set_seed
 from transformers.utils import cached_property
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
@@ -580,9 +579,7 @@ def test_retain_grad_hidden_states_attentions(self):
 
 
 @require_torch
-class SeamlessM4TModelWithTextInputTest(
-    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
+class SeamlessM4TModelWithTextInputTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     fx_compatible = False
     test_missing_keys = False
diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index c53bc4a8b186..0590f6a2c151 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -23,7 +23,6 @@
 from transformers.trainer_utils import set_seed
 from transformers.utils import cached_property
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import (
     ModelTesterMixin,
@@ -595,7 +594,7 @@ def test_retain_grad_hidden_states_attentions(self):
 
 
 @require_torch
-class SeamlessM4Tv2ModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class SeamlessM4Tv2ModelWithTextInputTest(ModelTesterMixin, unittest.TestCase):
     is_encoder_decoder = True
     fx_compatible = False
     test_missing_keys = False
diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py
index e750d50b62d1..0c127959909f 100644
--- a/tests/models/udop/test_modeling_udop.py
+++ b/tests/models/udop/test_modeling_udop.py
@@ -30,6 +30,7 @@
 )
 from transformers.utils import cached_property
 
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -265,7 +266,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class UdopModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class UdopModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             UdopModel,
@@ -419,6 +420,14 @@ def test_model_from_pretrained(self):
         model = UdopForConditionalGeneration.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    @unittest.skip(reason="TODO: Fix me @joao")
+    def test_generate_with_head_masking(self):
+        pass
+
+    @unittest.skip(reason="TODO: Fix me @joao")
+    def test_generate_without_input_ids(self):
+        pass
+
 
 class UdopEncoderOnlyModelTester:
     def __init__(
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 9dd5877c8b90..b92d3502460c 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -110,6 +110,8 @@
 )
 from transformers.utils.generic import ContextManagers, ModelOutput
 
+from .generation.test_utils import GenerationTesterMixin
+
 
 if is_accelerate_available():
     from accelerate.utils import compute_module_sizes
@@ -4949,6 +4951,31 @@ def test_flex_attention_with_grads(self):
             # If this does not raise an error, the test passes (see https://github.com/huggingface/transformers/pull/35605)
             _ = model(inputs_dict["input_ids"].to(torch_device))
 
+    def test_generation_tester_mixin_inheritance(self):
+        """
+        Ensures that we have the generation tester mixin if the model can generate. The test will fail otherwise,
+        forcing the mixin to be added -- and ensuring proper test coverage
+        """
+        if len(self.all_generative_model_classes) > 0:
+            self.assertTrue(
+                issubclass(self.__class__, GenerationTesterMixin),
+                msg=(
+                    "This model can call `generate`, so one of two things must happen: 1) the tester must inherit "
+                    "from `GenerationTesterMixin` to run `generate` tests, or 2) the model doesn't fully support the "
+                    "original `generate` or has a custom `generate`, the tester must set "
+                    "`all_generative_model_classes = ()`."
+                ),
+            )
+        else:
+            self.assertFalse(
+                issubclass(self.__class__, GenerationTesterMixin),
+                msg=(
+                    "This model can't call `generate`, so its tester can't inherit `GenerationTesterMixin`. (If you "
+                    "think the model should be able to `generate`, the model may be missing the `GenerationMixin` "
+                    "inheritance)"
+                ),
+            )
+
 
 global_rng = random.Random()
 

From 4ee061ecdd6e6725c8e0c1a78f73643fb3c50679 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Fri, 14 Feb 2025 10:39:41 +0000
Subject: [PATCH 2/8] all green

---
 src/transformers/modeling_utils.py                          | 5 +----
 tests/models/autoformer/test_modeling_autoformer.py         | 2 --
 tests/models/bark/test_modeling_bark.py                     | 2 --
 tests/models/idefics/test_modeling_idefics.py               | 6 ++++++
 tests/models/informer/test_modeling_informer.py             | 2 --
 tests/models/lilt/test_modeling_lilt.py                     | 3 +--
 tests/models/megatron_bert/test_modeling_megatron_bert.py   | 2 ++
 tests/models/modernbert/test_modeling_modernbert.py         | 3 +--
 tests/models/pegasus_x/test_modeling_pegasus_x.py           | 2 +-
 tests/models/qwen2_audio/test_modeling_qwen2_audio.py       | 2 ++
 tests/models/rembert/test_modeling_rembert.py               | 2 ++
 tests/models/roc_bert/test_modeling_roc_bert.py             | 2 ++
 tests/models/roformer/test_modeling_roformer.py             | 2 ++
 tests/models/speecht5/test_modeling_speecht5.py             | 2 ++
 .../table_transformer/test_modeling_table_transformer.py    | 3 +--
 tests/models/whisper/test_modeling_whisper.py               | 2 +-
 16 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index 13c8719b3603..4e2e23140060 100755
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -1645,7 +1645,7 @@ def base_model(self) -> nn.Module:
     @classmethod
     def can_generate(cls) -> bool:
         """
-        Returns whether this model can generate sequences with `.generate()`.
+        Returns whether this model can generate sequences with `.generate()` from the `GenerationMixin`.
 
         Returns:
             `bool`: Whether this model can generate sequences with `.generate()`.
@@ -1653,9 +1653,6 @@ def can_generate(cls) -> bool:
         # Directly inherits `GenerationMixin` -> can generate
         if "GenerationMixin" in str(cls.__bases__):
             return True
-        # Model class overwrites `generate` (e.g. time series models) -> can generate
-        if str(cls.__name__) in str(cls.generate):
-            return True
         # The class inherits from a class that can generate (recursive check) -> can generate
         for base in cls.__bases__:
             if not hasattr(base, "can_generate"):
diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py
index 0de67549b97e..3a2ed6385151 100644
--- a/tests/models/autoformer/test_modeling_autoformer.py
+++ b/tests/models/autoformer/test_modeling_autoformer.py
@@ -205,8 +205,6 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 @require_torch
 class AutoformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (AutoformerModel, AutoformerForPrediction) if is_torch_available() else ()
-    # Doesn't run generation tests. Has a custom `generate` method with a difference interface
-    all_generative_model_classes = ()
     pipeline_model_mapping = {"feature-extraction": AutoformerModel} if is_torch_available() else {}
     test_pruning = False
     test_head_masking = False
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index 44bc5a51a843..d94f6d26d6e8 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -707,8 +707,6 @@ def test_generate_fp16(self):
 @require_torch
 class BarkFineModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (BarkFineModel,) if is_torch_available() else ()
-    # Doesn't run generation tests. Has custom generation method with a different interface
-    all_generative_model_classes = ()
 
     is_encoder_decoder = False
     fx_compatible = False
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index d00ba28c7443..ca09991b1041 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -868,6 +868,12 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_sdpa_can_dispatch_non_composite_models(self):
         pass
 
+    @unittest.skip(
+        "Idefics has a separate test runner for generation tests with complex inheritance, causing this check to fail"
+    )
+    def test_generation_tester_mixin_inheritance(self):
+        pass
+
 
 @require_torch
 @require_vision
diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py
index 20446a19001d..5415717cd4b8 100644
--- a/tests/models/informer/test_modeling_informer.py
+++ b/tests/models/informer/test_modeling_informer.py
@@ -190,8 +190,6 @@ def check_encoder_decoder_model_standalone(self, config, inputs_dict):
 @require_torch
 class InformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (InformerModel, InformerForPrediction) if is_torch_available() else ()
-    # Doesn't run generation tests. Has a custom `generate` with an unique interface
-    all_generative_model_classes = ()
     pipeline_model_mapping = {"feature-extraction": InformerModel} if is_torch_available() else {}
     is_encoder_decoder = True
     test_pruning = False
diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py
index 9bfbb1c520c8..c24d375ccc45 100644
--- a/tests/models/lilt/test_modeling_lilt.py
+++ b/tests/models/lilt/test_modeling_lilt.py
@@ -19,7 +19,6 @@
 from transformers import LiltConfig, is_torch_available
 from transformers.testing_utils import require_torch, slow, torch_device
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -218,7 +217,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class LiltModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class LiltModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             LiltModel,
diff --git a/tests/models/megatron_bert/test_modeling_megatron_bert.py b/tests/models/megatron_bert/test_modeling_megatron_bert.py
index ee6bedfd0ca4..09fac0e75256 100644
--- a/tests/models/megatron_bert/test_modeling_megatron_bert.py
+++ b/tests/models/megatron_bert/test_modeling_megatron_bert.py
@@ -282,6 +282,8 @@ class MegatronBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.Test
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": MegatronBertModel,
diff --git a/tests/models/modernbert/test_modeling_modernbert.py b/tests/models/modernbert/test_modeling_modernbert.py
index 5cd26b352366..238999da9190 100644
--- a/tests/models/modernbert/test_modeling_modernbert.py
+++ b/tests/models/modernbert/test_modeling_modernbert.py
@@ -29,7 +29,6 @@
     torch_device,
 )
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, ids_tensor, random_attention_mask
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -216,7 +215,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ModernBertModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class ModernBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     test_torchscript = False
 
     all_model_classes = (
diff --git a/tests/models/pegasus_x/test_modeling_pegasus_x.py b/tests/models/pegasus_x/test_modeling_pegasus_x.py
index 97451ce766a1..cadacf716ac3 100644
--- a/tests/models/pegasus_x/test_modeling_pegasus_x.py
+++ b/tests/models/pegasus_x/test_modeling_pegasus_x.py
@@ -847,7 +847,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class PegasusXStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class PegasusXStandaloneDecoderModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (PegasusXDecoder,) if is_torch_available() else ()
     test_pruning = False
     is_encoder_decoder = False
diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
index ef8def3caef2..c5a8cc34c357 100644
--- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
+++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py
@@ -152,6 +152,8 @@ class Qwen2AudioForConditionalGenerationModelTest(ModelTesterMixin, unittest.Tes
     """
 
     all_model_classes = (Qwen2AudioForConditionalGeneration,) if is_torch_available() else ()
+    # Doesn't run generation tests. TODO eustache/joao: some generation tests are broken, the errors seem cache-related
+    all_generative_model_classes = ()
     test_pruning = False
     test_head_masking = False
     _is_composite = True
diff --git a/tests/models/rembert/test_modeling_rembert.py b/tests/models/rembert/test_modeling_rembert.py
index d5f9e0d5eccf..dbacbd150b3b 100644
--- a/tests/models/rembert/test_modeling_rembert.py
+++ b/tests/models/rembert/test_modeling_rembert.py
@@ -373,6 +373,8 @@ class RemBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": RemBertModel,
diff --git a/tests/models/roc_bert/test_modeling_roc_bert.py b/tests/models/roc_bert/test_modeling_roc_bert.py
index 2f13664e18b8..16d029ea4896 100644
--- a/tests/models/roc_bert/test_modeling_roc_bert.py
+++ b/tests/models/roc_bert/test_modeling_roc_bert.py
@@ -570,6 +570,8 @@ class RoCBertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": RoCBertModel,
diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py
index 7ad8165c8483..abd8cf1dc510 100644
--- a/tests/models/roformer/test_modeling_roformer.py
+++ b/tests/models/roformer/test_modeling_roformer.py
@@ -392,6 +392,8 @@ class RoFormerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. There are interface mismatches when using `generate` -- TODO @gante
+    all_generative_model_classes = ()
     pipeline_model_mapping = (
         {
             "feature-extraction": RoFormerModel,
diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py
index efc384e7051d..f133099ef707 100644
--- a/tests/models/speecht5/test_modeling_speecht5.py
+++ b/tests/models/speecht5/test_modeling_speecht5.py
@@ -362,6 +362,8 @@ def create_and_check_decoder_model_past_large_inputs(self, config, inputs_dict):
 @require_torch
 class SpeechT5ForSpeechToTextTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (SpeechT5ForSpeechToText,) if is_torch_available() else ()
+    # Doesn't run generation tests. TODO eustache/joao: shape checks probably need an update
+    all_generative_model_classes = ()
     is_encoder_decoder = True
     test_pruning = False
     test_headmasking = False
diff --git a/tests/models/table_transformer/test_modeling_table_transformer.py b/tests/models/table_transformer/test_modeling_table_transformer.py
index cbed595f66ff..aa4b7131f949 100644
--- a/tests/models/table_transformer/test_modeling_table_transformer.py
+++ b/tests/models/table_transformer/test_modeling_table_transformer.py
@@ -23,7 +23,6 @@
 from transformers import ResNetConfig, TableTransformerConfig, is_torch_available, is_vision_available
 from transformers.testing_utils import require_timm, require_torch, require_vision, slow, torch_device
 
-from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -189,7 +188,7 @@ def create_and_check_table_transformer_no_timm_backbone(self, config, pixel_valu
 
 
 @require_torch
-class TableTransformerModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class TableTransformerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (
         (
             TableTransformerModel,
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 9a951c9306d3..be5e465c2bc0 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -3522,7 +3522,7 @@ def create_and_check_model_forward(self, config, inputs_dict, use_weighted_layer
 
 
 @require_torch
-class WhisperEncoderModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class WhisperEncoderModelTest(ModelTesterMixin, unittest.TestCase):
     all_model_classes = (WhisperForAudioClassification,) if is_torch_available() else ()
     is_encoder_decoder = False
     fx_compatible = False

From 286270a4fbf2788d38294d215d54f6a505e6a10f Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Fri, 14 Feb 2025 10:42:22 +0000
Subject: [PATCH 3/8] update test

---
 tests/utils/test_modeling_utils.py | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/tests/utils/test_modeling_utils.py b/tests/utils/test_modeling_utils.py
index 9bd34168d6ff..72e54d1ef4b7 100644
--- a/tests/utils/test_modeling_utils.py
+++ b/tests/utils/test_modeling_utils.py
@@ -1709,17 +1709,7 @@ class DummyBertWithMixin(BertModel, GenerationMixin):
         self.assertTrue("" == cl.out)
         self.assertTrue(can_generate)
 
-        # 3 - Alternatively, a model can implement a `generate` method
-        class DummyBertWithGenerate(BertModel):
-            def generate(self):
-                pass
-
-        with CaptureLogger(logger) as cl:
-            can_generate = DummyBertWithGenerate.can_generate()
-        self.assertTrue("" == cl.out)
-        self.assertTrue(can_generate)
-
-        # 4 - Finally, it can inherit from a model that can generate
+        # 3 - Finally, it can inherit from a model that can generate
         class DummyBertWithParent(DummyBertWithMixin):
             pass
 
@@ -1728,7 +1718,7 @@ class DummyBertWithParent(DummyBertWithMixin):
         self.assertTrue("" == cl.out)
         self.assertTrue(can_generate)
 
-        # 5 - BC: models with a custom `prepare_inputs_for_generation` can generate (it was assumed they inherited
+        # 4 - BC: models with a custom `prepare_inputs_for_generation` can generate (it was assumed they inherited
         # `GenerationMixin`)
         class DummyBertWithPrepareInputs(BertModel):
             def prepare_inputs_for_generation(self):

From fc634654efc2f6291ad04450a1b394a2980e9cdb Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Fri, 14 Feb 2025 10:52:35 +0000
Subject: [PATCH 4/8] better error msg

---
 tests/test_modeling_common.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index b92d3502460c..13725074b5c6 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -4960,10 +4960,12 @@ def test_generation_tester_mixin_inheritance(self):
             self.assertTrue(
                 issubclass(self.__class__, GenerationTesterMixin),
                 msg=(
-                    "This model can call `generate`, so one of two things must happen: 1) the tester must inherit "
-                    "from `GenerationTesterMixin` to run `generate` tests, or 2) the model doesn't fully support the "
-                    "original `generate` or has a custom `generate`, the tester must set "
-                    "`all_generative_model_classes = ()`."
+                    "This model can call `generate` from `GenerationMixin`, so one of two things must happen: 1) the "
+                    "tester must inherit from `GenerationTesterMixin` to run `generate` tests, or 2) if the model "
+                    "doesn't fully support the original `generate` or has a custom `generate`, the tester must "
+                    "overwrite `all_generative_model_classes` to skip the failing classes (make sure to comment "
+                    "why). If `all_generative_model_classes` is overwritten as `()`, then we need to remove the "
+                    "`GenerationTesterMixin` inheritance -- no `generate` tests are being run anyway."
                 ),
             )
         else:

From 92d65cfc386177f7eb014fccf4463b2f940122f5 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Fri, 14 Feb 2025 11:00:52 +0000
Subject: [PATCH 5/8] musicgen needs partial inheritance

---
 .../models/musicgen/test_modeling_musicgen.py | 23 +++++++++++++++++--
 .../test_modeling_musicgen_melody.py          | 23 +++++++++++++++++--
 tests/test_modeling_common.py                 |  8 +++----
 3 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 69072af46f3e..2e105f69fd79 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -45,6 +45,7 @@
 )
 from transformers.utils import cached_property, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
 
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, sdpa_kernel
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -173,7 +174,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MusicgenDecoderTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (MusicgenModel, MusicgenForCausalLM) if is_torch_available() else ()
     # Doesn't run generation tests. See `greedy_sample_model_classes` below
     all_generative_model_classes = ()
@@ -671,6 +672,15 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                 self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
+    @unittest.skip(
+        reason=(
+            "MusicGen has a custom set of generation tests that rely on `GenerationTesterMixin`, controlled by "
+            "`greedy_sample_model_classes`"
+        )
+    )
+    def test_generation_tester_mixin_inheritance(self):
+        pass
+
 
 def prepare_musicgen_inputs_dict(
     config,
@@ -800,7 +810,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MusicgenTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (MusicgenForConditionalGeneration,) if is_torch_available() else ()
     # Doesn't run generation tests. See `greedy_sample_model_classes` below
     all_generative_model_classes = ()
@@ -1762,6 +1772,15 @@ def test_requires_grad_with_frozen_encoders(self):
             self.assertTrue(all(audio_encoder_grads))
             self.assertFalse(all(text_encoder_grads))
 
+    @unittest.skip(
+        reason=(
+            "MusicGen has a custom set of generation tests that rely on `GenerationTesterMixin`, controlled by "
+            "`greedy_sample_model_classes`"
+        )
+    )
+    def test_generation_tester_mixin_inheritance(self):
+        pass
+
 
 def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000):
     """Produces a series of 'bip bip' sounds at a given frequency."""
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index fab3979a65e1..57f6757a147d 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -49,6 +49,7 @@
 )
 from transformers.utils import cached_property, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device
 
+from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor, sdpa_kernel
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -179,7 +180,7 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class MusicgenMelodyDecoderTest(ModelTesterMixin, unittest.TestCase):
+class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (MusicgenMelodyModel, MusicgenMelodyForCausalLM) if is_torch_available() else ()
     # Doesn't run generation tests. See `greedy_sample_model_classes` below
     all_generative_model_classes = ()
@@ -688,6 +689,15 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol):
 
                 self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
 
+    @unittest.skip(
+        reason=(
+            "MusicGen has a custom set of generation tests that rely on `GenerationTesterMixin`, controlled by "
+            "`greedy_sample_model_classes`"
+        )
+    )
+    def test_generation_tester_mixin_inheritance(self):
+        pass
+
 
 def prepare_musicgen_melody_inputs_dict(
     config,
@@ -819,7 +829,7 @@ def prepare_config_and_inputs_for_common(self):
 
 @require_torch
 # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenTest with Musicgen->MusicgenMelody, musicgen->musicgen_melody, EncoderDecoder->DecoderOnly, input_values->input_features
-class MusicgenMelodyTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (MusicgenMelodyForConditionalGeneration,) if is_torch_available() else ()
     # Doesn't run generation tests. See `greedy_sample_model_classes` below
     all_generative_model_classes = ()
@@ -1740,6 +1750,15 @@ def test_requires_grad_with_frozen_encoders(self):
             self.assertTrue(all(audio_encoder_grads))
             self.assertFalse(all(text_encoder_grads))
 
+    @unittest.skip(
+        reason=(
+            "MusicGen has a custom set of generation tests that rely on `GenerationTesterMixin`, controlled by "
+            "`greedy_sample_model_classes`"
+        )
+    )
+    def test_generation_tester_mixin_inheritance(self):
+        pass
+
 
 # Copied from tests.models.musicgen.test_modeling_musicgen.get_bip_bip
 def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000):
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 13725074b5c6..7157a0099bc8 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -4962,10 +4962,10 @@ def test_generation_tester_mixin_inheritance(self):
                 msg=(
                     "This model can call `generate` from `GenerationMixin`, so one of two things must happen: 1) the "
                     "tester must inherit from `GenerationTesterMixin` to run `generate` tests, or 2) if the model "
-                    "doesn't fully support the original `generate` or has a custom `generate`, the tester must "
-                    "overwrite `all_generative_model_classes` to skip the failing classes (make sure to comment "
-                    "why). If `all_generative_model_classes` is overwritten as `()`, then we need to remove the "
-                    "`GenerationTesterMixin` inheritance -- no `generate` tests are being run anyway."
+                    "doesn't fully support the original `generate` or has a custom `generate` with partial feature "
+                    "support, the tester must overwrite `all_generative_model_classes` to skip the failing classes "
+                    "(make sure to comment why). If `all_generative_model_classes` is overwritten as `()`, then we "
+                    "need to remove the `GenerationTesterMixin` inheritance -- no `generate` tests are being run."
                 ),
             )
         else:

From d2c28598314f5f5266f6b31813f81bef85239d0d Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Fri, 14 Feb 2025 11:45:59 +0000
Subject: [PATCH 6/8] seamlessm4t

---
 .../models/seamless_m4t/modeling_seamless_m4t.py          | 8 ++++----
 tests/models/seamless_m4t/test_modeling_seamless_m4t.py   | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index d8b52cf63700..04dc8ed7d106 100755
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -2912,7 +2912,7 @@ def _reorder_cache(past_key_values, beam_idx):
     "The speech-to-text SeamlessM4T Model transformer which can be used for S2TT.",
     SEAMLESS_M4T_START_DOCSTRING,
 )
-class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel):
+class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_missing = ["text_decoder", "t2u_model", "vocoder"]
     main_input_name = "input_features"
 
@@ -3182,7 +3182,7 @@ def _reorder_cache(past_key_values, beam_idx):
     "The text-to-speech SeamlessM4T Model transformer which can be used for T2ST.",
     SEAMLESS_M4T_START_DOCSTRING,
 )
-class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel):
+class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_missing = ["speech_encoder"]
     main_input_name = "input_ids"
 
@@ -3511,7 +3511,7 @@ def _reorder_cache(past_key_values, beam_idx):
     "The speech-to-speech SeamlessM4T Model transformer which can be used for S2ST.",
     SEAMLESS_M4T_START_DOCSTRING,
 )
-class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel):
+class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_missing = ["text_encoder"]
     main_input_name = "input_features"
 
@@ -3854,7 +3854,7 @@ def _reorder_cache(past_key_values, beam_idx):
             Default modality. Used to initialize the model.
     """,
 )
-class SeamlessM4TModel(SeamlessM4TPreTrainedModel):
+class SeamlessM4TModel(SeamlessM4TPreTrainedModel, GenerationMixin):
     _tied_weights_keys = [
         "lm_head.weight",
         "text_encoder.embed_tokens.weight",
diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index 7b7fdcb585e7..558c9f7e4a54 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -357,6 +357,8 @@ class SeamlessM4TModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase):
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. Custom generation method with a different interface
+    all_generative_model_classes = ()
 
     def setUp(self):
         self.model_tester = SeamlessM4TModelTester(self, input_modality="speech")

From e97104993124e23ca224d50da27778be18215280 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Fri, 14 Feb 2025 12:04:10 +0000
Subject: [PATCH 7/8] seamlessm4t v2

---
 .../models/seamless_m4t_v2/modeling_seamless_m4t_v2.py    | 8 ++++----
 .../seamless_m4t_v2/test_modeling_seamless_m4t_v2.py      | 2 ++
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
index da9a4ecdadb4..3bf8edd2a68b 100644
--- a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -3192,7 +3192,7 @@ def _reorder_cache(past_key_values, beam_idx):
     "The speech-to-text SeamlessM4Tv2 Model transformer which can be used for S2TT.",
     SEAMLESS_M4T_V2_START_DOCSTRING,
 )
-class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel):
+class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_missing = ["text_decoder", "t2u_model", "vocoder"]
     main_input_name = "input_features"
 
@@ -3473,7 +3473,7 @@ def _reorder_cache(past_key_values, beam_idx):
     "The text-to-speech SeamlessM4Tv2 Model transformer which can be used for T2ST.",
     SEAMLESS_M4T_V2_START_DOCSTRING,
 )
-class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel):
+class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_missing = ["speech_encoder"]
     main_input_name = "input_ids"
 
@@ -3844,7 +3844,7 @@ def _reorder_cache(past_key_values, beam_idx):
     "The speech-to-speech SeamlessM4Tv2 Model transformer which can be used for S2ST.",
     SEAMLESS_M4T_V2_START_DOCSTRING,
 )
-class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel):
+class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
     _keys_to_ignore_on_load_missing = ["text_encoder"]
     main_input_name = "input_features"
 
@@ -4229,7 +4229,7 @@ def _reorder_cache(past_key_values, beam_idx):
             This will be updated automatically according to the modality passed to the forward and generate passes (`input_ids` for text and `input_features` for audio).
     """,
 )
-class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel):
+class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
     _tied_weights_keys = [
         "lm_head.weight",
         "text_encoder.embed_tokens.weight",
diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index 0590f6a2c151..ecf5363fd821 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -373,6 +373,8 @@ class SeamlessM4Tv2ModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase)
         if is_torch_available()
         else ()
     )
+    # Doesn't run generation tests. Has custom generation method with a different interface
+    all_generative_model_classes = ()
 
     def setUp(self):
         self.model_tester = SeamlessM4Tv2ModelTester(self, input_modality="speech")

From e90c38975fba281aa2e58269977bf732e5684916 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Fri, 14 Feb 2025 15:13:24 +0000
Subject: [PATCH 8/8] add copied from

---
 tests/generation/test_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 6c6cfc48996a..cc446ae3d592 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -2756,6 +2756,7 @@ def test_speculative_sampling_target_distribution(self):
 global_rng = random.Random()
 
 
+# Copied from tests.test_modeling_common.ids_tensor
 def ids_tensor(shape, vocab_size, rng=None, name=None):
     #  Creates a random int32 tensor of the shape within the vocab size
     if rng is None:
@@ -2772,6 +2773,7 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
     return torch.tensor(data=values, dtype=torch.long, device=torch_device).view(shape).contiguous()
 
 
+# Copied from tests.test_modeling_common.floats_tensor
 def floats_tensor(shape, scale=1.0, rng=None, name=None):
     """Creates a random float32 tensor"""
     if rng is None: