From 6eefb942d1853180a60861d9f6322347e4fceb1e Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Fri, 25 Oct 2024 17:23:54 +0000
Subject: [PATCH 01/16] tmp commit

---
 src/transformers/generation/utils.py          |   8 +-
 tests/generation/test_utils.py                | 449 +++++++++++++++---
 tests/models/moshi/test_modeling_moshi.py     |   4 +-
 .../models/musicgen/test_modeling_musicgen.py |  46 --
 tests/test_modeling_common.py                 | 424 -----------------
 5 files changed, 380 insertions(+), 551 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index efe953db051c..705857363415 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -378,10 +378,14 @@ def prepare_inputs_for_generation(
         # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
         # Exception 1: when passing input_embeds, input_ids may be missing entries
         # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
-        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case
+        # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+        #              (we can't check exception 3 while compiling)
         if past_key_values is not None:
             model_inputs["past_key_values"] = past_key_values
-            if inputs_embeds is not None or cache_position[-1] >= input_ids.shape[1]:  # Exception 1 or Exception 3
+            if (
+                inputs_embeds is not None  # Exception 1
+                or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1])  # Exception 3
+            ):
                 input_ids = input_ids[:, -cache_position.shape[0] :]
             elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
                 input_ids = input_ids[:, cache_position]
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 6f2eaf734df1..8afa1c12eda4 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -17,20 +17,26 @@
 import copy
 import gc
 import inspect
+import os
 import tempfile
+import time
 import unittest
 import warnings
 
 import numpy as np
 import pytest
+from packaging import version
 from parameterized import parameterized
 
 from transformers import AutoConfig, is_torch_available, pipeline, set_seed
 from transformers.testing_utils import (
     is_flaky,
     require_accelerate,
+    require_flash_attn,
     require_optimum_quanto,
+    require_read_token,
     require_torch,
+    require_torch_accelerator,
     require_torch_gpu,
     require_torch_multi_accelerator,
     require_torch_multi_gpu,
@@ -136,6 +142,34 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2):
 
         return config, filtered_inputs_dict
 
+    def _check_similar_generate_outputs(self, output_1, output_2, atol=1e-5, rtol=1e-5):
+        """
+        Checks whether a pair of generate outputs are similar. Two `generate` call outputs are considered similar in
+        the following siturations:
+        1. The sequences are the same
+        2. The sequences are different, but the scores up until the first mismatch are nearly identical
+        """
+        # scores doesn't include data regarding decoder input tokens
+        decoder_input_length = output_1.sequences.shape[1] - len(output_1.scores)
+        output_matches = output_1.sequences == output_2.sequences
+        has_matching_outputs = output_matches.all()
+        has_matching_scores = None
+        if not has_matching_outputs:
+            for batch_idx in range(output_1.sequences.shape[0]):
+                batch_matches = output_matches[batch_idx]
+                if batch_matches.all():
+                    continue
+                first_mismatch_idx = batch_matches.int().argmin()  # gets the index of the first False
+                first_mismatch_idx -= decoder_input_length
+                output_1_first_mismatch_scores = output_1.scores[first_mismatch_idx][batch_idx]
+                output_2_first_mismatch_scores = output_2.scores[first_mismatch_idx][batch_idx]
+                has_matching_scores = torch.allclose(
+                    output_1_first_mismatch_scores, output_2_first_mismatch_scores, rtol=atol, atol=rtol
+                )
+                if not has_matching_scores:
+                    break
+        self.assertTrue(has_matching_outputs or has_matching_scores)
+
     def _get_logits_processor_kwargs(self, do_sample=False, config=None):
         logits_processor_kwargs = {
             "bad_words_ids": [[1, 0]],
@@ -426,7 +460,6 @@ def test_greedy_generate(self):
     def test_greedy_generate_dict_outputs(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(
@@ -453,13 +486,12 @@ def test_greedy_generate_dict_outputs(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput)
 
-            self._check_outputs(output_generate, main_input, model.config)
+            self._check_outputs(output_generate, model.config)
 
     @pytest.mark.generate
     def test_greedy_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             if not hasattr(config, "use_cache"):
                 self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
@@ -486,7 +518,7 @@ def test_greedy_generate_dict_outputs_use_cache(self):
                     output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]
                 )
 
-            self._check_outputs(output_generate, main_input, model.config, use_cache=True)
+            self._check_outputs(output_generate, model.config, use_cache=True)
 
     @pytest.mark.generate
     def test_sample_generate(self):
@@ -505,7 +537,6 @@ def test_sample_generate(self):
     def test_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._sample_generate(
@@ -533,7 +564,7 @@ def test_sample_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, SampleDecoderOnlyOutput)
 
-            self._check_outputs(output_generate, main_input, model.config, num_return_sequences=2)
+            self._check_outputs(output_generate, model.config, num_return_sequences=2)
 
     @pytest.mark.generate
     def test_beam_search_generate(self):
@@ -554,7 +585,6 @@ def test_beam_search_generate(self):
     def test_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
@@ -582,15 +612,12 @@ def test_beam_search_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
-            self._check_outputs(
-                output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"]
-            )
+            self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"])
 
     @pytest.mark.generate
     def test_beam_search_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             if not hasattr(config, "use_cache"):
                 self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
@@ -622,11 +649,7 @@ def test_beam_search_generate_dict_outputs_use_cache(self):
                 )
 
             self._check_outputs(
-                output_generate,
-                main_input,
-                model.config,
-                use_cache=True,
-                num_return_sequences=beam_kwargs["num_beams"],
+                output_generate, model.config, use_cache=True, num_return_sequences=beam_kwargs["num_beams"]
             )
 
     @require_accelerate
@@ -698,7 +721,6 @@ def test_beam_sample_generate(self):
     def test_beam_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
@@ -729,7 +751,7 @@ def test_beam_sample_generate_dict_output(self):
                 self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput)
 
             self._check_outputs(
-                output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"]
+                output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"]
             )
 
     @pytest.mark.generate
@@ -788,7 +810,6 @@ def test_group_beam_search_generate(self):
     def test_group_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_diverse_beam_kwargs()
@@ -816,9 +837,7 @@ def test_group_beam_search_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
-            self._check_outputs(
-                output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"]
-            )
+            self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"])
 
     # TODO: @gante check why it is flaky
     @is_flaky()
@@ -921,9 +940,7 @@ def test_constrained_beam_search_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
-            self._check_outputs(
-                output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"]
-            )
+            self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"])
 
     @pytest.mark.generate
     def test_contrastive_generate(self):
@@ -965,7 +982,6 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
                 self.skipTest(reason="Won't fix: old model with different cache format")
 
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -991,7 +1007,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
                     output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1]
                 )
 
-            self._check_outputs(output_generate, main_input, model.config, use_cache=True)
+            self._check_outputs(output_generate, model.config, use_cache=True)
 
     @pytest.mark.generate
     def test_contrastive_generate_low_memory(self):
@@ -1087,14 +1103,10 @@ def test_beam_search_low_memory(self):
 
     @pytest.mark.generate
     @parameterized.expand([("random",), ("same",)])
-    @is_flaky()  # Read NOTE (1) below. If there are API issues, all attempts will fail.
     def test_assisted_decoding_matches_greedy_search(self, assistant_type):
         # This test ensures that the assisted generation does not introduce output changes over greedy search.
-        # NOTE (1): The sentence above is true most of the time, there is a tiny difference in the logits due to matmul
-        # shape differences -- and it may result in a different output. The input shape difference happens in the
-        # main model, that runs the forward pass with several candidates at once (as opposed to generating one token at
-        # a time). See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info.
-        # NOTE (2): It breaks the pattern in the tests above, for multiple reasons:
+        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info.
+        # NOTE: It breaks the pattern in the tests above, for multiple reasons:
         # - assisted_decoding, contrarily to the other methods, can't be called on its own (e.g. needs to
         # prepare the assistant encoder outputs in the main generate body);
         # - assisted_decoding does not support `use_cache = False`
@@ -1123,7 +1135,6 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
 
             # enable cache
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1164,12 +1175,10 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
             output_assisted = model.generate(**generation_kwargs, **inputs_dict)
 
             # The two outputs must match and their shape must be as expected
-
-            self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
+            self._check_similar_generate_outputs(output_greedy, output_assisted)
             for output in (output_greedy, output_assisted):
-                self._check_outputs(output, main_input, model.config, use_cache=True)
+                self._check_outputs(output, model.config, use_cache=True)
 
-    @is_flaky()
     @pytest.mark.generate
     def test_prompt_lookup_decoding_matches_greedy_search(self):
         # This test ensures that the prompt lookup generation does not introduce output changes over greedy search.
@@ -1198,7 +1207,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
 
             # enable cache
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1231,10 +1239,9 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
             output_prompt_lookup = model.generate(**generation_kwargs, **inputs_dict)
 
             # The two outputs must match and their shape must be as expected
-
-            self.assertListEqual(output_greedy.sequences.tolist(), output_prompt_lookup.sequences.tolist())
+            self._check_similar_generate_outputs(output_greedy, output_prompt_lookup)
             for output in (output_greedy, output_prompt_lookup):
-                self._check_outputs(output, main_input, model.config, use_cache=True)
+                self._check_outputs(output, model.config, use_cache=True)
 
     @pytest.mark.generate
     def test_dola_decoding_sample(self):
@@ -1254,7 +1261,6 @@ def test_dola_decoding_sample(self):
 
             # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             # Encoder-decoder models are not supported
             if config.is_encoder_decoder:
@@ -1282,7 +1288,7 @@ def test_dola_decoding_sample(self):
                 "dola_layers": "low",
             }
             output_dola = model.generate(**generation_kwargs, **inputs_dict)
-            self._check_outputs(output_dola, main_input, model.config, use_cache=getattr(config, "use_cache", False))
+            self._check_outputs(output_dola, model.config, use_cache=getattr(config, "use_cache", False))
 
     @pytest.mark.generate
     def test_assisted_decoding_sample(self):
@@ -1312,7 +1318,6 @@ def test_assisted_decoding_sample(self):
 
             # enable cache
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1344,7 +1349,7 @@ def test_assisted_decoding_sample(self):
             }
             output_assisted = model.generate(**generation_kwargs, **inputs_dict)
 
-            self._check_outputs(output_assisted, main_input, config, use_cache=True)
+            self._check_outputs(output_assisted, config, use_cache=True)
 
     @pytest.mark.generate
     def test_prompt_lookup_decoding_stops_at_eos(self):
@@ -1931,12 +1936,13 @@ def test_generate_with_quant_cache(self):
     @pytest.mark.generate
     @require_torch_gpu
     @slow
-    @is_flaky()  # compilation may result in equivalent (!= same) FP ops, causing the argmax in `generate` to be flaky
     def test_generate_compile_fullgraph(self):
         """
         Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results.
         ⚠️ Runs two sequential generations to ensure the cache doesn't get stuck after the first compiled run! ⚠️
         """
+        # TODO (@joao): end-to-end compilation is broken, fix me :(
+
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_static_cache:
                 self.skipTest("This model doesn't support static cache")
@@ -1961,6 +1967,8 @@ def test_generate_compile_fullgraph(self):
             generation_kwargs = {
                 "do_sample": False,
                 "max_new_tokens": 10,
+                "return_dict_in_generate": True,
+                "output_scores": True,
             }
 
             max_cache_len = input_ids.shape[1] + generation_kwargs["max_new_tokens"]
@@ -1981,7 +1989,7 @@ def test_generate_compile_fullgraph(self):
                 output_compiled = compiled_generate(
                     model_inputs, generation_config=generation_config, past_key_values=past_key_values
                 )
-                self.assertListEqual(output_dynamic.tolist(), output_compiled.tolist())
+                self._check_similar_generate_outputs(output_dynamic, output_compiled)
 
     @pytest.mark.generate
     def test_generate_methods_with_num_logits_to_keep(self):
@@ -2009,7 +2017,6 @@ def test_generate_methods_with_num_logits_to_keep(self):
             self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
 
     @pytest.mark.generate
-    @is_flaky()  # assisted generation tests are flaky (minor fp ops differences)
     def test_assisted_decoding_with_num_logits_to_keep(self):
         for model_class in self.all_generative_model_classes:
             if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):
@@ -2030,6 +2037,8 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
                 "max_new_tokens": 10,
                 "do_sample": False,
                 "assistant_model": assistant_model,
+                "return_dict_in_generate": True,
+                "output_scores": True,
             }
 
             assistant_model.generation_config.assistant_confidence_threshold = None
@@ -2037,7 +2046,8 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
             with_all_logits = model.generate(**generation_kwargs, **inputs_dict, num_logits_to_keep=0)
             # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior)
             without_all_logits = model.generate(**inputs_dict, **generation_kwargs)
-            self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
+
+            self._check_similar_generate_outputs(with_all_logits, without_all_logits)
 
     @pytest.mark.generate
     def test_inherits_generation_mixin(self):
@@ -2048,9 +2058,11 @@ def test_inherits_generation_mixin(self):
         for model_class in self.all_generative_model_classes:
             self.assertTrue("GenerationMixin" in str(model_class.__bases__))
 
+    @pytest.mark.generate
     @require_torch_sdpa
     @slow
     def test_eager_matches_sdpa_generate(self):
+        """Tests that generate has equivalent outputs with SDPA (default) and eager attention implementations."""
         max_new_tokens = 30
 
         for model_class in self.all_generative_model_classes:
@@ -2082,6 +2094,7 @@ def test_eager_matches_sdpa_generate(self):
                     "do_sample": False,
                     "return_dict_in_generate": True,
                     "output_scores": True,
+                    "use_cache": True,
                 }
 
                 model_sdpa = model_class.from_pretrained(
@@ -2103,35 +2116,317 @@ def test_eager_matches_sdpa_generate(self):
                 del model_eager
                 gc.collect()
 
-                # Eager and SDPA are very similar, but not exactly the same. Because we are using random models, this
-                # test would be flaky if we only checked the sequences. Two situations in which this test passes:
-                # 1. The sequences are the same
-                # 2. The sequences are different, but the scores up until the first mismatch are nearly identical
-                output_matches = res_eager.sequences == res_sdpa.sequences
-                has_matching_outputs = output_matches.all()
-                has_matching_scores = None
-                if not has_matching_outputs:
-                    input_length = main_input.shape[1]
-                    for batch_idx in range(res_eager.sequences.shape[0]):
-                        batch_matches = output_matches[batch_idx]
-                        if batch_matches.all():
-                            continue
-                        first_mismatch_idx = batch_matches.int().argmin()  # gets the index of the first False
-                        first_mismatch_idx -= input_length  # scores doesn't include data regarding input tokens
-                        sdpa_first_mismatch_scores = res_sdpa.scores[first_mismatch_idx][batch_idx]
-                        eager_first_mismatch_scores = res_eager.scores[first_mismatch_idx][batch_idx]
-                        has_matching_scores = torch.allclose(
-                            sdpa_first_mismatch_scores, eager_first_mismatch_scores, rtol=1e-3, atol=1e-3
-                        )
-                        if not has_matching_scores:
-                            break
+                self._check_similar_generate_outputs(res_eager, res_sdpa, atol=1e-3, rtol=1e-3)
+
+    @pytest.mark.flash_attn_test
+    @require_flash_attn
+    @require_torch_gpu
+    @slow
+    def test_flash_attn_2_generate_matches_default(self):
+        """Tests that generate has equivalent outputs with SDPA (default) and FA2 attention implementations."""
+        # TODO (@joao @raushan) -- this test is failing the output checks on most models, investigate. After fixing,
+        # check whether we still need the overwrites
+
+        max_new_tokens = 30
+
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_flash_attn_2:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
+
+            config, original_inputs_dict = self.prepare_config_and_inputs_for_generate()
+            inputs_dict = {}
+            for input_name, input_data in original_inputs_dict.items():
+                if isinstance(input_data, torch.Tensor) and input_data.dtype in [torch.float32, torch.bfloat16]:
+                    inputs_dict[input_name] = input_data.to(torch.float16)
+                else:
+                    inputs_dict[input_name] = input_data
+            main_input = inputs_dict[model_class.main_input_name]
+
+            # make sure that all models have enough positions for generation
+            if hasattr(config, "max_position_embeddings"):
+                config.max_position_embeddings = max_new_tokens + main_input.shape[1] + 1
+
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                generate_kwargs = {
+                    "max_new_tokens": max_new_tokens,
+                    "do_sample": False,
+                    "return_dict_in_generate": True,
+                    "output_scores": True,
+                    "use_cache": True,
+                }
+
+                model = model.to(device=torch_device, dtype=torch.float16)
+                res_sdpa = model.generate(**inputs_dict, **generate_kwargs)
+                del model
+                gc.collect()
+
+                model_fa2 = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    attn_implementation="flash_attention_2",
+                    low_cpu_mem_usage=True,
+                ).to(torch_device)
+                res_fa2 = model_fa2.generate(**inputs_dict, **generate_kwargs)
+                del model_fa2
+                gc.collect()
+
+                self._check_similar_generate_outputs(res_fa2, res_sdpa, atol=1e-3, rtol=1e-3)
+
+    @pytest.mark.generate
+    def test_generate_reuse_cache(self):
+        """Tests that we can pass a cache out of the `generate`, and use it to continue generation"""
+        max_new_tokens = 2
+        for model_class in self.all_generative_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            inputs_dict.pop("attention_mask", None)  # Let's rely on `attention_mask=None` in this test, for simplicity
+
+            if not hasattr(config, "use_cache"):
+                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
+
+            model = model_class(config).to(torch_device)
+
+            # base generate
+            generate_kwargs = {
+                "max_new_tokens": max_new_tokens,
+                "do_sample": False,
+                "return_dict_in_generate": True,
+                "output_scores": True,
+                "use_cache": True,
+            }
+            base_outputs = model.generate(**inputs_dict, **generate_kwargs)
+
+            # generate with cache reuse
+            generate_kwargs["max_new_tokens"] = 1
+            cache_reuse_outputs = model.generate(**inputs_dict, **generate_kwargs)
+            if not config.is_encoder_decoder:
+                inputs_dict["input_ids"] = cache_reuse_outputs.sequences
+            else:
+                inputs_dict["decoder_input_ids"] = cache_reuse_outputs.sequences
+            cache_reuse_outputs = model.generate(
+                **inputs_dict, **generate_kwargs, past_key_values=cache_reuse_outputs.past_key_values
+            )
+
+            self._check_similar_generate_outputs(base_outputs, cache_reuse_outputs)
+
+    # @pytest.mark.generate
+    # def test_inputs_embeds_matches_input_ids_with_generate(self):
+    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+    #     for model_class in self.all_model_classes:
+    #         if model_class.__name__ not in [
+    #             *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
+    #             *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES),
+    #         ]:
+    #             continue
+
+    #         model = model_class(config)
+    #         model.to(torch_device)
+    #         model.eval()
+
+    #         model_forward_args = inspect.signature(model.forward).parameters
+    #         if any(argument not in model_forward_args for argument in ["inputs_embeds", "position_ids"]):
+    #             self.skipTest(reason="This model doesn't use `inputs_embeds` or `position_ids`.")
+    #         has_inputs_embeds_forwarding = "inputs_embeds" in set(
+    #             inspect.signature(model.prepare_inputs_for_generation).parameters.keys()
+    #         )
+    #         if not has_inputs_embeds_forwarding:
+    #             self.skipTest(reason="This model doesn't support `inputs_embeds` passed to `generate`.")
+    #         inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+    #         pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
+
+    #         # VLMs can't generate with embeds and pixels at the same time. We expect the user to pass merged
+    #         # embeds already
+    #         if model_class.__name__ in get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES):
+    #             inputs.pop("pixel_values", None)
+    #             inputs.pop("pixel_values_videos", None)
+    #             inputs.pop("pixel_values_images", None)
+
+    #         wte = model.get_input_embeddings()
+    #         if not self.is_encoder_decoder:
+    #             input_ids = inputs["input_ids"]
+    #             # some models infer position ids/attn mask differently when input ids
+    #             # by check if pad_token let's make sure no padding is in input ids
+    #             not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
+    #             input_ids[input_ids == pad_token_id] = not_pad_token_id
+    #             del inputs["input_ids"]
+    #             inputs_embeds = wte(input_ids)
+    #             out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:]
+    #             out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2)
+    #         else:
+    #             encoder_input_ids = inputs["input_ids"]
+    #             decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
+    #             encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
+    #             decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
+    #             del inputs["input_ids"]
+    #             inputs.pop("decoder_input_ids", None)
+    #             inputs_embeds = wte(encoder_input_ids)
+    #             decoder_inputs_embeds = wte(decoder_input_ids)
+    #             out_ids = model.generate(
+    #                 input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2
+    #             )[:, -2:]
+    #             out_embeds = model.generate(
+    #                 inputs_embeds=inputs_embeds,
+    #                 decoder_inputs_embeds=decoder_inputs_embeds,
+    #                 **inputs,
+    #                 max_new_tokens=2,
+    #             )
+    #         # NOTE: this test changes the order of FP ops, there may be tiny differences in the output
+    #         number_of_different_tokens = (out_ids != out_embeds).sum()
+    #         max_differences = int(out_ids.shape[0] * out_ids.shape[1] * 0.1)
+    #         self.assertTrue(number_of_different_tokens <= max_differences)  # accept up to 10% mismatch
+
+    @pytest.mark.generate
+    def test_static_cache_matches_dynamic(self):
+        """
+        Tests that generating with static cache give almost same results as with dynamic cache.
+        This test does not compile the model and check only logits similarity for numerical precision
+        errors.
+        """
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(
+                reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks"
+            )
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_static_cache:
+                self.skipTest(f"{model_class.__name__} does not support static cache")
+
+            if not model_class._supports_cache_class:
+                self.skipTest(f"{model_class.__name__} does not support cache class")
+
+            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+            if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0:
+                self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test")
+
+            model = model_class(config).to(device=torch_device, dtype=torch.float32)
+            model.eval()
+
+            dynamic_out = model.generate(
+                **inputs, do_sample=False, max_new_tokens=10, output_logits=True, return_dict_in_generate=True
+            )
+            static_out = model.generate(
+                **inputs,
+                do_sample=False,
+                max_new_tokens=10,
+                cache_implementation="static",
+                output_logits=True,
+                return_dict_in_generate=True,
+            )
+            self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4))
+
+    # For now, Let's focus only on GPU for `torch.compile`
+    @slow
+    @require_torch_accelerator
+    @require_read_token
+    @pytest.mark.generate
+    def test_torch_compile(self):
+        if version.parse(torch.__version__) < version.parse("2.3"):
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
+        torch.compiler.reset()
+        if not hasattr(self, "_torch_compile_test_ckpt"):
+            self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
+        ckpt = self._torch_compile_test_ckpt
+        revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision
+
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+        batch_size = 1
+        n_iter = 3
+
+        tokenizer = AutoTokenizer.from_pretrained(ckpt)
+        if self.is_encoder_decoder:
+            model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
+                torch_device
+            )
+        else:
+            model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
+                torch_device
+            )
+
+        model.generation_config.max_new_tokens = 4
+
+        model.generation_config.cache_implementation = "static"
+        model.forward = torch.compile(model.forward, mode="reduce-overhead")
+
+        input_text = "Why dogs are cute?"
+        input_ids = tokenizer([input_text] * batch_size, return_tensors="pt").to(torch_device)
+
+        for i in range(n_iter):
+            _ = model.generate(**input_ids, do_sample=False)
+
+    @slow
+    @require_torch_gpu  # Testing cuda graphs.
+    @require_read_token
+    @pytest.mark.generate
+    def test_compile_cuda_graph_time(self):
+        if version.parse(torch.__version__) < version.parse("2.3"):
+            self.skipTest(reason="This test requires torch >= 2.3 to run.")
+
+        # TODO felix: All models supporting `StaticCache` or `torch.compile` should be tested.
+        # At the moment, only llama, gemma and gemma2 are tested here!
+        if not hasattr(self, "_torch_compile_test_ckpt"):
+            self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
+        ckpt = self._torch_compile_test_ckpt
+        revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision
+
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+        tokenizer = AutoTokenizer.from_pretrained(ckpt)
+        if self.is_encoder_decoder:
+            model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
+                torch_device
+            )
+        else:
+            model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
+                torch_device
+            )
+
+        cache_implementation = "static"
+        if model.config.model_type == "gemma2":
+            cache_implementation = "hybrid"
+
+        new_tokens = 50
+        gen_config = GenerationConfig(
+            max_new_tokens=new_tokens,
+            min_new_tokens=new_tokens,
+            use_cache=True,
+            pad_token_id=tokenizer.pad_token_id,
+            num_beams=1,
+            do_sample=False,
+            eos_token_id=None,  # This is required for min_new_tokens to actually have an effect.
+        )
+        model.generation_config.eos_token_id = None  # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect.
+
+        model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+
+        inp = tokenizer("Why cats are cute?", return_tensors="pt").to(torch_device)
+
+        # First run: the first run warms up each graph, which does things like CuBlas or Triton benchmarking
+        start = time.perf_counter()
+        _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
+        end = time.perf_counter()
+        graph_warmup_time = end - start
+
+        # Second run: CUDA Graph recording, and replays it
+        start = time.perf_counter()
+        _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
+        end = time.perf_counter()
+        record_time = end - start
+
+        # Finally: we hit the optimized, CUDA Graph replay path
+        start = time.perf_counter()
+        _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
+        end = time.perf_counter()
+        opt_time = end - start
 
-                self.assertTrue(has_matching_outputs or has_matching_scores)
+        # For the recording step, we expect only two cuda graphs and this step should be much faster than the first.
+        self.assertTrue(record_time < 0.15 * graph_warmup_time)
+        self.assertTrue(opt_time < record_time)
 
-    def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1):
-        # we can be sure what is batch size from main input but seq length depends on model type and whether input is text/audio/image
-        # so we infer actual text seq length from model_tester, same was as it is done in `test_modeling_common.py` tests`
-        batch_size = main_input.shape[0]
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1):
+        batch_size = output.sequences.shape[0]
 
         seq_length = getattr(self.model_tester, "seq_length", None)
         seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py
index dd9302ee2c55..872ecd21a32b 100644
--- a/tests/models/moshi/test_modeling_moshi.py
+++ b/tests/models/moshi/test_modeling_moshi.py
@@ -591,9 +591,9 @@ def _check_hidden_states_for_generate(
                 [expected_shape] * len(iter_hidden_states),
             )
 
-    def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1):
         # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True`
-        super()._check_outputs(output, input_ids, config, use_cache=True, num_return_sequences=num_return_sequences)
+        super()._check_outputs(output, config, use_cache=True, num_return_sequences=num_return_sequences)
 
     def _check_hidden_states_for_generate(
         self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 346ad60debe2..4c9dc95fb3bf 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -542,52 +542,6 @@ def test_flash_attn_2_generate_padding_right(self):
 
                 self.assertTrue(torch.allclose(out, out_fa))
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache
-    def test_flash_attn_2_generate_use_cache(self):
-        max_new_tokens = 30
-
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
     @slow
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 51d51dfcc282..6e08f2958b29 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -22,7 +22,6 @@
 import random
 import re
 import tempfile
-import time
 import warnings
 from collections import defaultdict
 from contextlib import contextmanager
@@ -37,10 +36,7 @@
 from transformers import (
     AutoModel,
     AutoModelForCausalLM,
-    AutoModelForSeq2SeqLM,
     AutoModelForSequenceClassification,
-    AutoTokenizer,
-    GenerationConfig,
     PretrainedConfig,
     PreTrainedModel,
     is_torch_available,
@@ -3000,71 +2996,6 @@ def test_inputs_embeds_matches_input_ids(self):
                     )[0]
             self.assertTrue(torch.allclose(out_embeds, out_ids))
 
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            if model_class.__name__ not in [
-                *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
-                *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES),
-            ]:
-                continue
-
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            model_forward_args = inspect.signature(model.forward).parameters
-            if any(argument not in model_forward_args for argument in ["inputs_embeds", "position_ids"]):
-                self.skipTest(reason="This model doesn't use `inputs_embeds` or `position_ids`.")
-            has_inputs_embeds_forwarding = "inputs_embeds" in set(
-                inspect.signature(model.prepare_inputs_for_generation).parameters.keys()
-            )
-            if not has_inputs_embeds_forwarding:
-                self.skipTest(reason="This model doesn't support `inputs_embeds` passed to `generate`.")
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
-
-            # VLMs can't generate with embeds and pixels at the same time. We expect the user to pass merged
-            # embeds already
-            if model_class.__name__ in get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES):
-                inputs.pop("pixel_values", None)
-                inputs.pop("pixel_values_videos", None)
-                inputs.pop("pixel_values_images", None)
-
-            wte = model.get_input_embeddings()
-            if not self.is_encoder_decoder:
-                input_ids = inputs["input_ids"]
-                # some models infer position ids/attn mask differently when input ids
-                # by check if pad_token let's make sure no padding is in input ids
-                not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
-                input_ids[input_ids == pad_token_id] = not_pad_token_id
-                del inputs["input_ids"]
-                inputs_embeds = wte(input_ids)
-                out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:]
-                out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2)
-            else:
-                encoder_input_ids = inputs["input_ids"]
-                decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-                encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
-                decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
-                del inputs["input_ids"]
-                inputs.pop("decoder_input_ids", None)
-                inputs_embeds = wte(encoder_input_ids)
-                decoder_inputs_embeds = wte(decoder_input_ids)
-                out_ids = model.generate(
-                    input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2
-                )[:, -2:]
-                out_embeds = model.generate(
-                    inputs_embeds=inputs_embeds,
-                    decoder_inputs_embeds=decoder_inputs_embeds,
-                    **inputs,
-                    max_new_tokens=2,
-                )
-            # NOTE: this test changes the order of FP ops, there may be tiny differences in the output
-            number_of_different_tokens = (out_ids != out_embeds).sum()
-            max_differences = int(out_ids.shape[0] * out_ids.shape[1] * 0.1)
-            self.assertTrue(number_of_different_tokens <= max_differences)  # accept up to 10% mismatch
-
     @require_non_xpu
     @require_torch_multi_gpu
     def test_multi_gpu_data_parallel_forward(self):
@@ -3857,102 +3788,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
                 assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    @is_flaky()
-    def test_flash_attn_2_generate_left_padding(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # make sure we do left padding
-                dummy_attention_mask[:, :-1] = 0
-                dummy_attention_mask[:, -1:] = 1
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @is_flaky()
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # make sure we do right padding
-                dummy_attention_mask[:, :-1] = 1
-                dummy_attention_mask[:, -1:] = 0
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
     def test_attn_implementation_composite_models(self):
         """
         Tests if composite models can receive a dict object as attn_implementation, where each key should be
@@ -4525,65 +4360,6 @@ def test_sdpa_matches_eager_sliding_window(self):
                     torch.allclose(res_eager[attention_mask == 1], res_sdpa[attention_mask == 1], rtol=1e-4, atol=1e-4)
                 )
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
-                # Generate with one batch only to test generation when attention mask will be None
-                # when real inputs are used, because there is no padding. See issue #32237 for more
-                dummy_input = dummy_input[:1, ...]
-                dummy_attention_mask = torch.ones_like(dummy_attention_mask[:1, ...])
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @mark.flash_attn_test
@@ -4640,62 +4416,6 @@ def test_flash_attn_2_can_dispatch_composite_models(self):
                     if not has_fa2:
                         raise ValueError("The FA2 model should have FA2 layers")
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_reuse_cache(self):
-        if not self.has_attentions:
-            self.skipTest(reason="Model architecture does not support attentions")
-
-        max_new_tokens = 2
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # run generate once to get filled cache
-                output = model.generate(
-                    dummy_input,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                    return_dict_in_generate=True,
-                )
-                past_key_values = output.past_key_values
-
-                # Try to continue generation from where we left, given that we have more than 1 new token to process
-                # e.g. this can happen in speculative decoding when feeding candidate tokens back to target model
-                dummy_input_updated = torch.cat([dummy_input, output.sequences], dim=-1)
-                _ = model.generate(
-                    dummy_input_updated,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                    past_key_values=past_key_values,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @require_bitsandbytes
@@ -4999,82 +4719,6 @@ def test_custom_4d_attention_mask(self):
             normalized_1 = F.softmax(out_shared_prefix_last_tokens)
             torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4)
 
-    def test_static_cache_matches_dynamic(self):
-        """
-        Tests that generating with static cache give almost same results as with dynamic cache.
-        This test does not compile the model and check only logits similarity for numerical precision
-        errors.
-        """
-        if len(self.all_generative_model_classes) == 0:
-            self.skipTest(
-                reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks"
-            )
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_static_cache:
-                self.skipTest(f"{model_class.__name__} does not support static cache")
-
-            if not model_class._supports_cache_class:
-                self.skipTest(f"{model_class.__name__} does not support cache class")
-
-            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-            if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0:
-                self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test")
-
-            model = model_class(config).to(device=torch_device, dtype=torch.float32)
-            model.eval()
-
-            dynamic_out = model.generate(
-                **inputs, do_sample=False, max_new_tokens=10, output_logits=True, return_dict_in_generate=True
-            )
-            static_out = model.generate(
-                **inputs,
-                do_sample=False,
-                max_new_tokens=10,
-                cache_implementation="static",
-                output_logits=True,
-                return_dict_in_generate=True,
-            )
-            self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4))
-
-    # For now, Let's focus only on GPU for `torch.compile`
-    @slow
-    @require_torch_accelerator
-    @require_read_token
-    def test_torch_compile(self):
-        if version.parse(torch.__version__) < version.parse("2.3"):
-            self.skipTest(reason="This test requires torch >= 2.3 to run.")
-        torch.compiler.reset()
-        if not hasattr(self, "_torch_compile_test_ckpt"):
-            self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
-        ckpt = self._torch_compile_test_ckpt
-        revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision
-
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-        batch_size = 1
-        n_iter = 3
-
-        tokenizer = AutoTokenizer.from_pretrained(ckpt)
-        if self.is_encoder_decoder:
-            model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
-                torch_device
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
-                torch_device
-            )
-
-        model.generation_config.max_new_tokens = 4
-
-        model.generation_config.cache_implementation = "static"
-        model.forward = torch.compile(model.forward, mode="reduce-overhead")
-
-        input_text = "Why dogs are cute?"
-        input_ids = tokenizer([input_text] * batch_size, return_tensors="pt").to(torch_device)
-
-        for i in range(n_iter):
-            _ = model.generate(**input_ids, do_sample=False)
-
     @slow
     @require_torch_gpu
     def test_torch_compile_for_training(self):
@@ -5118,74 +4762,6 @@ def test_torch_compile_for_training(self):
         for name, param in model._orig_mod.named_parameters():
             torch.testing.assert_close(param.grad.detach().cpu(), params[name], rtol=1e-4, atol=1e-4)
 
-    @slow
-    @require_torch_gpu  # Testing cuda graphs.
-    @require_read_token
-    def test_compile_cuda_graph_time(self):
-        if version.parse(torch.__version__) < version.parse("2.3"):
-            self.skipTest(reason="This test requires torch >= 2.3 to run.")
-
-        # TODO felix: All models supporting `StaticCache` or `torch.compile` should be tested.
-        # At the moment, only llama, gemma and gemma2 are tested here!
-        if not hasattr(self, "_torch_compile_test_ckpt"):
-            self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
-        ckpt = self._torch_compile_test_ckpt
-        revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision
-
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-        tokenizer = AutoTokenizer.from_pretrained(ckpt)
-        if self.is_encoder_decoder:
-            model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
-                torch_device
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
-                torch_device
-            )
-
-        cache_implementation = "static"
-        if model.config.model_type == "gemma2":
-            cache_implementation = "hybrid"
-
-        new_tokens = 50
-        gen_config = GenerationConfig(
-            max_new_tokens=new_tokens,
-            min_new_tokens=new_tokens,
-            use_cache=True,
-            pad_token_id=tokenizer.pad_token_id,
-            num_beams=1,
-            do_sample=False,
-            eos_token_id=None,  # This is required for min_new_tokens to actually have an effect.
-        )
-        model.generation_config.eos_token_id = None  # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect.
-
-        model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
-
-        inp = tokenizer("Why cats are cute?", return_tensors="pt").to(torch_device)
-
-        # First run: the first run warms up each graph, which does things like CuBlas or Triton benchmarking
-        start = time.perf_counter()
-        _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
-        end = time.perf_counter()
-        graph_warmup_time = end - start
-
-        # Second run: CUDA Graph recording, and replays it
-        start = time.perf_counter()
-        _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
-        end = time.perf_counter()
-        record_time = end - start
-
-        # Finally: we hit the optimized, CUDA Graph replay path
-        start = time.perf_counter()
-        _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
-        end = time.perf_counter()
-        opt_time = end - start
-
-        # For the recording step, we expect only two cuda graphs and this step should be much faster than the first.
-        self.assertTrue(record_time < 0.15 * graph_warmup_time)
-        self.assertTrue(opt_time < record_time)
-
     def test_forward_with_num_logits_to_keep(self):
         for model_class in self.all_generative_model_classes:
             if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):

From 06d2048145adb21f2afa3a3f719599aa336d581a Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Mon, 28 Oct 2024 11:18:18 +0000
Subject: [PATCH 02/16] tmp commit

---
 tests/generation/test_utils.py | 97 +++++++++++-----------------------
 1 file changed, 31 insertions(+), 66 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 8afa1c12eda4..eb61d8ca3432 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -2058,16 +2058,21 @@ def test_inherits_generation_mixin(self):
         for model_class in self.all_generative_model_classes:
             self.assertTrue("GenerationMixin" in str(model_class.__bases__))
 
-    @pytest.mark.generate
-    @require_torch_sdpa
-    @slow
-    def test_eager_matches_sdpa_generate(self):
-        """Tests that generate has equivalent outputs with SDPA (default) and eager attention implementations."""
+    def _test_attention_implementation(self, attn_implementation):
+        """
+        Compares the output of generate with the eager attention implementation against other implementations.
+        NOTE: despite the test logic being the same, different implementations actually need diferent decorators, hence
+        this separate function.
+        """
         max_new_tokens = 30
+        support_flag = {
+            "sdpa": "_supports_sdpa",
+            "flash_attention_2": "_supports_flash_attn_2",
+        }
 
         for model_class in self.all_generative_model_classes:
-            if not model_class._supports_sdpa:
-                self.skipTest(f"{model_class.__name__} does not support SDPA")
+            if not getattr(model_class, support_flag[attn_implementation]):
+                self.skipTest(f"{model_class.__name__} does not support `attn_implementation={attn_implementation}`")
 
             config, original_inputs_dict = self.prepare_config_and_inputs_for_generate()
             inputs_dict = {}
@@ -2097,84 +2102,44 @@ def test_eager_matches_sdpa_generate(self):
                     "use_cache": True,
                 }
 
-                model_sdpa = model_class.from_pretrained(
+                model_eager = model_class.from_pretrained(
                     tmpdirname,
                     torch_dtype=torch.float16,
                     low_cpu_mem_usage=True,
+                    attn_implementation="eager",
                 ).to(torch_device)
-                res_sdpa = model_sdpa.generate(**inputs_dict, **generate_kwargs)
-                del model_sdpa
+                res_eager = model_eager.generate(**inputs_dict, **generate_kwargs)
+                del model_eager
                 gc.collect()
 
-                model_eager = model_class.from_pretrained(
+                model_attn = model_class.from_pretrained(
                     tmpdirname,
                     torch_dtype=torch.float16,
                     low_cpu_mem_usage=True,
-                    attn_implementation="eager",
+                    attn_implementation=attn_implementation,
                 ).to(torch_device)
-                res_eager = model_eager.generate(**inputs_dict, **generate_kwargs)
-                del model_eager
+                res_attn = model_attn.generate(**inputs_dict, **generate_kwargs)
+                del model_attn
                 gc.collect()
 
-                self._check_similar_generate_outputs(res_eager, res_sdpa, atol=1e-3, rtol=1e-3)
+                self._check_similar_generate_outputs(res_eager, res_attn, atol=1e-3, rtol=1e-3)
+
+    @pytest.mark.generate
+    @require_torch_sdpa
+    @slow
+    def test_eager_matches_sdpa_generate(self):
+        """Tests that generate has equivalent outputs with SDPA and eager attention implementations."""
+        self._test_attention_implementation("sdpa")
 
     @pytest.mark.flash_attn_test
     @require_flash_attn
     @require_torch_gpu
     @slow
-    def test_flash_attn_2_generate_matches_default(self):
-        """Tests that generate has equivalent outputs with SDPA (default) and FA2 attention implementations."""
+    def test_eager_matches_fa2_generate(self):
+        """Tests that generate has equivalent outputs with FA2 and eager attention implementations."""
         # TODO (@joao @raushan) -- this test is failing the output checks on most models, investigate. After fixing,
         # check whether we still need the overwrites
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, original_inputs_dict = self.prepare_config_and_inputs_for_generate()
-            inputs_dict = {}
-            for input_name, input_data in original_inputs_dict.items():
-                if isinstance(input_data, torch.Tensor) and input_data.dtype in [torch.float32, torch.bfloat16]:
-                    inputs_dict[input_name] = input_data.to(torch.float16)
-                else:
-                    inputs_dict[input_name] = input_data
-            main_input = inputs_dict[model_class.main_input_name]
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + main_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                generate_kwargs = {
-                    "max_new_tokens": max_new_tokens,
-                    "do_sample": False,
-                    "return_dict_in_generate": True,
-                    "output_scores": True,
-                    "use_cache": True,
-                }
-
-                model = model.to(device=torch_device, dtype=torch.float16)
-                res_sdpa = model.generate(**inputs_dict, **generate_kwargs)
-                del model
-                gc.collect()
-
-                model_fa2 = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-                res_fa2 = model_fa2.generate(**inputs_dict, **generate_kwargs)
-                del model_fa2
-                gc.collect()
-
-                self._check_similar_generate_outputs(res_fa2, res_sdpa, atol=1e-3, rtol=1e-3)
+        self._test_attention_implementation("flash_attention_2")
 
     @pytest.mark.generate
     def test_generate_reuse_cache(self):

From 9b8d9b23ae4e5d19e0cafd78b650703478410311 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Mon, 28 Oct 2024 14:46:15 +0000
Subject: [PATCH 03/16] cull overwrites of deleted tests

---
 src/transformers/generation/utils.py          |   2 +-
 tests/generation/test_utils.py                | 342 +++---------------
 tests/models/bart/test_modeling_bart.py       |   5 -
 tests/models/bert/test_modeling_bert.py       |   5 -
 .../chameleon/test_modeling_chameleon.py      |  40 --
 tests/models/gemma/test_modeling_gemma.py     |  48 ---
 tests/models/gemma2/test_modeling_gemma2.py   |   1 -
 tests/models/glm/test_modeling_glm.py         |  40 --
 tests/models/gptj/test_modeling_gptj.py       |  45 +--
 tests/models/granite/test_modeling_granite.py |  47 +--
 .../granitemoe/test_modeling_granitemoe.py    |  45 ---
 .../models/idefics2/test_modeling_idefics2.py |  33 --
 .../models/idefics3/test_modeling_idefics3.py |  33 --
 tests/models/jamba/test_modeling_jamba.py     |  87 -----
 tests/models/jetmoe/test_modeling_jetmoe.py   |  80 ----
 tests/models/kosmos2/test_modeling_kosmos2.py |   6 -
 tests/models/llama/test_modeling_llama.py     |  41 ---
 tests/models/mamba2/test_modeling_mamba2.py   |   6 -
 tests/models/mimi/test_modeling_mimi.py       |  16 -
 tests/models/mistral/test_modeling_mistral.py |  80 ----
 tests/models/mixtral/test_modeling_mixtral.py |  80 ----
 tests/models/mllama/test_modeling_mllama.py   |   1 -
 tests/models/mt5/test_modeling_mt5.py         |   3 -
 .../models/musicgen/test_modeling_musicgen.py | 235 ------------
 .../test_modeling_musicgen_melody.py          | 143 --------
 .../models/nemotron/test_modeling_nemotron.py |   2 -
 .../paligemma/test_modeling_paligemma.py      |   4 -
 tests/models/phi/test_modeling_phi.py         |  41 ---
 tests/models/qwen2/test_modeling_qwen2.py     |  80 ----
 .../qwen2_moe/test_modeling_qwen2_moe.py      |  80 ----
 .../models/qwen2_vl/test_modeling_qwen2_vl.py |   4 -
 .../test_modeling_recurrent_gemma.py          |   4 -
 .../starcoder2/test_modeling_starcoder2.py    |  80 ----
 tests/models/t5/test_modeling_t5.py           |   3 -
 tests/models/umt5/test_modeling_umt5.py       |   3 -
 tests/models/whisper/test_modeling_whisper.py |  70 ----
 tests/models/zamba/test_modeling_zamba.py     |  87 -----
 tests/test_modeling_common.py                 |   1 -
 38 files changed, 52 insertions(+), 1871 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 705857363415..46ea679aca50 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -418,7 +418,7 @@ def prepare_inputs_for_generation(
         for model_input_name in ["position_ids", "token_type_ids"]:
             model_input = kwargs.get(model_input_name)
             if model_input is not None:
-                if past_key_values:
+                if past_key_values is not None:
                     model_input = model_input[:, -input_ids.shape[1] :]
                     model_input = model_input.clone(memory_format=torch.contiguous_format)
                 model_inputs[model_input_name] = model_input
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index eb61d8ca3432..6c10b28e1d39 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -17,15 +17,12 @@
 import copy
 import gc
 import inspect
-import os
 import tempfile
-import time
 import unittest
 import warnings
 
 import numpy as np
 import pytest
-from packaging import version
 from parameterized import parameterized
 
 from transformers import AutoConfig, is_torch_available, pipeline, set_seed
@@ -34,9 +31,7 @@
     require_accelerate,
     require_flash_attn,
     require_optimum_quanto,
-    require_read_token,
     require_torch,
-    require_torch_accelerator,
     require_torch_gpu,
     require_torch_multi_accelerator,
     require_torch_multi_gpu,
@@ -750,9 +745,7 @@ def test_beam_sample_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput)
 
-            self._check_outputs(
-                output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"]
-            )
+            self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"])
 
     @pytest.mark.generate
     def test_generate_without_input_ids(self):
@@ -901,7 +894,6 @@ def test_constrained_beam_search_generate(self):
     def test_constrained_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
 
@@ -1575,7 +1567,8 @@ def test_past_key_values_format(self):
                     )
 
     @pytest.mark.generate
-    def test_generate_from_inputs_embeds_decoder_only(self):
+    def test_generate_from_inputs_embeds(self):
+        """Tests that we can generate from `inputs_embeds` instead of `input_ids` in LLMs, VLMs, etc"""
         # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids`
         # if fails, you should probably update the `prepare_inputs_for_generation` function
         for model_class in self.all_generative_model_classes:
@@ -1854,10 +1847,8 @@ def test_new_cache_format(self, num_beams, do_sample):
     @pytest.mark.generate
     def test_generate_with_static_cache(self):
         """
-        Tests if StaticCache works if we set attn_implementation=static when generation.
-        This doesn't test if generation quality is good, but tests that models with
-        self._supports_static_cache don't throw an error when generating and return
-        a StaticCache object at the end.
+        Tests that generating with static cache give almost same results as with dynamic cache, and the output cache
+        has the expected shapes
         """
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_static_cache:
@@ -1876,13 +1867,15 @@ def test_generate_with_static_cache(self):
 
             model = model_class(config).to(torch_device).eval()
             generation_kwargs = {
-                "max_length": None,
                 "max_new_tokens": max_new_tokens,
-                "cache_implementation": "static",
                 "return_dict_in_generate": True,  # Required to return `past_key_values`
+                "output_scores": True,
                 "use_cache": True,
             }
 
+            static_cache_generation = model.generate(**generation_kwargs, **inputs_dict, cache_implementation="static")
+
+            # Check 1: The cache shapes must match the expected shapes
             max_cache_len = seq_length + max_new_tokens
             config = config.text_config if hasattr(config, "text_config") else config
             head_dim = (
@@ -1894,12 +1887,14 @@ def test_generate_with_static_cache(self):
                 else config.num_key_value_heads
             )
             num_hidden_layers = config.num_hidden_layers
-            results = model.generate(**generation_kwargs, **inputs_dict)
-
             cache_shape = (batch_size, num_key_value_heads, max_cache_len, head_dim)
-            self.assertTrue(isinstance(results.past_key_values, StaticCache))
-            self.assertTrue(len(results.past_key_values.key_cache) == num_hidden_layers)
-            self.assertTrue(results.past_key_values.key_cache[0].shape == cache_shape)
+            self.assertTrue(isinstance(static_cache_generation.past_key_values, StaticCache))
+            self.assertTrue(len(static_cache_generation.past_key_values.key_cache) == num_hidden_layers)
+            self.assertTrue(static_cache_generation.past_key_values.key_cache[0].shape == cache_shape)
+
+            # Check 2: The outputs must be similar to the case with dynamic cache
+            dynamic_cache_generation = model.generate(**generation_kwargs, **inputs_dict)
+            self._check_similar_generate_outputs(dynamic_cache_generation, static_cache_generation)
 
     @require_optimum_quanto
     @pytest.mark.generate
@@ -1936,23 +1931,29 @@ def test_generate_with_quant_cache(self):
     @pytest.mark.generate
     @require_torch_gpu
     @slow
-    def test_generate_compile_fullgraph(self):
+    @parameterized.expand(
+        [
+            ("forward_only", False),  # TODO (@joao): a few models failing. After fixed, this should not be "@slow"
+            ("end_to_end", True),  # TODO (@joao): end-to-end compilation is broken with torch 2.5+, explore and fix
+        ]
+    )
+    def test_generate_compile(self, name, end_to_end):
         """
-        Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results.
+        Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. Tests
+        end-to-end compilation and forward pass compilation only.
         ⚠️ Runs two sequential generations to ensure the cache doesn't get stuck after the first compiled run! ⚠️
         """
-        # TODO (@joao): end-to-end compilation is broken, fix me :(
-
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_static_cache:
                 self.skipTest("This model doesn't support static cache")
+
             # TODO (joao) -- fix and enable me :)
-            if any(model_name in model_class.__name__.lower() for model_name in ["whisper"]):
+            if end_to_end and any(model_name in model_class.__name__.lower() for model_name in ["whisper"]):
                 self.skipTest("whisper model end-to-end generate compile not yet supported")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             # TODO (joao) -- fix and enable me :)
-            if config.is_encoder_decoder:
+            if end_to_end and config.is_encoder_decoder:
                 self.skipTest("Encoder-decoder model end-to-end generate compile not yet supported")
 
             model = model_class(config).to(torch_device)
@@ -1970,26 +1971,30 @@ def test_generate_compile_fullgraph(self):
                 "return_dict_in_generate": True,
                 "output_scores": True,
             }
+            # end-to-end works best with dynamic cache, forward compilation works best with static cache
+            if not end_to_end:
+                generation_kwargs["cache_implementation"] = "static"
 
-            max_cache_len = input_ids.shape[1] + generation_kwargs["max_new_tokens"]
-            config = config.get_text_config()
-            past_key_values = StaticCache(
-                config, batch_size=half_batch_size, max_cache_len=max_cache_len, device=torch_device
-            )
+            # get eager + dynamic cache results for future comparison
+            dynamic_outputs = []
+            for model_inputs in input_ids_sets:
+                dynamic_outputs.append(model.generate(model_inputs, **generation_kwargs))
+
+            # get compiled results
+            generation_config = copy.deepcopy(model.generation_config)
+            generation_config.update(**generation_kwargs)
+            torch.compiler.reset()
+            if end_to_end:
+                model.generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead")
+            else:
+                model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead")
 
+            compuled_outputs = []
             for model_inputs in input_ids_sets:
-                # eager dynamic cache
-                output_dynamic = model.generate(model_inputs, **generation_kwargs)
-
-                # end-to-end compiled dynamic cache
-                torch.compiler.reset()
-                compiled_generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead")
-                generation_config = copy.deepcopy(model.generation_config)
-                generation_config.update(**generation_kwargs)
-                output_compiled = compiled_generate(
-                    model_inputs, generation_config=generation_config, past_key_values=past_key_values
-                )
-                self._check_similar_generate_outputs(output_dynamic, output_compiled)
+                compuled_outputs.append(model.generate(model_inputs, generation_config=generation_config))
+
+            for dynamic_result, compiled_result in zip(dynamic_outputs, compuled_outputs):
+                self._check_similar_generate_outputs(dynamic_result, compiled_result)
 
     @pytest.mark.generate
     def test_generate_methods_with_num_logits_to_keep(self):
@@ -2141,255 +2146,6 @@ def test_eager_matches_fa2_generate(self):
         # check whether we still need the overwrites
         self._test_attention_implementation("flash_attention_2")
 
-    @pytest.mark.generate
-    def test_generate_reuse_cache(self):
-        """Tests that we can pass a cache out of the `generate`, and use it to continue generation"""
-        max_new_tokens = 2
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            inputs_dict.pop("attention_mask", None)  # Let's rely on `attention_mask=None` in this test, for simplicity
-
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
-
-            model = model_class(config).to(torch_device)
-
-            # base generate
-            generate_kwargs = {
-                "max_new_tokens": max_new_tokens,
-                "do_sample": False,
-                "return_dict_in_generate": True,
-                "output_scores": True,
-                "use_cache": True,
-            }
-            base_outputs = model.generate(**inputs_dict, **generate_kwargs)
-
-            # generate with cache reuse
-            generate_kwargs["max_new_tokens"] = 1
-            cache_reuse_outputs = model.generate(**inputs_dict, **generate_kwargs)
-            if not config.is_encoder_decoder:
-                inputs_dict["input_ids"] = cache_reuse_outputs.sequences
-            else:
-                inputs_dict["decoder_input_ids"] = cache_reuse_outputs.sequences
-            cache_reuse_outputs = model.generate(
-                **inputs_dict, **generate_kwargs, past_key_values=cache_reuse_outputs.past_key_values
-            )
-
-            self._check_similar_generate_outputs(base_outputs, cache_reuse_outputs)
-
-    # @pytest.mark.generate
-    # def test_inputs_embeds_matches_input_ids_with_generate(self):
-    #     config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-    #     for model_class in self.all_model_classes:
-    #         if model_class.__name__ not in [
-    #             *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES),
-    #             *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES),
-    #         ]:
-    #             continue
-
-    #         model = model_class(config)
-    #         model.to(torch_device)
-    #         model.eval()
-
-    #         model_forward_args = inspect.signature(model.forward).parameters
-    #         if any(argument not in model_forward_args for argument in ["inputs_embeds", "position_ids"]):
-    #             self.skipTest(reason="This model doesn't use `inputs_embeds` or `position_ids`.")
-    #         has_inputs_embeds_forwarding = "inputs_embeds" in set(
-    #             inspect.signature(model.prepare_inputs_for_generation).parameters.keys()
-    #         )
-    #         if not has_inputs_embeds_forwarding:
-    #             self.skipTest(reason="This model doesn't support `inputs_embeds` passed to `generate`.")
-    #         inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-    #         pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
-
-    #         # VLMs can't generate with embeds and pixels at the same time. We expect the user to pass merged
-    #         # embeds already
-    #         if model_class.__name__ in get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES):
-    #             inputs.pop("pixel_values", None)
-    #             inputs.pop("pixel_values_videos", None)
-    #             inputs.pop("pixel_values_images", None)
-
-    #         wte = model.get_input_embeddings()
-    #         if not self.is_encoder_decoder:
-    #             input_ids = inputs["input_ids"]
-    #             # some models infer position ids/attn mask differently when input ids
-    #             # by check if pad_token let's make sure no padding is in input ids
-    #             not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
-    #             input_ids[input_ids == pad_token_id] = not_pad_token_id
-    #             del inputs["input_ids"]
-    #             inputs_embeds = wte(input_ids)
-    #             out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:]
-    #             out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2)
-    #         else:
-    #             encoder_input_ids = inputs["input_ids"]
-    #             decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids)
-    #             encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
-    #             decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1)
-    #             del inputs["input_ids"]
-    #             inputs.pop("decoder_input_ids", None)
-    #             inputs_embeds = wte(encoder_input_ids)
-    #             decoder_inputs_embeds = wte(decoder_input_ids)
-    #             out_ids = model.generate(
-    #                 input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2
-    #             )[:, -2:]
-    #             out_embeds = model.generate(
-    #                 inputs_embeds=inputs_embeds,
-    #                 decoder_inputs_embeds=decoder_inputs_embeds,
-    #                 **inputs,
-    #                 max_new_tokens=2,
-    #             )
-    #         # NOTE: this test changes the order of FP ops, there may be tiny differences in the output
-    #         number_of_different_tokens = (out_ids != out_embeds).sum()
-    #         max_differences = int(out_ids.shape[0] * out_ids.shape[1] * 0.1)
-    #         self.assertTrue(number_of_different_tokens <= max_differences)  # accept up to 10% mismatch
-
-    @pytest.mark.generate
-    def test_static_cache_matches_dynamic(self):
-        """
-        Tests that generating with static cache give almost same results as with dynamic cache.
-        This test does not compile the model and check only logits similarity for numerical precision
-        errors.
-        """
-        if len(self.all_generative_model_classes) == 0:
-            self.skipTest(
-                reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks"
-            )
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_static_cache:
-                self.skipTest(f"{model_class.__name__} does not support static cache")
-
-            if not model_class._supports_cache_class:
-                self.skipTest(f"{model_class.__name__} does not support cache class")
-
-            config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
-            if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0:
-                self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test")
-
-            model = model_class(config).to(device=torch_device, dtype=torch.float32)
-            model.eval()
-
-            dynamic_out = model.generate(
-                **inputs, do_sample=False, max_new_tokens=10, output_logits=True, return_dict_in_generate=True
-            )
-            static_out = model.generate(
-                **inputs,
-                do_sample=False,
-                max_new_tokens=10,
-                cache_implementation="static",
-                output_logits=True,
-                return_dict_in_generate=True,
-            )
-            self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4))
-
-    # For now, Let's focus only on GPU for `torch.compile`
-    @slow
-    @require_torch_accelerator
-    @require_read_token
-    @pytest.mark.generate
-    def test_torch_compile(self):
-        if version.parse(torch.__version__) < version.parse("2.3"):
-            self.skipTest(reason="This test requires torch >= 2.3 to run.")
-        torch.compiler.reset()
-        if not hasattr(self, "_torch_compile_test_ckpt"):
-            self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
-        ckpt = self._torch_compile_test_ckpt
-        revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision
-
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-        batch_size = 1
-        n_iter = 3
-
-        tokenizer = AutoTokenizer.from_pretrained(ckpt)
-        if self.is_encoder_decoder:
-            model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
-                torch_device
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
-                torch_device
-            )
-
-        model.generation_config.max_new_tokens = 4
-
-        model.generation_config.cache_implementation = "static"
-        model.forward = torch.compile(model.forward, mode="reduce-overhead")
-
-        input_text = "Why dogs are cute?"
-        input_ids = tokenizer([input_text] * batch_size, return_tensors="pt").to(torch_device)
-
-        for i in range(n_iter):
-            _ = model.generate(**input_ids, do_sample=False)
-
-    @slow
-    @require_torch_gpu  # Testing cuda graphs.
-    @require_read_token
-    @pytest.mark.generate
-    def test_compile_cuda_graph_time(self):
-        if version.parse(torch.__version__) < version.parse("2.3"):
-            self.skipTest(reason="This test requires torch >= 2.3 to run.")
-
-        # TODO felix: All models supporting `StaticCache` or `torch.compile` should be tested.
-        # At the moment, only llama, gemma and gemma2 are tested here!
-        if not hasattr(self, "_torch_compile_test_ckpt"):
-            self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.")
-        ckpt = self._torch_compile_test_ckpt
-        revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision
-
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-        tokenizer = AutoTokenizer.from_pretrained(ckpt)
-        if self.is_encoder_decoder:
-            model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
-                torch_device
-            )
-        else:
-            model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to(
-                torch_device
-            )
-
-        cache_implementation = "static"
-        if model.config.model_type == "gemma2":
-            cache_implementation = "hybrid"
-
-        new_tokens = 50
-        gen_config = GenerationConfig(
-            max_new_tokens=new_tokens,
-            min_new_tokens=new_tokens,
-            use_cache=True,
-            pad_token_id=tokenizer.pad_token_id,
-            num_beams=1,
-            do_sample=False,
-            eos_token_id=None,  # This is required for min_new_tokens to actually have an effect.
-        )
-        model.generation_config.eos_token_id = None  # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect.
-
-        model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
-
-        inp = tokenizer("Why cats are cute?", return_tensors="pt").to(torch_device)
-
-        # First run: the first run warms up each graph, which does things like CuBlas or Triton benchmarking
-        start = time.perf_counter()
-        _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
-        end = time.perf_counter()
-        graph_warmup_time = end - start
-
-        # Second run: CUDA Graph recording, and replays it
-        start = time.perf_counter()
-        _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
-        end = time.perf_counter()
-        record_time = end - start
-
-        # Finally: we hit the optimized, CUDA Graph replay path
-        start = time.perf_counter()
-        _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation)
-        end = time.perf_counter()
-        opt_time = end - start
-
-        # For the recording step, we expect only two cuda graphs and this step should be much faster than the first.
-        self.assertTrue(record_time < 0.15 * graph_warmup_time)
-        self.assertTrue(opt_time < record_time)
-
     def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1):
         batch_size = output.sequences.shape[0]
 
diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py
index eda51d21199f..e4d0df141be2 100644
--- a/tests/models/bart/test_modeling_bart.py
+++ b/tests/models/bart/test_modeling_bart.py
@@ -1532,8 +1532,3 @@ def test_retain_grad_hidden_states_attentions(self):
     @unittest.skip
     def test_save_load_fast_init_from_base(self):
         pass
-
-    @unittest.skip(reason="Generate needs input ids")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        # generate only works with input ids for bartforcausalLM
-        pass
diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py
index 8ac1c3d2b409..9e7edc28d0c8 100644
--- a/tests/models/bert/test_modeling_bert.py
+++ b/tests/models/bert/test_modeling_bert.py
@@ -509,11 +509,6 @@ def test_model_as_decoder(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
         self.model_tester.create_and_check_model_as_decoder(*config_and_inputs)
 
-    @unittest.skip(reason="Generate needs input ids")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        # generate only works with input ids for bertforcausalLM
-        pass
-
     def test_model_as_decoder_with_default_input_mask(self):
         # This regression test was failing with PyTorch < 1.3
         (
diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py
index aad26ef147e8..2a8e7633ba40 100644
--- a/tests/models/chameleon/test_modeling_chameleon.py
+++ b/tests/models/chameleon/test_modeling_chameleon.py
@@ -16,17 +16,14 @@
 
 import unittest
 
-import pytest
 import requests
 from parameterized import parameterized
 
 from transformers import ChameleonConfig, is_torch_available, is_vision_available, set_seed
 from transformers.testing_utils import (
     require_bitsandbytes,
-    require_flash_attn,
     require_read_token,
     require_torch,
-    require_torch_gpu,
     slow,
     torch_device,
 )
@@ -329,43 +326,6 @@ def test_model_rope_scaling(self, scaling_type):
         # The output should be different for long inputs
         self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
 
-    @require_flash_attn
-    @require_read_token
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = ChameleonForConditionalGeneration.from_pretrained(
-            "facebook/chameleon-7b",
-            load_in_4bit=True,
-            device_map={"": 0},
-        )
-
-        processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        processor.tokenizer.padding_side = "right"
-
-        inputs = processor(text=texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = processor.tokenizer.batch_decode(output_native)
-
-        model = ChameleonForConditionalGeneration.from_pretrained(
-            "facebook/chameleon-7b",
-            load_in_4bit=True,
-            attn_implementation="flash_attention_2",
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = processor.tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
     @unittest.skip("Chameleon forces some token ids to be -inf!")
     def test_batching_equivalence(self):
         pass
diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py
index a888bdcd3bc7..e8483f8c7c7d 100644
--- a/tests/models/gemma/test_modeling_gemma.py
+++ b/tests/models/gemma/test_modeling_gemma.py
@@ -319,9 +319,6 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.6]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "google/gemma-2b"
-
     # used in `test_torch_compile_for_training`
     _torch_compile_train_cls = GemmaForCausalLM if is_torch_available() else None
 
@@ -419,51 +416,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Gemma apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py
index 94670803daa9..7bca83f96d73 100644
--- a/tests/models/gemma2/test_modeling_gemma2.py
+++ b/tests/models/gemma2/test_modeling_gemma2.py
@@ -78,7 +78,6 @@ class Gemma2ModelTest(GemmaModelTest, unittest.TestCase):
     test_pruning = False
     _is_stateful = True
     model_split_percents = [0.5, 0.6]
-    _torch_compile_test_ckpt = "google/gemma-2-9b"
 
     def setUp(self):
         self.model_tester = Gemma2ModelTester(self)
diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py
index 32bce7cbfa61..b92c5db815b7 100644
--- a/tests/models/glm/test_modeling_glm.py
+++ b/tests/models/glm/test_modeling_glm.py
@@ -28,7 +28,6 @@
     require_flash_attn,
     require_torch,
     require_torch_accelerator,
-    require_torch_gpu,
     require_torch_sdpa,
     slow,
     torch_device,
@@ -306,10 +305,6 @@ class GlmModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     test_headmasking = False
     test_pruning = False
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "THUDM/glm-4-9b"
-    _torch_compile_test_revision = "refs/pr/15"
-
     def setUp(self):
         self.model_tester = GlmModelTester(self)
         self.config_tester = ConfigTester(self, config_class=GlmConfig, hidden_size=37)
@@ -426,41 +421,6 @@ def test_custom_4d_attention_mask(self):
 
             torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-3)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """Overwrite the common test as the test is flaky on tiny models."""
-        model = GlmForCausalLM.from_pretrained(
-            "THUDM/glm-4-9b",
-            device_map={"": 0},
-            torch_dtype=torch.bfloat16,
-            revision="refs/pr/15",
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b", revision="refs/pr/15")
-        tokenizer.padding_side = "right"
-
-        texts = ["hi", "Hello this is a very long sentence"]
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=15, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = GlmForCausalLM.from_pretrained(
-            "THUDM/glm-4-9b",
-            device_map={"": 0},
-            attn_implementation="flash_attention_2",
-            torch_dtype=torch.bfloat16,
-            revision="refs/pr/15",
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=15, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
     @slow
diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py
index 6f6fba50dc12..afc741cd502d 100644
--- a/tests/models/gptj/test_modeling_gptj.py
+++ b/tests/models/gptj/test_modeling_gptj.py
@@ -17,14 +17,9 @@
 import datetime
 import unittest
 
-import pytest
-
-from transformers import BitsAndBytesConfig, GPTJConfig, is_torch_available
+from transformers import GPTJConfig, is_torch_available
 from transformers.testing_utils import (
-    require_bitsandbytes,
-    require_flash_attn,
     require_torch,
-    require_torch_gpu,
     slow,
     tooslow,
     torch_device,
@@ -505,44 +500,6 @@ def test_model_from_pretrained(self):
         model = GPTJModel.from_pretrained(model_name, revision="float16", torch_dtype=torch.float16)
         self.assertIsNotNone(model)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-        expected_outputs = [
-            "hi<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Q: I have a question about the new version of the game. I have a question about the",
-            "Hello this is a very long sentence.\n\nA:\n\nI think the best way to understand this is to think of it",
-        ]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-
-        model = GPTJForCausalLM.from_pretrained(
-            "EleutherAI/gpt-j-6b",
-            device_map={"": 0},
-            attn_implementation="flash_attention_2",
-            revision="float16",
-            torch_dtype=torch.float16,
-            quantization_config=quantization_config,
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(expected_outputs, output_fa_2)
-
 
 @require_torch
 class GPTJModelLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py
index 1bcb6641803c..97b59f5aa506 100644
--- a/tests/models/granite/test_modeling_granite.py
+++ b/tests/models/granite/test_modeling_granite.py
@@ -17,12 +17,10 @@
 import tempfile
 import unittest
 
-import pytest
 from parameterized import parameterized
 
-from transformers import AutoTokenizer, GraniteConfig, is_torch_available, set_seed
+from transformers import GraniteConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
-    require_bitsandbytes,
     require_flash_attn,
     require_read_token,
     require_torch,
@@ -303,9 +301,6 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "ibm/PowerLM-3b"
-
     def setUp(self):
         self.model_tester = GraniteModelTester(self)
         self.config_tester = ConfigTester(self, config_class=GraniteConfig, hidden_size=37)
@@ -423,46 +418,6 @@ def test_model_rope_scaling(self):
         with self.assertRaises(AssertionError):
             torch.testing.assert_close(yarn_sin_long, original_sin_long)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @require_read_token
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = GraniteForCausalLM.from_pretrained(
-            "ibm/PowerLM-3b",
-            load_in_4bit=True,
-            device_map={"": 0},
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("ibm/PowerLM-3b")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = GraniteForCausalLM.from_pretrained(
-            "ibm/PowerLM-3b",
-            load_in_4bit=True,
-            device_map={"": 0},
-            attn_implementation="flash_attention_2",
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
     @require_flash_attn
     @require_torch_gpu
     @slow
diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py
index 124ce0c3bb5a..f2f76b9fa75b 100644
--- a/tests/models/granitemoe/test_modeling_granitemoe.py
+++ b/tests/models/granitemoe/test_modeling_granitemoe.py
@@ -17,12 +17,10 @@
 import tempfile
 import unittest
 
-import pytest
 from parameterized import parameterized
 
 from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
-    require_bitsandbytes,
     require_flash_attn,
     require_read_token,
     require_torch,
@@ -302,9 +300,6 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "ibm/PowerMoE-3b"
-
     def setUp(self):
         self.model_tester = GraniteMoeModelTester(self)
         self.config_tester = ConfigTester(self, config_class=GraniteMoeConfig, hidden_size=37)
@@ -422,46 +417,6 @@ def test_model_rope_scaling(self):
         with self.assertRaises(AssertionError):
             torch.testing.assert_close(yarn_sin_long, original_sin_long)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @require_read_token
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = GraniteMoeForCausalLM.from_pretrained(
-            "ibm-granite/granitemoe-3b",
-            load_in_4bit=True,
-            device_map={"": 0},
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granitemoe-3b")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = GraniteMoeForCausalLM.from_pretrained(
-            "ibm-granite/granitemoe-3b",
-            load_in_4bit=True,
-            device_map={"": 0},
-            attn_implementation="flash_attention_2",
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
     @require_flash_attn
     @require_torch_gpu
     @slow
diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py
index 854b8b934578..5790d21c5348 100644
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@@ -196,10 +196,6 @@ def test_inputs_embeds():
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    @unittest.skip(reason="Model does not support padding right")
-    def test_flash_attn_2_generate_padding_right(self):
-        pass
-
     @unittest.skip(reason="Model does not support padding right")
     def test_flash_attn_2_inference_padding_right(self):
         pass
@@ -390,10 +386,6 @@ def setUp(self):
     def test_inputs_embeds():
         pass
 
-    @unittest.skip(reason="Model does not support padding right")
-    def test_flash_attn_2_generate_padding_right(self):
-        pass
-
     @unittest.skip(reason="Model does not support padding right")
     def test_flash_attn_2_inference_padding_right(self):
         pass
@@ -579,31 +571,6 @@ def test_resize_embeddings_untied(self):
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             model(**self._prepare_for_class(inputs_dict, model_class))
 
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        # overwrite because IDEFICS needs ids and embeds at the input to be not None
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
-
-            wte = model.get_input_embeddings()
-
-            input_ids = inputs["input_ids"]
-            # some models infer position ids/attn mask differently when input ids
-            # by check if pad_token let's make sure no padding is in input ids
-            not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
-            input_ids[input_ids == pad_token_id] = not_pad_token_id
-            del inputs["input_ids"]
-            inputs_embeds = wte(input_ids)
-            out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)
-            out_embeds = model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2)
-
-            self.assertTrue(torch.allclose(out_embeds, out_ids))
-
 
 @require_torch
 class Idefics2ForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index f0366e7b539a..3bb32c869913 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -180,10 +180,6 @@ def test_inputs_embeds():
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
-    @unittest.skip(reason="Model does not support padding right")
-    def test_flash_attn_2_generate_padding_right(self):
-        pass
-
     @unittest.skip(reason="Model does not support padding right")
     def test_flash_attn_2_inference_padding_right(self):
         pass
@@ -337,10 +333,6 @@ def setUp(self):
     def test_inputs_embeds():
         pass
 
-    @unittest.skip(reason="Model does not support padding right")
-    def test_flash_attn_2_generate_padding_right(self):
-        pass
-
     @unittest.skip(reason="Model does not support padding right")
     def test_flash_attn_2_inference_padding_right(self):
         pass
@@ -526,31 +518,6 @@ def test_resize_embeddings_untied(self):
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             model(**self._prepare_for_class(inputs_dict, model_class))
 
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        # overwrite because IDEFICS needs ids and embeds at the input to be not None
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            model.to(torch_device)
-            model.eval()
-
-            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
-            pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
-
-            wte = model.get_input_embeddings()
-
-            input_ids = inputs["input_ids"]
-            # some models infer position ids/attn mask differently when input ids
-            # by check if pad_token let's make sure no padding is in input ids
-            not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
-            input_ids[input_ids == pad_token_id] = not_pad_token_id
-            del inputs["input_ids"]
-            inputs_embeds = wte(input_ids)
-            out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)
-            out_embeds = model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2)
-
-            self.assertTrue(torch.allclose(out_embeds, out_ids))
-
 
 @require_torch
 class Idefics3ForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py
index 251f293f7226..ef0b5831587b 100644
--- a/tests/models/jamba/test_modeling_jamba.py
+++ b/tests/models/jamba/test_modeling_jamba.py
@@ -539,93 +539,6 @@ def test_flash_attn_2_fp32_ln(self):
                 # with attention mask
                 _ = model(dummy_input, attention_mask=dummy_attention_mask)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        r"""
-        Overriding the test_flash_attn_2_generate_padding_right test as the Jamba model, like Mixtral, doesn't support
-        right padding + use cache with FA2
-        """
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        r"""
-        Overriding the test_flash_attn_2_generate_use_cache test as the Jamba model, like Mixtral, doesn't support
-        right padding + use cache with FA2
-        """
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Jamba does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py
index a04d8bba741a..dc510f0ff040 100644
--- a/tests/models/jetmoe/test_modeling_jetmoe.py
+++ b/tests/models/jetmoe/test_modeling_jetmoe.py
@@ -15,7 +15,6 @@
 """Testing suite for the PyTorch JetMoe model."""
 
 import gc
-import tempfile
 import unittest
 
 import pytest
@@ -377,85 +376,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: JetMoe apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
index de6c0b15d661..0f0b595d3d23 100644
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -438,12 +438,6 @@ def check_same_values(layer_1, layer_2):
             # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape)
             # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head))
 
-    @unittest.skip(
-        "KOSMOS-2 doesn't support inputs embeds. The test isn't skipped by checking ipnut args because KOSMOS-2 has `generate()` overwritten"
-    )
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        pass
-
     @slow
     def test_model_from_pretrained(self):
         model_name = "microsoft/kosmos-2-patch14-224"
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 824337d8bdda..375ec1dd3e6f 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -26,7 +26,6 @@
 from transformers.generation.configuration_utils import GenerationConfig
 from transformers.testing_utils import (
     backend_empty_cache,
-    require_bitsandbytes,
     require_flash_attn,
     require_read_token,
     require_torch,
@@ -316,9 +315,6 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
     # This is because we are hitting edge cases with the causal_mask buffer
     model_split_percents = [0.5, 0.7, 0.8]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "meta-llama/Llama-2-7b-hf"
-
     # used in `test_torch_compile_for_training`
     _torch_compile_train_cls = LlamaForCausalLM if is_torch_available() else None
 
@@ -585,43 +581,6 @@ def _reinitialize_config(base_config, new_kwargs):
         with self.assertRaises(KeyError):
             config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}})  # missing "factor"
 
-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @require_read_token
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf",
-            load_in_4bit=True,
-            device_map={"": 0},
-        )
-
-        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2"
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
     @require_flash_attn
     @require_torch_gpu
     @slow
diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py
index f19358a22f4b..201aa0b33c0e 100644
--- a/tests/models/mamba2/test_modeling_mamba2.py
+++ b/tests/models/mamba2/test_modeling_mamba2.py
@@ -275,12 +275,6 @@ def recursive_check(tuple_object, dict_object):
             dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
             check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True})
 
-    @unittest.skip(
-        reason="Mamba2 does not support generating with input embeddings (custom cache_position computation)"
-    )
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        pass
-
 
 @require_torch
 @slow
diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py
index 074dceae1552..0e66b276065c 100644
--- a/tests/models/mimi/test_modeling_mimi.py
+++ b/tests/models/mimi/test_modeling_mimi.py
@@ -745,22 +745,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
     def test_sdpa_can_compile_dynamic(self):
         pass
 
-    # For now, Let's focus only on GPU for `torch.compile`
-    @slow
-    @require_torch_gpu
-    def test_torch_compile(self):
-        if version.parse(torch.__version__) < version.parse("2.3"):
-            self.skipTest(reason="This test requires torch >= 2.3 to run.")
-
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-        n_iter = 3
-        for model_class in self.all_model_classes:
-            model = model_class(config).to(torch_device)
-            model.forward = torch.compile(model.forward)
-            for i in range(n_iter):
-                _ = model(inputs_dict["input_values"].to(torch_device))
-
     @is_flaky()
     def test_batching_equivalence(self):
         super().test_batching_equivalence()
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index f2ee714bcdba..1538735ad78b 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -15,7 +15,6 @@
 """Testing suite for the PyTorch Mistral model."""
 
 import gc
-import tempfile
 import unittest
 
 import pytest
@@ -416,85 +415,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Mistral apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
index b9b5faed851f..931bb1f17bec 100644
--- a/tests/models/mixtral/test_modeling_mixtral.py
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Mixtral model."""
 
-import tempfile
 import unittest
 
 import pytest
@@ -415,85 +414,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Mixtral apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py
index 3efa7b778fb7..5174247b895e 100644
--- a/tests/models/mllama/test_modeling_mllama.py
+++ b/tests/models/mllama/test_modeling_mllama.py
@@ -126,7 +126,6 @@ class MllamaForCausalLMModelTest(ModelTesterMixin, GenerationTesterMixin, unitte
     all_generative_model_classes = (MllamaForCausalLM,) if is_torch_available() else ()
     test_pruning = False
     test_head_masking = False
-    _torch_compile_test_ckpt = "nltpt/Llama-3.2-11B-Vision"
 
     def setUp(self):
         self.model_tester = MllamaText2TextModelTester(self)
diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py
index 20412da2e1db..1628d3a5893e 100644
--- a/tests/models/mt5/test_modeling_mt5.py
+++ b/tests/models/mt5/test_modeling_mt5.py
@@ -576,9 +576,6 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     # The small MT5 model needs higher percentages for CPU/MP tests
     model_split_percents = [0.5, 0.8, 0.9]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "google/mt5-small"
-
     def setUp(self):
         self.model_tester = MT5ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MT5Config, d_model=37)
diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py
index 4c9dc95fb3bf..963cace28d6e 100644
--- a/tests/models/musicgen/test_modeling_musicgen.py
+++ b/tests/models/musicgen/test_modeling_musicgen.py
@@ -450,98 +450,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
                 assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
-    def test_flash_attn_2_generate_left_padding(self):
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # make sure we do left padding
-                dummy_attention_mask[:, :-1] = 0
-                dummy_attention_mask[:, -1:] = 1
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
-    def test_flash_attn_2_generate_padding_right(self):
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # make sure we do right padding
-                dummy_attention_mask[:, :-1] = 1
-                dummy_attention_mask[:, -1:] = 0
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
     @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
     @require_torch_sdpa
     @slow
@@ -1539,149 +1447,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
                 assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
-    def test_flash_attn_2_generate_left_padding(self):
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask")
-                if dummy_attention_mask is None:
-                    dummy_attention_mask = torch.ones_like(dummy_input)
-
-                # make sure we do left padding
-                dummy_attention_mask[:, :-1] = 0
-                dummy_attention_mask[:, -1:] = 1
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
-    def test_flash_attn_2_generate_padding_right(self):
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask")
-                if dummy_attention_mask is None:
-                    dummy_attention_mask = torch.ones_like(dummy_input)
-                # make sure we do right padding
-                dummy_attention_mask[:, :-1] = 1
-                dummy_attention_mask[:, -1:] = 0
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache
-    def test_flash_attn_2_generate_use_cache(self):
-        max_new_tokens = 30
-
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_torch_sdpa
     def test_sdpa_can_dispatch_composite_models(self):
         if not self.has_attentions:
diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
index f3b6be0ac652..957db9f23b0f 100644
--- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
+++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py
@@ -1437,149 +1437,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self):
 
                 assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
-    def test_flash_attn_2_generate_left_padding(self):
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask")
-                if dummy_attention_mask is None:
-                    dummy_attention_mask = torch.ones_like(dummy_input)
-
-                # make sure we do left padding
-                dummy_attention_mask[:, :-1] = 0
-                dummy_attention_mask[:, -1:] = 1
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
-    def test_flash_attn_2_generate_padding_right(self):
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = inputs_dict[model.main_input_name]
-                if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                    dummy_input = dummy_input.to(torch.float16)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask")
-                if dummy_attention_mask is None:
-                    dummy_attention_mask = torch.ones_like(dummy_input)
-                # make sure we do right padding
-                dummy_attention_mask[:, :-1] = 1
-                dummy_attention_mask[:, -1:] = 0
-
-                out = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                out_fa = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
-                )
-
-                self.assertTrue(torch.allclose(out, out_fa))
-
-    @require_flash_attn
-    @require_torch_gpu
-    @mark.flash_attn_test
-    @slow
-    # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache
-    def test_flash_attn_2_generate_use_cache(self):
-        max_new_tokens = 30
-
-        # Ignore copy
-        for model_class in self.greedy_sample_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_torch_sdpa
     def test_sdpa_can_dispatch_composite_models(self):
         if not self.has_attentions:
diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py
index 13adfe1e5794..37a581a33866 100644
--- a/tests/models/nemotron/test_modeling_nemotron.py
+++ b/tests/models/nemotron/test_modeling_nemotron.py
@@ -92,8 +92,6 @@ class NemotronModelTest(GemmaModelTest):
     test_pruning = False
     fx_compatible = False
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "nvidia/nemotron-3-8b-base-4k-hf"
     # used in `test_torch_compile_for_training`
     _torch_compile_train_cls = NemotronForCausalLM if is_torch_available() else None
 
diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py
index cfc2a2c29b1d..d07dd40603cd 100644
--- a/tests/models/paligemma/test_modeling_paligemma.py
+++ b/tests/models/paligemma/test_modeling_paligemma.py
@@ -316,10 +316,6 @@ def test_save_load_low_cpu_mem_usage_no_safetensors(self):
     def test_generate_from_inputs_embeds_with_static_cache(self):
         pass
 
-    @unittest.skip(reason="TODO (@joao): fix me -- failing to produce similar results")
-    def test_static_cache_matches_dynamic(self):
-        pass
-
     @unittest.skip("FlashAttention only support fp16 and bf16 data type")
     def test_flash_attn_2_fp32_ln(self):
         pass
diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py
index c17f69a49986..eae6789bef25 100644
--- a/tests/models/phi/test_modeling_phi.py
+++ b/tests/models/phi/test_modeling_phi.py
@@ -17,15 +17,11 @@
 
 import unittest
 
-import pytest
 from parameterized import parameterized
 
 from transformers import PhiConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
-    require_bitsandbytes,
-    require_flash_attn,
     require_torch,
-    require_torch_gpu,
     slow,
     torch_device,
 )
@@ -468,43 +464,6 @@ def test_model_rope_scaling(self):
             torch.testing.assert_close(ntk_sin_long, original_sin_long)
         self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all())
 
-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @slow
-    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_flash_attn_2_generate_padding_right with LlamaForCausalLM->PhiForCausalLM,LlamaTokenizer->AutoTokenizer,meta-llama/Llama-2-7b-hf->microsoft/phi-1
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = PhiForCausalLM.from_pretrained(
-            "microsoft/phi-1",
-            load_in_4bit=True,
-            device_map={"": 0},
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = PhiForCausalLM.from_pretrained(
-            "microsoft/phi-1", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2"
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
 
 @slow
 @require_torch
diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py
index 4e57f8e0f002..f51dc2e0a5e2 100644
--- a/tests/models/qwen2/test_modeling_qwen2.py
+++ b/tests/models/qwen2/test_modeling_qwen2.py
@@ -15,7 +15,6 @@
 """Testing suite for the PyTorch Qwen2 model."""
 
 import gc
-import tempfile
 import unittest
 
 import pytest
@@ -428,85 +427,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Qwen2 apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
index c545e882faee..abc7b57919b0 100644
--- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
+++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py
@@ -15,7 +15,6 @@
 """Testing suite for the PyTorch Qwen2MoE model."""
 
 import gc
-import tempfile
 import unittest
 
 import pytest
@@ -453,85 +452,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Qwen2Moe apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
index 956243dccebe..c020e8258b93 100644
--- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py
@@ -267,10 +267,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_feed_forward_chunking(self):
         pass
 
-    @unittest.skip(reason="Generate needs input ids")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        pass
-
     @unittest.skip(reason="CPU offload is not yet supported")
     def test_cpu_offload(self):
         pass
diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
index 542955f9fa45..985115d7707b 100644
--- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
+++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py
@@ -420,10 +420,6 @@ def _check_hidden_states_for_generate(
     def test_initialization(self):
         pass
 
-    @unittest.skip(reason="RecurrentGemma does not support generating with input embeddings (missing position_ids)")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        pass
-
 
 @require_torch_accelerator
 @slow
diff --git a/tests/models/starcoder2/test_modeling_starcoder2.py b/tests/models/starcoder2/test_modeling_starcoder2.py
index 32d28143d72f..df743f132c11 100644
--- a/tests/models/starcoder2/test_modeling_starcoder2.py
+++ b/tests/models/starcoder2/test_modeling_starcoder2.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """Testing suite for the PyTorch Starcoder2 model."""
 
-import tempfile
 import unittest
 
 import pytest
@@ -404,85 +403,6 @@ def test_save_load_fast_init_from_base(self):
     def test_past_key_values_format(self):
         pass
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Starcoder2 apparently does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index 68dd5a52b3d6..b03416390766 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -580,9 +580,6 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     # The small T5 model needs higher percentages for CPU/MP tests
     model_split_percents = [0.5, 0.8, 0.9]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "google-t5/t5-small"
-
     def setUp(self):
         self.model_tester = T5ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py
index ec4c1d019b6d..377668851c58 100644
--- a/tests/models/umt5/test_modeling_umt5.py
+++ b/tests/models/umt5/test_modeling_umt5.py
@@ -317,9 +317,6 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin
     # The small UMT5 model needs higher percentages for CPU/MP tests
     model_split_percents = [0.5, 0.8, 0.9]
 
-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "google/umt5-small"
-
     def setUp(self):
         self.model_tester = UMT5ModelTester(self)
 
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index b24c577a16e5..12aedaca8cf9 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -1574,59 +1574,6 @@ def test_generate_output_type(self, return_dict_in_generate):
             )
             assert isinstance(pred_ids, expected_output_type)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_reuse_cache(self):
-        max_new_tokens = 2
-        for model_class in self.all_generative_model_classes:
-            if not model_class._supports_flash_attn_2:
-                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
-
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name][..., :10]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # run generate once to get filled cache
-                output = model.generate(
-                    dummy_input,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                    return_dict_in_generate=True,
-                )
-                past_key_values = output.past_key_values
-
-                # Try to continue generation from where we left, given that we have more than 1 new token to process
-                # e.g. this can happen in speculative decoding when feeding candidate tokens back to target model
-                _ = model.generate(
-                    dummy_input,
-                    decoder_input_ids=output.sequences,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                    past_key_values=past_key_values,
-                )
-
     def test_labels_sequence_max_length_correct(self):
         config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -3961,11 +3908,6 @@ def test_generate_without_input_ids(self):
         # generate only works with input ids for whisper
         pass
 
-    @unittest.skip(reason="Generate needs input ids")
-    def test_inputs_embeds_matches_input_ids_with_generate(self):
-        # generate only works with input ids for whisper
-        pass
-
     @unittest.skip(reason="Decoder can't keep attention grads")
     def test_retain_grad_hidden_states_attentions(self):
         return
@@ -3974,18 +3916,6 @@ def test_retain_grad_hidden_states_attentions(self):
     def test_save_load_fast_init_from_base(self):
         pass
 
-    @unittest.skip(
-        reason="FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
-    )
-    def test_flash_attn_2_generate_reuse_cache(self):
-        pass
-
-    @unittest.skip(
-        "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
-    )
-    def test_flash_attn_2_generate_padding_right(self):
-        pass
-
     @unittest.skip(
         "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test"
     )
diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py
index c0a8020bedd7..a6dd516f98a4 100644
--- a/tests/models/zamba/test_modeling_zamba.py
+++ b/tests/models/zamba/test_modeling_zamba.py
@@ -542,93 +542,6 @@ def test_flash_attn_2_fp32_ln(self):
                 # with attention mask
                 _ = model(dummy_input, attention_mask=dummy_attention_mask)
 
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        r"""
-        Overriding the test_flash_attn_2_generate_padding_right test as the Zamba model, like Mixtral, doesn't support
-        right padding + use cache with FA2
-        """
-        import torch
-
-        for model_class in self.all_generative_model_classes:
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
-                    torch_device
-                )
-
-                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
-
-                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                with self.assertRaises(ValueError):
-                    _ = model.generate(
-                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
-                    )
-
-    @require_flash_attn
-    @require_torch_gpu
-    @pytest.mark.flash_attn_test
-    @slow
-    def test_flash_attn_2_generate_use_cache(self):
-        r"""
-        Overriding the test_flash_attn_2_generate_use_cache test as the Zamba model, like Mixtral, doesn't support
-        right padding + use cache with FA2
-        """
-        import torch
-
-        max_new_tokens = 30
-
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-
-            dummy_input = inputs_dict[model_class.main_input_name]
-            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
-                dummy_input = dummy_input.to(torch.float16)
-
-            # make sure that all models have enough positions for generation
-            if hasattr(config, "max_position_embeddings"):
-                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
-
-            model = model_class(config)
-
-            with tempfile.TemporaryDirectory() as tmpdirname:
-                model.save_pretrained(tmpdirname)
-
-                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # NOTE: Zamba does not support right padding + use_cache with FA2.
-                dummy_attention_mask[:, -1] = 1
-
-                model = model_class.from_pretrained(
-                    tmpdirname,
-                    torch_dtype=torch.float16,
-                    attn_implementation="flash_attention_2",
-                    low_cpu_mem_usage=True,
-                ).to(torch_device)
-
-                # Just test that a large cache works as expected
-                _ = model.generate(
-                    dummy_input,
-                    attention_mask=dummy_attention_mask,
-                    max_new_tokens=max_new_tokens,
-                    do_sample=False,
-                    use_cache=True,
-                )
-
     @require_flash_attn
     @require_torch_gpu
     @pytest.mark.flash_attn_test
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 6e08f2958b29..e2719d8cf1b6 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -82,7 +82,6 @@
     require_deepspeed,
     require_flash_attn,
     require_non_xpu,
-    require_read_token,
     require_safetensors,
     require_torch,
     require_torch_accelerator,

From 7feeecac09dbd860c86ed8ad219a34d127569ef0 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Mon, 28 Oct 2024 15:04:35 +0000
Subject: [PATCH 04/16] typo

---
 tests/generation/test_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 6c10b28e1d39..d3cc0b5d1f51 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1989,11 +1989,11 @@ def test_generate_compile(self, name, end_to_end):
             else:
                 model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead")
 
-            compuled_outputs = []
+            compiled_outputs = []
             for model_inputs in input_ids_sets:
-                compuled_outputs.append(model.generate(model_inputs, generation_config=generation_config))
+                compiled_outputs.append(model.generate(model_inputs, generation_config=generation_config))
 
-            for dynamic_result, compiled_result in zip(dynamic_outputs, compuled_outputs):
+            for dynamic_result, compiled_result in zip(dynamic_outputs, compiled_outputs):
                 self._check_similar_generate_outputs(dynamic_result, compiled_result)
 
     @pytest.mark.generate

From a1959f8b59ac1234e56ab407771cfa2721b60253 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Mon, 28 Oct 2024 15:24:48 +0000
Subject: [PATCH 05/16] more specific docstring

---
 tests/generation/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index d3cc0b5d1f51..fbb2b4fed50a 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -142,7 +142,7 @@ def _check_similar_generate_outputs(self, output_1, output_2, atol=1e-5, rtol=1e
         Checks whether a pair of generate outputs are similar. Two `generate` call outputs are considered similar in
         the following siturations:
         1. The sequences are the same
-        2. The sequences are different, but the scores up until the first mismatch are nearly identical
+        2. The sequences are different, but the scores up to (and including) the first mismatch are nearly identical
         """
         # scores doesn't include data regarding decoder input tokens
         decoder_input_length = output_1.sequences.shape[1] - len(output_1.scores)

From e84177ed6012b745a8b490065b5111c0874f1a7a Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Mon, 28 Oct 2024 15:27:59 +0000
Subject: [PATCH 06/16] make fixup

---
 tests/models/mimi/test_modeling_mimi.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py
index 0e66b276065c..df0007d666a0 100644
--- a/tests/models/mimi/test_modeling_mimi.py
+++ b/tests/models/mimi/test_modeling_mimi.py
@@ -21,7 +21,6 @@
 
 import numpy as np
 from datasets import Audio, load_dataset
-from packaging import version
 from parameterized import parameterized
 from pytest import mark
 

From ad8ebae13b59896465b8d12f02a421de7101a868 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Mon, 28 Oct 2024 15:37:59 +0000
Subject: [PATCH 07/16] parameterize at the top?

---
 tests/generation/test_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index fbb2b4fed50a..88a7616d4592 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1928,15 +1928,15 @@ def test_generate_with_quant_cache(self):
             with self.assertRaises(ValueError):
                 model.generate(**generation_kwargs, **inputs_dict)
 
-    @pytest.mark.generate
-    @require_torch_gpu
-    @slow
     @parameterized.expand(
         [
             ("forward_only", False),  # TODO (@joao): a few models failing. After fixed, this should not be "@slow"
             ("end_to_end", True),  # TODO (@joao): end-to-end compilation is broken with torch 2.5+, explore and fix
         ]
     )
+    @pytest.mark.generate
+    @require_torch_gpu
+    @slow
     def test_generate_compile(self, name, end_to_end):
         """
         Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. Tests

From def0940da5d7bbbe2381c12abe128ba37a6b9104 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Mon, 28 Oct 2024 15:54:20 +0000
Subject: [PATCH 08/16] correction

---
 tests/generation/test_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 88a7616d4592..c30048fa8674 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -2147,14 +2147,14 @@ def test_eager_matches_fa2_generate(self):
         self._test_attention_implementation("flash_attention_2")
 
     def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1):
-        batch_size = output.sequences.shape[0]
+        num_sequences_in_output = output.sequences.shape[0]
+        batch_size = num_sequences_in_output / num_return_sequences
 
         seq_length = getattr(self.model_tester, "seq_length", None)
         seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
         seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
 
         config = config.text_config if hasattr(config, "text_config") else config
-        num_sequences_in_output = batch_size * num_return_sequences
 
         gen_len = (
             output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length

From a03703738090ca84aa273b9e20369246f3fb3b89 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Tue, 29 Oct 2024 09:45:00 +0000
Subject: [PATCH 09/16] more deletions :D

---
 tests/generation/test_utils.py                |  2 +-
 tests/models/idefics/test_modeling_idefics.py |  7 ---
 .../models/idefics2/test_modeling_idefics2.py | 45 ----------------
 .../models/idefics3/test_modeling_idefics3.py | 45 ----------------
 tests/models/mamba2/test_modeling_mamba2.py   |  2 +-
 tests/models/moshi/test_modeling_moshi.py     | 53 -------------------
 6 files changed, 2 insertions(+), 152 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 02d7c94a168f..810d22c47cb5 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1546,7 +1546,7 @@ def test_past_key_values_format(self):
     @pytest.mark.generate
     @parameterized.expand([(1,), (2,)])
     def test_generate_from_inputs_embeds(self, num_beams):
-      """Tests that we can generate from `inputs_embeds` instead of `input_ids` in LLMs, VLMs, etc"""
+        """Tests that we can generate from `inputs_embeds` instead of `input_ids` in LLMs, VLMs, etc"""
         # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids`
         # if fails, you should probably update the `prepare_inputs_for_generation` function
         for model_class in self.all_generative_model_classes:
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index c2f0ef8ccd01..d19d10932bfc 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -770,13 +770,6 @@ def test_contrastive_generate_low_memory(self):
     def test_custom_4d_attention_mask(self):
         pass
 
-    @unittest.skip(
-        reason="IDEFICS has specific requirements for working with inputs embeds like passing also the ids and pixels"
-    )
-    @parameterized.expand([(1,), (2,)])
-    def test_generate_from_inputs_embeds_decoder_only(self, num_beams):
-        pass
-
     @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs")
     def test_generate_compile_fullgraph(self):
         pass
diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py
index 5790d21c5348..f33a9572ebe8 100644
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@@ -20,7 +20,6 @@
 import unittest
 from io import BytesIO
 
-import pytest
 import requests
 
 from transformers import (
@@ -412,50 +411,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
     def test_flash_attn_2_fp32_ln(self):
         pass
 
-    @pytest.mark.generate
-    def test_generate_from_inputs_embeds_decoder_only(self):
-        # overwrite because IDEFICS needs ids and embeds at the input to be not None
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-
-            # Ignore:
-            # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids,
-            #   which would cause a mismatch),
-            config.pad_token_id = config.eos_token_id = -1
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
-            input_ids = inputs_dict.pop("input_ids")
-
-            # Traditional way of generating text
-            outputs_from_ids = model.generate(
-                input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
-            )
-            self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
-
-            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs_from_embeds = model.generate(
-                input_ids,
-                inputs_embeds=inputs_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-            self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist())
-
-            # But if we pass different inputs_embeds, we should get different outputs (the output text may be the
-            # same, but the logits will almost surely be different)
-            random_embeds = torch.rand_like(inputs_embeds)
-            outputs_from_rand_embeds = model.generate(
-                input_ids,
-                inputs_embeds=random_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-            for i in range(len(outputs_from_rand_embeds.scores)):
-                self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i]))
-
     # We need to override as we need to prepare such that the image token is the last token
     def test_resize_tokens_embeddings(self):
         (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py
index 3bb32c869913..5dc352d22fe0 100644
--- a/tests/models/idefics3/test_modeling_idefics3.py
+++ b/tests/models/idefics3/test_modeling_idefics3.py
@@ -19,7 +19,6 @@
 import unittest
 from io import BytesIO
 
-import pytest
 import requests
 
 from transformers import (
@@ -359,50 +358,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
     def test_flash_attn_2_fp32_ln(self):
         pass
 
-    @pytest.mark.generate
-    def test_generate_from_inputs_embeds_decoder_only(self):
-        # overwrite because IDEFICS needs ids and embeds at the input to be not None
-        for model_class in self.all_generative_model_classes:
-            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-
-            # Ignore:
-            # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids,
-            #   which would cause a mismatch),
-            config.pad_token_id = config.eos_token_id = -1
-            config.is_decoder = True
-            model = model_class(config).to(torch_device).eval()
-            input_ids = inputs_dict.pop("input_ids")
-
-            # Traditional way of generating text
-            outputs_from_ids = model.generate(
-                input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True
-            )
-            self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
-
-            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs_from_embeds = model.generate(
-                input_ids,
-                inputs_embeds=inputs_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-            self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist())
-
-            # But if we pass different inputs_embeds, we should get different outputs (the output text may be the
-            # same, but the logits will almost surely be different)
-            random_embeds = torch.rand_like(inputs_embeds)
-            outputs_from_rand_embeds = model.generate(
-                input_ids,
-                inputs_embeds=random_embeds,
-                max_new_tokens=5,
-                return_dict_in_generate=True,
-                output_scores=True,
-            )
-            for i in range(len(outputs_from_rand_embeds.scores)):
-                self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i]))
-
     # We need to override as we need to prepare such that the image token is the last token
     def test_resize_tokens_embeddings(self):
         (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py
index cb859ef5327b..6cb88192bc19 100644
--- a/tests/models/mamba2/test_modeling_mamba2.py
+++ b/tests/models/mamba2/test_modeling_mamba2.py
@@ -205,7 +205,7 @@ def test_generate_without_input_ids(self):
 
     @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case")
     @parameterized.expand([(1,), (2,)])
-    def test_generate_from_inputs_embeds_decoder_only(self, num_beams):
+    def test_generate_from_inputs_embeds(self, num_beams):
         pass
 
     @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case")
diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py
index 7bf95ea873d9..082ef425e7c8 100644
--- a/tests/models/moshi/test_modeling_moshi.py
+++ b/tests/models/moshi/test_modeling_moshi.py
@@ -655,59 +655,6 @@ def test_initialization(self):
                             msg=f"Parameter {name} of model {model_class} seems not properly initialized",
                         )
 
-    @pytest.mark.generate
-    @parameterized.expand([(1,), (2,)])
-    def test_generate_from_inputs_embeds_decoder_only(self, num_beams):
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, _, inputs_dict = self._get_input_ids_and_config()
-
-            model = model_class(config).to(torch_device).eval()
-            generation_kwargs = {
-                "return_dict_in_generate": True,
-                "output_scores": True,
-                "num_beams": num_beams,
-                "do_sample": False,
-            }
-
-            # Traditional way of generating text
-            outputs_from_ids = model.generate(input_ids, max_new_tokens=5, **generation_kwargs, **inputs_dict)
-            self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
-
-            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
-            inputs_embeds = model.get_input_embeddings()(input_ids)
-            outputs_from_embeds = model.generate(
-                input_ids,
-                inputs_embeds=inputs_embeds,
-                max_new_tokens=5,
-                **generation_kwargs,
-                **inputs_dict,
-            )
-
-            # But if we pass different inputs_embeds, we should get different outputs (the output text may be the
-            # same, but the logits will almost surely be different)
-            random_embeds = torch.rand_like(inputs_embeds)
-            outputs_from_rand_embeds = model.generate(
-                input_ids,
-                inputs_embeds=random_embeds,
-                max_new_tokens=5,
-                **generation_kwargs,
-                **inputs_dict,
-            )
-            for i in range(len(outputs_from_rand_embeds.scores)):
-                self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i]))
-
-            # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same
-            outputs_from_embeds_wo_ids = model.generate(
-                inputs_embeds=inputs_embeds,
-                max_new_tokens=5,
-                **generation_kwargs,
-                **inputs_dict,
-            )
-            self.assertListEqual(
-                outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(),
-                outputs_from_embeds_wo_ids.sequences.tolist(),
-            )
-
     @unittest.skip(reason="Continuing from past key values is not straightforward as we're dealing with 3 inputs")
     def test_generate_continue_from_past_key_values(self):
         pass

From 02a32fb7a27becd407029c332f439efad63c6c76 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Tue, 29 Oct 2024 09:56:59 +0000
Subject: [PATCH 10/16] tmp commit

---
 tests/generation/test_utils.py              | 30 ++++++++-------------
 tests/models/mamba2/test_modeling_mamba2.py |  4 +--
 2 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 810d22c47cb5..cdcf19a53492 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1544,20 +1544,16 @@ def test_past_key_values_format(self):
                     )
 
     @pytest.mark.generate
-    @parameterized.expand([(1,), (2,)])
-    def test_generate_from_inputs_embeds(self, num_beams):
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
         """Tests that we can generate from `inputs_embeds` instead of `input_ids` in LLMs, VLMs, etc"""
         # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids`
         # if fails, you should probably update the `prepare_inputs_for_generation` function
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
 
-            # Ignore:
-            # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids,
-            #   which would cause a mismatch),
-            config.pad_token_id = config.eos_token_id = -1
-            # b) embedding scaling, the scaling factor applied after embeding from input_ids (requires knowledge of the
-            #   variable that holds the scaling factor, which is model-dependent)
+            # Ignore embedding scaling, the scaling factor applied after embeding from input_ids (requires knowledge
+            # of the variable that holds the scaling factor, which is model-dependent)
             if hasattr(config, "scale_embedding"):
                 config.scale_embedding = False
 
@@ -1578,19 +1574,18 @@ def test_generate_from_inputs_embeds(self, num_beams):
                 "output_scores": True,
                 "num_beams": num_beams,
                 "do_sample": False,
+                "max_new_tokens": 5,
+                "min_new_tokens": 5,  # generate exactly 5 tokens
             }
 
             # Traditional way of generating text
-            outputs_from_ids = model.generate(input_ids, max_new_tokens=5, **generation_kwargs)
+            outputs_from_ids = model.generate(input_ids, **generation_kwargs, **inputs_dict)
             self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
 
             # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
             inputs_embeds = model.get_input_embeddings()(input_ids)
             outputs_from_embeds = model.generate(
-                input_ids,
-                inputs_embeds=inputs_embeds,
-                max_new_tokens=5,
-                **generation_kwargs,
+                input_ids, inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict
             )
             self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist())
 
@@ -1598,17 +1593,14 @@ def test_generate_from_inputs_embeds(self, num_beams):
             # same, but the logits will almost surely be different)
             random_embeds = torch.rand_like(inputs_embeds)
             outputs_from_rand_embeds = model.generate(
-                input_ids,
-                inputs_embeds=random_embeds,
-                max_new_tokens=5,
-                **generation_kwargs,
+                input_ids, inputs_embeds=random_embeds, **generation_kwargs, **inputs_dict
             )
             for i in range(len(outputs_from_rand_embeds.scores)):
                 self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i]))
 
             # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same
             outputs_from_embeds_wo_ids = model.generate(
-                inputs_embeds=inputs_embeds, max_new_tokens=5, **generation_kwargs
+                inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict
             )
             self.assertListEqual(
                 outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(),
@@ -1917,7 +1909,7 @@ def test_generate_with_quant_cache(self):
     @pytest.mark.generate
     @require_torch_gpu
     @slow
-    def test_generate_compile(self, name, end_to_end):
+    def test_generate_compile(self, _, end_to_end):
         """
         Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. Tests
         end-to-end compilation and forward pass compilation only.
diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py
index 6cb88192bc19..9b3a9563b58d 100644
--- a/tests/models/mamba2/test_modeling_mamba2.py
+++ b/tests/models/mamba2/test_modeling_mamba2.py
@@ -204,8 +204,8 @@ def test_generate_without_input_ids(self):
         pass
 
     @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case")
-    @parameterized.expand([(1,), (2,)])
-    def test_generate_from_inputs_embeds(self, num_beams):
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
         pass
 
     @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case")

From 08a59b247ea1e52dcc9eda3ae0d5a36b7388b568 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Tue, 29 Oct 2024 11:38:49 +0000
Subject: [PATCH 11/16]  for VLMs too

---
 .../modeling_llava_next_video.py              |  3 +-
 .../modular_llava_next_video.py               |  3 +-
 .../modeling_llava_onevision.py               |  3 +-
 .../video_llava/modeling_video_llava.py       |  3 +-
 tests/generation/test_utils.py                | 60 +++++++++++++------
 .../models/idefics2/test_modeling_idefics2.py | 33 ++++++++++
 tests/models/moshi/test_modeling_moshi.py     |  2 +-
 7 files changed, 83 insertions(+), 24 deletions(-)

diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
index 44b372535d70..95618e0b6e12 100644
--- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py
@@ -911,7 +911,8 @@ def forward(
 
         if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
-                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+                "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, "
+                "and must specify either one"
             )
 
         legacy_processing = False
diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py
index e9974e920493..c6f5d708abbc 100644
--- a/src/transformers/models/llava_next_video/modular_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py
@@ -424,7 +424,8 @@ def forward(
 
         if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
-                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+                "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, "
+                "and must specify either one"
             )
 
         legacy_processing = False
diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
index 946688bfcf07..bb6a8a10f0b4 100644
--- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py
@@ -657,7 +657,8 @@ def forward(
 
         if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
-                "You cannot specify both pixel_values/pixel_values_videos and inputs_embeds at the same time, and must specify either one"
+                "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, "
+                "and must specify either one"
             )
 
         if inputs_embeds is None:
diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py
index 30f82e45056c..1f02cf583966 100644
--- a/src/transformers/models/video_llava/modeling_video_llava.py
+++ b/src/transformers/models/video_llava/modeling_video_llava.py
@@ -534,7 +534,8 @@ def forward(
 
         if (pixel_values_images is not None or pixel_values_videos is not None) and inputs_embeds is not None:
             raise ValueError(
-                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+                "You cannot specify both `pixel_values_images`/`pixel_values_videos` and `inputs_embeds` at the same "
+                "time, and must specify either one"
             )
 
         legacy_processing = False
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index cdcf19a53492..89db3d562257 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1552,22 +1552,44 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
 
-            # Ignore embedding scaling, the scaling factor applied after embeding from input_ids (requires knowledge
-            # of the variable that holds the scaling factor, which is model-dependent)
-            if hasattr(config, "scale_embedding"):
-                config.scale_embedding = False
-
             # This test is for decoder-only models (encoder-decoder models have native input embeddings support in the
             # decoder)
             if config.is_encoder_decoder:
                 continue
+            config.is_decoder = True
 
             # Skip models without explicit support
-            config.is_decoder = True
             model = model_class(config).to(torch_device).eval()
             if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys():
                 continue
 
+            # There are a few exception patterns in this test:
+            # 1 - Some models can't generate without `input_ids`, when `inputs_embeds` are passed
+            requires_inputs_ids = any(
+                model_name in model_class.__name__.lower() for model_name in ["idefics", "qwen2vl"]
+            )
+            # 2 - Complex `inputs_embeds` computation, i.e. the correct computation of inputs embeds is more complex
+            # than calling the embedding layer with `input_ids`. Subcases of this exception:
+            #   2.A - Ignore `scale_embedding`, if the model supports it (it is controlled by a model-dependent flag)
+            if hasattr(config, "scale_embedding"):
+                config.scale_embedding = False
+            #   2.B - Some VLMs assume `inputs_embeds` and `pixel_values` are mutually exclusive AND fall in the
+            #   exception above (complex `inputs_embeds` computation). Popping `pixel_values` allow us to run the
+            #   checks without adding test complexity. Ditto for `pixel_values_videos` and `pixel_values_images`
+            pixel_values_is_mutually_exclusive = any(
+                model_name in model_class.__name__.lower()
+                for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma"]
+            )
+            if pixel_values_is_mutually_exclusive:
+                inputs_dict.pop("pixel_values", None)
+                inputs_dict.pop("pixel_values_videos", None)
+                inputs_dict.pop("pixel_values_images", None)
+            #   2.C - No easy fix, let's skip the check that compares the outputs from `input_ids` and `inputs_embeds`
+            has_complex_embeds_computation = any(
+                model_name in model_class.__name__.lower() for model_name in ["moshi"]
+            )
+
+            # Traditional way of generating text
             input_ids = inputs_dict.pop("input_ids")
             generation_kwargs = {
                 "return_dict_in_generate": True,
@@ -1577,19 +1599,19 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
                 "max_new_tokens": 5,
                 "min_new_tokens": 5,  # generate exactly 5 tokens
             }
-
-            # Traditional way of generating text
             outputs_from_ids = model.generate(input_ids, **generation_kwargs, **inputs_dict)
             self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5))
 
-            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output)
+            # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output).
+            # The output of the two calls should be the same.
             inputs_embeds = model.get_input_embeddings()(input_ids)
             outputs_from_embeds = model.generate(
                 input_ids, inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict
             )
-            self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist())
+            if not has_complex_embeds_computation:
+                self._check_similar_generate_outputs(outputs_from_ids, outputs_from_embeds)
 
-            # But if we pass different inputs_embeds, we should get different outputs (the output text may be the
+            # If we pass different inputs_embeds, we should get different outputs (the output text may be the
             # same, but the logits will almost surely be different)
             random_embeds = torch.rand_like(inputs_embeds)
             outputs_from_rand_embeds = model.generate(
@@ -1598,14 +1620,14 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
             for i in range(len(outputs_from_rand_embeds.scores)):
                 self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i]))
 
-            # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same
-            outputs_from_embeds_wo_ids = model.generate(
-                inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict
-            )
-            self.assertListEqual(
-                outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(),
-                outputs_from_embeds_wo_ids.sequences.tolist(),
-            )
+            # input_ids is not a required input on most models -- if we don't pass it, the newly generated tokens will
+            # be the same
+            if not requires_inputs_ids:
+                outputs_from_embeds_wo_ids = model.generate(
+                    inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict
+                )
+                outputs_from_embeds.sequences = outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :]
+                self._check_similar_generate_outputs(outputs_from_embeds_wo_ids, outputs_from_embeds)
 
     @pytest.mark.generate
     def test_generate_from_inputs_embeds_with_static_cache(self):
diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py
index f33a9572ebe8..042fecf4bd25 100644
--- a/tests/models/idefics2/test_modeling_idefics2.py
+++ b/tests/models/idefics2/test_modeling_idefics2.py
@@ -195,6 +195,10 @@ def test_inputs_embeds():
     def test_inputs_embeds_matches_input_ids(self):
         pass
 
+    @unittest.skip(reason="Model does not support padding right")
+    def test_flash_attn_2_generate_padding_right(self):
+        pass
+
     @unittest.skip(reason="Model does not support padding right")
     def test_flash_attn_2_inference_padding_right(self):
         pass
@@ -385,6 +389,10 @@ def setUp(self):
     def test_inputs_embeds():
         pass
 
+    @unittest.skip(reason="Model does not support padding right")
+    def test_flash_attn_2_generate_padding_right(self):
+        pass
+
     @unittest.skip(reason="Model does not support padding right")
     def test_flash_attn_2_inference_padding_right(self):
         pass
@@ -526,6 +534,31 @@ def test_resize_embeddings_untied(self):
             # Check that the model can still do a forward pass successfully (every parameter should be resized)
             model(**self._prepare_for_class(inputs_dict, model_class))
 
+    def test_inputs_embeds_matches_input_ids_with_generate(self):
+        # overwrite because IDEFICS needs ids and embeds at the input to be not None
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class))
+            pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1
+
+            wte = model.get_input_embeddings()
+
+            input_ids = inputs["input_ids"]
+            # some models infer position ids/attn mask differently when input ids
+            # by check if pad_token let's make sure no padding is in input ids
+            not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1
+            input_ids[input_ids == pad_token_id] = not_pad_token_id
+            del inputs["input_ids"]
+            inputs_embeds = wte(input_ids)
+            out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)
+            out_embeds = model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2)
+
+            self.assertTrue(torch.allclose(out_embeds, out_ids))
+
 
 @require_torch
 class Idefics2ForConditionalGenerationIntegrationTest(unittest.TestCase):
diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py
index 082ef425e7c8..4d138193a191 100644
--- a/tests/models/moshi/test_modeling_moshi.py
+++ b/tests/models/moshi/test_modeling_moshi.py
@@ -560,7 +560,7 @@ def _get_input_ids_and_config(self, batch_size=2):
         return config, input_ids, attention_mask, inputs_dict
 
     def prepare_config_and_inputs_for_generate(self, batch_size=2):
-        config, filtered_inputs_dict = super().prepare_config_and_inputs_for_generate()
+        config, filtered_inputs_dict = super().prepare_config_and_inputs_for_generate(batch_size=batch_size)
 
         # Make sure we only return `input_ids`.
         # Note that audio_codes will still be generated internally, so the ability to test audio codes is still there.

From 9b17c4bad7cac89979701b4c1890f2f740c3297f Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Tue, 29 Oct 2024 12:24:50 +0000
Subject: [PATCH 12/16] fix _check_outputs

---
 tests/generation/test_utils.py            | 60 ++++++++++++++++-------
 tests/models/moshi/test_modeling_moshi.py |  6 ++-
 2 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 89db3d562257..af858c53027f 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -607,7 +607,12 @@ def test_beam_search_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
-            self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"])
+            self._check_outputs(
+                output_generate,
+                model.config,
+                num_return_sequences=beam_kwargs["num_return_sequences"],
+                num_beams=beam_kwargs["num_beams"],
+            )
 
     @pytest.mark.generate
     def test_beam_search_generate_dict_outputs_use_cache(self):
@@ -644,7 +649,11 @@ def test_beam_search_generate_dict_outputs_use_cache(self):
                 )
 
             self._check_outputs(
-                output_generate, model.config, use_cache=True, num_return_sequences=beam_kwargs["num_beams"]
+                output_generate,
+                model.config,
+                use_cache=True,
+                num_return_sequences=beam_kwargs["num_return_sequences"],
+                num_beams=beam_kwargs["num_beams"],
             )
 
     @require_accelerate
@@ -722,7 +731,12 @@ def test_beam_sample_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput)
 
-            self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"])
+            self._check_outputs(
+                output_generate,
+                model.config,
+                num_return_sequences=beam_kwargs["num_return_sequences"],
+                num_beams=beam_kwargs["num_beams"],
+            )
 
     @pytest.mark.generate
     def test_generate_without_input_ids(self):
@@ -807,7 +821,12 @@ def test_group_beam_search_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
-            self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"])
+            self._check_outputs(
+                output_generate,
+                model.config,
+                num_return_sequences=beam_kwargs["num_return_sequences"],
+                num_beams=beam_kwargs["num_beams"],
+            )
 
     # TODO: @gante check why it is flaky
     @is_flaky()
@@ -909,7 +928,12 @@ def test_constrained_beam_search_generate_dict_output(self):
                 # Retrocompatibility check
                 self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput)
 
-            self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"])
+            self._check_outputs(
+                output_generate,
+                model.config,
+                num_return_sequences=beam_kwargs["num_return_sequences"],
+                num_beams=beam_kwargs["num_beams"],
+            )
 
     @pytest.mark.generate
     def test_contrastive_generate(self):
@@ -2140,9 +2164,9 @@ def test_eager_matches_fa2_generate(self):
         # check whether we still need the overwrites
         self._test_attention_implementation("flash_attention_2")
 
-    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1):
-        num_sequences_in_output = output.sequences.shape[0]
-        batch_size = num_sequences_in_output / num_return_sequences
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
+        input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
+        internal_batch_size = input_batch_size * num_beams
 
         seq_length = getattr(self.model_tester, "seq_length", None)
         seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
@@ -2159,19 +2183,21 @@ def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1
             seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
 
         # scores
-        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
+        self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
 
         # unprocessed logits
-        self._check_logits(num_sequences_in_output, output.logits, config=config)
+        self._check_logits(internal_batch_size, output.logits, config=config)
 
         # Attentions
         if self.has_attentions:
             if config.is_encoder_decoder:
                 # encoder
-                self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
+                self._check_encoder_attention_for_generate(
+                    output.encoder_attentions, input_batch_size, config, seq_length
+                )
                 # decoder
                 self._check_attentions_for_generate(
-                    num_sequences_in_output,
+                    internal_batch_size,
                     output.decoder_attentions,
                     min_length=1,
                     max_length=output.sequences.shape[-1],
@@ -2183,7 +2209,7 @@ def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1
                 attentions = output.attentions if not use_cache else output.attentions[1:]
                 min_length = seq_length if not use_cache else seq_length + 1
                 self._check_attentions_for_generate(
-                    num_sequences_in_output,
+                    internal_batch_size,
                     attentions=attentions,
                     min_length=min_length,
                     max_length=output.sequences.shape[-1],
@@ -2195,12 +2221,12 @@ def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1
         if config.is_encoder_decoder:
             # encoder
             self._check_encoder_hidden_states_for_generate(
-                output.encoder_hidden_states, batch_size, config, seq_length
+                output.encoder_hidden_states, input_batch_size, config, seq_length
             )
 
             # decoder
             self._check_hidden_states_for_generate(
-                num_sequences_in_output,
+                internal_batch_size,
                 output.decoder_hidden_states,
                 min_length=1,
                 max_length=output.sequences.shape[-1],
@@ -2212,7 +2238,7 @@ def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1
             hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
             min_length = seq_length if not use_cache else seq_length + 1
             self._check_hidden_states_for_generate(
-                num_sequences_in_output,
+                internal_batch_size,
                 hidden_states,
                 min_length=min_length,
                 max_length=output.sequences.shape[-1],
@@ -2243,7 +2269,7 @@ def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1
                 past_key_values = output.past_key_values
                 past_sequence_length = output.sequences.shape[-1] - 1
                 self._check_past_key_values_for_generate(
-                    num_sequences_in_output,
+                    internal_batch_size,
                     past_key_values,
                     seq_length=past_sequence_length,
                     config=config,
diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py
index 4d138193a191..7d4b855c10d8 100644
--- a/tests/models/moshi/test_modeling_moshi.py
+++ b/tests/models/moshi/test_modeling_moshi.py
@@ -591,9 +591,11 @@ def _check_hidden_states_for_generate(
                 [expected_shape] * len(iter_hidden_states),
             )
 
-    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1):
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
         # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True`
-        super()._check_outputs(output, config, use_cache=True, num_return_sequences=num_return_sequences)
+        super()._check_outputs(
+            output, config, use_cache=True, num_return_sequences=num_return_sequences, num_beams=num_beams
+        )
 
     def _check_hidden_states_for_generate(
         self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1

From 4a24d7e1bccd8306d6035c6ccbbcee8acfc0e9bc Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Tue, 29 Oct 2024 12:29:09 +0000
Subject: [PATCH 13/16] test nit

---
 tests/generation/test_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index af858c53027f..a042875235d9 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -2166,7 +2166,7 @@ def test_eager_matches_fa2_generate(self):
 
     def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
         input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
-        internal_batch_size = input_batch_size * num_beams
+        internal_batch_size = input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
 
         seq_length = getattr(self.model_tester, "seq_length", None)
         seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)

From e50df32a583fbd5a6e3cea179a173e079e8514a6 Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Tue, 29 Oct 2024 12:36:18 +0000
Subject: [PATCH 14/16] make fixup

---
 tests/generation/test_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index a042875235d9..96348b1c51db 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -2166,7 +2166,9 @@ def test_eager_matches_fa2_generate(self):
 
     def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
         input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
-        internal_batch_size = input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
+        internal_batch_size = (
+            input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
+        )
 
         seq_length = getattr(self.model_tester, "seq_length", None)
         seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)

From 81443f5b7e170928bf6930e3427cb7da8f4e56bf Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Tue, 29 Oct 2024 13:30:25 +0000
Subject: [PATCH 15/16] fix another flaky

---
 tests/generation/test_utils.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 96348b1c51db..bc8d140da595 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -2048,6 +2048,9 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
                 self.skipTest(reason="Stateful models don't support assisted generation")
 
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
+            # NOTE: assisted generation only works with cache on at the moment.
+            if not hasattr(config, "use_cache"):
+                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
             config.use_cache = True
             config.is_decoder = True
 
@@ -2064,7 +2067,6 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
                 "output_scores": True,
             }
 
-            assistant_model.generation_config.assistant_confidence_threshold = None
             # Setting num_logits_to_keep at 0 keeps all logits (old behavior)
             with_all_logits = model.generate(**generation_kwargs, **inputs_dict, num_logits_to_keep=0)
             # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior)

From 4fd197fdf4f716568be279b341ab324d0bedd1dd Mon Sep 17 00:00:00 2001
From: Joao Gante <joao@huggingface.co>
Date: Tue, 29 Oct 2024 14:17:25 +0000
Subject: [PATCH 16/16] test_generate_from_inputs_embeds -- handle missing
 attention mask

---
 src/transformers/generation/utils.py          | 23 ++++++++++++-------
 .../models/musicgen/modeling_musicgen.py      |  4 ++--
 .../modeling_musicgen_melody.py               |  4 ++--
 tests/generation/test_utils.py                |  5 +++-
 4 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 46ea679aca50..6e6d5b8bdce7 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -572,27 +572,34 @@ def _maybe_initialize_input_ids_for_generation(
 
     def _prepare_attention_mask_for_generation(
         self,
-        inputs: torch.Tensor,
-        pad_token_id: Optional[torch.Tensor],
-        eos_token_id: Optional[torch.Tensor],
+        inputs_tensor: torch.Tensor,
+        generation_config: GenerationConfig,
+        model_kwargs: Dict[str, Any],
     ) -> torch.LongTensor:
+        pad_token_id = generation_config._pad_token_tensor
+        eos_token_id = generation_config._eos_token_tensor
+
+        # `input_ids` may be present in the model kwargs, instead of being the main input (e.g. multimodal model)
+        if "input_ids" in model_kwargs and model_kwargs["input_ids"].shape[1] > 0:
+            inputs_tensor = model_kwargs["input_ids"]
+
         # No information for attention mask inference -> return default attention mask
-        default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device)
+        default_attention_mask = torch.ones(inputs_tensor.shape[:2], dtype=torch.long, device=inputs_tensor.device)
         if pad_token_id is None:
             return default_attention_mask
 
-        is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [torch.int, torch.long]
+        is_input_ids = len(inputs_tensor.shape) == 2 and inputs_tensor.dtype in [torch.int, torch.long]
         if not is_input_ids:
             return default_attention_mask
 
         is_pad_token_in_inputs = (pad_token_id is not None) and (
-            isin_mps_friendly(elements=inputs, test_elements=pad_token_id).any()
+            isin_mps_friendly(elements=inputs_tensor, test_elements=pad_token_id).any()
         )
         is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~(
             isin_mps_friendly(elements=eos_token_id, test_elements=pad_token_id).any()
         )
         can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id
-        attention_mask_from_padding = inputs.ne(pad_token_id).long()
+        attention_mask_from_padding = inputs_tensor.ne(pad_token_id).long()
 
         attention_mask = (
             attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask
@@ -2024,7 +2031,7 @@ def generate(
 
         if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
-                inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor
+                inputs_tensor, generation_config, model_kwargs
             )
         elif kwargs_has_attention_mask:
             # TODO (joao): generalize this check with other types of inputs
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index c18e1d1c9d86..109ddfb626d2 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -1562,7 +1562,7 @@ def generate(
 
         if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
-                input_ids, generation_config._pad_token_tensor, generation_config._eos_token_tensor
+                input_ids, generation_config, model_kwargs
             )
 
         # 5. Prepare `max_length` depending on other stopping criteria.
@@ -2578,7 +2578,7 @@ def generate(
 
         if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
-                inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor
+                inputs_tensor, generation_config, model_kwargs
             )
 
         if "encoder_outputs" not in model_kwargs:
diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
index d2f339afc414..61f2ce414e1d 100644
--- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
+++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py
@@ -1484,7 +1484,7 @@ def generate(
 
         if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
-                input_ids, generation_config._pad_token_tensor, generation_config._eos_token_tensor
+                input_ids, generation_config, model_kwargs
             )
 
         # 5. Prepare `max_length` depending on other stopping criteria.
@@ -2425,7 +2425,7 @@ def generate(
 
         if model_kwargs.get("attention_mask", None) is None and requires_attention_mask:
             model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation(
-                inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor
+                inputs_tensor, generation_config, model_kwargs
             )
 
         if "encoder_hidden_states" not in model_kwargs:
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index bc8d140da595..545b696d6737 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -1612,6 +1612,9 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
             has_complex_embeds_computation = any(
                 model_name in model_class.__name__.lower() for model_name in ["moshi"]
             )
+            # 3 - `inputs_dict` doesn't contain `attention_mask`. When `attention_mask` is not passed to generate,
+            # we infer it from `input_ids`. The last test case will fail if there is a pad token in the original input.
+            missing_attention_mask = "attention_mask" not in inputs_dict
 
             # Traditional way of generating text
             input_ids = inputs_dict.pop("input_ids")
@@ -1646,7 +1649,7 @@ def test_generate_from_inputs_embeds(self, _, num_beams):
 
             # input_ids is not a required input on most models -- if we don't pass it, the newly generated tokens will
             # be the same
-            if not requires_inputs_ids:
+            if not (requires_inputs_ids or missing_attention_mask):
                 outputs_from_embeds_wo_ids = model.generate(
                     inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict
                 )