From 64873abdfb7ffd3c798ae5298c53393d4086fd4a Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 15 Oct 2024 12:52:17 +0200
Subject: [PATCH 01/12] blip2 tests

---
 .../models/blip_2/modeling_blip_2.py          |  37 ++-
 tests/generation/test_utils.py                |  81 +++---
 tests/models/blip_2/test_modeling_blip_2.py   | 236 +++++++++++++++++-
 3 files changed, 286 insertions(+), 68 deletions(-)

diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 4b0ed4f71d9c..1cdcdff7fc6d 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1774,8 +1774,12 @@ def forward(
                 return_dict=return_dict,
                 labels=labels,
             )
-            loss = outputs.loss if return_dict else outputs[0]
-            logits = outputs.logits if return_dict else outputs[1]
+            if labels is not None:
+                loss = outputs.loss if return_dict else outputs[0]
+                logits = outputs.logits if return_dict else outputs[1]
+            else:
+                loss = None
+                logits = outputs.logits if return_dict else outputs[0]
 
         if not return_dict:
             output = (logits, vision_outputs, query_outputs, outputs)
@@ -2243,8 +2247,12 @@ def forward(
                 return_dict=return_dict,
                 labels=labels,
             )
-            loss = outputs.loss if return_dict else outputs[0]
-            logits = outputs.logits if return_dict else outputs[1]
+            if labels is not None:
+                loss = outputs.loss if return_dict else outputs[0]
+                logits = outputs.logits if return_dict else outputs[1]
+            else:
+                loss = None
+                logits = outputs.logits if return_dict else outputs[0]
 
         if not return_dict:
             output = (logits, vision_outputs, query_outputs, outputs)
@@ -2341,24 +2349,11 @@ def generate(
                 )
                 generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
 
-        outputs = self.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-
-        # this is a temporary workaround to be consistent with other generation models and
-        # have BOS as the first token, even though under the hood we are calling LM with embeds
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
         if not self.language_model.config.is_encoder_decoder:
-            bos_tokens = (
-                torch.LongTensor([[self.config.text_config.bos_token_id]])
-                .repeat(batch_size, 1)
-                .to(image_embeds.device)
-            )
-            if not isinstance(outputs, torch.Tensor):
-                outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1)
-            else:
-                outputs = torch.cat([bos_tokens, outputs], dim=-1)
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
         return outputs
 
 
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 58259821cf7a..594d5a57ea26 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -92,6 +92,7 @@
 
 
 class GenerationTesterMixin:
+    input_name = "input_ids"
     model_tester = None
     all_generative_model_classes = ()
     max_new_tokens = 3
@@ -406,7 +407,7 @@ def _contrastive_generate(
     def test_greedy_generate(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(model=model, inputs_dict=inputs_dict)
@@ -420,7 +421,7 @@ def test_greedy_generate(self):
     def test_greedy_generate_dict_outputs(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(
@@ -434,7 +435,7 @@ def test_greedy_generate_dict_outputs(self):
                 use_cache=False,
             )
 
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
                 # Retrocompatibility check
@@ -451,10 +452,10 @@ def test_greedy_generate_dict_outputs(self):
     def test_greedy_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
-            if not hasattr(config, "use_cache"):
-                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
+            # if not hasattr(config, "use_cache") or not hasattr(config.get_text_config(), "use_cache"):
+            #     self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
             if any(model_name in model_class.__name__.lower() for model_name in ["rwkv"]):
                 self.skipTest(reason="Won't fix: model with non-standard dictionary output shapes")
 
@@ -471,7 +472,7 @@ def test_greedy_generate_dict_outputs_use_cache(self):
                 use_cache=True,  # Enable cache
             )
 
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -482,12 +483,12 @@ def test_greedy_generate_dict_outputs_use_cache(self):
     def test_sample_generate(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._sample_generate(model=model, inputs_dict=inputs_dict, num_return_sequences=1)
 
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -496,7 +497,7 @@ def test_sample_generate(self):
     def test_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._sample_generate(
@@ -511,7 +512,7 @@ def test_sample_generate_dict_output(self):
                 use_cache=False,
             )
 
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
                 # Retrocompatibility check
@@ -528,14 +529,14 @@ def test_sample_generate_dict_output(self):
     def test_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             model = model_class(config).to(torch_device).eval()
 
             beam_kwargs = self._get_beam_kwargs()
             output_generate = self._beam_search_generate(model=model, inputs_dict=inputs_dict, beam_kwargs=beam_kwargs)
 
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -544,7 +545,7 @@ def test_beam_search_generate(self):
     def test_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
@@ -559,7 +560,7 @@ def test_beam_search_generate_dict_output(self):
                 return_dict_in_generate=True,
                 use_cache=False,
             )
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                 # Retrocompatibility check
@@ -578,7 +579,7 @@ def test_beam_search_generate_dict_output(self):
     def test_beam_search_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             if not hasattr(config, "use_cache"):
                 self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
@@ -602,7 +603,7 @@ def test_beam_search_generate_dict_outputs_use_cache(self):
                 use_cache=True,  # Enable cache
             )
 
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -643,7 +644,7 @@ def test_model_parallel_beam_search(self):
     def test_beam_sample_generate(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
@@ -653,7 +654,7 @@ def test_beam_sample_generate(self):
                 beam_kwargs=beam_kwargs,
             )
 
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -685,7 +686,7 @@ def test_beam_sample_generate(self):
     def test_beam_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
@@ -702,7 +703,7 @@ def test_beam_sample_generate_dict_output(self):
                 use_cache=False,
             )
 
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                 # Retrocompatibility check
@@ -742,7 +743,7 @@ def test_generate_without_input_ids(self):
     def test_group_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             model = model_class(config).to(torch_device).eval()
             # check `generate()` and `group_beam_search()` are equal
@@ -752,7 +753,7 @@ def test_group_beam_search_generate(self):
                 inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
             )
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -765,7 +766,7 @@ def test_group_beam_search_generate(self):
                 inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
             )
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -774,7 +775,7 @@ def test_group_beam_search_generate(self):
     def test_group_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_diverse_beam_kwargs()
@@ -789,7 +790,7 @@ def test_group_beam_search_generate_dict_output(self):
                 return_dict_in_generate=True,
                 use_cache=False,
             )
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                 # Retrocompatibility check
@@ -810,7 +811,7 @@ def test_group_beam_search_generate_dict_output(self):
     def test_constrained_beam_search_generate(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             model = model_class(config).to(torch_device).eval()
 
@@ -831,7 +832,7 @@ def test_constrained_beam_search_generate(self):
                 beam_kwargs=beam_kwargs,
             )
 
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -855,7 +856,7 @@ def test_constrained_beam_search_generate(self):
                 beam_kwargs=beam_kwargs,
             )
 
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -867,7 +868,7 @@ def test_constrained_beam_search_generate(self):
     def test_constrained_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             model = model_class(config).to(torch_device).eval()
 
@@ -893,7 +894,7 @@ def test_constrained_beam_search_generate_dict_output(self):
                 use_cache=False,
             )
 
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                 # Retrocompatibility check
@@ -919,7 +920,7 @@ def test_contrastive_generate(self):
                 self.skipTest(reason="Won't fix: old model with different cache format")
 
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -933,7 +934,7 @@ def test_contrastive_generate(self):
                 inputs_dict=inputs_dict,
                 use_cache=True,  # Enable cache
             )
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -949,7 +950,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
                 self.skipTest(reason="Won't fix: old model with different cache format")
 
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -968,7 +969,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
                 use_cache=True,  # Enable cache
             )
 
-            if model.config.is_encoder_decoder:
+            if model.config.get_text_config().is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -1104,7 +1105,7 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
 
             # enable cache
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1178,7 +1179,7 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
 
             # enable cache
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1231,7 +1232,7 @@ def test_dola_decoding_sample(self):
 
             # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             # Encoder-decoder models are not supported
             if config.is_encoder_decoder:
@@ -1288,7 +1289,7 @@ def test_assisted_decoding_sample(self):
 
             # enable cache
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1834,7 +1835,7 @@ def test_generate_with_static_cache(self):
                 self.skipTest(reason="This model does not support the static cache format")
 
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[model_class.main_input_name]
+            main_input = inputs_dict[self.input_name]
 
             if config.is_encoder_decoder:
                 self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index cee5d710a85f..e1c175668e5b 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -19,6 +19,7 @@
 import unittest
 
 import numpy as np
+import pytest
 import requests
 
 from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
@@ -390,7 +391,14 @@ def get_config(self):
 # this model tester uses a decoder-only language model (OPT)
 class Blip2ForConditionalGenerationDecoderOnlyModelTester:
     def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+        self,
+        parent,
+        vision_kwargs=None,
+        qformer_kwargs=None,
+        text_kwargs=None,
+        is_training=True,
+        num_query_tokens=10,
+        image_token_index=4,
     ):
         if vision_kwargs is None:
             vision_kwargs = {}
@@ -407,11 +415,21 @@ def __init__(
         self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
         self.is_training = is_training
         self.num_query_tokens = num_query_tokens
+        self.image_token_index = image_token_index
 
     def prepare_config_and_inputs(self):
         _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
         _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
 
+        vision_tokens = (
+            torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype)
+            * self.image_token_index
+        )
+        input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id
+        input_ids = torch.cat([vision_tokens, input_ids], dim=-1)
+        vision_attention_mask = torch.ones_like(vision_tokens)
+        attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1)
+
         config = self.get_config()
 
         return config, input_ids, attention_mask, pixel_values
@@ -422,6 +440,7 @@ def get_config(self):
             qformer_config=self.qformer_model_tester.get_config(),
             text_config=self.text_model_tester.get_config(),
             num_query_tokens=self.num_query_tokens,
+            image_token_index=self.image_token_index,
         )
 
     def create_and_check_for_conditional_generation(self, config, input_ids, attention_mask, pixel_values):
@@ -442,7 +461,6 @@ def prepare_config_and_inputs_for_common(self):
             "pixel_values": pixel_values,
             "input_ids": input_ids,
             "attention_mask": attention_mask,
-            "labels": input_ids,
         }
         return config, inputs_dict
 
@@ -450,6 +468,7 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -521,6 +540,203 @@ def test_model_from_pretrained(self):
         model = Blip2ForConditionalGeneration.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    # overwrite because BLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format
+    def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1):
+        use_cache = True  # force this to be True in case False is passed
+
+        batch_size = main_input.shape[0]
+        seq_length = main_input.shape[-1]
+        config = config.text_config if hasattr(config, "text_config") else config
+        num_sequences_in_output = batch_size * num_return_sequences
+
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
+
+        # in some models we subsample the sequence length in inner layers
+        if hasattr(self.model_tester, "get_subsampled_output_lengths"):
+            seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
+
+        # scores
+        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
+
+        # unprocessed logits
+        self._check_logits(num_sequences_in_output, output.logits, config=config)
+
+        # Attentions
+        if self.has_attentions:
+            if config.is_encoder_decoder:
+                # encoder
+                self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
+                # decoder
+                self._check_attentions_for_generate(
+                    num_sequences_in_output,
+                    output.decoder_attentions,
+                    min_length=1,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+            else:
+                # if use_cache first input is equal to no use_cache, so skip here
+                attentions = output.attentions if not use_cache else output.attentions[1:]
+                min_length = seq_length if not use_cache else seq_length + 1
+                self._check_attentions_for_generate(
+                    num_sequences_in_output,
+                    attentions=attentions,
+                    min_length=min_length,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+
+        # Hidden States
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_hidden_states_for_generate(
+                output.encoder_hidden_states, batch_size, config, seq_length
+            )
+
+            # decoder
+            self._check_hidden_states_for_generate(
+                num_sequences_in_output,
+                output.decoder_hidden_states,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_hidden_states_for_generate(
+                num_sequences_in_output,
+                hidden_states,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+
+        # Past Key Value States -- a few notes here:
+        # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1"
+        # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the
+        #    standard cache format (e.g.gptbigcode )
+        models_without_standard_cache = (
+            "ctrl",
+            "fsmt",
+            "gptbigcode",
+            "mega",
+            "reformer",
+            "jamba",
+            "mamba",
+            "xlnet",
+            "zamba",
+        )
+        has_standard_cache = not any(
+            model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache
+        )
+        if has_standard_cache:
+            if use_cache:
+                past_key_values = output.past_key_values
+                past_sequence_length = output.sequences.shape[-1] - 1
+                self._check_past_key_values_for_generate(
+                    num_sequences_in_output,
+                    past_key_values,
+                    seq_length=past_sequence_length,
+                    config=config,
+                )
+            elif use_cache is False:
+                self.assertTrue(output.past_key_values is None)
+
+    # overwrite because BLIP2 cannot generate on;y from input ids, and requires pixel values in all cases to be present
+    @pytest.mark.generate
+    def test_left_padding_compatibility(self):
+        # NOTE: left-padding results in small numerical differences. This is expected.
+        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
+
+        # First, filter out models that don't support left padding
+        # - The model must have generative capabilities
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(reason="No generative architecture available for this model.")
+
+        # - The model must support padding
+        if not self.has_attentions:
+            self.skipTest(reason="This model doesn't support padding.")
+
+        # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
+        decoder_only_classes = []
+        for model_class in self.all_generative_model_classes:
+            config, _ = self.prepare_config_and_inputs_for_generate()
+            if config.is_encoder_decoder:
+                continue
+            else:
+                decoder_only_classes.append(model_class)
+        if len(decoder_only_classes) == 0:
+            self.skipTest(reason="No decoder-only architecture available for this model.")
+
+        # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
+        #   added support for it yet. We skip these models for now.
+        has_encoder_attributes = any(
+            attr_name
+            for attr_name in config.to_dict().keys()
+            if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
+        )
+        if has_encoder_attributes:
+            self.skipTest(
+                reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
+            )
+
+        # Then, test left-padding
+        def _prepare_model_kwargs(input_ids, attention_mask, signature):
+            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            if "position_ids" in signature:
+                position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                model_kwargs["position_ids"] = position_ids
+            if "cache_position" in signature:
+                cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
+                model_kwargs["cache_position"] = cache_position
+            return model_kwargs
+
+        for model_class in decoder_only_classes:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            input_ids = inputs_dict["input_ids"]
+            attention_mask = inputs_dict.get("attention_mask")
+            pixel_values = inputs_dict["pixel_values"]
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+
+            model = model_class(config).to(torch_device).eval()
+            signature = inspect.signature(model.forward).parameters.keys()
+
+            # no cache as some models require special cache classes to be init outside forward
+            model.generation_config.use_cache = False
+
+            # Without padding
+            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
+            next_logits_wo_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :]
+
+            # With left-padding (length 32)
+            # can hardcode pad_token to be 0 as we'll do attn masking anyway
+            pad_token_id = (
+                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
+            )
+            pad_size = (input_ids.shape[0], 32)
+            padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
+            padded_input_ids = torch.cat((padding, input_ids), dim=1)
+            padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+            model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
+            next_logits_with_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :]
+
+            # They should result in very similar logits
+            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
+
+    @unittest.skip("BLIP2 cannot generate on;y from input ids, and requires pixel values in all cases to be present")
+    def test_generate_from_inputs_embeds_decoder_only(self):
+        pass
+
 
 # this class is based on `T5ModelTester` found in tests/models/t5/test_modeling_t5.py
 class Blip2TextModelTester:
@@ -692,7 +908,6 @@ def prepare_config_and_inputs_for_common(self):
             "attention_mask": attention_mask,
             "decoder_input_ids": decoder_input_ids,
             "decoder_attention_mask": decoder_attention_mask,
-            "labels": labels,
         }
         return config, inputs_dict
 
@@ -718,9 +933,16 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+        self,
+        pipeline_test_case_name,
+        config_class,
+        model_architecture,
+        tokenizer_name,
+        image_processor_name,
+        feature_extractor_name,
+        processor_name,
     ):
-        if pipeline_test_casse_name == "VisualQuestionAnsweringPipelineTests":
+        if pipeline_test_case_name == "VisualQuestionAnsweringPipelineTests":
             # Get `RuntimeError: "LayerNormKernelImpl" not implemented for 'Half'`.
             return True
 
@@ -811,7 +1033,7 @@ def test_get_text_features(self):
     def test_get_image_features(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"]
+        keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
 
         for key in keys_to_pop:
             inputs_dict.pop(key)
@@ -831,7 +1053,7 @@ def test_get_image_features(self):
     def test_get_qformer_features(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-        keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"]
+        keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"]
 
         for key in keys_to_pop:
             inputs_dict.pop(key)

From 23d2e15f469cca0f1027bc3300019bdaaa115f85 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 15 Oct 2024 13:39:11 +0200
Subject: [PATCH 02/12] instructblips

---
 .../instructblip/modeling_instructblip.py     |  25 +-
 .../modeling_instructblipvideo.py             |  25 +-
 .../test_modeling_instructblip.py             | 227 ++++++++++++++++-
 .../test_modeling_instructblipvideo.py        | 228 +++++++++++++++++-
 4 files changed, 460 insertions(+), 45 deletions(-)

diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py
index de4e84b82f83..b99dda4b834f 100644
--- a/src/transformers/models/instructblip/modeling_instructblip.py
+++ b/src/transformers/models/instructblip/modeling_instructblip.py
@@ -1628,27 +1628,10 @@ def generate(
                 )
                 generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
 
-        outputs = self.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-
-        # this is a temporary workaround to be consistent with other generation models and
-        # have BOS as the first token, even though under the hood we are calling LM with embeds
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
         if not self.language_model.config.is_encoder_decoder:
-            # the InstructBLIP authors used inconsistent tokenizer/model files during training,
-            # with the tokenizer's bos token being set to </s> which has ID=2,
-            # whereas the model's text config has bos token id = 0
-            bos_token_id = (
-                2
-                if self.config.text_config.architectures[0] == "LLaMAForCausalLM"
-                else self.config.text_config.bos_token_id
-            )
-            bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device)
-            if not isinstance(outputs, torch.Tensor):
-                outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1)
-            else:
-                outputs = torch.cat([bos_tokens, outputs], dim=-1)
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
 
         return outputs
diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
index a300268ed713..6c3eaf1ccce8 100644
--- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py
@@ -1663,27 +1663,10 @@ def generate(
                 )
                 generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
 
-        outputs = self.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-
-        # this is a temporary workaround to be consistent with other generation models and
-        # have BOS as the first token, even though under the hood we are calling LM with embeds
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
         if not self.language_model.config.is_encoder_decoder:
-            # the InstructBLIP authors used inconsistent tokenizer/model files during training,
-            # with the tokenizer's bos token being set to </s> which has ID=2,
-            # whereas the model's text config has bos token id = 0
-            bos_token_id = (
-                2
-                if self.config.text_config.architectures[0] == "LLaMAForCausalLM"
-                else self.config.text_config.bos_token_id
-            )
-            bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device)
-            if not isinstance(outputs, torch.Tensor):
-                outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1)
-            else:
-                outputs = torch.cat([bos_tokens, outputs], dim=-1)
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
 
         return outputs
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index 8292567334bf..c3406e7682b7 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -19,6 +19,7 @@
 import unittest
 
 import numpy as np
+import pytest
 import requests
 
 from transformers import (
@@ -319,7 +320,7 @@ def __init__(
         hidden_act="gelu",
         hidden_dropout_prob=0.1,
         attention_probs_dropout_prob=0.1,
-        max_position_embeddings=20,
+        max_position_embeddings=100,
         eos_token_id=2,
         pad_token_id=1,
         bos_token_id=0,
@@ -383,7 +384,14 @@ def get_config(self):
 # this model tester uses a decoder-only language model (OPT)
 class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
     def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+        self,
+        parent,
+        vision_kwargs=None,
+        qformer_kwargs=None,
+        text_kwargs=None,
+        is_training=True,
+        num_query_tokens=10,
+        image_token_index=4,
     ):
         if vision_kwargs is None:
             vision_kwargs = {}
@@ -400,6 +408,7 @@ def __init__(
         self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
         self.is_training = is_training
         self.num_query_tokens = num_query_tokens
+        self.image_token_index = image_token_index
 
     def prepare_config_and_inputs(self):
         _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
@@ -407,6 +416,14 @@ def prepare_config_and_inputs(self):
         _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
 
         config = self.get_config()
+        vision_tokens = (
+            torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype)
+            * self.image_token_index
+        )
+        input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id
+        input_ids = torch.cat([vision_tokens, input_ids], dim=-1)
+        vision_attention_mask = torch.ones_like(vision_tokens)
+        attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1)
 
         return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values
 
@@ -416,6 +433,7 @@ def get_config(self):
             qformer_config=self.qformer_model_tester.get_config(),
             text_config=self.text_model_tester.get_config(),
             num_query_tokens=self.num_query_tokens,
+            image_token_index=self.image_token_index,
         )
 
     def create_and_check_for_conditional_generation(
@@ -454,6 +472,7 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
     all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -529,6 +548,210 @@ def test_model_from_pretrained(self):
         model = InstructBlipForConditionalGeneration.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    # overwrite because InstructBLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format
+    def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1):
+        use_cache = True  # force this to be True in case False is passed
+
+        batch_size = main_input.shape[0]
+        seq_length = main_input.shape[-1]
+        config = config.text_config if hasattr(config, "text_config") else config
+        num_sequences_in_output = batch_size * num_return_sequences
+
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
+
+        # in some models we subsample the sequence length in inner layers
+        if hasattr(self.model_tester, "get_subsampled_output_lengths"):
+            seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
+
+        # scores
+        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
+
+        # unprocessed logits
+        self._check_logits(num_sequences_in_output, output.logits, config=config)
+
+        # Attentions
+        if self.has_attentions:
+            if config.is_encoder_decoder:
+                # encoder
+                self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
+                # decoder
+                self._check_attentions_for_generate(
+                    num_sequences_in_output,
+                    output.decoder_attentions,
+                    min_length=1,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+            else:
+                # if use_cache first input is equal to no use_cache, so skip here
+                attentions = output.attentions if not use_cache else output.attentions[1:]
+                min_length = seq_length if not use_cache else seq_length + 1
+                self._check_attentions_for_generate(
+                    num_sequences_in_output,
+                    attentions=attentions,
+                    min_length=min_length,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+
+        # Hidden States
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_hidden_states_for_generate(
+                output.encoder_hidden_states, batch_size, config, seq_length
+            )
+
+            # decoder
+            self._check_hidden_states_for_generate(
+                num_sequences_in_output,
+                output.decoder_hidden_states,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_hidden_states_for_generate(
+                num_sequences_in_output,
+                hidden_states,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+
+        # Past Key Value States -- a few notes here:
+        # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1"
+        # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the
+        #    standard cache format (e.g.gptbigcode )
+        models_without_standard_cache = (
+            "ctrl",
+            "fsmt",
+            "gptbigcode",
+            "mega",
+            "reformer",
+            "jamba",
+            "mamba",
+            "xlnet",
+            "zamba",
+        )
+        has_standard_cache = not any(
+            model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache
+        )
+        if has_standard_cache:
+            if use_cache:
+                past_key_values = output.past_key_values
+                past_sequence_length = output.sequences.shape[-1] - 1
+                self._check_past_key_values_for_generate(
+                    num_sequences_in_output,
+                    past_key_values,
+                    seq_length=past_sequence_length,
+                    config=config,
+                )
+            elif use_cache is False:
+                self.assertTrue(output.past_key_values is None)
+
+    # overwrite because InstructBLIP cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present
+    @pytest.mark.generate
+    def test_left_padding_compatibility(self):
+        # NOTE: left-padding results in small numerical differences. This is expected.
+        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
+
+        # First, filter out models that don't support left padding
+        # - The model must have generative capabilities
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(reason="No generative architecture available for this model.")
+
+        # - The model must support padding
+        if not self.has_attentions:
+            self.skipTest(reason="This model doesn't support padding.")
+
+        # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
+        decoder_only_classes = []
+        for model_class in self.all_generative_model_classes:
+            config, _ = self.prepare_config_and_inputs_for_generate()
+            if config.is_encoder_decoder:
+                continue
+            else:
+                decoder_only_classes.append(model_class)
+        if len(decoder_only_classes) == 0:
+            self.skipTest(reason="No decoder-only architecture available for this model.")
+
+        # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
+        #   added support for it yet. We skip these models for now.
+        has_encoder_attributes = any(
+            attr_name
+            for attr_name in config.to_dict().keys()
+            if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
+        )
+        if has_encoder_attributes:
+            self.skipTest(
+                reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
+            )
+
+        # Then, test left-padding
+        def _prepare_model_kwargs(input_ids, attention_mask, signature):
+            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            if "position_ids" in signature:
+                position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                model_kwargs["position_ids"] = position_ids
+            if "cache_position" in signature:
+                cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
+                model_kwargs["cache_position"] = cache_position
+            return model_kwargs
+
+        for model_class in decoder_only_classes:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            input_ids = inputs_dict["input_ids"]
+            attention_mask = inputs_dict.get("attention_mask")
+            pixel_values = inputs_dict["pixel_values"]
+            qformer_input_ids = inputs_dict["qformer_input_ids"]
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+
+            model = model_class(config).to(torch_device).eval()
+            signature = inspect.signature(model.forward).parameters.keys()
+
+            # no cache as some models require special cache classes to be init outside forward
+            model.generation_config.use_cache = False
+
+            # Without padding
+            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
+            next_logits_wo_padding = model(
+                **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
+            ).logits[:, -1, :]
+
+            # With left-padding (length 32)
+            # can hardcode pad_token to be 0 as we'll do attn masking anyway
+            pad_token_id = (
+                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
+            )
+            pad_size = (input_ids.shape[0], 32)
+            padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
+            padded_input_ids = torch.cat((padding, input_ids), dim=1)
+            padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+            model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
+            next_logits_with_padding = model(
+                **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
+            ).logits[:, -1, :]
+
+            # They should result in very similar logits
+            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
+
+    @unittest.skip(
+        "InstructBLIP cannot generate on;y from input ids, and requires pixel values in all cases to be present"
+    )
+    def test_generate_from_inputs_embeds_decoder_only(self):
+        pass
+
 
 # We will verify our results on an image of cute cats
 def prepare_img():
diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index 8a9326c22ac1..08f2c861b630 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -19,6 +19,7 @@
 import unittest
 
 import numpy as np
+import pytest
 from huggingface_hub import hf_hub_download
 
 from transformers import (
@@ -397,7 +398,14 @@ def get_config(self):
 # this model tester uses a decoder-only language model (OPT)
 class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
     def __init__(
-        self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10
+        self,
+        parent,
+        vision_kwargs=None,
+        qformer_kwargs=None,
+        text_kwargs=None,
+        is_training=True,
+        num_query_tokens=10,
+        video_token_index=4,
     ):
         if vision_kwargs is None:
             vision_kwargs = {}
@@ -414,6 +422,7 @@ def __init__(
         self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
         self.is_training = is_training
         self.num_query_tokens = num_query_tokens
+        self.video_token_index = video_token_index
 
     def prepare_config_and_inputs(self):
         _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
@@ -423,6 +432,17 @@ def prepare_config_and_inputs(self):
         _, c, h, w = pixel_values.shape
         pixel_values = pixel_values.reshape(-1, frames, c, h, w)
 
+        vision_tokens = (
+            torch.ones(
+                (input_ids.shape[0], self.num_query_tokens * frames), device=torch_device, dtype=input_ids.dtype
+            )
+            * self.video_token_index
+        )
+        input_ids[input_ids == self.video_token_index] = self.text_model_tester.pad_token_id
+        input_ids = torch.cat([vision_tokens, input_ids], dim=-1)
+        vision_attention_mask = torch.ones_like(vision_tokens)
+        attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1)
+
         config = self.get_config()
 
         return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values
@@ -433,6 +453,7 @@ def get_config(self):
             qformer_config=self.qformer_model_tester.get_config(),
             text_config=self.text_model_tester.get_config(),
             num_query_tokens=self.num_query_tokens,
+            video_token_index=self.video_token_index,
         )
 
     def create_and_check_for_conditional_generation(
@@ -475,6 +496,7 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
     ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
 ):
     all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
+    all_generative_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
@@ -550,6 +572,210 @@ def test_model_from_pretrained(self):
         model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name)
         self.assertIsNotNone(model)
 
+    # overwrite because InstructBLIPVideo internally calls LM.generate() with embeds thus it cannot operate in no cache format
+    def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1):
+        use_cache = True  # force this to be True in case False is passed
+
+        batch_size = main_input.shape[0]
+        seq_length = main_input.shape[-1]
+        config = config.text_config if hasattr(config, "text_config") else config
+        num_sequences_in_output = batch_size * num_return_sequences
+
+        gen_len = (
+            output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
+        )
+
+        # in some models we subsample the sequence length in inner layers
+        if hasattr(self.model_tester, "get_subsampled_output_lengths"):
+            seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
+
+        # scores
+        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
+
+        # unprocessed logits
+        self._check_logits(num_sequences_in_output, output.logits, config=config)
+
+        # Attentions
+        if self.has_attentions:
+            if config.is_encoder_decoder:
+                # encoder
+                self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
+                # decoder
+                self._check_attentions_for_generate(
+                    num_sequences_in_output,
+                    output.decoder_attentions,
+                    min_length=1,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+            else:
+                # if use_cache first input is equal to no use_cache, so skip here
+                attentions = output.attentions if not use_cache else output.attentions[1:]
+                min_length = seq_length if not use_cache else seq_length + 1
+                self._check_attentions_for_generate(
+                    num_sequences_in_output,
+                    attentions=attentions,
+                    min_length=min_length,
+                    max_length=output.sequences.shape[-1],
+                    config=config,
+                    use_cache=use_cache,
+                )
+
+        # Hidden States
+        if config.is_encoder_decoder:
+            # encoder
+            self._check_encoder_hidden_states_for_generate(
+                output.encoder_hidden_states, batch_size, config, seq_length
+            )
+
+            # decoder
+            self._check_hidden_states_for_generate(
+                num_sequences_in_output,
+                output.decoder_hidden_states,
+                min_length=1,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+        else:
+            # if use_cache first input is equal to no use_cache, so skip here
+            hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
+            min_length = seq_length if not use_cache else seq_length + 1
+            self._check_hidden_states_for_generate(
+                num_sequences_in_output,
+                hidden_states,
+                min_length=min_length,
+                max_length=output.sequences.shape[-1],
+                config=config,
+                use_cache=use_cache,
+            )
+
+        # Past Key Value States -- a few notes here:
+        # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1"
+        # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the
+        #    standard cache format (e.g.gptbigcode )
+        models_without_standard_cache = (
+            "ctrl",
+            "fsmt",
+            "gptbigcode",
+            "mega",
+            "reformer",
+            "jamba",
+            "mamba",
+            "xlnet",
+            "zamba",
+        )
+        has_standard_cache = not any(
+            model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache
+        )
+        if has_standard_cache:
+            if use_cache:
+                past_key_values = output.past_key_values
+                past_sequence_length = output.sequences.shape[-1] - 1
+                self._check_past_key_values_for_generate(
+                    num_sequences_in_output,
+                    past_key_values,
+                    seq_length=past_sequence_length,
+                    config=config,
+                )
+            elif use_cache is False:
+                self.assertTrue(output.past_key_values is None)
+
+    # overwrite because InstructBLIPVideo cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present
+    @pytest.mark.generate
+    def test_left_padding_compatibility(self):
+        # NOTE: left-padding results in small numerical differences. This is expected.
+        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
+
+        # First, filter out models that don't support left padding
+        # - The model must have generative capabilities
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(reason="No generative architecture available for this model.")
+
+        # - The model must support padding
+        if not self.has_attentions:
+            self.skipTest(reason="This model doesn't support padding.")
+
+        # - The model must be a decoder-only architecture (encoder-based architectures use right-padding)
+        decoder_only_classes = []
+        for model_class in self.all_generative_model_classes:
+            config, _ = self.prepare_config_and_inputs_for_generate()
+            if config.is_encoder_decoder:
+                continue
+            else:
+                decoder_only_classes.append(model_class)
+        if len(decoder_only_classes) == 0:
+            self.skipTest(reason="No decoder-only architecture available for this model.")
+
+        # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't
+        #   added support for it yet. We skip these models for now.
+        has_encoder_attributes = any(
+            attr_name
+            for attr_name in config.to_dict().keys()
+            if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size"
+        )
+        if has_encoder_attributes:
+            self.skipTest(
+                reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding."
+            )
+
+        # Then, test left-padding
+        def _prepare_model_kwargs(input_ids, attention_mask, signature):
+            model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask}
+            if "position_ids" in signature:
+                position_ids = torch.cumsum(attention_mask, dim=-1) - 1
+                position_ids.masked_fill_(attention_mask == 0, 1)
+                model_kwargs["position_ids"] = position_ids
+            if "cache_position" in signature:
+                cache_position = torch.arange(input_ids.shape[-1], device=torch_device)
+                model_kwargs["cache_position"] = cache_position
+            return model_kwargs
+
+        for model_class in decoder_only_classes:
+            config, inputs_dict = self.prepare_config_and_inputs_for_generate()
+            input_ids = inputs_dict["input_ids"]
+            attention_mask = inputs_dict.get("attention_mask")
+            pixel_values = inputs_dict["pixel_values"]
+            qformer_input_ids = inputs_dict["qformer_input_ids"]
+            if attention_mask is None:
+                attention_mask = torch.ones_like(input_ids)
+
+            model = model_class(config).to(torch_device).eval()
+            signature = inspect.signature(model.forward).parameters.keys()
+
+            # no cache as some models require special cache classes to be init outside forward
+            model.generation_config.use_cache = False
+
+            # Without padding
+            model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature)
+            next_logits_wo_padding = model(
+                **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
+            ).logits[:, -1, :]
+
+            # With left-padding (length 32)
+            # can hardcode pad_token to be 0 as we'll do attn masking anyway
+            pad_token_id = (
+                config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0
+            )
+            pad_size = (input_ids.shape[0], 32)
+            padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id
+            padded_input_ids = torch.cat((padding, input_ids), dim=1)
+            padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1)
+            model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature)
+            next_logits_with_padding = model(
+                **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids
+            ).logits[:, -1, :]
+
+            # They should result in very similar logits
+            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
+
+    @unittest.skip(
+        "InstructBLIPVideo cannot generate on;y from input ids, and requires pixel values in all cases to be present"
+    )
+    def test_generate_from_inputs_embeds_decoder_only(self):
+        pass
+
 
 # We will verify our results on an image of cute cats
 def prepare_video():

From 8cf7507ff0760f596a97404e7ef9aaaec5c5d0e6 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 15 Oct 2024 13:44:03 +0200
Subject: [PATCH 03/12] copies

---
 .../modular_instructblipvideo.py              | 25 +++----------------
 1 file changed, 4 insertions(+), 21 deletions(-)

diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
index 2128f25df662..63c6c486854c 100644
--- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
+++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py
@@ -468,27 +468,10 @@ def generate(
                 )
                 generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1]
 
-        outputs = self.language_model.generate(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            **generate_kwargs,
-        )
-
-        # this is a temporary workaround to be consistent with other generation models and
-        # have BOS as the first token, even though under the hood we are calling LM with embeds
+        inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask}
         if not self.language_model.config.is_encoder_decoder:
-            # the InstructBLIP authors used inconsistent tokenizer/model files during training,
-            # with the tokenizer's bos token being set to </s> which has ID=2,
-            # whereas the model's text config has bos token id = 0
-            bos_token_id = (
-                2
-                if self.config.text_config.architectures[0] == "LLaMAForCausalLM"
-                else self.config.text_config.bos_token_id
-            )
-            bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device)
-            if not isinstance(outputs, torch.Tensor):
-                outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1)
-            else:
-                outputs = torch.cat([bos_tokens, outputs], dim=-1)
+            inputs["input_ids"] = input_ids
+
+        outputs = self.language_model.generate(**inputs, **generate_kwargs)
 
         return outputs

From 8dcb4fbc570b3bc422e33965a7fb92b4e6839e9c Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 15 Oct 2024 14:15:50 +0200
Subject: [PATCH 04/12] fix slow tests

---
 tests/models/blip_2/test_modeling_blip_2.py              | 9 +++++----
 tests/models/instructblip/test_modeling_instructblip.py  | 4 ++--
 .../instructblipvideo/test_modeling_instructblipvideo.py | 2 +-
 tests/models/seamless_m4t/test_modeling_seamless_m4t.py  | 1 +
 .../seamless_m4t_v2/test_modeling_seamless_m4t_v2.py     | 1 +
 .../speech_to_text/test_modeling_speech_to_text.py       | 1 +
 tests/models/whisper/test_modeling_whisper.py            | 2 ++
 7 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index e1c175668e5b..4fa8b31cfc11 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -1652,6 +1652,7 @@ def test_inference_opt(self):
         generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
 
         # Test output
+        print(predictions[0].tolist(), generated_text)
         self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118])
         self.assertEqual("a woman sitting on the beach with a dog", generated_text)
 
@@ -1666,9 +1667,9 @@ def test_inference_opt(self):
         # Test output
         self.assertEqual(
             predictions[0].tolist(),
-            [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
+            [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
         )
-        self.assertEqual(generated_text, "it's not a city, it's a beach")
+        self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach")
 
     def test_inference_interpolate_pos_encoding(self):
         processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
@@ -1777,9 +1778,9 @@ def test_inference_opt_multi_accelerator(self):
         # Test output
         self.assertEqual(
             predictions[0].tolist(),
-            [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
+            [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118],
         )
-        self.assertEqual(generated_text, "it's not a city, it's a beach")
+        self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach")
 
     @require_torch_multi_accelerator
     def test_inference_t5_multi_accelerator(self):
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index c3406e7682b7..2cce1e1b1a9d 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -792,12 +792,12 @@ def test_inference_vicuna_7b(self):
         outputs = model.generate(**inputs, max_new_tokens=30)
         generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
 
-        expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 372, 338, 19500, 1623, 263, 19587, 4272]  # fmt: off
+        expected_outputs = [2, 1724, 338, 22910, 1048, 445, 1967, 29973, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 19500, 373, 263, 19587, 4272, 11952, 29889]  # fmt: off
 
         self.assertEqual(outputs[0].tolist(), expected_outputs)
         self.assertEqual(
             generated_text,
-            "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while it is driving down a busy city",
+            "What is unusual about this image? The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving on a busy city street.",
         )
 
     def test_inference_flant5_xl(self):
diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index 08f2c861b630..e066c890e76f 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -807,7 +807,7 @@ def test_inference_vicuna_7b(self):
         generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip()
         self.assertEqual(
             generated_text,
-            "a baby girl wearing glasses is reading a book on the bed 1080p",
+            "Explain what is happening in this short video. a baby girl wearing glasses is reading a book on the bed 1080p",
         )
 
     def test_expansion_in_processing(self):
diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index cb09d44421f4..b028e3acbf00 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -592,6 +592,7 @@ class SeamlessM4TModelWithTextInputTest(
     test_resize_embeddings = True
     test_headmasking = False
     test_torchscript = False
+    input_name = "input_features"
 
     all_model_classes = (
         (
diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index 451fff0b35fb..1eefbbf31f77 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -600,6 +600,7 @@ class SeamlessM4Tv2ModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixi
     test_resize_embeddings = True
     test_headmasking = False
     test_torchscript = False
+    input_name = "input_features"
 
     all_model_classes = (
         (
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index 50446d4628af..b5e7eaba6310 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -281,6 +281,7 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
     fx_compatible = True
     test_pruning = False
     test_missing_keys = False
+    input_name = "input_features"
 
     def setUp(self):
         self.model_tester = Speech2TextModelTester(self)
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index b24c577a16e5..c04a9875eb33 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -394,6 +394,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     # Needs higher percentages after model tester's vocab_size is changed to 200 (PR #21222)
     # `0.5` is for `test_disk_offload` (which also works for `test_model_parallelism`)
     model_split_percents = [0.5, 0.8, 0.9]
+    input_name = "input_features"
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(
@@ -3930,6 +3931,7 @@ class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin,
     test_pruning = False
     is_encoder_decoder = False
     test_missing_keys = False
+    input_name = "input_features"
 
     def setUp(self):
         self.model_tester = WhisperStandaloneDecoderModelTester(self, is_training=False)

From 8cfabfeaa7c96e3511ae0599c06b3593fcf313a1 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 15 Oct 2024 14:50:42 +0200
Subject: [PATCH 05/12] fix

---
 tests/generation/test_utils.py                | 34 +++++++++----------
 .../test_modeling_seamless_m4t.py             |  1 -
 .../test_modeling_seamless_m4t_v2.py          |  1 -
 tests/models/whisper/test_modeling_whisper.py |  1 -
 4 files changed, 17 insertions(+), 20 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index b27d219225e9..aa0881775004 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -436,7 +436,7 @@ def test_greedy_generate_dict_outputs(self):
                 use_cache=False,
             )
 
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
                 # Retrocompatibility check
@@ -473,7 +473,7 @@ def test_greedy_generate_dict_outputs_use_cache(self):
                 use_cache=True,  # Enable cache
             )
 
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -489,7 +489,7 @@ def test_sample_generate(self):
             model = model_class(config).to(torch_device).eval()
             output_generate = self._sample_generate(model=model, inputs_dict=inputs_dict, num_return_sequences=1)
 
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -513,7 +513,7 @@ def test_sample_generate_dict_output(self):
                 use_cache=False,
             )
 
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput)
                 # Retrocompatibility check
@@ -537,7 +537,7 @@ def test_beam_search_generate(self):
             beam_kwargs = self._get_beam_kwargs()
             output_generate = self._beam_search_generate(model=model, inputs_dict=inputs_dict, beam_kwargs=beam_kwargs)
 
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -561,7 +561,7 @@ def test_beam_search_generate_dict_output(self):
                 return_dict_in_generate=True,
                 use_cache=False,
             )
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                 # Retrocompatibility check
@@ -604,7 +604,7 @@ def test_beam_search_generate_dict_outputs_use_cache(self):
                 use_cache=True,  # Enable cache
             )
 
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -655,7 +655,7 @@ def test_beam_sample_generate(self):
                 beam_kwargs=beam_kwargs,
             )
 
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -704,7 +704,7 @@ def test_beam_sample_generate_dict_output(self):
                 use_cache=False,
             )
 
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                 # Retrocompatibility check
@@ -754,7 +754,7 @@ def test_group_beam_search_generate(self):
                 inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
             )
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -767,7 +767,7 @@ def test_group_beam_search_generate(self):
                 inputs_dict=inputs_dict,
                 beam_kwargs=beam_kwargs,
             )
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -791,7 +791,7 @@ def test_group_beam_search_generate_dict_output(self):
                 return_dict_in_generate=True,
                 use_cache=False,
             )
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                 # Retrocompatibility check
@@ -833,7 +833,7 @@ def test_constrained_beam_search_generate(self):
                 beam_kwargs=beam_kwargs,
             )
 
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -857,7 +857,7 @@ def test_constrained_beam_search_generate(self):
                 beam_kwargs=beam_kwargs,
             )
 
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -895,7 +895,7 @@ def test_constrained_beam_search_generate_dict_output(self):
                 use_cache=False,
             )
 
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
                 self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput)
                 # Retrocompatibility check
@@ -935,7 +935,7 @@ def test_contrastive_generate(self):
                 inputs_dict=inputs_dict,
                 use_cache=True,  # Enable cache
             )
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1])
@@ -970,7 +970,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
                 use_cache=True,  # Enable cache
             )
 
-            if model.config.get_text_config().is_encoder_decoder:
+            if model.config.is_encoder_decoder:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1)
             else:
                 self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1])
diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index b028e3acbf00..cb09d44421f4 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -592,7 +592,6 @@ class SeamlessM4TModelWithTextInputTest(
     test_resize_embeddings = True
     test_headmasking = False
     test_torchscript = False
-    input_name = "input_features"
 
     all_model_classes = (
         (
diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
index 1eefbbf31f77..451fff0b35fb 100644
--- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -600,7 +600,6 @@ class SeamlessM4Tv2ModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixi
     test_resize_embeddings = True
     test_headmasking = False
     test_torchscript = False
-    input_name = "input_features"
 
     all_model_classes = (
         (
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index c04a9875eb33..3229714b7efe 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -3931,7 +3931,6 @@ class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin,
     test_pruning = False
     is_encoder_decoder = False
     test_missing_keys = False
-    input_name = "input_features"
 
     def setUp(self):
         self.model_tester = WhisperStandaloneDecoderModelTester(self, is_training=False)

From f0aff4f02ea2d741a733d6385eb2d79a4dc8d5f8 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Tue, 15 Oct 2024 15:29:30 +0200
Subject: [PATCH 06/12] uncomment this

---
 tests/generation/test_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index aa0881775004..34746f49c093 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -455,8 +455,8 @@ def test_greedy_generate_dict_outputs_use_cache(self):
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
             main_input = inputs_dict[self.input_name]
 
-            # if not hasattr(config, "use_cache") or not hasattr(config.get_text_config(), "use_cache"):
-            #     self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
+            if not hasattr(config, "use_cache"):
+                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
             if any(model_name in model_class.__name__.lower() for model_name in ["rwkv"]):
                 self.skipTest(reason="Won't fix: model with non-standard dictionary output shapes")
 

From c39c5ed95de252fd043b093e6c657f6bbc3f5b0d Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 21 Oct 2024 12:34:27 +0200
Subject: [PATCH 07/12] clean up after rebase

---
 .../models/blip_2/modeling_blip_2.py          | 22 ++++------
 src/transformers/models/glm/modeling_glm.py   | 41 +++----------------
 tests/models/blip_2/test_modeling_blip_2.py   |  4 +-
 .../test_modeling_instructblip.py             |  2 +-
 .../test_modeling_instructblipvideo.py        |  2 +-
 .../test_modeling_speech_to_text.py           |  1 -
 tests/models/whisper/test_modeling_whisper.py |  1 -
 7 files changed, 17 insertions(+), 56 deletions(-)

diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py
index 1cdcdff7fc6d..8b947a106667 100644
--- a/src/transformers/models/blip_2/modeling_blip_2.py
+++ b/src/transformers/models/blip_2/modeling_blip_2.py
@@ -1771,15 +1771,12 @@ def forward(
                 decoder_attention_mask=decoder_attention_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
+                return_dict=True,  # toggle for easier access to loss/logits below
                 labels=labels,
             )
-            if labels is not None:
-                loss = outputs.loss if return_dict else outputs[0]
-                logits = outputs.logits if return_dict else outputs[1]
-            else:
-                loss = None
-                logits = outputs.logits if return_dict else outputs[0]
+            loss = outputs.loss
+            logits = outputs.logits
+            outputs = outputs.to_tuple() if not return_dict else outputs
 
         if not return_dict:
             output = (logits, vision_outputs, query_outputs, outputs)
@@ -2244,15 +2241,12 @@ def forward(
                 decoder_attention_mask=decoder_attention_mask,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
-                return_dict=return_dict,
+                return_dict=True,  # toggle for easier access to loss/logits below
                 labels=labels,
             )
-            if labels is not None:
-                loss = outputs.loss if return_dict else outputs[0]
-                logits = outputs.logits if return_dict else outputs[1]
-            else:
-                loss = None
-                logits = outputs.logits if return_dict else outputs[0]
+            loss = outputs.loss
+            logits = outputs.logits
+            outputs = outputs.to_tuple() if not return_dict else outputs
 
         if not return_dict:
             output = (logits, vision_outputs, query_outputs, outputs)
diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py
index 9815dbc78992..a458c02a6fed 100644
--- a/src/transformers/models/glm/modeling_glm.py
+++ b/src/transformers/models/glm/modeling_glm.py
@@ -25,7 +25,6 @@
 import torch
 import torch.nn as nn
 import torch.utils.checkpoint
-from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache, StaticCache
@@ -921,6 +920,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
         device: torch.device,
         cache_position: torch.Tensor,
         batch_size: int,
+        **kwargs,
     ):
         """
         Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
@@ -1071,18 +1071,7 @@ def forward(
 
         loss = None
         if labels is not None:
-            # Upcast to float if we need to compute the loss to avoid potential precision issues
-            logits = logits.float()
-            # Shift so that tokens < n predict n
-            shift_logits = logits[..., :-1, :].contiguous()
-            shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss()
-            shift_logits = shift_logits.view(-1, self.config.vocab_size)
-            shift_labels = shift_labels.view(-1)
-            # Enable model parallelism
-            shift_labels = shift_labels.to(shift_logits.device)
-            loss = loss_fct(shift_logits, shift_labels)
+            loss = self.loss_function(logits, labels, self.vocab_size)
 
         if not return_dict:
             output = (logits,) + outputs[1:]
@@ -1186,27 +1175,8 @@ def forward(
 
         loss = None
         if labels is not None:
-            labels = labels.to(logits.device)
-            if self.config.problem_type is None:
-                if self.num_labels == 1:
-                    self.config.problem_type = "regression"
-                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
-                    self.config.problem_type = "single_label_classification"
-                else:
-                    self.config.problem_type = "multi_label_classification"
-
-            if self.config.problem_type == "regression":
-                loss_fct = MSELoss()
-                if self.num_labels == 1:
-                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
-                else:
-                    loss = loss_fct(pooled_logits, labels)
-            elif self.config.problem_type == "single_label_classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
-            elif self.config.problem_type == "multi_label_classification":
-                loss_fct = BCEWithLogitsLoss()
-                loss = loss_fct(pooled_logits, labels)
+            loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config)
+
         if not return_dict:
             output = (pooled_logits,) + transformer_outputs[1:]
             return ((loss,) + output) if loss is not None else output
@@ -1289,8 +1259,7 @@ def forward(
 
         loss = None
         if labels is not None:
-            loss_fct = CrossEntropyLoss()
-            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            loss = self.loss_function(logits, labels, self.config)
 
         if not return_dict:
             output = (logits,) + outputs[2:]
diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index 4fa8b31cfc11..804bcf6f3f56 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -650,7 +650,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
             elif use_cache is False:
                 self.assertTrue(output.past_key_values is None)
 
-    # overwrite because BLIP2 cannot generate on;y from input ids, and requires pixel values in all cases to be present
+    # overwrite because BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present
     @pytest.mark.generate
     def test_left_padding_compatibility(self):
         # NOTE: left-padding results in small numerical differences. This is expected.
@@ -733,7 +733,7 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature):
             # They should result in very similar logits
             self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
 
-    @unittest.skip("BLIP2 cannot generate on;y from input ids, and requires pixel values in all cases to be present")
+    @unittest.skip("BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present")
     def test_generate_from_inputs_embeds_decoder_only(self):
         pass
 
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index 2cce1e1b1a9d..7f268d3cc3f8 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -747,7 +747,7 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature):
             self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
 
     @unittest.skip(
-        "InstructBLIP cannot generate on;y from input ids, and requires pixel values in all cases to be present"
+        "InstructBLIP cannot generate only from input ids, and requires pixel values in all cases to be present"
     )
     def test_generate_from_inputs_embeds_decoder_only(self):
         pass
diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index e066c890e76f..fd3188ce77dd 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -771,7 +771,7 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature):
             self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
 
     @unittest.skip(
-        "InstructBLIPVideo cannot generate on;y from input ids, and requires pixel values in all cases to be present"
+        "InstructBLIPVideo cannot generate only from input ids, and requires pixel values in all cases to be present"
     )
     def test_generate_from_inputs_embeds_decoder_only(self):
         pass
diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py
index 31ef16a44da9..253cda7e49cb 100644
--- a/tests/models/speech_to_text/test_modeling_speech_to_text.py
+++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py
@@ -281,7 +281,6 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest
     fx_compatible = True
     test_pruning = False
     test_missing_keys = False
-    input_name = "input_features"
 
     def setUp(self):
         self.model_tester = Speech2TextModelTester(self)
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index 3229714b7efe..b24c577a16e5 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -394,7 +394,6 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     # Needs higher percentages after model tester's vocab_size is changed to 200 (PR #21222)
     # `0.5` is for `test_disk_offload` (which also works for `test_model_parallelism`)
     model_split_percents = [0.5, 0.8, 0.9]
-    input_name = "input_features"
 
     # TODO: Fix the failed tests
     def is_pipeline_test_to_skip(

From 37d25b1ecf47d4b92d2df357bd96988de7367cb2 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 21 Oct 2024 12:40:57 +0200
Subject: [PATCH 08/12] should be model main input

---
 tests/generation/test_utils.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index ae59da97ec41..3c702c580f5f 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -424,7 +424,7 @@ def test_greedy_generate(self):
     def test_greedy_generate_dict_outputs(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._greedy_generate(
@@ -457,7 +457,7 @@ def test_greedy_generate_dict_outputs(self):
     def test_greedy_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             if not hasattr(config, "use_cache"):
                 self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
@@ -503,7 +503,7 @@ def test_sample_generate(self):
     def test_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             output_generate = self._sample_generate(
@@ -552,7 +552,7 @@ def test_beam_search_generate(self):
     def test_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
@@ -588,7 +588,7 @@ def test_beam_search_generate_dict_output(self):
     def test_beam_search_generate_dict_outputs_use_cache(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             if not hasattr(config, "use_cache"):
                 self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
@@ -696,7 +696,7 @@ def test_beam_sample_generate(self):
     def test_beam_sample_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_beam_kwargs()
@@ -786,7 +786,7 @@ def test_group_beam_search_generate(self):
     def test_group_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
             beam_kwargs = self._get_diverse_beam_kwargs()
@@ -880,7 +880,7 @@ def test_constrained_beam_search_generate(self):
     def test_constrained_beam_search_generate_dict_output(self):
         for model_class in self.all_generative_model_classes:
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             model = model_class(config).to(torch_device).eval()
 
@@ -963,7 +963,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self):
                 self.skipTest(reason="Won't fix: old model with different cache format")
 
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1121,7 +1121,7 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type):
 
             # enable cache
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1196,7 +1196,7 @@ def test_prompt_lookup_decoding_matches_greedy_search(self):
 
             # enable cache
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1252,7 +1252,7 @@ def test_dola_decoding_sample(self):
 
             # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             # Encoder-decoder models are not supported
             if config.is_encoder_decoder:
@@ -1310,7 +1310,7 @@ def test_assisted_decoding_sample(self):
 
             # enable cache
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1857,7 +1857,7 @@ def test_generate_with_static_cache(self):
                 self.skipTest(reason="This model does not support the static cache format")
 
             config, inputs_dict = self.prepare_config_and_inputs_for_generate()
-            main_input = inputs_dict[self.input_name]
+            main_input = inputs_dict[model_class.main_input_name]
 
             if config.is_encoder_decoder:
                 self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache")

From ce467a01d0a2c0a2214184f0795e2358f4fdc727 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 21 Oct 2024 12:51:59 +0200
Subject: [PATCH 09/12] fix overwritten tests

---
 tests/models/blip_2/test_modeling_blip_2.py               | 8 ++++++--
 tests/models/instructblip/test_modeling_instructblip.py   | 8 ++++++--
 .../instructblipvideo/test_modeling_instructblipvideo.py  | 8 ++++++--
 3 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index 804bcf6f3f56..ffe979ca6e55 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -412,7 +412,7 @@ def __init__(
         self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs)
         self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs)
         self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
+        self.seq_length = self.text_model_tester.seq_length + num_query_tokens  # need seq_length for common tests
         self.is_training = is_training
         self.num_query_tokens = num_query_tokens
         self.image_token_index = image_token_index
@@ -545,7 +545,11 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
         use_cache = True  # force this to be True in case False is passed
 
         batch_size = main_input.shape[0]
-        seq_length = main_input.shape[-1]
+
+        seq_length = getattr(self.model_tester, "seq_length", None)
+        seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
+        seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
+
         config = config.text_config if hasattr(config, "text_config") else config
         num_sequences_in_output = batch_size * num_return_sequences
 
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index 7f268d3cc3f8..ac2738c67fd4 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -405,7 +405,7 @@ def __init__(
         self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs)
         self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs)
         self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
+        self.seq_length = self.text_model_tester.seq_length + num_query_tokens  # need seq_length for common tests
         self.is_training = is_training
         self.num_query_tokens = num_query_tokens
         self.image_token_index = image_token_index
@@ -553,7 +553,11 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
         use_cache = True  # force this to be True in case False is passed
 
         batch_size = main_input.shape[0]
-        seq_length = main_input.shape[-1]
+
+        seq_length = getattr(self.model_tester, "seq_length", None)
+        seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
+        seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
+
         config = config.text_config if hasattr(config, "text_config") else config
         num_sequences_in_output = batch_size * num_return_sequences
 
diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index fd3188ce77dd..917e40914147 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -419,7 +419,7 @@ def __init__(
         self.qformer_model_tester = InstructBlipVideoQFormerModelTester(parent, **qformer_kwargs)
         self.text_model_tester = InstructBlipVideoTextModelDecoderOnlyTester(parent, **text_kwargs)
         self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length  # need seq_length for common tests
+        self.seq_length = self.text_model_tester.seq_length + num_query_tokens  # need seq_length for common tests
         self.is_training = is_training
         self.num_query_tokens = num_query_tokens
         self.video_token_index = video_token_index
@@ -577,7 +577,11 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
         use_cache = True  # force this to be True in case False is passed
 
         batch_size = main_input.shape[0]
-        seq_length = main_input.shape[-1]
+
+        seq_length = getattr(self.model_tester, "seq_length", None)
+        seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
+        seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
+
         config = config.text_config if hasattr(config, "text_config") else config
         num_sequences_in_output = batch_size * num_return_sequences
 

From 95f6b763d67dabfd55d6a337c81c21ffbe959217 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Mon, 21 Oct 2024 13:03:24 +0200
Subject: [PATCH 10/12] oops len should be multiple of frame number

---
 .../instructblipvideo/test_modeling_instructblipvideo.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index 917e40914147..3526de56d015 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -419,7 +419,9 @@ def __init__(
         self.qformer_model_tester = InstructBlipVideoQFormerModelTester(parent, **qformer_kwargs)
         self.text_model_tester = InstructBlipVideoTextModelDecoderOnlyTester(parent, **text_kwargs)
         self.batch_size = self.text_model_tester.batch_size  # need bs for batching_equivalence test
-        self.seq_length = self.text_model_tester.seq_length + num_query_tokens  # need seq_length for common tests
+        self.frames = self.vision_model_tester.frames
+        # need seq_length for common tests
+        self.seq_length = self.text_model_tester.seq_length + (num_query_tokens * self.frames)
         self.is_training = is_training
         self.num_query_tokens = num_query_tokens
         self.video_token_index = video_token_index
@@ -428,13 +430,12 @@ def prepare_config_and_inputs(self):
         _, pixel_values = self.vision_model_tester.prepare_config_and_inputs()
         _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs()
         _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs()
-        frames = self.vision_model_tester.frames
         _, c, h, w = pixel_values.shape
-        pixel_values = pixel_values.reshape(-1, frames, c, h, w)
+        pixel_values = pixel_values.reshape(-1, self.frames, c, h, w)
 
         vision_tokens = (
             torch.ones(
-                (input_ids.shape[0], self.num_query_tokens * frames), device=torch_device, dtype=input_ids.dtype
+                (input_ids.shape[0], self.num_query_tokens * self.frames), device=torch_device, dtype=input_ids.dtype
             )
             * self.video_token_index
         )

From 63756d6b14e95a53899056ceccde65ba339545ac Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Fri, 25 Oct 2024 10:44:18 +0200
Subject: [PATCH 11/12] style

---
 .../models/instructblipvideo/test_modeling_instructblipvideo.py  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index 23c769945395..3ac00cd42411 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -843,6 +843,7 @@ def test_sdpa_can_dispatch_composite_models(self):
                 ):
                     raise ValueError("The SDPA model should have SDPA attention layers")
 
+
 # We will verify our results on an image of cute cats
 def prepare_video():
     video_file = hf_hub_download(

From 3fec5100741708b6e48b5e224d155ae269ebbc39 Mon Sep 17 00:00:00 2001
From: raushan <raushan@huggingface.co>
Date: Fri, 1 Nov 2024 08:28:37 +0100
Subject: [PATCH 12/12] fix some tests

---
 tests/models/blip_2/test_modeling_blip_2.py   | 70 ++++++++-----------
 .../test_modeling_instructblip.py             | 70 ++++++++-----------
 .../test_modeling_instructblipvideo.py        | 70 ++++++++-----------
 3 files changed, 84 insertions(+), 126 deletions(-)

diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py
index b42bc4dfb6ca..ffd3e47e9374 100644
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -21,6 +21,7 @@
 import numpy as np
 import pytest
 import requests
+from parameterized import parameterized
 
 from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig
 from transformers.testing_utils import (
@@ -603,17 +604,19 @@ def test_model_from_pretrained(self):
         self.assertIsNotNone(model)
 
     # overwrite because BLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format
-    def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1):
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
         use_cache = True  # force this to be True in case False is passed
 
-        batch_size = main_input.shape[0]
+        input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
+        internal_batch_size = (
+            input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
+        )
 
         seq_length = getattr(self.model_tester, "seq_length", None)
         seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
         seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
 
         config = config.text_config if hasattr(config, "text_config") else config
-        num_sequences_in_output = batch_size * num_return_sequences
 
         gen_len = (
             output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
@@ -624,19 +627,21 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
             seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
 
         # scores
-        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
+        self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
 
         # unprocessed logits
-        self._check_logits(num_sequences_in_output, output.logits, config=config)
+        self._check_logits(internal_batch_size, output.logits, config=config)
 
         # Attentions
         if self.has_attentions:
             if config.is_encoder_decoder:
                 # encoder
-                self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
+                self._check_encoder_attention_for_generate(
+                    output.encoder_attentions, input_batch_size, config, seq_length
+                )
                 # decoder
                 self._check_attentions_for_generate(
-                    num_sequences_in_output,
+                    internal_batch_size,
                     output.decoder_attentions,
                     min_length=1,
                     max_length=output.sequences.shape[-1],
@@ -648,7 +653,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
                 attentions = output.attentions if not use_cache else output.attentions[1:]
                 min_length = seq_length if not use_cache else seq_length + 1
                 self._check_attentions_for_generate(
-                    num_sequences_in_output,
+                    internal_batch_size,
                     attentions=attentions,
                     min_length=min_length,
                     max_length=output.sequences.shape[-1],
@@ -660,12 +665,12 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
         if config.is_encoder_decoder:
             # encoder
             self._check_encoder_hidden_states_for_generate(
-                output.encoder_hidden_states, batch_size, config, seq_length
+                output.encoder_hidden_states, input_batch_size, config, seq_length
             )
 
             # decoder
             self._check_hidden_states_for_generate(
-                num_sequences_in_output,
+                internal_batch_size,
                 output.decoder_hidden_states,
                 min_length=1,
                 max_length=output.sequences.shape[-1],
@@ -677,7 +682,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
             hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
             min_length = seq_length if not use_cache else seq_length + 1
             self._check_hidden_states_for_generate(
-                num_sequences_in_output,
+                internal_batch_size,
                 hidden_states,
                 min_length=min_length,
                 max_length=output.sequences.shape[-1],
@@ -685,36 +690,16 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
                 use_cache=use_cache,
             )
 
-        # Past Key Value States -- a few notes here:
-        # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1"
-        # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the
-        #    standard cache format (e.g.gptbigcode )
-        models_without_standard_cache = (
-            "ctrl",
-            "fsmt",
-            "gptbigcode",
-            "mega",
-            "reformer",
-            "jamba",
-            "mamba",
-            "xlnet",
-            "zamba",
-        )
-        has_standard_cache = not any(
-            model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache
-        )
-        if has_standard_cache:
-            if use_cache:
-                past_key_values = output.past_key_values
-                past_sequence_length = output.sequences.shape[-1] - 1
-                self._check_past_key_values_for_generate(
-                    num_sequences_in_output,
-                    past_key_values,
-                    seq_length=past_sequence_length,
-                    config=config,
-                )
-            elif use_cache is False:
-                self.assertTrue(output.past_key_values is None)
+        # Past Key Value States
+        if use_cache:
+            past_key_values = output.past_key_values
+            past_sequence_length = output.sequences.shape[-1] - 1
+            self._check_past_key_values_for_generate(
+                internal_batch_size,
+                past_key_values,
+                seq_length=past_sequence_length,
+                config=config,
+            )
 
     # overwrite because BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present
     @pytest.mark.generate
@@ -800,7 +785,8 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature):
             self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
 
     @unittest.skip("BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present")
-    def test_generate_from_inputs_embeds_decoder_only(self):
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
         pass
 
 
diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py
index 44793195309f..618d20347860 100644
--- a/tests/models/instructblip/test_modeling_instructblip.py
+++ b/tests/models/instructblip/test_modeling_instructblip.py
@@ -21,6 +21,7 @@
 import numpy as np
 import pytest
 import requests
+from parameterized import parameterized
 
 from transformers import (
     CONFIG_MAPPING,
@@ -552,17 +553,19 @@ def test_model_from_pretrained(self):
         self.assertIsNotNone(model)
 
     # overwrite because InstructBLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format
-    def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1):
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
         use_cache = True  # force this to be True in case False is passed
 
-        batch_size = main_input.shape[0]
+        input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
+        internal_batch_size = (
+            input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
+        )
 
         seq_length = getattr(self.model_tester, "seq_length", None)
         seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
         seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
 
         config = config.text_config if hasattr(config, "text_config") else config
-        num_sequences_in_output = batch_size * num_return_sequences
 
         gen_len = (
             output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
@@ -573,19 +576,21 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
             seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
 
         # scores
-        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
+        self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
 
         # unprocessed logits
-        self._check_logits(num_sequences_in_output, output.logits, config=config)
+        self._check_logits(internal_batch_size, output.logits, config=config)
 
         # Attentions
         if self.has_attentions:
             if config.is_encoder_decoder:
                 # encoder
-                self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
+                self._check_encoder_attention_for_generate(
+                    output.encoder_attentions, input_batch_size, config, seq_length
+                )
                 # decoder
                 self._check_attentions_for_generate(
-                    num_sequences_in_output,
+                    internal_batch_size,
                     output.decoder_attentions,
                     min_length=1,
                     max_length=output.sequences.shape[-1],
@@ -597,7 +602,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
                 attentions = output.attentions if not use_cache else output.attentions[1:]
                 min_length = seq_length if not use_cache else seq_length + 1
                 self._check_attentions_for_generate(
-                    num_sequences_in_output,
+                    internal_batch_size,
                     attentions=attentions,
                     min_length=min_length,
                     max_length=output.sequences.shape[-1],
@@ -609,12 +614,12 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
         if config.is_encoder_decoder:
             # encoder
             self._check_encoder_hidden_states_for_generate(
-                output.encoder_hidden_states, batch_size, config, seq_length
+                output.encoder_hidden_states, input_batch_size, config, seq_length
             )
 
             # decoder
             self._check_hidden_states_for_generate(
-                num_sequences_in_output,
+                internal_batch_size,
                 output.decoder_hidden_states,
                 min_length=1,
                 max_length=output.sequences.shape[-1],
@@ -626,7 +631,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
             hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
             min_length = seq_length if not use_cache else seq_length + 1
             self._check_hidden_states_for_generate(
-                num_sequences_in_output,
+                internal_batch_size,
                 hidden_states,
                 min_length=min_length,
                 max_length=output.sequences.shape[-1],
@@ -634,36 +639,16 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
                 use_cache=use_cache,
             )
 
-        # Past Key Value States -- a few notes here:
-        # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1"
-        # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the
-        #    standard cache format (e.g.gptbigcode )
-        models_without_standard_cache = (
-            "ctrl",
-            "fsmt",
-            "gptbigcode",
-            "mega",
-            "reformer",
-            "jamba",
-            "mamba",
-            "xlnet",
-            "zamba",
-        )
-        has_standard_cache = not any(
-            model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache
-        )
-        if has_standard_cache:
-            if use_cache:
-                past_key_values = output.past_key_values
-                past_sequence_length = output.sequences.shape[-1] - 1
-                self._check_past_key_values_for_generate(
-                    num_sequences_in_output,
-                    past_key_values,
-                    seq_length=past_sequence_length,
-                    config=config,
-                )
-            elif use_cache is False:
-                self.assertTrue(output.past_key_values is None)
+        # Past Key Value States
+        if use_cache:
+            past_key_values = output.past_key_values
+            past_sequence_length = output.sequences.shape[-1] - 1
+            self._check_past_key_values_for_generate(
+                internal_batch_size,
+                past_key_values,
+                seq_length=past_sequence_length,
+                config=config,
+            )
 
     # overwrite because InstructBLIP cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present
     @pytest.mark.generate
@@ -756,7 +741,8 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature):
     @unittest.skip(
         "InstructBLIP cannot generate only from input ids, and requires pixel values in all cases to be present"
     )
-    def test_generate_from_inputs_embeds_decoder_only(self):
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
         pass
 
     @require_torch_sdpa
diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
index 3ac00cd42411..60ebf206a06e 100644
--- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
+++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py
@@ -21,6 +21,7 @@
 import numpy as np
 import pytest
 from huggingface_hub import hf_hub_download
+from parameterized import parameterized
 
 from transformers import (
     CONFIG_MAPPING,
@@ -576,17 +577,19 @@ def test_model_from_pretrained(self):
         self.assertIsNotNone(model)
 
     # overwrite because InstructBLIPVideo internally calls LM.generate() with embeds thus it cannot operate in no cache format
-    def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1):
+    def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1):
         use_cache = True  # force this to be True in case False is passed
 
-        batch_size = main_input.shape[0]
+        input_batch_size = int(output.sequences.shape[0] / num_return_sequences)
+        internal_batch_size = (
+            input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences
+        )
 
         seq_length = getattr(self.model_tester, "seq_length", None)
         seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length)
         seq_length = getattr(self.model_tester, "text_seq_length", seq_length)
 
         config = config.text_config if hasattr(config, "text_config") else config
-        num_sequences_in_output = batch_size * num_return_sequences
 
         gen_len = (
             output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length
@@ -597,19 +600,21 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
             seq_length = self.model_tester.get_subsampled_output_lengths(seq_length)
 
         # scores
-        self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config)
+        self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config)
 
         # unprocessed logits
-        self._check_logits(num_sequences_in_output, output.logits, config=config)
+        self._check_logits(internal_batch_size, output.logits, config=config)
 
         # Attentions
         if self.has_attentions:
             if config.is_encoder_decoder:
                 # encoder
-                self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length)
+                self._check_encoder_attention_for_generate(
+                    output.encoder_attentions, input_batch_size, config, seq_length
+                )
                 # decoder
                 self._check_attentions_for_generate(
-                    num_sequences_in_output,
+                    internal_batch_size,
                     output.decoder_attentions,
                     min_length=1,
                     max_length=output.sequences.shape[-1],
@@ -621,7 +626,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
                 attentions = output.attentions if not use_cache else output.attentions[1:]
                 min_length = seq_length if not use_cache else seq_length + 1
                 self._check_attentions_for_generate(
-                    num_sequences_in_output,
+                    internal_batch_size,
                     attentions=attentions,
                     min_length=min_length,
                     max_length=output.sequences.shape[-1],
@@ -633,12 +638,12 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
         if config.is_encoder_decoder:
             # encoder
             self._check_encoder_hidden_states_for_generate(
-                output.encoder_hidden_states, batch_size, config, seq_length
+                output.encoder_hidden_states, input_batch_size, config, seq_length
             )
 
             # decoder
             self._check_hidden_states_for_generate(
-                num_sequences_in_output,
+                internal_batch_size,
                 output.decoder_hidden_states,
                 min_length=1,
                 max_length=output.sequences.shape[-1],
@@ -650,7 +655,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
             hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:]
             min_length = seq_length if not use_cache else seq_length + 1
             self._check_hidden_states_for_generate(
-                num_sequences_in_output,
+                internal_batch_size,
                 hidden_states,
                 min_length=min_length,
                 max_length=output.sequences.shape[-1],
@@ -658,36 +663,16 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return
                 use_cache=use_cache,
             )
 
-        # Past Key Value States -- a few notes here:
-        # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1"
-        # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the
-        #    standard cache format (e.g.gptbigcode )
-        models_without_standard_cache = (
-            "ctrl",
-            "fsmt",
-            "gptbigcode",
-            "mega",
-            "reformer",
-            "jamba",
-            "mamba",
-            "xlnet",
-            "zamba",
-        )
-        has_standard_cache = not any(
-            model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache
-        )
-        if has_standard_cache:
-            if use_cache:
-                past_key_values = output.past_key_values
-                past_sequence_length = output.sequences.shape[-1] - 1
-                self._check_past_key_values_for_generate(
-                    num_sequences_in_output,
-                    past_key_values,
-                    seq_length=past_sequence_length,
-                    config=config,
-                )
-            elif use_cache is False:
-                self.assertTrue(output.past_key_values is None)
+        # Past Key Value States
+        if use_cache:
+            past_key_values = output.past_key_values
+            past_sequence_length = output.sequences.shape[-1] - 1
+            self._check_past_key_values_for_generate(
+                internal_batch_size,
+                past_key_values,
+                seq_length=past_sequence_length,
+                config=config,
+            )
 
     # overwrite because InstructBLIPVideo cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present
     @pytest.mark.generate
@@ -780,7 +765,8 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature):
     @unittest.skip(
         "InstructBLIPVideo cannot generate only from input ids, and requires pixel values in all cases to be present"
     )
-    def test_generate_from_inputs_embeds_decoder_only(self):
+    @parameterized.expand([("greedy", 1), ("beam search", 2)])
+    def test_generate_from_inputs_embeds(self, _, num_beams):
         pass
 
     @require_torch_sdpa