From 64873abdfb7ffd3c798ae5298c53393d4086fd4a Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 15 Oct 2024 12:52:17 +0200 Subject: [PATCH 01/12] blip2 tests --- .../models/blip_2/modeling_blip_2.py | 37 ++- tests/generation/test_utils.py | 81 +++--- tests/models/blip_2/test_modeling_blip_2.py | 236 +++++++++++++++++- 3 files changed, 286 insertions(+), 68 deletions(-) diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 4b0ed4f71d9c..1cdcdff7fc6d 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -1774,8 +1774,12 @@ def forward( return_dict=return_dict, labels=labels, ) - loss = outputs.loss if return_dict else outputs[0] - logits = outputs.logits if return_dict else outputs[1] + if labels is not None: + loss = outputs.loss if return_dict else outputs[0] + logits = outputs.logits if return_dict else outputs[1] + else: + loss = None + logits = outputs.logits if return_dict else outputs[0] if not return_dict: output = (logits, vision_outputs, query_outputs, outputs) @@ -2243,8 +2247,12 @@ def forward( return_dict=return_dict, labels=labels, ) - loss = outputs.loss if return_dict else outputs[0] - logits = outputs.logits if return_dict else outputs[1] + if labels is not None: + loss = outputs.loss if return_dict else outputs[0] + logits = outputs.logits if return_dict else outputs[1] + else: + loss = None + logits = outputs.logits if return_dict else outputs[0] if not return_dict: output = (logits, vision_outputs, query_outputs, outputs) @@ -2341,24 +2349,11 @@ def generate( ) generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] - outputs = self.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - # this is a temporary workaround to be consistent with other generation models and - # have BOS as the first token, even though under the hood we are calling LM with embeds + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - bos_tokens = ( - torch.LongTensor([[self.config.text_config.bos_token_id]]) - .repeat(batch_size, 1) - .to(image_embeds.device) - ) - if not isinstance(outputs, torch.Tensor): - outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) - else: - outputs = torch.cat([bos_tokens, outputs], dim=-1) + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) return outputs diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 58259821cf7a..594d5a57ea26 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -92,6 +92,7 @@ class GenerationTesterMixin: + input_name = "input_ids" model_tester = None all_generative_model_classes = () max_new_tokens = 3 @@ -406,7 +407,7 @@ def _contrastive_generate( def test_greedy_generate(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] model = model_class(config).to(torch_device).eval() output_generate = self._greedy_generate(model=model, inputs_dict=inputs_dict) @@ -420,7 +421,7 @@ def test_greedy_generate(self): def test_greedy_generate_dict_outputs(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] model = model_class(config).to(torch_device).eval() output_generate = self._greedy_generate( @@ -434,7 +435,7 @@ def test_greedy_generate_dict_outputs(self): use_cache=False, ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput) # Retrocompatibility check @@ -451,10 +452,10 @@ def test_greedy_generate_dict_outputs(self): def test_greedy_generate_dict_outputs_use_cache(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] - if not hasattr(config, "use_cache"): - self.skipTest(reason=f"{model_class.__name__} doesn't support caching") + # if not hasattr(config, "use_cache") or not hasattr(config.get_text_config(), "use_cache"): + # self.skipTest(reason=f"{model_class.__name__} doesn't support caching") if any(model_name in model_class.__name__.lower() for model_name in ["rwkv"]): self.skipTest(reason="Won't fix: model with non-standard dictionary output shapes") @@ -471,7 +472,7 @@ def test_greedy_generate_dict_outputs_use_cache(self): use_cache=True, # Enable cache ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -482,12 +483,12 @@ def test_greedy_generate_dict_outputs_use_cache(self): def test_sample_generate(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] model = model_class(config).to(torch_device).eval() output_generate = self._sample_generate(model=model, inputs_dict=inputs_dict, num_return_sequences=1) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -496,7 +497,7 @@ def test_sample_generate(self): def test_sample_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] model = model_class(config).to(torch_device).eval() output_generate = self._sample_generate( @@ -511,7 +512,7 @@ def test_sample_generate_dict_output(self): use_cache=False, ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput) # Retrocompatibility check @@ -528,14 +529,14 @@ def test_sample_generate_dict_output(self): def test_beam_search_generate(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() output_generate = self._beam_search_generate(model=model, inputs_dict=inputs_dict, beam_kwargs=beam_kwargs) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -544,7 +545,7 @@ def test_beam_search_generate(self): def test_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() @@ -559,7 +560,7 @@ def test_beam_search_generate_dict_output(self): return_dict_in_generate=True, use_cache=False, ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) # Retrocompatibility check @@ -578,7 +579,7 @@ def test_beam_search_generate_dict_output(self): def test_beam_search_generate_dict_outputs_use_cache(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] if not hasattr(config, "use_cache"): self.skipTest(reason=f"{model_class.__name__} doesn't support caching") @@ -602,7 +603,7 @@ def test_beam_search_generate_dict_outputs_use_cache(self): use_cache=True, # Enable cache ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -643,7 +644,7 @@ def test_model_parallel_beam_search(self): def test_beam_sample_generate(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() @@ -653,7 +654,7 @@ def test_beam_sample_generate(self): beam_kwargs=beam_kwargs, ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -685,7 +686,7 @@ def test_beam_sample_generate(self): def test_beam_sample_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() @@ -702,7 +703,7 @@ def test_beam_sample_generate_dict_output(self): use_cache=False, ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) # Retrocompatibility check @@ -742,7 +743,7 @@ def test_generate_without_input_ids(self): def test_group_beam_search_generate(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] model = model_class(config).to(torch_device).eval() # check `generate()` and `group_beam_search()` are equal @@ -752,7 +753,7 @@ def test_group_beam_search_generate(self): inputs_dict=inputs_dict, beam_kwargs=beam_kwargs, ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -765,7 +766,7 @@ def test_group_beam_search_generate(self): inputs_dict=inputs_dict, beam_kwargs=beam_kwargs, ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -774,7 +775,7 @@ def test_group_beam_search_generate(self): def test_group_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_diverse_beam_kwargs() @@ -789,7 +790,7 @@ def test_group_beam_search_generate_dict_output(self): return_dict_in_generate=True, use_cache=False, ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) # Retrocompatibility check @@ -810,7 +811,7 @@ def test_group_beam_search_generate_dict_output(self): def test_constrained_beam_search_generate(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] model = model_class(config).to(torch_device).eval() @@ -831,7 +832,7 @@ def test_constrained_beam_search_generate(self): beam_kwargs=beam_kwargs, ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -855,7 +856,7 @@ def test_constrained_beam_search_generate(self): beam_kwargs=beam_kwargs, ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -867,7 +868,7 @@ def test_constrained_beam_search_generate(self): def test_constrained_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] model = model_class(config).to(torch_device).eval() @@ -893,7 +894,7 @@ def test_constrained_beam_search_generate_dict_output(self): use_cache=False, ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) # Retrocompatibility check @@ -919,7 +920,7 @@ def test_contrastive_generate(self): self.skipTest(reason="Won't fix: old model with different cache format") config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] # NOTE: contrastive search only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -933,7 +934,7 @@ def test_contrastive_generate(self): inputs_dict=inputs_dict, use_cache=True, # Enable cache ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -949,7 +950,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self): self.skipTest(reason="Won't fix: old model with different cache format") config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] # NOTE: contrastive search only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -968,7 +969,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self): use_cache=True, # Enable cache ) - if model.config.is_encoder_decoder: + if model.config.get_text_config().is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -1104,7 +1105,7 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1178,7 +1179,7 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1231,7 +1232,7 @@ def test_dola_decoding_sample(self): # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] # Encoder-decoder models are not supported if config.is_encoder_decoder: @@ -1288,7 +1289,7 @@ def test_assisted_decoding_sample(self): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1834,7 +1835,7 @@ def test_generate_with_static_cache(self): self.skipTest(reason="This model does not support the static cache format") config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] + main_input = inputs_dict[self.input_name] if config.is_encoder_decoder: self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache") diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index cee5d710a85f..e1c175668e5b 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -19,6 +19,7 @@ import unittest import numpy as np +import pytest import requests from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig @@ -390,7 +391,14 @@ def get_config(self): # this model tester uses a decoder-only language model (OPT) class Blip2ForConditionalGenerationDecoderOnlyModelTester: def __init__( - self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10 + self, + parent, + vision_kwargs=None, + qformer_kwargs=None, + text_kwargs=None, + is_training=True, + num_query_tokens=10, + image_token_index=4, ): if vision_kwargs is None: vision_kwargs = {} @@ -407,11 +415,21 @@ def __init__( self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests self.is_training = is_training self.num_query_tokens = num_query_tokens + self.image_token_index = image_token_index def prepare_config_and_inputs(self): _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() + vision_tokens = ( + torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype) + * self.image_token_index + ) + input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id + input_ids = torch.cat([vision_tokens, input_ids], dim=-1) + vision_attention_mask = torch.ones_like(vision_tokens) + attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1) + config = self.get_config() return config, input_ids, attention_mask, pixel_values @@ -422,6 +440,7 @@ def get_config(self): qformer_config=self.qformer_model_tester.get_config(), text_config=self.text_model_tester.get_config(), num_query_tokens=self.num_query_tokens, + image_token_index=self.image_token_index, ) def create_and_check_for_conditional_generation(self, config, input_ids, attention_mask, pixel_values): @@ -442,7 +461,6 @@ def prepare_config_and_inputs_for_common(self): "pixel_values": pixel_values, "input_ids": input_ids, "attention_mask": attention_mask, - "labels": input_ids, } return config, inputs_dict @@ -450,6 +468,7 @@ def prepare_config_and_inputs_for_common(self): @require_torch class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (Blip2ForConditionalGeneration,) if is_torch_available() else () fx_compatible = False test_head_masking = False test_pruning = False @@ -521,6 +540,203 @@ def test_model_from_pretrained(self): model = Blip2ForConditionalGeneration.from_pretrained(model_name) self.assertIsNotNone(model) + # overwrite because BLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format + def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): + use_cache = True # force this to be True in case False is passed + + batch_size = main_input.shape[0] + seq_length = main_input.shape[-1] + config = config.text_config if hasattr(config, "text_config") else config + num_sequences_in_output = batch_size * num_return_sequences + + gen_len = ( + output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length + ) + + # in some models we subsample the sequence length in inner layers + if hasattr(self.model_tester, "get_subsampled_output_lengths"): + seq_length = self.model_tester.get_subsampled_output_lengths(seq_length) + + # scores + self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config) + + # unprocessed logits + self._check_logits(num_sequences_in_output, output.logits, config=config) + + # Attentions + if self.has_attentions: + if config.is_encoder_decoder: + # encoder + self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length) + # decoder + self._check_attentions_for_generate( + num_sequences_in_output, + output.decoder_attentions, + min_length=1, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + else: + # if use_cache first input is equal to no use_cache, so skip here + attentions = output.attentions if not use_cache else output.attentions[1:] + min_length = seq_length if not use_cache else seq_length + 1 + self._check_attentions_for_generate( + num_sequences_in_output, + attentions=attentions, + min_length=min_length, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + + # Hidden States + if config.is_encoder_decoder: + # encoder + self._check_encoder_hidden_states_for_generate( + output.encoder_hidden_states, batch_size, config, seq_length + ) + + # decoder + self._check_hidden_states_for_generate( + num_sequences_in_output, + output.decoder_hidden_states, + min_length=1, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + else: + # if use_cache first input is equal to no use_cache, so skip here + hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:] + min_length = seq_length if not use_cache else seq_length + 1 + self._check_hidden_states_for_generate( + num_sequences_in_output, + hidden_states, + min_length=min_length, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + + # Past Key Value States -- a few notes here: + # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1" + # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the + # standard cache format (e.g.gptbigcode ) + models_without_standard_cache = ( + "ctrl", + "fsmt", + "gptbigcode", + "mega", + "reformer", + "jamba", + "mamba", + "xlnet", + "zamba", + ) + has_standard_cache = not any( + model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache + ) + if has_standard_cache: + if use_cache: + past_key_values = output.past_key_values + past_sequence_length = output.sequences.shape[-1] - 1 + self._check_past_key_values_for_generate( + num_sequences_in_output, + past_key_values, + seq_length=past_sequence_length, + config=config, + ) + elif use_cache is False: + self.assertTrue(output.past_key_values is None) + + # overwrite because BLIP2 cannot generate on;y from input ids, and requires pixel values in all cases to be present + @pytest.mark.generate + def test_left_padding_compatibility(self): + # NOTE: left-padding results in small numerical differences. This is expected. + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + + # First, filter out models that don't support left padding + # - The model must have generative capabilities + if len(self.all_generative_model_classes) == 0: + self.skipTest(reason="No generative architecture available for this model.") + + # - The model must support padding + if not self.has_attentions: + self.skipTest(reason="This model doesn't support padding.") + + # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) + decoder_only_classes = [] + for model_class in self.all_generative_model_classes: + config, _ = self.prepare_config_and_inputs_for_generate() + if config.is_encoder_decoder: + continue + else: + decoder_only_classes.append(model_class) + if len(decoder_only_classes) == 0: + self.skipTest(reason="No decoder-only architecture available for this model.") + + # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't + # added support for it yet. We skip these models for now. + has_encoder_attributes = any( + attr_name + for attr_name in config.to_dict().keys() + if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" + ) + if has_encoder_attributes: + self.skipTest( + reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." + ) + + # Then, test left-padding + def _prepare_model_kwargs(input_ids, attention_mask, signature): + model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} + if "position_ids" in signature: + position_ids = torch.cumsum(attention_mask, dim=-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + model_kwargs["position_ids"] = position_ids + if "cache_position" in signature: + cache_position = torch.arange(input_ids.shape[-1], device=torch_device) + model_kwargs["cache_position"] = cache_position + return model_kwargs + + for model_class in decoder_only_classes: + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] + attention_mask = inputs_dict.get("attention_mask") + pixel_values = inputs_dict["pixel_values"] + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + model = model_class(config).to(torch_device).eval() + signature = inspect.signature(model.forward).parameters.keys() + + # no cache as some models require special cache classes to be init outside forward + model.generation_config.use_cache = False + + # Without padding + model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) + next_logits_wo_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :] + + # With left-padding (length 32) + # can hardcode pad_token to be 0 as we'll do attn masking anyway + pad_token_id = ( + config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 + ) + pad_size = (input_ids.shape[0], 32) + padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id + padded_input_ids = torch.cat((padding, input_ids), dim=1) + padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) + model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) + next_logits_with_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :] + + # They should result in very similar logits + self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) + + @unittest.skip("BLIP2 cannot generate on;y from input ids, and requires pixel values in all cases to be present") + def test_generate_from_inputs_embeds_decoder_only(self): + pass + # this class is based on `T5ModelTester` found in tests/models/t5/test_modeling_t5.py class Blip2TextModelTester: @@ -692,7 +908,6 @@ def prepare_config_and_inputs_for_common(self): "attention_mask": attention_mask, "decoder_input_ids": decoder_input_ids, "decoder_attention_mask": decoder_attention_mask, - "labels": labels, } return config, inputs_dict @@ -718,9 +933,16 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi # TODO: Fix the failed tests def is_pipeline_test_to_skip( - self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name + self, + pipeline_test_case_name, + config_class, + model_architecture, + tokenizer_name, + image_processor_name, + feature_extractor_name, + processor_name, ): - if pipeline_test_casse_name == "VisualQuestionAnsweringPipelineTests": + if pipeline_test_case_name == "VisualQuestionAnsweringPipelineTests": # Get `RuntimeError: "LayerNormKernelImpl" not implemented for 'Half'`. return True @@ -811,7 +1033,7 @@ def test_get_text_features(self): def test_get_image_features(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"] + keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"] for key in keys_to_pop: inputs_dict.pop(key) @@ -831,7 +1053,7 @@ def test_get_image_features(self): def test_get_qformer_features(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"] + keys_to_pop = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask"] for key in keys_to_pop: inputs_dict.pop(key) From 23d2e15f469cca0f1027bc3300019bdaaa115f85 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 15 Oct 2024 13:39:11 +0200 Subject: [PATCH 02/12] instructblips --- .../instructblip/modeling_instructblip.py | 25 +- .../modeling_instructblipvideo.py | 25 +- .../test_modeling_instructblip.py | 227 ++++++++++++++++- .../test_modeling_instructblipvideo.py | 228 +++++++++++++++++- 4 files changed, 460 insertions(+), 45 deletions(-) diff --git a/src/transformers/models/instructblip/modeling_instructblip.py b/src/transformers/models/instructblip/modeling_instructblip.py index de4e84b82f83..b99dda4b834f 100644 --- a/src/transformers/models/instructblip/modeling_instructblip.py +++ b/src/transformers/models/instructblip/modeling_instructblip.py @@ -1628,27 +1628,10 @@ def generate( ) generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] - outputs = self.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - # this is a temporary workaround to be consistent with other generation models and - # have BOS as the first token, even though under the hood we are calling LM with embeds + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - # the InstructBLIP authors used inconsistent tokenizer/model files during training, - # with the tokenizer's bos token being set to which has ID=2, - # whereas the model's text config has bos token id = 0 - bos_token_id = ( - 2 - if self.config.text_config.architectures[0] == "LLaMAForCausalLM" - else self.config.text_config.bos_token_id - ) - bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) - if not isinstance(outputs, torch.Tensor): - outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) - else: - outputs = torch.cat([bos_tokens, outputs], dim=-1) + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) return outputs diff --git a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py index a300268ed713..6c3eaf1ccce8 100644 --- a/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modeling_instructblipvideo.py @@ -1663,27 +1663,10 @@ def generate( ) generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] - outputs = self.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - # this is a temporary workaround to be consistent with other generation models and - # have BOS as the first token, even though under the hood we are calling LM with embeds + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - # the InstructBLIP authors used inconsistent tokenizer/model files during training, - # with the tokenizer's bos token being set to which has ID=2, - # whereas the model's text config has bos token id = 0 - bos_token_id = ( - 2 - if self.config.text_config.architectures[0] == "LLaMAForCausalLM" - else self.config.text_config.bos_token_id - ) - bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) - if not isinstance(outputs, torch.Tensor): - outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) - else: - outputs = torch.cat([bos_tokens, outputs], dim=-1) + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) return outputs diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 8292567334bf..c3406e7682b7 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -19,6 +19,7 @@ import unittest import numpy as np +import pytest import requests from transformers import ( @@ -319,7 +320,7 @@ def __init__( hidden_act="gelu", hidden_dropout_prob=0.1, attention_probs_dropout_prob=0.1, - max_position_embeddings=20, + max_position_embeddings=100, eos_token_id=2, pad_token_id=1, bos_token_id=0, @@ -383,7 +384,14 @@ def get_config(self): # this model tester uses a decoder-only language model (OPT) class InstructBlipForConditionalGenerationDecoderOnlyModelTester: def __init__( - self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10 + self, + parent, + vision_kwargs=None, + qformer_kwargs=None, + text_kwargs=None, + is_training=True, + num_query_tokens=10, + image_token_index=4, ): if vision_kwargs is None: vision_kwargs = {} @@ -400,6 +408,7 @@ def __init__( self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests self.is_training = is_training self.num_query_tokens = num_query_tokens + self.image_token_index = image_token_index def prepare_config_and_inputs(self): _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() @@ -407,6 +416,14 @@ def prepare_config_and_inputs(self): _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() config = self.get_config() + vision_tokens = ( + torch.ones((input_ids.shape[0], self.num_query_tokens), device=torch_device, dtype=input_ids.dtype) + * self.image_token_index + ) + input_ids[input_ids == self.image_token_index] = self.text_model_tester.pad_token_id + input_ids = torch.cat([vision_tokens, input_ids], dim=-1) + vision_attention_mask = torch.ones_like(vision_tokens) + attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1) return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values @@ -416,6 +433,7 @@ def get_config(self): qformer_config=self.qformer_model_tester.get_config(), text_config=self.text_model_tester.get_config(), num_query_tokens=self.num_query_tokens, + image_token_index=self.image_token_index, ) def create_and_check_for_conditional_generation( @@ -454,6 +472,7 @@ def prepare_config_and_inputs_for_common(self): @require_torch class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else () fx_compatible = False test_head_masking = False test_pruning = False @@ -529,6 +548,210 @@ def test_model_from_pretrained(self): model = InstructBlipForConditionalGeneration.from_pretrained(model_name) self.assertIsNotNone(model) + # overwrite because InstructBLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format + def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): + use_cache = True # force this to be True in case False is passed + + batch_size = main_input.shape[0] + seq_length = main_input.shape[-1] + config = config.text_config if hasattr(config, "text_config") else config + num_sequences_in_output = batch_size * num_return_sequences + + gen_len = ( + output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length + ) + + # in some models we subsample the sequence length in inner layers + if hasattr(self.model_tester, "get_subsampled_output_lengths"): + seq_length = self.model_tester.get_subsampled_output_lengths(seq_length) + + # scores + self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config) + + # unprocessed logits + self._check_logits(num_sequences_in_output, output.logits, config=config) + + # Attentions + if self.has_attentions: + if config.is_encoder_decoder: + # encoder + self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length) + # decoder + self._check_attentions_for_generate( + num_sequences_in_output, + output.decoder_attentions, + min_length=1, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + else: + # if use_cache first input is equal to no use_cache, so skip here + attentions = output.attentions if not use_cache else output.attentions[1:] + min_length = seq_length if not use_cache else seq_length + 1 + self._check_attentions_for_generate( + num_sequences_in_output, + attentions=attentions, + min_length=min_length, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + + # Hidden States + if config.is_encoder_decoder: + # encoder + self._check_encoder_hidden_states_for_generate( + output.encoder_hidden_states, batch_size, config, seq_length + ) + + # decoder + self._check_hidden_states_for_generate( + num_sequences_in_output, + output.decoder_hidden_states, + min_length=1, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + else: + # if use_cache first input is equal to no use_cache, so skip here + hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:] + min_length = seq_length if not use_cache else seq_length + 1 + self._check_hidden_states_for_generate( + num_sequences_in_output, + hidden_states, + min_length=min_length, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + + # Past Key Value States -- a few notes here: + # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1" + # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the + # standard cache format (e.g.gptbigcode ) + models_without_standard_cache = ( + "ctrl", + "fsmt", + "gptbigcode", + "mega", + "reformer", + "jamba", + "mamba", + "xlnet", + "zamba", + ) + has_standard_cache = not any( + model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache + ) + if has_standard_cache: + if use_cache: + past_key_values = output.past_key_values + past_sequence_length = output.sequences.shape[-1] - 1 + self._check_past_key_values_for_generate( + num_sequences_in_output, + past_key_values, + seq_length=past_sequence_length, + config=config, + ) + elif use_cache is False: + self.assertTrue(output.past_key_values is None) + + # overwrite because InstructBLIP cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present + @pytest.mark.generate + def test_left_padding_compatibility(self): + # NOTE: left-padding results in small numerical differences. This is expected. + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + + # First, filter out models that don't support left padding + # - The model must have generative capabilities + if len(self.all_generative_model_classes) == 0: + self.skipTest(reason="No generative architecture available for this model.") + + # - The model must support padding + if not self.has_attentions: + self.skipTest(reason="This model doesn't support padding.") + + # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) + decoder_only_classes = [] + for model_class in self.all_generative_model_classes: + config, _ = self.prepare_config_and_inputs_for_generate() + if config.is_encoder_decoder: + continue + else: + decoder_only_classes.append(model_class) + if len(decoder_only_classes) == 0: + self.skipTest(reason="No decoder-only architecture available for this model.") + + # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't + # added support for it yet. We skip these models for now. + has_encoder_attributes = any( + attr_name + for attr_name in config.to_dict().keys() + if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" + ) + if has_encoder_attributes: + self.skipTest( + reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." + ) + + # Then, test left-padding + def _prepare_model_kwargs(input_ids, attention_mask, signature): + model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} + if "position_ids" in signature: + position_ids = torch.cumsum(attention_mask, dim=-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + model_kwargs["position_ids"] = position_ids + if "cache_position" in signature: + cache_position = torch.arange(input_ids.shape[-1], device=torch_device) + model_kwargs["cache_position"] = cache_position + return model_kwargs + + for model_class in decoder_only_classes: + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] + attention_mask = inputs_dict.get("attention_mask") + pixel_values = inputs_dict["pixel_values"] + qformer_input_ids = inputs_dict["qformer_input_ids"] + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + model = model_class(config).to(torch_device).eval() + signature = inspect.signature(model.forward).parameters.keys() + + # no cache as some models require special cache classes to be init outside forward + model.generation_config.use_cache = False + + # Without padding + model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) + next_logits_wo_padding = model( + **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids + ).logits[:, -1, :] + + # With left-padding (length 32) + # can hardcode pad_token to be 0 as we'll do attn masking anyway + pad_token_id = ( + config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 + ) + pad_size = (input_ids.shape[0], 32) + padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id + padded_input_ids = torch.cat((padding, input_ids), dim=1) + padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) + model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) + next_logits_with_padding = model( + **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids + ).logits[:, -1, :] + + # They should result in very similar logits + self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) + + @unittest.skip( + "InstructBLIP cannot generate on;y from input ids, and requires pixel values in all cases to be present" + ) + def test_generate_from_inputs_embeds_decoder_only(self): + pass + # We will verify our results on an image of cute cats def prepare_img(): diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 8a9326c22ac1..08f2c861b630 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -19,6 +19,7 @@ import unittest import numpy as np +import pytest from huggingface_hub import hf_hub_download from transformers import ( @@ -397,7 +398,14 @@ def get_config(self): # this model tester uses a decoder-only language model (OPT) class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester: def __init__( - self, parent, vision_kwargs=None, qformer_kwargs=None, text_kwargs=None, is_training=True, num_query_tokens=10 + self, + parent, + vision_kwargs=None, + qformer_kwargs=None, + text_kwargs=None, + is_training=True, + num_query_tokens=10, + video_token_index=4, ): if vision_kwargs is None: vision_kwargs = {} @@ -414,6 +422,7 @@ def __init__( self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests self.is_training = is_training self.num_query_tokens = num_query_tokens + self.video_token_index = video_token_index def prepare_config_and_inputs(self): _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() @@ -423,6 +432,17 @@ def prepare_config_and_inputs(self): _, c, h, w = pixel_values.shape pixel_values = pixel_values.reshape(-1, frames, c, h, w) + vision_tokens = ( + torch.ones( + (input_ids.shape[0], self.num_query_tokens * frames), device=torch_device, dtype=input_ids.dtype + ) + * self.video_token_index + ) + input_ids[input_ids == self.video_token_index] = self.text_model_tester.pad_token_id + input_ids = torch.cat([vision_tokens, input_ids], dim=-1) + vision_attention_mask = torch.ones_like(vision_tokens) + attention_mask = torch.cat([vision_attention_mask, attention_mask], dim=-1) + config = self.get_config() return config, input_ids, attention_mask, qformer_input_ids, qformer_attention_mask, pixel_values @@ -433,6 +453,7 @@ def get_config(self): qformer_config=self.qformer_model_tester.get_config(), text_config=self.text_model_tester.get_config(), num_query_tokens=self.num_query_tokens, + video_token_index=self.video_token_index, ) def create_and_check_for_conditional_generation( @@ -475,6 +496,7 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest( ModelTesterMixin, GenerationTesterMixin, unittest.TestCase ): all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else () fx_compatible = False test_head_masking = False test_pruning = False @@ -550,6 +572,210 @@ def test_model_from_pretrained(self): model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name) self.assertIsNotNone(model) + # overwrite because InstructBLIPVideo internally calls LM.generate() with embeds thus it cannot operate in no cache format + def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): + use_cache = True # force this to be True in case False is passed + + batch_size = main_input.shape[0] + seq_length = main_input.shape[-1] + config = config.text_config if hasattr(config, "text_config") else config + num_sequences_in_output = batch_size * num_return_sequences + + gen_len = ( + output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length + ) + + # in some models we subsample the sequence length in inner layers + if hasattr(self.model_tester, "get_subsampled_output_lengths"): + seq_length = self.model_tester.get_subsampled_output_lengths(seq_length) + + # scores + self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config) + + # unprocessed logits + self._check_logits(num_sequences_in_output, output.logits, config=config) + + # Attentions + if self.has_attentions: + if config.is_encoder_decoder: + # encoder + self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length) + # decoder + self._check_attentions_for_generate( + num_sequences_in_output, + output.decoder_attentions, + min_length=1, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + else: + # if use_cache first input is equal to no use_cache, so skip here + attentions = output.attentions if not use_cache else output.attentions[1:] + min_length = seq_length if not use_cache else seq_length + 1 + self._check_attentions_for_generate( + num_sequences_in_output, + attentions=attentions, + min_length=min_length, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + + # Hidden States + if config.is_encoder_decoder: + # encoder + self._check_encoder_hidden_states_for_generate( + output.encoder_hidden_states, batch_size, config, seq_length + ) + + # decoder + self._check_hidden_states_for_generate( + num_sequences_in_output, + output.decoder_hidden_states, + min_length=1, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + else: + # if use_cache first input is equal to no use_cache, so skip here + hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:] + min_length = seq_length if not use_cache else seq_length + 1 + self._check_hidden_states_for_generate( + num_sequences_in_output, + hidden_states, + min_length=min_length, + max_length=output.sequences.shape[-1], + config=config, + use_cache=use_cache, + ) + + # Past Key Value States -- a few notes here: + # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1" + # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the + # standard cache format (e.g.gptbigcode ) + models_without_standard_cache = ( + "ctrl", + "fsmt", + "gptbigcode", + "mega", + "reformer", + "jamba", + "mamba", + "xlnet", + "zamba", + ) + has_standard_cache = not any( + model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache + ) + if has_standard_cache: + if use_cache: + past_key_values = output.past_key_values + past_sequence_length = output.sequences.shape[-1] - 1 + self._check_past_key_values_for_generate( + num_sequences_in_output, + past_key_values, + seq_length=past_sequence_length, + config=config, + ) + elif use_cache is False: + self.assertTrue(output.past_key_values is None) + + # overwrite because InstructBLIPVideo cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present + @pytest.mark.generate + def test_left_padding_compatibility(self): + # NOTE: left-padding results in small numerical differences. This is expected. + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + + # First, filter out models that don't support left padding + # - The model must have generative capabilities + if len(self.all_generative_model_classes) == 0: + self.skipTest(reason="No generative architecture available for this model.") + + # - The model must support padding + if not self.has_attentions: + self.skipTest(reason="This model doesn't support padding.") + + # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) + decoder_only_classes = [] + for model_class in self.all_generative_model_classes: + config, _ = self.prepare_config_and_inputs_for_generate() + if config.is_encoder_decoder: + continue + else: + decoder_only_classes.append(model_class) + if len(decoder_only_classes) == 0: + self.skipTest(reason="No decoder-only architecture available for this model.") + + # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't + # added support for it yet. We skip these models for now. + has_encoder_attributes = any( + attr_name + for attr_name in config.to_dict().keys() + if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" + ) + if has_encoder_attributes: + self.skipTest( + reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." + ) + + # Then, test left-padding + def _prepare_model_kwargs(input_ids, attention_mask, signature): + model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} + if "position_ids" in signature: + position_ids = torch.cumsum(attention_mask, dim=-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + model_kwargs["position_ids"] = position_ids + if "cache_position" in signature: + cache_position = torch.arange(input_ids.shape[-1], device=torch_device) + model_kwargs["cache_position"] = cache_position + return model_kwargs + + for model_class in decoder_only_classes: + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] + attention_mask = inputs_dict.get("attention_mask") + pixel_values = inputs_dict["pixel_values"] + qformer_input_ids = inputs_dict["qformer_input_ids"] + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + model = model_class(config).to(torch_device).eval() + signature = inspect.signature(model.forward).parameters.keys() + + # no cache as some models require special cache classes to be init outside forward + model.generation_config.use_cache = False + + # Without padding + model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) + next_logits_wo_padding = model( + **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids + ).logits[:, -1, :] + + # With left-padding (length 32) + # can hardcode pad_token to be 0 as we'll do attn masking anyway + pad_token_id = ( + config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 + ) + pad_size = (input_ids.shape[0], 32) + padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id + padded_input_ids = torch.cat((padding, input_ids), dim=1) + padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) + model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) + next_logits_with_padding = model( + **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids + ).logits[:, -1, :] + + # They should result in very similar logits + self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) + + @unittest.skip( + "InstructBLIPVideo cannot generate on;y from input ids, and requires pixel values in all cases to be present" + ) + def test_generate_from_inputs_embeds_decoder_only(self): + pass + # We will verify our results on an image of cute cats def prepare_video(): From 8cf7507ff0760f596a97404e7ef9aaaec5c5d0e6 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 15 Oct 2024 13:44:03 +0200 Subject: [PATCH 03/12] copies --- .../modular_instructblipvideo.py | 25 +++---------------- 1 file changed, 4 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index 2128f25df662..63c6c486854c 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -468,27 +468,10 @@ def generate( ) generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] - outputs = self.language_model.generate( - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - **generate_kwargs, - ) - - # this is a temporary workaround to be consistent with other generation models and - # have BOS as the first token, even though under the hood we are calling LM with embeds + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} if not self.language_model.config.is_encoder_decoder: - # the InstructBLIP authors used inconsistent tokenizer/model files during training, - # with the tokenizer's bos token being set to which has ID=2, - # whereas the model's text config has bos token id = 0 - bos_token_id = ( - 2 - if self.config.text_config.architectures[0] == "LLaMAForCausalLM" - else self.config.text_config.bos_token_id - ) - bos_tokens = torch.LongTensor([[bos_token_id]]).repeat(batch_size, 1).to(image_embeds.device) - if not isinstance(outputs, torch.Tensor): - outputs.sequences = torch.cat([bos_tokens, outputs.sequences], dim=-1) - else: - outputs = torch.cat([bos_tokens, outputs], dim=-1) + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) return outputs From 8dcb4fbc570b3bc422e33965a7fb92b4e6839e9c Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 15 Oct 2024 14:15:50 +0200 Subject: [PATCH 04/12] fix slow tests --- tests/models/blip_2/test_modeling_blip_2.py | 9 +++++---- tests/models/instructblip/test_modeling_instructblip.py | 4 ++-- .../instructblipvideo/test_modeling_instructblipvideo.py | 2 +- tests/models/seamless_m4t/test_modeling_seamless_m4t.py | 1 + .../seamless_m4t_v2/test_modeling_seamless_m4t_v2.py | 1 + .../speech_to_text/test_modeling_speech_to_text.py | 1 + tests/models/whisper/test_modeling_whisper.py | 2 ++ 7 files changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index e1c175668e5b..4fa8b31cfc11 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -1652,6 +1652,7 @@ def test_inference_opt(self): generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() # Test output + print(predictions[0].tolist(), generated_text) self.assertEqual(predictions[0].tolist(), [2, 102, 693, 2828, 15, 5, 4105, 19, 10, 2335, 50118]) self.assertEqual("a woman sitting on the beach with a dog", generated_text) @@ -1666,9 +1667,9 @@ def test_inference_opt(self): # Test output self.assertEqual( predictions[0].tolist(), - [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118], + [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118], ) - self.assertEqual(generated_text, "it's not a city, it's a beach") + self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach") def test_inference_interpolate_pos_encoding(self): processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") @@ -1777,9 +1778,9 @@ def test_inference_opt_multi_accelerator(self): # Test output self.assertEqual( predictions[0].tolist(), - [2, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118], + [2, 45641, 35, 61, 343, 16, 42, 116, 31652, 35, 24, 18, 45, 10, 343, 6, 24, 18, 10, 4105, 50118], ) - self.assertEqual(generated_text, "it's not a city, it's a beach") + self.assertEqual(generated_text, "Question: which city is this? Answer: it's not a city, it's a beach") @require_torch_multi_accelerator def test_inference_t5_multi_accelerator(self): diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index c3406e7682b7..2cce1e1b1a9d 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -792,12 +792,12 @@ def test_inference_vicuna_7b(self): outputs = model.generate(**inputs, max_new_tokens=30) generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() - expected_outputs = [2, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 372, 338, 19500, 1623, 263, 19587, 4272] # fmt: off + expected_outputs = [2, 1724, 338, 22910, 1048, 445, 1967, 29973, 450, 22910, 9565, 310, 445, 1967, 338, 393, 263, 767, 338, 13977, 292, 22095, 373, 278, 1250, 310, 263, 13328, 20134, 29963, 1550, 19500, 373, 263, 19587, 4272, 11952, 29889] # fmt: off self.assertEqual(outputs[0].tolist(), expected_outputs) self.assertEqual( generated_text, - "The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while it is driving down a busy city", + "What is unusual about this image? The unusual aspect of this image is that a man is ironing clothes on the back of a yellow SUV while driving on a busy city street.", ) def test_inference_flant5_xl(self): diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 08f2c861b630..e066c890e76f 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -807,7 +807,7 @@ def test_inference_vicuna_7b(self): generated_text = processor.batch_decode(outputs, skip_special_tokens=True)[0].strip() self.assertEqual( generated_text, - "a baby girl wearing glasses is reading a book on the bed 1080p", + "Explain what is happening in this short video. a baby girl wearing glasses is reading a book on the bed 1080p", ) def test_expansion_in_processing(self): diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py index cb09d44421f4..b028e3acbf00 100644 --- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py @@ -592,6 +592,7 @@ class SeamlessM4TModelWithTextInputTest( test_resize_embeddings = True test_headmasking = False test_torchscript = False + input_name = "input_features" all_model_classes = ( ( diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py index 451fff0b35fb..1eefbbf31f77 100644 --- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py +++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py @@ -600,6 +600,7 @@ class SeamlessM4Tv2ModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixi test_resize_embeddings = True test_headmasking = False test_torchscript = False + input_name = "input_features" all_model_classes = ( ( diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py index 50446d4628af..b5e7eaba6310 100644 --- a/tests/models/speech_to_text/test_modeling_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py @@ -281,6 +281,7 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest fx_compatible = True test_pruning = False test_missing_keys = False + input_name = "input_features" def setUp(self): self.model_tester = Speech2TextModelTester(self) diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index b24c577a16e5..c04a9875eb33 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -394,6 +394,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi # Needs higher percentages after model tester's vocab_size is changed to 200 (PR #21222) # `0.5` is for `test_disk_offload` (which also works for `test_model_parallelism`) model_split_percents = [0.5, 0.8, 0.9] + input_name = "input_features" # TODO: Fix the failed tests def is_pipeline_test_to_skip( @@ -3930,6 +3931,7 @@ class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, test_pruning = False is_encoder_decoder = False test_missing_keys = False + input_name = "input_features" def setUp(self): self.model_tester = WhisperStandaloneDecoderModelTester(self, is_training=False) From 8cfabfeaa7c96e3511ae0599c06b3593fcf313a1 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 15 Oct 2024 14:50:42 +0200 Subject: [PATCH 05/12] fix --- tests/generation/test_utils.py | 34 +++++++++---------- .../test_modeling_seamless_m4t.py | 1 - .../test_modeling_seamless_m4t_v2.py | 1 - tests/models/whisper/test_modeling_whisper.py | 1 - 4 files changed, 17 insertions(+), 20 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index b27d219225e9..aa0881775004 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -436,7 +436,7 @@ def test_greedy_generate_dict_outputs(self): use_cache=False, ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput) # Retrocompatibility check @@ -473,7 +473,7 @@ def test_greedy_generate_dict_outputs_use_cache(self): use_cache=True, # Enable cache ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -489,7 +489,7 @@ def test_sample_generate(self): model = model_class(config).to(torch_device).eval() output_generate = self._sample_generate(model=model, inputs_dict=inputs_dict, num_return_sequences=1) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -513,7 +513,7 @@ def test_sample_generate_dict_output(self): use_cache=False, ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) self.assertIsInstance(output_generate, GenerateEncoderDecoderOutput) # Retrocompatibility check @@ -537,7 +537,7 @@ def test_beam_search_generate(self): beam_kwargs = self._get_beam_kwargs() output_generate = self._beam_search_generate(model=model, inputs_dict=inputs_dict, beam_kwargs=beam_kwargs) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -561,7 +561,7 @@ def test_beam_search_generate_dict_output(self): return_dict_in_generate=True, use_cache=False, ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) # Retrocompatibility check @@ -604,7 +604,7 @@ def test_beam_search_generate_dict_outputs_use_cache(self): use_cache=True, # Enable cache ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -655,7 +655,7 @@ def test_beam_sample_generate(self): beam_kwargs=beam_kwargs, ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -704,7 +704,7 @@ def test_beam_sample_generate_dict_output(self): use_cache=False, ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) # Retrocompatibility check @@ -754,7 +754,7 @@ def test_group_beam_search_generate(self): inputs_dict=inputs_dict, beam_kwargs=beam_kwargs, ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -767,7 +767,7 @@ def test_group_beam_search_generate(self): inputs_dict=inputs_dict, beam_kwargs=beam_kwargs, ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -791,7 +791,7 @@ def test_group_beam_search_generate_dict_output(self): return_dict_in_generate=True, use_cache=False, ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) # Retrocompatibility check @@ -833,7 +833,7 @@ def test_constrained_beam_search_generate(self): beam_kwargs=beam_kwargs, ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -857,7 +857,7 @@ def test_constrained_beam_search_generate(self): beam_kwargs=beam_kwargs, ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -895,7 +895,7 @@ def test_constrained_beam_search_generate_dict_output(self): use_cache=False, ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) self.assertIsInstance(output_generate, GenerateBeamEncoderDecoderOutput) # Retrocompatibility check @@ -935,7 +935,7 @@ def test_contrastive_generate(self): inputs_dict=inputs_dict, use_cache=True, # Enable cache ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + main_input.shape[-1]) @@ -970,7 +970,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self): use_cache=True, # Enable cache ) - if model.config.get_text_config().is_encoder_decoder: + if model.config.is_encoder_decoder: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + 1) else: self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + main_input.shape[-1]) diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py index b028e3acbf00..cb09d44421f4 100644 --- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py @@ -592,7 +592,6 @@ class SeamlessM4TModelWithTextInputTest( test_resize_embeddings = True test_headmasking = False test_torchscript = False - input_name = "input_features" all_model_classes = ( ( diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py index 1eefbbf31f77..451fff0b35fb 100644 --- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py +++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py @@ -600,7 +600,6 @@ class SeamlessM4Tv2ModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixi test_resize_embeddings = True test_headmasking = False test_torchscript = False - input_name = "input_features" all_model_classes = ( ( diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index c04a9875eb33..3229714b7efe 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -3931,7 +3931,6 @@ class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, test_pruning = False is_encoder_decoder = False test_missing_keys = False - input_name = "input_features" def setUp(self): self.model_tester = WhisperStandaloneDecoderModelTester(self, is_training=False) From f0aff4f02ea2d741a733d6385eb2d79a4dc8d5f8 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 15 Oct 2024 15:29:30 +0200 Subject: [PATCH 06/12] uncomment this --- tests/generation/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index aa0881775004..34746f49c093 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -455,8 +455,8 @@ def test_greedy_generate_dict_outputs_use_cache(self): config, inputs_dict = self.prepare_config_and_inputs_for_generate() main_input = inputs_dict[self.input_name] - # if not hasattr(config, "use_cache") or not hasattr(config.get_text_config(), "use_cache"): - # self.skipTest(reason=f"{model_class.__name__} doesn't support caching") + if not hasattr(config, "use_cache"): + self.skipTest(reason=f"{model_class.__name__} doesn't support caching") if any(model_name in model_class.__name__.lower() for model_name in ["rwkv"]): self.skipTest(reason="Won't fix: model with non-standard dictionary output shapes") From c39c5ed95de252fd043b093e6c657f6bbc3f5b0d Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 21 Oct 2024 12:34:27 +0200 Subject: [PATCH 07/12] clean up after rebase --- .../models/blip_2/modeling_blip_2.py | 22 ++++------ src/transformers/models/glm/modeling_glm.py | 41 +++---------------- tests/models/blip_2/test_modeling_blip_2.py | 4 +- .../test_modeling_instructblip.py | 2 +- .../test_modeling_instructblipvideo.py | 2 +- .../test_modeling_speech_to_text.py | 1 - tests/models/whisper/test_modeling_whisper.py | 1 - 7 files changed, 17 insertions(+), 56 deletions(-) diff --git a/src/transformers/models/blip_2/modeling_blip_2.py b/src/transformers/models/blip_2/modeling_blip_2.py index 1cdcdff7fc6d..8b947a106667 100644 --- a/src/transformers/models/blip_2/modeling_blip_2.py +++ b/src/transformers/models/blip_2/modeling_blip_2.py @@ -1771,15 +1771,12 @@ def forward( decoder_attention_mask=decoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=True, # toggle for easier access to loss/logits below labels=labels, ) - if labels is not None: - loss = outputs.loss if return_dict else outputs[0] - logits = outputs.logits if return_dict else outputs[1] - else: - loss = None - logits = outputs.logits if return_dict else outputs[0] + loss = outputs.loss + logits = outputs.logits + outputs = outputs.to_tuple() if not return_dict else outputs if not return_dict: output = (logits, vision_outputs, query_outputs, outputs) @@ -2244,15 +2241,12 @@ def forward( decoder_attention_mask=decoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, - return_dict=return_dict, + return_dict=True, # toggle for easier access to loss/logits below labels=labels, ) - if labels is not None: - loss = outputs.loss if return_dict else outputs[0] - logits = outputs.logits if return_dict else outputs[1] - else: - loss = None - logits = outputs.logits if return_dict else outputs[0] + loss = outputs.loss + logits = outputs.logits + outputs = outputs.to_tuple() if not return_dict else outputs if not return_dict: output = (logits, vision_outputs, query_outputs, outputs) diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index 9815dbc78992..a458c02a6fed 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -25,7 +25,6 @@ import torch import torch.nn as nn import torch.utils.checkpoint -from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache, StaticCache @@ -921,6 +920,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position( device: torch.device, cache_position: torch.Tensor, batch_size: int, + **kwargs, ): """ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape @@ -1071,18 +1071,7 @@ def forward( loss = None if labels is not None: - # Upcast to float if we need to compute the loss to avoid potential precision issues - logits = logits.float() - # Shift so that tokens < n predict n - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = CrossEntropyLoss() - shift_logits = shift_logits.view(-1, self.config.vocab_size) - shift_labels = shift_labels.view(-1) - # Enable model parallelism - shift_labels = shift_labels.to(shift_logits.device) - loss = loss_fct(shift_logits, shift_labels) + loss = self.loss_function(logits, labels, self.vocab_size) if not return_dict: output = (logits,) + outputs[1:] @@ -1186,27 +1175,8 @@ def forward( loss = None if labels is not None: - labels = labels.to(logits.device) - if self.config.problem_type is None: - if self.num_labels == 1: - self.config.problem_type = "regression" - elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): - self.config.problem_type = "single_label_classification" - else: - self.config.problem_type = "multi_label_classification" - - if self.config.problem_type == "regression": - loss_fct = MSELoss() - if self.num_labels == 1: - loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) - else: - loss = loss_fct(pooled_logits, labels) - elif self.config.problem_type == "single_label_classification": - loss_fct = CrossEntropyLoss() - loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) - elif self.config.problem_type == "multi_label_classification": - loss_fct = BCEWithLogitsLoss() - loss = loss_fct(pooled_logits, labels) + loss = self.loss_function(logits=logits, labels=labels, pooled_logits=pooled_logits, config=self.config) + if not return_dict: output = (pooled_logits,) + transformer_outputs[1:] return ((loss,) + output) if loss is not None else output @@ -1289,8 +1259,7 @@ def forward( loss = None if labels is not None: - loss_fct = CrossEntropyLoss() - loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + loss = self.loss_function(logits, labels, self.config) if not return_dict: output = (logits,) + outputs[2:] diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 4fa8b31cfc11..804bcf6f3f56 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -650,7 +650,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return elif use_cache is False: self.assertTrue(output.past_key_values is None) - # overwrite because BLIP2 cannot generate on;y from input ids, and requires pixel values in all cases to be present + # overwrite because BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present @pytest.mark.generate def test_left_padding_compatibility(self): # NOTE: left-padding results in small numerical differences. This is expected. @@ -733,7 +733,7 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature): # They should result in very similar logits self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) - @unittest.skip("BLIP2 cannot generate on;y from input ids, and requires pixel values in all cases to be present") + @unittest.skip("BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present") def test_generate_from_inputs_embeds_decoder_only(self): pass diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 2cce1e1b1a9d..7f268d3cc3f8 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -747,7 +747,7 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature): self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) @unittest.skip( - "InstructBLIP cannot generate on;y from input ids, and requires pixel values in all cases to be present" + "InstructBLIP cannot generate only from input ids, and requires pixel values in all cases to be present" ) def test_generate_from_inputs_embeds_decoder_only(self): pass diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index e066c890e76f..fd3188ce77dd 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -771,7 +771,7 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature): self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) @unittest.skip( - "InstructBLIPVideo cannot generate on;y from input ids, and requires pixel values in all cases to be present" + "InstructBLIPVideo cannot generate only from input ids, and requires pixel values in all cases to be present" ) def test_generate_from_inputs_embeds_decoder_only(self): pass diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py index 31ef16a44da9..253cda7e49cb 100644 --- a/tests/models/speech_to_text/test_modeling_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py @@ -281,7 +281,6 @@ class Speech2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTest fx_compatible = True test_pruning = False test_missing_keys = False - input_name = "input_features" def setUp(self): self.model_tester = Speech2TextModelTester(self) diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 3229714b7efe..b24c577a16e5 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -394,7 +394,6 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi # Needs higher percentages after model tester's vocab_size is changed to 200 (PR #21222) # `0.5` is for `test_disk_offload` (which also works for `test_model_parallelism`) model_split_percents = [0.5, 0.8, 0.9] - input_name = "input_features" # TODO: Fix the failed tests def is_pipeline_test_to_skip( From 37d25b1ecf47d4b92d2df357bd96988de7367cb2 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 21 Oct 2024 12:40:57 +0200 Subject: [PATCH 08/12] should be model main input --- tests/generation/test_utils.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index ae59da97ec41..3c702c580f5f 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -424,7 +424,7 @@ def test_greedy_generate(self): def test_greedy_generate_dict_outputs(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() output_generate = self._greedy_generate( @@ -457,7 +457,7 @@ def test_greedy_generate_dict_outputs(self): def test_greedy_generate_dict_outputs_use_cache(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] if not hasattr(config, "use_cache"): self.skipTest(reason=f"{model_class.__name__} doesn't support caching") @@ -503,7 +503,7 @@ def test_sample_generate(self): def test_sample_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() output_generate = self._sample_generate( @@ -552,7 +552,7 @@ def test_beam_search_generate(self): def test_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() @@ -588,7 +588,7 @@ def test_beam_search_generate_dict_output(self): def test_beam_search_generate_dict_outputs_use_cache(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] if not hasattr(config, "use_cache"): self.skipTest(reason=f"{model_class.__name__} doesn't support caching") @@ -696,7 +696,7 @@ def test_beam_sample_generate(self): def test_beam_sample_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() @@ -786,7 +786,7 @@ def test_group_beam_search_generate(self): def test_group_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_diverse_beam_kwargs() @@ -880,7 +880,7 @@ def test_constrained_beam_search_generate(self): def test_constrained_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() @@ -963,7 +963,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self): self.skipTest(reason="Won't fix: old model with different cache format") config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] # NOTE: contrastive search only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1121,7 +1121,7 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1196,7 +1196,7 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1252,7 +1252,7 @@ def test_dola_decoding_sample(self): # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] # Encoder-decoder models are not supported if config.is_encoder_decoder: @@ -1310,7 +1310,7 @@ def test_assisted_decoding_sample(self): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1857,7 +1857,7 @@ def test_generate_with_static_cache(self): self.skipTest(reason="This model does not support the static cache format") config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[self.input_name] + main_input = inputs_dict[model_class.main_input_name] if config.is_encoder_decoder: self.skipTest(reason="This model is encoder-decoder and has Encoder-Decoder Cache") From ce467a01d0a2c0a2214184f0795e2358f4fdc727 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 21 Oct 2024 12:51:59 +0200 Subject: [PATCH 09/12] fix overwritten tests --- tests/models/blip_2/test_modeling_blip_2.py | 8 ++++++-- tests/models/instructblip/test_modeling_instructblip.py | 8 ++++++-- .../instructblipvideo/test_modeling_instructblipvideo.py | 8 ++++++-- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 804bcf6f3f56..ffe979ca6e55 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -412,7 +412,7 @@ def __init__( self.qformer_model_tester = Blip2QFormerModelTester(parent, **qformer_kwargs) self.text_model_tester = Blip2TextModelDecoderOnlyTester(parent, **text_kwargs) self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test - self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests + self.seq_length = self.text_model_tester.seq_length + num_query_tokens # need seq_length for common tests self.is_training = is_training self.num_query_tokens = num_query_tokens self.image_token_index = image_token_index @@ -545,7 +545,11 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return use_cache = True # force this to be True in case False is passed batch_size = main_input.shape[0] - seq_length = main_input.shape[-1] + + seq_length = getattr(self.model_tester, "seq_length", None) + seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) + seq_length = getattr(self.model_tester, "text_seq_length", seq_length) + config = config.text_config if hasattr(config, "text_config") else config num_sequences_in_output = batch_size * num_return_sequences diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 7f268d3cc3f8..ac2738c67fd4 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -405,7 +405,7 @@ def __init__( self.qformer_model_tester = InstructBlipQFormerModelTester(parent, **qformer_kwargs) self.text_model_tester = InstructBlipTextModelDecoderOnlyTester(parent, **text_kwargs) self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test - self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests + self.seq_length = self.text_model_tester.seq_length + num_query_tokens # need seq_length for common tests self.is_training = is_training self.num_query_tokens = num_query_tokens self.image_token_index = image_token_index @@ -553,7 +553,11 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return use_cache = True # force this to be True in case False is passed batch_size = main_input.shape[0] - seq_length = main_input.shape[-1] + + seq_length = getattr(self.model_tester, "seq_length", None) + seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) + seq_length = getattr(self.model_tester, "text_seq_length", seq_length) + config = config.text_config if hasattr(config, "text_config") else config num_sequences_in_output = batch_size * num_return_sequences diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index fd3188ce77dd..917e40914147 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -419,7 +419,7 @@ def __init__( self.qformer_model_tester = InstructBlipVideoQFormerModelTester(parent, **qformer_kwargs) self.text_model_tester = InstructBlipVideoTextModelDecoderOnlyTester(parent, **text_kwargs) self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test - self.seq_length = self.text_model_tester.seq_length # need seq_length for common tests + self.seq_length = self.text_model_tester.seq_length + num_query_tokens # need seq_length for common tests self.is_training = is_training self.num_query_tokens = num_query_tokens self.video_token_index = video_token_index @@ -577,7 +577,11 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return use_cache = True # force this to be True in case False is passed batch_size = main_input.shape[0] - seq_length = main_input.shape[-1] + + seq_length = getattr(self.model_tester, "seq_length", None) + seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) + seq_length = getattr(self.model_tester, "text_seq_length", seq_length) + config = config.text_config if hasattr(config, "text_config") else config num_sequences_in_output = batch_size * num_return_sequences From 95f6b763d67dabfd55d6a337c81c21ffbe959217 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 21 Oct 2024 13:03:24 +0200 Subject: [PATCH 10/12] oops len should be multiple of frame number --- .../instructblipvideo/test_modeling_instructblipvideo.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 917e40914147..3526de56d015 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -419,7 +419,9 @@ def __init__( self.qformer_model_tester = InstructBlipVideoQFormerModelTester(parent, **qformer_kwargs) self.text_model_tester = InstructBlipVideoTextModelDecoderOnlyTester(parent, **text_kwargs) self.batch_size = self.text_model_tester.batch_size # need bs for batching_equivalence test - self.seq_length = self.text_model_tester.seq_length + num_query_tokens # need seq_length for common tests + self.frames = self.vision_model_tester.frames + # need seq_length for common tests + self.seq_length = self.text_model_tester.seq_length + (num_query_tokens * self.frames) self.is_training = is_training self.num_query_tokens = num_query_tokens self.video_token_index = video_token_index @@ -428,13 +430,12 @@ def prepare_config_and_inputs(self): _, pixel_values = self.vision_model_tester.prepare_config_and_inputs() _, _, _, qformer_input_ids, qformer_attention_mask = self.qformer_model_tester.prepare_config_and_inputs() _, input_ids, attention_mask = self.text_model_tester.prepare_config_and_inputs() - frames = self.vision_model_tester.frames _, c, h, w = pixel_values.shape - pixel_values = pixel_values.reshape(-1, frames, c, h, w) + pixel_values = pixel_values.reshape(-1, self.frames, c, h, w) vision_tokens = ( torch.ones( - (input_ids.shape[0], self.num_query_tokens * frames), device=torch_device, dtype=input_ids.dtype + (input_ids.shape[0], self.num_query_tokens * self.frames), device=torch_device, dtype=input_ids.dtype ) * self.video_token_index ) From 63756d6b14e95a53899056ceccde65ba339545ac Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 25 Oct 2024 10:44:18 +0200 Subject: [PATCH 11/12] style --- .../models/instructblipvideo/test_modeling_instructblipvideo.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 23c769945395..3ac00cd42411 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -843,6 +843,7 @@ def test_sdpa_can_dispatch_composite_models(self): ): raise ValueError("The SDPA model should have SDPA attention layers") + # We will verify our results on an image of cute cats def prepare_video(): video_file = hf_hub_download( From 3fec5100741708b6e48b5e224d155ae269ebbc39 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 1 Nov 2024 08:28:37 +0100 Subject: [PATCH 12/12] fix some tests --- tests/models/blip_2/test_modeling_blip_2.py | 70 ++++++++----------- .../test_modeling_instructblip.py | 70 ++++++++----------- .../test_modeling_instructblipvideo.py | 70 ++++++++----------- 3 files changed, 84 insertions(+), 126 deletions(-) diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index b42bc4dfb6ca..ffd3e47e9374 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -21,6 +21,7 @@ import numpy as np import pytest import requests +from parameterized import parameterized from transformers import CONFIG_MAPPING, Blip2Config, Blip2QFormerConfig, Blip2VisionConfig from transformers.testing_utils import ( @@ -603,17 +604,19 @@ def test_model_from_pretrained(self): self.assertIsNotNone(model) # overwrite because BLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format - def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): use_cache = True # force this to be True in case False is passed - batch_size = main_input.shape[0] + input_batch_size = int(output.sequences.shape[0] / num_return_sequences) + internal_batch_size = ( + input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences + ) seq_length = getattr(self.model_tester, "seq_length", None) seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) seq_length = getattr(self.model_tester, "text_seq_length", seq_length) config = config.text_config if hasattr(config, "text_config") else config - num_sequences_in_output = batch_size * num_return_sequences gen_len = ( output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length @@ -624,19 +627,21 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return seq_length = self.model_tester.get_subsampled_output_lengths(seq_length) # scores - self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config) + self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config) # unprocessed logits - self._check_logits(num_sequences_in_output, output.logits, config=config) + self._check_logits(internal_batch_size, output.logits, config=config) # Attentions if self.has_attentions: if config.is_encoder_decoder: # encoder - self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length) + self._check_encoder_attention_for_generate( + output.encoder_attentions, input_batch_size, config, seq_length + ) # decoder self._check_attentions_for_generate( - num_sequences_in_output, + internal_batch_size, output.decoder_attentions, min_length=1, max_length=output.sequences.shape[-1], @@ -648,7 +653,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return attentions = output.attentions if not use_cache else output.attentions[1:] min_length = seq_length if not use_cache else seq_length + 1 self._check_attentions_for_generate( - num_sequences_in_output, + internal_batch_size, attentions=attentions, min_length=min_length, max_length=output.sequences.shape[-1], @@ -660,12 +665,12 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return if config.is_encoder_decoder: # encoder self._check_encoder_hidden_states_for_generate( - output.encoder_hidden_states, batch_size, config, seq_length + output.encoder_hidden_states, input_batch_size, config, seq_length ) # decoder self._check_hidden_states_for_generate( - num_sequences_in_output, + internal_batch_size, output.decoder_hidden_states, min_length=1, max_length=output.sequences.shape[-1], @@ -677,7 +682,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:] min_length = seq_length if not use_cache else seq_length + 1 self._check_hidden_states_for_generate( - num_sequences_in_output, + internal_batch_size, hidden_states, min_length=min_length, max_length=output.sequences.shape[-1], @@ -685,36 +690,16 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return use_cache=use_cache, ) - # Past Key Value States -- a few notes here: - # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1" - # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the - # standard cache format (e.g.gptbigcode ) - models_without_standard_cache = ( - "ctrl", - "fsmt", - "gptbigcode", - "mega", - "reformer", - "jamba", - "mamba", - "xlnet", - "zamba", - ) - has_standard_cache = not any( - model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache - ) - if has_standard_cache: - if use_cache: - past_key_values = output.past_key_values - past_sequence_length = output.sequences.shape[-1] - 1 - self._check_past_key_values_for_generate( - num_sequences_in_output, - past_key_values, - seq_length=past_sequence_length, - config=config, - ) - elif use_cache is False: - self.assertTrue(output.past_key_values is None) + # Past Key Value States + if use_cache: + past_key_values = output.past_key_values + past_sequence_length = output.sequences.shape[-1] - 1 + self._check_past_key_values_for_generate( + internal_batch_size, + past_key_values, + seq_length=past_sequence_length, + config=config, + ) # overwrite because BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present @pytest.mark.generate @@ -800,7 +785,8 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature): self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5)) @unittest.skip("BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present") - def test_generate_from_inputs_embeds_decoder_only(self): + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): pass diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 44793195309f..618d20347860 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -21,6 +21,7 @@ import numpy as np import pytest import requests +from parameterized import parameterized from transformers import ( CONFIG_MAPPING, @@ -552,17 +553,19 @@ def test_model_from_pretrained(self): self.assertIsNotNone(model) # overwrite because InstructBLIP internally calls LM.generate() with embeds thus it cannot operate in no cache format - def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): use_cache = True # force this to be True in case False is passed - batch_size = main_input.shape[0] + input_batch_size = int(output.sequences.shape[0] / num_return_sequences) + internal_batch_size = ( + input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences + ) seq_length = getattr(self.model_tester, "seq_length", None) seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) seq_length = getattr(self.model_tester, "text_seq_length", seq_length) config = config.text_config if hasattr(config, "text_config") else config - num_sequences_in_output = batch_size * num_return_sequences gen_len = ( output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length @@ -573,19 +576,21 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return seq_length = self.model_tester.get_subsampled_output_lengths(seq_length) # scores - self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config) + self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config) # unprocessed logits - self._check_logits(num_sequences_in_output, output.logits, config=config) + self._check_logits(internal_batch_size, output.logits, config=config) # Attentions if self.has_attentions: if config.is_encoder_decoder: # encoder - self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length) + self._check_encoder_attention_for_generate( + output.encoder_attentions, input_batch_size, config, seq_length + ) # decoder self._check_attentions_for_generate( - num_sequences_in_output, + internal_batch_size, output.decoder_attentions, min_length=1, max_length=output.sequences.shape[-1], @@ -597,7 +602,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return attentions = output.attentions if not use_cache else output.attentions[1:] min_length = seq_length if not use_cache else seq_length + 1 self._check_attentions_for_generate( - num_sequences_in_output, + internal_batch_size, attentions=attentions, min_length=min_length, max_length=output.sequences.shape[-1], @@ -609,12 +614,12 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return if config.is_encoder_decoder: # encoder self._check_encoder_hidden_states_for_generate( - output.encoder_hidden_states, batch_size, config, seq_length + output.encoder_hidden_states, input_batch_size, config, seq_length ) # decoder self._check_hidden_states_for_generate( - num_sequences_in_output, + internal_batch_size, output.decoder_hidden_states, min_length=1, max_length=output.sequences.shape[-1], @@ -626,7 +631,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:] min_length = seq_length if not use_cache else seq_length + 1 self._check_hidden_states_for_generate( - num_sequences_in_output, + internal_batch_size, hidden_states, min_length=min_length, max_length=output.sequences.shape[-1], @@ -634,36 +639,16 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return use_cache=use_cache, ) - # Past Key Value States -- a few notes here: - # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1" - # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the - # standard cache format (e.g.gptbigcode ) - models_without_standard_cache = ( - "ctrl", - "fsmt", - "gptbigcode", - "mega", - "reformer", - "jamba", - "mamba", - "xlnet", - "zamba", - ) - has_standard_cache = not any( - model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache - ) - if has_standard_cache: - if use_cache: - past_key_values = output.past_key_values - past_sequence_length = output.sequences.shape[-1] - 1 - self._check_past_key_values_for_generate( - num_sequences_in_output, - past_key_values, - seq_length=past_sequence_length, - config=config, - ) - elif use_cache is False: - self.assertTrue(output.past_key_values is None) + # Past Key Value States + if use_cache: + past_key_values = output.past_key_values + past_sequence_length = output.sequences.shape[-1] - 1 + self._check_past_key_values_for_generate( + internal_batch_size, + past_key_values, + seq_length=past_sequence_length, + config=config, + ) # overwrite because InstructBLIP cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present @pytest.mark.generate @@ -756,7 +741,8 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature): @unittest.skip( "InstructBLIP cannot generate only from input ids, and requires pixel values in all cases to be present" ) - def test_generate_from_inputs_embeds_decoder_only(self): + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): pass @require_torch_sdpa diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 3ac00cd42411..60ebf206a06e 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -21,6 +21,7 @@ import numpy as np import pytest from huggingface_hub import hf_hub_download +from parameterized import parameterized from transformers import ( CONFIG_MAPPING, @@ -576,17 +577,19 @@ def test_model_from_pretrained(self): self.assertIsNotNone(model) # overwrite because InstructBLIPVideo internally calls LM.generate() with embeds thus it cannot operate in no cache format - def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): use_cache = True # force this to be True in case False is passed - batch_size = main_input.shape[0] + input_batch_size = int(output.sequences.shape[0] / num_return_sequences) + internal_batch_size = ( + input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences + ) seq_length = getattr(self.model_tester, "seq_length", None) seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) seq_length = getattr(self.model_tester, "text_seq_length", seq_length) config = config.text_config if hasattr(config, "text_config") else config - num_sequences_in_output = batch_size * num_return_sequences gen_len = ( output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length @@ -597,19 +600,21 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return seq_length = self.model_tester.get_subsampled_output_lengths(seq_length) # scores - self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config) + self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config) # unprocessed logits - self._check_logits(num_sequences_in_output, output.logits, config=config) + self._check_logits(internal_batch_size, output.logits, config=config) # Attentions if self.has_attentions: if config.is_encoder_decoder: # encoder - self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length) + self._check_encoder_attention_for_generate( + output.encoder_attentions, input_batch_size, config, seq_length + ) # decoder self._check_attentions_for_generate( - num_sequences_in_output, + internal_batch_size, output.decoder_attentions, min_length=1, max_length=output.sequences.shape[-1], @@ -621,7 +626,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return attentions = output.attentions if not use_cache else output.attentions[1:] min_length = seq_length if not use_cache else seq_length + 1 self._check_attentions_for_generate( - num_sequences_in_output, + internal_batch_size, attentions=attentions, min_length=min_length, max_length=output.sequences.shape[-1], @@ -633,12 +638,12 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return if config.is_encoder_decoder: # encoder self._check_encoder_hidden_states_for_generate( - output.encoder_hidden_states, batch_size, config, seq_length + output.encoder_hidden_states, input_batch_size, config, seq_length ) # decoder self._check_hidden_states_for_generate( - num_sequences_in_output, + internal_batch_size, output.decoder_hidden_states, min_length=1, max_length=output.sequences.shape[-1], @@ -650,7 +655,7 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:] min_length = seq_length if not use_cache else seq_length + 1 self._check_hidden_states_for_generate( - num_sequences_in_output, + internal_batch_size, hidden_states, min_length=min_length, max_length=output.sequences.shape[-1], @@ -658,36 +663,16 @@ def _check_outputs(self, output, main_input, config, use_cache=False, num_return use_cache=use_cache, ) - # Past Key Value States -- a few notes here: - # 1. Its inner sequence length is with respect to the inputs of the latest forward pass, hence the "-1" - # 2. We ignore models that have unique cache structures (e.g. mamba) or are in need of refatoring to match the - # standard cache format (e.g.gptbigcode ) - models_without_standard_cache = ( - "ctrl", - "fsmt", - "gptbigcode", - "mega", - "reformer", - "jamba", - "mamba", - "xlnet", - "zamba", - ) - has_standard_cache = not any( - model_name in config.__class__.__name__.lower() for model_name in models_without_standard_cache - ) - if has_standard_cache: - if use_cache: - past_key_values = output.past_key_values - past_sequence_length = output.sequences.shape[-1] - 1 - self._check_past_key_values_for_generate( - num_sequences_in_output, - past_key_values, - seq_length=past_sequence_length, - config=config, - ) - elif use_cache is False: - self.assertTrue(output.past_key_values is None) + # Past Key Value States + if use_cache: + past_key_values = output.past_key_values + past_sequence_length = output.sequences.shape[-1] - 1 + self._check_past_key_values_for_generate( + internal_batch_size, + past_key_values, + seq_length=past_sequence_length, + config=config, + ) # overwrite because InstructBLIPVideo cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present @pytest.mark.generate @@ -780,7 +765,8 @@ def _prepare_model_kwargs(input_ids, attention_mask, signature): @unittest.skip( "InstructBLIPVideo cannot generate only from input ids, and requires pixel values in all cases to be present" ) - def test_generate_from_inputs_embeds_decoder_only(self): + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): pass @require_torch_sdpa