From 6eefb942d1853180a60861d9f6322347e4fceb1e Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Fri, 25 Oct 2024 17:23:54 +0000 Subject: [PATCH 01/16] tmp commit --- src/transformers/generation/utils.py | 8 +- tests/generation/test_utils.py | 449 +++++++++++++++--- tests/models/moshi/test_modeling_moshi.py | 4 +- .../models/musicgen/test_modeling_musicgen.py | 46 -- tests/test_modeling_common.py | 424 ----------------- 5 files changed, 380 insertions(+), 551 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index efe953db051c..705857363415 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -378,10 +378,14 @@ def prepare_inputs_for_generation( # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens # Exception 1: when passing input_embeds, input_ids may be missing entries # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here - # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) if past_key_values is not None: model_inputs["past_key_values"] = past_key_values - if inputs_embeds is not None or cache_position[-1] >= input_ids.shape[1]: # Exception 1 or Exception 3 + if ( + inputs_embeds is not None # Exception 1 + or (is_torchdynamo_compiling() or cache_position[-1] >= input_ids.shape[1]) # Exception 3 + ): input_ids = input_ids[:, -cache_position.shape[0] :] elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) input_ids = input_ids[:, cache_position] diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 6f2eaf734df1..8afa1c12eda4 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -17,20 +17,26 @@ import copy import gc import inspect +import os import tempfile +import time import unittest import warnings import numpy as np import pytest +from packaging import version from parameterized import parameterized from transformers import AutoConfig, is_torch_available, pipeline, set_seed from transformers.testing_utils import ( is_flaky, require_accelerate, + require_flash_attn, require_optimum_quanto, + require_read_token, require_torch, + require_torch_accelerator, require_torch_gpu, require_torch_multi_accelerator, require_torch_multi_gpu, @@ -136,6 +142,34 @@ def prepare_config_and_inputs_for_generate(self, batch_size=2): return config, filtered_inputs_dict + def _check_similar_generate_outputs(self, output_1, output_2, atol=1e-5, rtol=1e-5): + """ + Checks whether a pair of generate outputs are similar. Two `generate` call outputs are considered similar in + the following siturations: + 1. The sequences are the same + 2. The sequences are different, but the scores up until the first mismatch are nearly identical + """ + # scores doesn't include data regarding decoder input tokens + decoder_input_length = output_1.sequences.shape[1] - len(output_1.scores) + output_matches = output_1.sequences == output_2.sequences + has_matching_outputs = output_matches.all() + has_matching_scores = None + if not has_matching_outputs: + for batch_idx in range(output_1.sequences.shape[0]): + batch_matches = output_matches[batch_idx] + if batch_matches.all(): + continue + first_mismatch_idx = batch_matches.int().argmin() # gets the index of the first False + first_mismatch_idx -= decoder_input_length + output_1_first_mismatch_scores = output_1.scores[first_mismatch_idx][batch_idx] + output_2_first_mismatch_scores = output_2.scores[first_mismatch_idx][batch_idx] + has_matching_scores = torch.allclose( + output_1_first_mismatch_scores, output_2_first_mismatch_scores, rtol=atol, atol=rtol + ) + if not has_matching_scores: + break + self.assertTrue(has_matching_outputs or has_matching_scores) + def _get_logits_processor_kwargs(self, do_sample=False, config=None): logits_processor_kwargs = { "bad_words_ids": [[1, 0]], @@ -426,7 +460,6 @@ def test_greedy_generate(self): def test_greedy_generate_dict_outputs(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() output_generate = self._greedy_generate( @@ -453,13 +486,12 @@ def test_greedy_generate_dict_outputs(self): # Retrocompatibility check self.assertIsInstance(output_generate, GreedySearchDecoderOnlyOutput) - self._check_outputs(output_generate, main_input, model.config) + self._check_outputs(output_generate, model.config) @pytest.mark.generate def test_greedy_generate_dict_outputs_use_cache(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] if not hasattr(config, "use_cache"): self.skipTest(reason=f"{model_class.__name__} doesn't support caching") @@ -486,7 +518,7 @@ def test_greedy_generate_dict_outputs_use_cache(self): output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] ) - self._check_outputs(output_generate, main_input, model.config, use_cache=True) + self._check_outputs(output_generate, model.config, use_cache=True) @pytest.mark.generate def test_sample_generate(self): @@ -505,7 +537,6 @@ def test_sample_generate(self): def test_sample_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() output_generate = self._sample_generate( @@ -533,7 +564,7 @@ def test_sample_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, SampleDecoderOnlyOutput) - self._check_outputs(output_generate, main_input, model.config, num_return_sequences=2) + self._check_outputs(output_generate, model.config, num_return_sequences=2) @pytest.mark.generate def test_beam_search_generate(self): @@ -554,7 +585,6 @@ def test_beam_search_generate(self): def test_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() @@ -582,15 +612,12 @@ def test_beam_search_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) - self._check_outputs( - output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"] - ) + self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"]) @pytest.mark.generate def test_beam_search_generate_dict_outputs_use_cache(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] if not hasattr(config, "use_cache"): self.skipTest(reason=f"{model_class.__name__} doesn't support caching") @@ -622,11 +649,7 @@ def test_beam_search_generate_dict_outputs_use_cache(self): ) self._check_outputs( - output_generate, - main_input, - model.config, - use_cache=True, - num_return_sequences=beam_kwargs["num_beams"], + output_generate, model.config, use_cache=True, num_return_sequences=beam_kwargs["num_beams"] ) @require_accelerate @@ -698,7 +721,6 @@ def test_beam_sample_generate(self): def test_beam_sample_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_beam_kwargs() @@ -729,7 +751,7 @@ def test_beam_sample_generate_dict_output(self): self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput) self._check_outputs( - output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"] + output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"] ) @pytest.mark.generate @@ -788,7 +810,6 @@ def test_group_beam_search_generate(self): def test_group_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() beam_kwargs = self._get_diverse_beam_kwargs() @@ -816,9 +837,7 @@ def test_group_beam_search_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) - self._check_outputs( - output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"] - ) + self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"]) # TODO: @gante check why it is flaky @is_flaky() @@ -921,9 +940,7 @@ def test_constrained_beam_search_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) - self._check_outputs( - output_generate, main_input, model.config, num_return_sequences=beam_kwargs["num_beams"] - ) + self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"]) @pytest.mark.generate def test_contrastive_generate(self): @@ -965,7 +982,6 @@ def test_contrastive_generate_dict_outputs_use_cache(self): self.skipTest(reason="Won't fix: old model with different cache format") config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] # NOTE: contrastive search only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -991,7 +1007,7 @@ def test_contrastive_generate_dict_outputs_use_cache(self): output_generate.sequences.shape[-1] == self.max_new_tokens + inputs_dict["input_ids"].shape[-1] ) - self._check_outputs(output_generate, main_input, model.config, use_cache=True) + self._check_outputs(output_generate, model.config, use_cache=True) @pytest.mark.generate def test_contrastive_generate_low_memory(self): @@ -1087,14 +1103,10 @@ def test_beam_search_low_memory(self): @pytest.mark.generate @parameterized.expand([("random",), ("same",)]) - @is_flaky() # Read NOTE (1) below. If there are API issues, all attempts will fail. def test_assisted_decoding_matches_greedy_search(self, assistant_type): # This test ensures that the assisted generation does not introduce output changes over greedy search. - # NOTE (1): The sentence above is true most of the time, there is a tiny difference in the logits due to matmul - # shape differences -- and it may result in a different output. The input shape difference happens in the - # main model, that runs the forward pass with several candidates at once (as opposed to generating one token at - # a time). See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info. - # NOTE (2): It breaks the pattern in the tests above, for multiple reasons: + # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info. + # NOTE: It breaks the pattern in the tests above, for multiple reasons: # - assisted_decoding, contrarily to the other methods, can't be called on its own (e.g. needs to # prepare the assistant encoder outputs in the main generate body); # - assisted_decoding does not support `use_cache = False` @@ -1123,7 +1135,6 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[model_class.main_input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1164,12 +1175,10 @@ def test_assisted_decoding_matches_greedy_search(self, assistant_type): output_assisted = model.generate(**generation_kwargs, **inputs_dict) # The two outputs must match and their shape must be as expected - - self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist()) + self._check_similar_generate_outputs(output_greedy, output_assisted) for output in (output_greedy, output_assisted): - self._check_outputs(output, main_input, model.config, use_cache=True) + self._check_outputs(output, model.config, use_cache=True) - @is_flaky() @pytest.mark.generate def test_prompt_lookup_decoding_matches_greedy_search(self): # This test ensures that the prompt lookup generation does not introduce output changes over greedy search. @@ -1198,7 +1207,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[model_class.main_input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1231,10 +1239,9 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): output_prompt_lookup = model.generate(**generation_kwargs, **inputs_dict) # The two outputs must match and their shape must be as expected - - self.assertListEqual(output_greedy.sequences.tolist(), output_prompt_lookup.sequences.tolist()) + self._check_similar_generate_outputs(output_greedy, output_prompt_lookup) for output in (output_greedy, output_prompt_lookup): - self._check_outputs(output, main_input, model.config, use_cache=True) + self._check_outputs(output, model.config, use_cache=True) @pytest.mark.generate def test_dola_decoding_sample(self): @@ -1254,7 +1261,6 @@ def test_dola_decoding_sample(self): # enable cache if the model is not openai-gpt, xlnet, cpm, or xlm config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] # Encoder-decoder models are not supported if config.is_encoder_decoder: @@ -1282,7 +1288,7 @@ def test_dola_decoding_sample(self): "dola_layers": "low", } output_dola = model.generate(**generation_kwargs, **inputs_dict) - self._check_outputs(output_dola, main_input, model.config, use_cache=getattr(config, "use_cache", False)) + self._check_outputs(output_dola, model.config, use_cache=getattr(config, "use_cache", False)) @pytest.mark.generate def test_assisted_decoding_sample(self): @@ -1312,7 +1318,6 @@ def test_assisted_decoding_sample(self): # enable cache config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) - main_input = inputs_dict[model_class.main_input_name] # NOTE: assisted generation only works with cache on at the moment. if not hasattr(config, "use_cache"): @@ -1344,7 +1349,7 @@ def test_assisted_decoding_sample(self): } output_assisted = model.generate(**generation_kwargs, **inputs_dict) - self._check_outputs(output_assisted, main_input, config, use_cache=True) + self._check_outputs(output_assisted, config, use_cache=True) @pytest.mark.generate def test_prompt_lookup_decoding_stops_at_eos(self): @@ -1931,12 +1936,13 @@ def test_generate_with_quant_cache(self): @pytest.mark.generate @require_torch_gpu @slow - @is_flaky() # compilation may result in equivalent (!= same) FP ops, causing the argmax in `generate` to be flaky def test_generate_compile_fullgraph(self): """ Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. ⚠️ Runs two sequential generations to ensure the cache doesn't get stuck after the first compiled run! ⚠️ """ + # TODO (@joao): end-to-end compilation is broken, fix me :( + for model_class in self.all_generative_model_classes: if not model_class._supports_static_cache: self.skipTest("This model doesn't support static cache") @@ -1961,6 +1967,8 @@ def test_generate_compile_fullgraph(self): generation_kwargs = { "do_sample": False, "max_new_tokens": 10, + "return_dict_in_generate": True, + "output_scores": True, } max_cache_len = input_ids.shape[1] + generation_kwargs["max_new_tokens"] @@ -1981,7 +1989,7 @@ def test_generate_compile_fullgraph(self): output_compiled = compiled_generate( model_inputs, generation_config=generation_config, past_key_values=past_key_values ) - self.assertListEqual(output_dynamic.tolist(), output_compiled.tolist()) + self._check_similar_generate_outputs(output_dynamic, output_compiled) @pytest.mark.generate def test_generate_methods_with_num_logits_to_keep(self): @@ -2009,7 +2017,6 @@ def test_generate_methods_with_num_logits_to_keep(self): self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist()) @pytest.mark.generate - @is_flaky() # assisted generation tests are flaky (minor fp ops differences) def test_assisted_decoding_with_num_logits_to_keep(self): for model_class in self.all_generative_model_classes: if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()): @@ -2030,6 +2037,8 @@ def test_assisted_decoding_with_num_logits_to_keep(self): "max_new_tokens": 10, "do_sample": False, "assistant_model": assistant_model, + "return_dict_in_generate": True, + "output_scores": True, } assistant_model.generation_config.assistant_confidence_threshold = None @@ -2037,7 +2046,8 @@ def test_assisted_decoding_with_num_logits_to_keep(self): with_all_logits = model.generate(**generation_kwargs, **inputs_dict, num_logits_to_keep=0) # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior) without_all_logits = model.generate(**inputs_dict, **generation_kwargs) - self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist()) + + self._check_similar_generate_outputs(with_all_logits, without_all_logits) @pytest.mark.generate def test_inherits_generation_mixin(self): @@ -2048,9 +2058,11 @@ def test_inherits_generation_mixin(self): for model_class in self.all_generative_model_classes: self.assertTrue("GenerationMixin" in str(model_class.__bases__)) + @pytest.mark.generate @require_torch_sdpa @slow def test_eager_matches_sdpa_generate(self): + """Tests that generate has equivalent outputs with SDPA (default) and eager attention implementations.""" max_new_tokens = 30 for model_class in self.all_generative_model_classes: @@ -2082,6 +2094,7 @@ def test_eager_matches_sdpa_generate(self): "do_sample": False, "return_dict_in_generate": True, "output_scores": True, + "use_cache": True, } model_sdpa = model_class.from_pretrained( @@ -2103,35 +2116,317 @@ def test_eager_matches_sdpa_generate(self): del model_eager gc.collect() - # Eager and SDPA are very similar, but not exactly the same. Because we are using random models, this - # test would be flaky if we only checked the sequences. Two situations in which this test passes: - # 1. The sequences are the same - # 2. The sequences are different, but the scores up until the first mismatch are nearly identical - output_matches = res_eager.sequences == res_sdpa.sequences - has_matching_outputs = output_matches.all() - has_matching_scores = None - if not has_matching_outputs: - input_length = main_input.shape[1] - for batch_idx in range(res_eager.sequences.shape[0]): - batch_matches = output_matches[batch_idx] - if batch_matches.all(): - continue - first_mismatch_idx = batch_matches.int().argmin() # gets the index of the first False - first_mismatch_idx -= input_length # scores doesn't include data regarding input tokens - sdpa_first_mismatch_scores = res_sdpa.scores[first_mismatch_idx][batch_idx] - eager_first_mismatch_scores = res_eager.scores[first_mismatch_idx][batch_idx] - has_matching_scores = torch.allclose( - sdpa_first_mismatch_scores, eager_first_mismatch_scores, rtol=1e-3, atol=1e-3 - ) - if not has_matching_scores: - break + self._check_similar_generate_outputs(res_eager, res_sdpa, atol=1e-3, rtol=1e-3) + + @pytest.mark.flash_attn_test + @require_flash_attn + @require_torch_gpu + @slow + def test_flash_attn_2_generate_matches_default(self): + """Tests that generate has equivalent outputs with SDPA (default) and FA2 attention implementations.""" + # TODO (@joao @raushan) -- this test is failing the output checks on most models, investigate. After fixing, + # check whether we still need the overwrites + + max_new_tokens = 30 + + for model_class in self.all_generative_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, original_inputs_dict = self.prepare_config_and_inputs_for_generate() + inputs_dict = {} + for input_name, input_data in original_inputs_dict.items(): + if isinstance(input_data, torch.Tensor) and input_data.dtype in [torch.float32, torch.bfloat16]: + inputs_dict[input_name] = input_data.to(torch.float16) + else: + inputs_dict[input_name] = input_data + main_input = inputs_dict[model_class.main_input_name] + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + main_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + generate_kwargs = { + "max_new_tokens": max_new_tokens, + "do_sample": False, + "return_dict_in_generate": True, + "output_scores": True, + "use_cache": True, + } + + model = model.to(device=torch_device, dtype=torch.float16) + res_sdpa = model.generate(**inputs_dict, **generate_kwargs) + del model + gc.collect() + + model_fa2 = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + res_fa2 = model_fa2.generate(**inputs_dict, **generate_kwargs) + del model_fa2 + gc.collect() + + self._check_similar_generate_outputs(res_fa2, res_sdpa, atol=1e-3, rtol=1e-3) + + @pytest.mark.generate + def test_generate_reuse_cache(self): + """Tests that we can pass a cache out of the `generate`, and use it to continue generation""" + max_new_tokens = 2 + for model_class in self.all_generative_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + inputs_dict.pop("attention_mask", None) # Let's rely on `attention_mask=None` in this test, for simplicity + + if not hasattr(config, "use_cache"): + self.skipTest(reason=f"{model_class.__name__} doesn't support caching") + + model = model_class(config).to(torch_device) + + # base generate + generate_kwargs = { + "max_new_tokens": max_new_tokens, + "do_sample": False, + "return_dict_in_generate": True, + "output_scores": True, + "use_cache": True, + } + base_outputs = model.generate(**inputs_dict, **generate_kwargs) + + # generate with cache reuse + generate_kwargs["max_new_tokens"] = 1 + cache_reuse_outputs = model.generate(**inputs_dict, **generate_kwargs) + if not config.is_encoder_decoder: + inputs_dict["input_ids"] = cache_reuse_outputs.sequences + else: + inputs_dict["decoder_input_ids"] = cache_reuse_outputs.sequences + cache_reuse_outputs = model.generate( + **inputs_dict, **generate_kwargs, past_key_values=cache_reuse_outputs.past_key_values + ) + + self._check_similar_generate_outputs(base_outputs, cache_reuse_outputs) + + # @pytest.mark.generate + # def test_inputs_embeds_matches_input_ids_with_generate(self): + # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + # for model_class in self.all_model_classes: + # if model_class.__name__ not in [ + # *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES), + # *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES), + # ]: + # continue + + # model = model_class(config) + # model.to(torch_device) + # model.eval() + + # model_forward_args = inspect.signature(model.forward).parameters + # if any(argument not in model_forward_args for argument in ["inputs_embeds", "position_ids"]): + # self.skipTest(reason="This model doesn't use `inputs_embeds` or `position_ids`.") + # has_inputs_embeds_forwarding = "inputs_embeds" in set( + # inspect.signature(model.prepare_inputs_for_generation).parameters.keys() + # ) + # if not has_inputs_embeds_forwarding: + # self.skipTest(reason="This model doesn't support `inputs_embeds` passed to `generate`.") + # inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + # pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 + + # # VLMs can't generate with embeds and pixels at the same time. We expect the user to pass merged + # # embeds already + # if model_class.__name__ in get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES): + # inputs.pop("pixel_values", None) + # inputs.pop("pixel_values_videos", None) + # inputs.pop("pixel_values_images", None) + + # wte = model.get_input_embeddings() + # if not self.is_encoder_decoder: + # input_ids = inputs["input_ids"] + # # some models infer position ids/attn mask differently when input ids + # # by check if pad_token let's make sure no padding is in input ids + # not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 + # input_ids[input_ids == pad_token_id] = not_pad_token_id + # del inputs["input_ids"] + # inputs_embeds = wte(input_ids) + # out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:] + # out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) + # else: + # encoder_input_ids = inputs["input_ids"] + # decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) + # encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1) + # decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1) + # del inputs["input_ids"] + # inputs.pop("decoder_input_ids", None) + # inputs_embeds = wte(encoder_input_ids) + # decoder_inputs_embeds = wte(decoder_input_ids) + # out_ids = model.generate( + # input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2 + # )[:, -2:] + # out_embeds = model.generate( + # inputs_embeds=inputs_embeds, + # decoder_inputs_embeds=decoder_inputs_embeds, + # **inputs, + # max_new_tokens=2, + # ) + # # NOTE: this test changes the order of FP ops, there may be tiny differences in the output + # number_of_different_tokens = (out_ids != out_embeds).sum() + # max_differences = int(out_ids.shape[0] * out_ids.shape[1] * 0.1) + # self.assertTrue(number_of_different_tokens <= max_differences) # accept up to 10% mismatch + + @pytest.mark.generate + def test_static_cache_matches_dynamic(self): + """ + Tests that generating with static cache give almost same results as with dynamic cache. + This test does not compile the model and check only logits similarity for numerical precision + errors. + """ + if len(self.all_generative_model_classes) == 0: + self.skipTest( + reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks" + ) + for model_class in self.all_generative_model_classes: + if not model_class._supports_static_cache: + self.skipTest(f"{model_class.__name__} does not support static cache") + + if not model_class._supports_cache_class: + self.skipTest(f"{model_class.__name__} does not support cache class") + + config, inputs = self.model_tester.prepare_config_and_inputs_for_common() + if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0: + self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test") + + model = model_class(config).to(device=torch_device, dtype=torch.float32) + model.eval() + + dynamic_out = model.generate( + **inputs, do_sample=False, max_new_tokens=10, output_logits=True, return_dict_in_generate=True + ) + static_out = model.generate( + **inputs, + do_sample=False, + max_new_tokens=10, + cache_implementation="static", + output_logits=True, + return_dict_in_generate=True, + ) + self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4)) + + # For now, Let's focus only on GPU for `torch.compile` + @slow + @require_torch_accelerator + @require_read_token + @pytest.mark.generate + def test_torch_compile(self): + if version.parse(torch.__version__) < version.parse("2.3"): + self.skipTest(reason="This test requires torch >= 2.3 to run.") + torch.compiler.reset() + if not hasattr(self, "_torch_compile_test_ckpt"): + self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.") + ckpt = self._torch_compile_test_ckpt + revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision + + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + batch_size = 1 + n_iter = 3 + + tokenizer = AutoTokenizer.from_pretrained(ckpt) + if self.is_encoder_decoder: + model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( + torch_device + ) + else: + model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( + torch_device + ) + + model.generation_config.max_new_tokens = 4 + + model.generation_config.cache_implementation = "static" + model.forward = torch.compile(model.forward, mode="reduce-overhead") + + input_text = "Why dogs are cute?" + input_ids = tokenizer([input_text] * batch_size, return_tensors="pt").to(torch_device) + + for i in range(n_iter): + _ = model.generate(**input_ids, do_sample=False) + + @slow + @require_torch_gpu # Testing cuda graphs. + @require_read_token + @pytest.mark.generate + def test_compile_cuda_graph_time(self): + if version.parse(torch.__version__) < version.parse("2.3"): + self.skipTest(reason="This test requires torch >= 2.3 to run.") + + # TODO felix: All models supporting `StaticCache` or `torch.compile` should be tested. + # At the moment, only llama, gemma and gemma2 are tested here! + if not hasattr(self, "_torch_compile_test_ckpt"): + self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.") + ckpt = self._torch_compile_test_ckpt + revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision + + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + tokenizer = AutoTokenizer.from_pretrained(ckpt) + if self.is_encoder_decoder: + model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( + torch_device + ) + else: + model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( + torch_device + ) + + cache_implementation = "static" + if model.config.model_type == "gemma2": + cache_implementation = "hybrid" + + new_tokens = 50 + gen_config = GenerationConfig( + max_new_tokens=new_tokens, + min_new_tokens=new_tokens, + use_cache=True, + pad_token_id=tokenizer.pad_token_id, + num_beams=1, + do_sample=False, + eos_token_id=None, # This is required for min_new_tokens to actually have an effect. + ) + model.generation_config.eos_token_id = None # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect. + + model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) + + inp = tokenizer("Why cats are cute?", return_tensors="pt").to(torch_device) + + # First run: the first run warms up each graph, which does things like CuBlas or Triton benchmarking + start = time.perf_counter() + _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) + end = time.perf_counter() + graph_warmup_time = end - start + + # Second run: CUDA Graph recording, and replays it + start = time.perf_counter() + _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) + end = time.perf_counter() + record_time = end - start + + # Finally: we hit the optimized, CUDA Graph replay path + start = time.perf_counter() + _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) + end = time.perf_counter() + opt_time = end - start - self.assertTrue(has_matching_outputs or has_matching_scores) + # For the recording step, we expect only two cuda graphs and this step should be much faster than the first. + self.assertTrue(record_time < 0.15 * graph_warmup_time) + self.assertTrue(opt_time < record_time) - def _check_outputs(self, output, main_input, config, use_cache=False, num_return_sequences=1): - # we can be sure what is batch size from main input but seq length depends on model type and whether input is text/audio/image - # so we infer actual text seq length from model_tester, same was as it is done in `test_modeling_common.py` tests` - batch_size = main_input.shape[0] + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1): + batch_size = output.sequences.shape[0] seq_length = getattr(self.model_tester, "seq_length", None) seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index dd9302ee2c55..872ecd21a32b 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -591,9 +591,9 @@ def _check_hidden_states_for_generate( [expected_shape] * len(iter_hidden_states), ) - def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1): + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1): # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True` - super()._check_outputs(output, input_ids, config, use_cache=True, num_return_sequences=num_return_sequences) + super()._check_outputs(output, config, use_cache=True, num_return_sequences=num_return_sequences) def _check_hidden_states_for_generate( self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1 diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index 346ad60debe2..4c9dc95fb3bf 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -542,52 +542,6 @@ def test_flash_attn_2_generate_padding_right(self): self.assertTrue(torch.allclose(out, out_fa)) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache - def test_flash_attn_2_generate_use_cache(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa @slow diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 51d51dfcc282..6e08f2958b29 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -22,7 +22,6 @@ import random import re import tempfile -import time import warnings from collections import defaultdict from contextlib import contextmanager @@ -37,10 +36,7 @@ from transformers import ( AutoModel, AutoModelForCausalLM, - AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, - AutoTokenizer, - GenerationConfig, PretrainedConfig, PreTrainedModel, is_torch_available, @@ -3000,71 +2996,6 @@ def test_inputs_embeds_matches_input_ids(self): )[0] self.assertTrue(torch.allclose(out_embeds, out_ids)) - def test_inputs_embeds_matches_input_ids_with_generate(self): - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - if model_class.__name__ not in [ - *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES), - *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES), - ]: - continue - - model = model_class(config) - model.to(torch_device) - model.eval() - - model_forward_args = inspect.signature(model.forward).parameters - if any(argument not in model_forward_args for argument in ["inputs_embeds", "position_ids"]): - self.skipTest(reason="This model doesn't use `inputs_embeds` or `position_ids`.") - has_inputs_embeds_forwarding = "inputs_embeds" in set( - inspect.signature(model.prepare_inputs_for_generation).parameters.keys() - ) - if not has_inputs_embeds_forwarding: - self.skipTest(reason="This model doesn't support `inputs_embeds` passed to `generate`.") - inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) - pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 - - # VLMs can't generate with embeds and pixels at the same time. We expect the user to pass merged - # embeds already - if model_class.__name__ in get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES): - inputs.pop("pixel_values", None) - inputs.pop("pixel_values_videos", None) - inputs.pop("pixel_values_images", None) - - wte = model.get_input_embeddings() - if not self.is_encoder_decoder: - input_ids = inputs["input_ids"] - # some models infer position ids/attn mask differently when input ids - # by check if pad_token let's make sure no padding is in input ids - not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 - input_ids[input_ids == pad_token_id] = not_pad_token_id - del inputs["input_ids"] - inputs_embeds = wte(input_ids) - out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:] - out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) - else: - encoder_input_ids = inputs["input_ids"] - decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) - encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1) - decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1) - del inputs["input_ids"] - inputs.pop("decoder_input_ids", None) - inputs_embeds = wte(encoder_input_ids) - decoder_inputs_embeds = wte(decoder_input_ids) - out_ids = model.generate( - input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2 - )[:, -2:] - out_embeds = model.generate( - inputs_embeds=inputs_embeds, - decoder_inputs_embeds=decoder_inputs_embeds, - **inputs, - max_new_tokens=2, - ) - # NOTE: this test changes the order of FP ops, there may be tiny differences in the output - number_of_different_tokens = (out_ids != out_embeds).sum() - max_differences = int(out_ids.shape[0] * out_ids.shape[1] * 0.1) - self.assertTrue(number_of_different_tokens <= max_differences) # accept up to 10% mismatch - @require_non_xpu @require_torch_multi_gpu def test_multi_gpu_data_parallel_forward(self): @@ -3857,102 +3788,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - @is_flaky() - def test_flash_attn_2_generate_left_padding(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @is_flaky() - @slow - def test_flash_attn_2_generate_padding_right(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - def test_attn_implementation_composite_models(self): """ Tests if composite models can receive a dict object as attn_implementation, where each key should be @@ -4525,65 +4360,6 @@ def test_sdpa_matches_eager_sliding_window(self): torch.allclose(res_eager[attention_mask == 1], res_sdpa[attention_mask == 1], rtol=1e-4, atol=1e-4) ) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - - # Generate with one batch only to test generation when attention mask will be None - # when real inputs are used, because there is no padding. See issue #32237 for more - dummy_input = dummy_input[:1, ...] - dummy_attention_mask = torch.ones_like(dummy_attention_mask[:1, ...]) - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @mark.flash_attn_test @@ -4640,62 +4416,6 @@ def test_flash_attn_2_can_dispatch_composite_models(self): if not has_fa2: raise ValueError("The FA2 model should have FA2 layers") - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - def test_flash_attn_2_generate_reuse_cache(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - max_new_tokens = 2 - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # run generate once to get filled cache - output = model.generate( - dummy_input, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - return_dict_in_generate=True, - ) - past_key_values = output.past_key_values - - # Try to continue generation from where we left, given that we have more than 1 new token to process - # e.g. this can happen in speculative decoding when feeding candidate tokens back to target model - dummy_input_updated = torch.cat([dummy_input, output.sequences], dim=-1) - _ = model.generate( - dummy_input_updated, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - past_key_values=past_key_values, - ) - @require_flash_attn @require_torch_gpu @require_bitsandbytes @@ -4999,82 +4719,6 @@ def test_custom_4d_attention_mask(self): normalized_1 = F.softmax(out_shared_prefix_last_tokens) torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) - def test_static_cache_matches_dynamic(self): - """ - Tests that generating with static cache give almost same results as with dynamic cache. - This test does not compile the model and check only logits similarity for numerical precision - errors. - """ - if len(self.all_generative_model_classes) == 0: - self.skipTest( - reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks" - ) - for model_class in self.all_generative_model_classes: - if not model_class._supports_static_cache: - self.skipTest(f"{model_class.__name__} does not support static cache") - - if not model_class._supports_cache_class: - self.skipTest(f"{model_class.__name__} does not support cache class") - - config, inputs = self.model_tester.prepare_config_and_inputs_for_common() - if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0: - self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test") - - model = model_class(config).to(device=torch_device, dtype=torch.float32) - model.eval() - - dynamic_out = model.generate( - **inputs, do_sample=False, max_new_tokens=10, output_logits=True, return_dict_in_generate=True - ) - static_out = model.generate( - **inputs, - do_sample=False, - max_new_tokens=10, - cache_implementation="static", - output_logits=True, - return_dict_in_generate=True, - ) - self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4)) - - # For now, Let's focus only on GPU for `torch.compile` - @slow - @require_torch_accelerator - @require_read_token - def test_torch_compile(self): - if version.parse(torch.__version__) < version.parse("2.3"): - self.skipTest(reason="This test requires torch >= 2.3 to run.") - torch.compiler.reset() - if not hasattr(self, "_torch_compile_test_ckpt"): - self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.") - ckpt = self._torch_compile_test_ckpt - revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision - - os.environ["TOKENIZERS_PARALLELISM"] = "false" - - batch_size = 1 - n_iter = 3 - - tokenizer = AutoTokenizer.from_pretrained(ckpt) - if self.is_encoder_decoder: - model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - else: - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - - model.generation_config.max_new_tokens = 4 - - model.generation_config.cache_implementation = "static" - model.forward = torch.compile(model.forward, mode="reduce-overhead") - - input_text = "Why dogs are cute?" - input_ids = tokenizer([input_text] * batch_size, return_tensors="pt").to(torch_device) - - for i in range(n_iter): - _ = model.generate(**input_ids, do_sample=False) - @slow @require_torch_gpu def test_torch_compile_for_training(self): @@ -5118,74 +4762,6 @@ def test_torch_compile_for_training(self): for name, param in model._orig_mod.named_parameters(): torch.testing.assert_close(param.grad.detach().cpu(), params[name], rtol=1e-4, atol=1e-4) - @slow - @require_torch_gpu # Testing cuda graphs. - @require_read_token - def test_compile_cuda_graph_time(self): - if version.parse(torch.__version__) < version.parse("2.3"): - self.skipTest(reason="This test requires torch >= 2.3 to run.") - - # TODO felix: All models supporting `StaticCache` or `torch.compile` should be tested. - # At the moment, only llama, gemma and gemma2 are tested here! - if not hasattr(self, "_torch_compile_test_ckpt"): - self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.") - ckpt = self._torch_compile_test_ckpt - revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision - - os.environ["TOKENIZERS_PARALLELISM"] = "false" - - tokenizer = AutoTokenizer.from_pretrained(ckpt) - if self.is_encoder_decoder: - model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - else: - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - - cache_implementation = "static" - if model.config.model_type == "gemma2": - cache_implementation = "hybrid" - - new_tokens = 50 - gen_config = GenerationConfig( - max_new_tokens=new_tokens, - min_new_tokens=new_tokens, - use_cache=True, - pad_token_id=tokenizer.pad_token_id, - num_beams=1, - do_sample=False, - eos_token_id=None, # This is required for min_new_tokens to actually have an effect. - ) - model.generation_config.eos_token_id = None # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect. - - model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) - - inp = tokenizer("Why cats are cute?", return_tensors="pt").to(torch_device) - - # First run: the first run warms up each graph, which does things like CuBlas or Triton benchmarking - start = time.perf_counter() - _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) - end = time.perf_counter() - graph_warmup_time = end - start - - # Second run: CUDA Graph recording, and replays it - start = time.perf_counter() - _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) - end = time.perf_counter() - record_time = end - start - - # Finally: we hit the optimized, CUDA Graph replay path - start = time.perf_counter() - _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) - end = time.perf_counter() - opt_time = end - start - - # For the recording step, we expect only two cuda graphs and this step should be much faster than the first. - self.assertTrue(record_time < 0.15 * graph_warmup_time) - self.assertTrue(opt_time < record_time) - def test_forward_with_num_logits_to_keep(self): for model_class in self.all_generative_model_classes: if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()): From 06d2048145adb21f2afa3a3f719599aa336d581a Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Mon, 28 Oct 2024 11:18:18 +0000 Subject: [PATCH 02/16] tmp commit --- tests/generation/test_utils.py | 97 +++++++++++----------------------- 1 file changed, 31 insertions(+), 66 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 8afa1c12eda4..eb61d8ca3432 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -2058,16 +2058,21 @@ def test_inherits_generation_mixin(self): for model_class in self.all_generative_model_classes: self.assertTrue("GenerationMixin" in str(model_class.__bases__)) - @pytest.mark.generate - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_generate(self): - """Tests that generate has equivalent outputs with SDPA (default) and eager attention implementations.""" + def _test_attention_implementation(self, attn_implementation): + """ + Compares the output of generate with the eager attention implementation against other implementations. + NOTE: despite the test logic being the same, different implementations actually need diferent decorators, hence + this separate function. + """ max_new_tokens = 30 + support_flag = { + "sdpa": "_supports_sdpa", + "flash_attention_2": "_supports_flash_attn_2", + } for model_class in self.all_generative_model_classes: - if not model_class._supports_sdpa: - self.skipTest(f"{model_class.__name__} does not support SDPA") + if not getattr(model_class, support_flag[attn_implementation]): + self.skipTest(f"{model_class.__name__} does not support `attn_implementation={attn_implementation}`") config, original_inputs_dict = self.prepare_config_and_inputs_for_generate() inputs_dict = {} @@ -2097,84 +2102,44 @@ def test_eager_matches_sdpa_generate(self): "use_cache": True, } - model_sdpa = model_class.from_pretrained( + model_eager = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True, + attn_implementation="eager", ).to(torch_device) - res_sdpa = model_sdpa.generate(**inputs_dict, **generate_kwargs) - del model_sdpa + res_eager = model_eager.generate(**inputs_dict, **generate_kwargs) + del model_eager gc.collect() - model_eager = model_class.from_pretrained( + model_attn = model_class.from_pretrained( tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True, - attn_implementation="eager", + attn_implementation=attn_implementation, ).to(torch_device) - res_eager = model_eager.generate(**inputs_dict, **generate_kwargs) - del model_eager + res_attn = model_attn.generate(**inputs_dict, **generate_kwargs) + del model_attn gc.collect() - self._check_similar_generate_outputs(res_eager, res_sdpa, atol=1e-3, rtol=1e-3) + self._check_similar_generate_outputs(res_eager, res_attn, atol=1e-3, rtol=1e-3) + + @pytest.mark.generate + @require_torch_sdpa + @slow + def test_eager_matches_sdpa_generate(self): + """Tests that generate has equivalent outputs with SDPA and eager attention implementations.""" + self._test_attention_implementation("sdpa") @pytest.mark.flash_attn_test @require_flash_attn @require_torch_gpu @slow - def test_flash_attn_2_generate_matches_default(self): - """Tests that generate has equivalent outputs with SDPA (default) and FA2 attention implementations.""" + def test_eager_matches_fa2_generate(self): + """Tests that generate has equivalent outputs with FA2 and eager attention implementations.""" # TODO (@joao @raushan) -- this test is failing the output checks on most models, investigate. After fixing, # check whether we still need the overwrites - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, original_inputs_dict = self.prepare_config_and_inputs_for_generate() - inputs_dict = {} - for input_name, input_data in original_inputs_dict.items(): - if isinstance(input_data, torch.Tensor) and input_data.dtype in [torch.float32, torch.bfloat16]: - inputs_dict[input_name] = input_data.to(torch.float16) - else: - inputs_dict[input_name] = input_data - main_input = inputs_dict[model_class.main_input_name] - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + main_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - generate_kwargs = { - "max_new_tokens": max_new_tokens, - "do_sample": False, - "return_dict_in_generate": True, - "output_scores": True, - "use_cache": True, - } - - model = model.to(device=torch_device, dtype=torch.float16) - res_sdpa = model.generate(**inputs_dict, **generate_kwargs) - del model - gc.collect() - - model_fa2 = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - res_fa2 = model_fa2.generate(**inputs_dict, **generate_kwargs) - del model_fa2 - gc.collect() - - self._check_similar_generate_outputs(res_fa2, res_sdpa, atol=1e-3, rtol=1e-3) + self._test_attention_implementation("flash_attention_2") @pytest.mark.generate def test_generate_reuse_cache(self): From 9b8d9b23ae4e5d19e0cafd78b650703478410311 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Mon, 28 Oct 2024 14:46:15 +0000 Subject: [PATCH 03/16] cull overwrites of deleted tests --- src/transformers/generation/utils.py | 2 +- tests/generation/test_utils.py | 342 +++--------------- tests/models/bart/test_modeling_bart.py | 5 - tests/models/bert/test_modeling_bert.py | 5 - .../chameleon/test_modeling_chameleon.py | 40 -- tests/models/gemma/test_modeling_gemma.py | 48 --- tests/models/gemma2/test_modeling_gemma2.py | 1 - tests/models/glm/test_modeling_glm.py | 40 -- tests/models/gptj/test_modeling_gptj.py | 45 +-- tests/models/granite/test_modeling_granite.py | 47 +-- .../granitemoe/test_modeling_granitemoe.py | 45 --- .../models/idefics2/test_modeling_idefics2.py | 33 -- .../models/idefics3/test_modeling_idefics3.py | 33 -- tests/models/jamba/test_modeling_jamba.py | 87 ----- tests/models/jetmoe/test_modeling_jetmoe.py | 80 ---- tests/models/kosmos2/test_modeling_kosmos2.py | 6 - tests/models/llama/test_modeling_llama.py | 41 --- tests/models/mamba2/test_modeling_mamba2.py | 6 - tests/models/mimi/test_modeling_mimi.py | 16 - tests/models/mistral/test_modeling_mistral.py | 80 ---- tests/models/mixtral/test_modeling_mixtral.py | 80 ---- tests/models/mllama/test_modeling_mllama.py | 1 - tests/models/mt5/test_modeling_mt5.py | 3 - .../models/musicgen/test_modeling_musicgen.py | 235 ------------ .../test_modeling_musicgen_melody.py | 143 -------- .../models/nemotron/test_modeling_nemotron.py | 2 - .../paligemma/test_modeling_paligemma.py | 4 - tests/models/phi/test_modeling_phi.py | 41 --- tests/models/qwen2/test_modeling_qwen2.py | 80 ---- .../qwen2_moe/test_modeling_qwen2_moe.py | 80 ---- .../models/qwen2_vl/test_modeling_qwen2_vl.py | 4 - .../test_modeling_recurrent_gemma.py | 4 - .../starcoder2/test_modeling_starcoder2.py | 80 ---- tests/models/t5/test_modeling_t5.py | 3 - tests/models/umt5/test_modeling_umt5.py | 3 - tests/models/whisper/test_modeling_whisper.py | 70 ---- tests/models/zamba/test_modeling_zamba.py | 87 ----- tests/test_modeling_common.py | 1 - 38 files changed, 52 insertions(+), 1871 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 705857363415..46ea679aca50 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -418,7 +418,7 @@ def prepare_inputs_for_generation( for model_input_name in ["position_ids", "token_type_ids"]: model_input = kwargs.get(model_input_name) if model_input is not None: - if past_key_values: + if past_key_values is not None: model_input = model_input[:, -input_ids.shape[1] :] model_input = model_input.clone(memory_format=torch.contiguous_format) model_inputs[model_input_name] = model_input diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index eb61d8ca3432..6c10b28e1d39 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -17,15 +17,12 @@ import copy import gc import inspect -import os import tempfile -import time import unittest import warnings import numpy as np import pytest -from packaging import version from parameterized import parameterized from transformers import AutoConfig, is_torch_available, pipeline, set_seed @@ -34,9 +31,7 @@ require_accelerate, require_flash_attn, require_optimum_quanto, - require_read_token, require_torch, - require_torch_accelerator, require_torch_gpu, require_torch_multi_accelerator, require_torch_multi_gpu, @@ -750,9 +745,7 @@ def test_beam_sample_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput) - self._check_outputs( - output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"] - ) + self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"]) @pytest.mark.generate def test_generate_without_input_ids(self): @@ -901,7 +894,6 @@ def test_constrained_beam_search_generate(self): def test_constrained_beam_search_generate_dict_output(self): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - main_input = inputs_dict[model_class.main_input_name] model = model_class(config).to(torch_device).eval() @@ -1575,7 +1567,8 @@ def test_past_key_values_format(self): ) @pytest.mark.generate - def test_generate_from_inputs_embeds_decoder_only(self): + def test_generate_from_inputs_embeds(self): + """Tests that we can generate from `inputs_embeds` instead of `input_ids` in LLMs, VLMs, etc""" # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids` # if fails, you should probably update the `prepare_inputs_for_generation` function for model_class in self.all_generative_model_classes: @@ -1854,10 +1847,8 @@ def test_new_cache_format(self, num_beams, do_sample): @pytest.mark.generate def test_generate_with_static_cache(self): """ - Tests if StaticCache works if we set attn_implementation=static when generation. - This doesn't test if generation quality is good, but tests that models with - self._supports_static_cache don't throw an error when generating and return - a StaticCache object at the end. + Tests that generating with static cache give almost same results as with dynamic cache, and the output cache + has the expected shapes """ for model_class in self.all_generative_model_classes: if not model_class._supports_static_cache: @@ -1876,13 +1867,15 @@ def test_generate_with_static_cache(self): model = model_class(config).to(torch_device).eval() generation_kwargs = { - "max_length": None, "max_new_tokens": max_new_tokens, - "cache_implementation": "static", "return_dict_in_generate": True, # Required to return `past_key_values` + "output_scores": True, "use_cache": True, } + static_cache_generation = model.generate(**generation_kwargs, **inputs_dict, cache_implementation="static") + + # Check 1: The cache shapes must match the expected shapes max_cache_len = seq_length + max_new_tokens config = config.text_config if hasattr(config, "text_config") else config head_dim = ( @@ -1894,12 +1887,14 @@ def test_generate_with_static_cache(self): else config.num_key_value_heads ) num_hidden_layers = config.num_hidden_layers - results = model.generate(**generation_kwargs, **inputs_dict) - cache_shape = (batch_size, num_key_value_heads, max_cache_len, head_dim) - self.assertTrue(isinstance(results.past_key_values, StaticCache)) - self.assertTrue(len(results.past_key_values.key_cache) == num_hidden_layers) - self.assertTrue(results.past_key_values.key_cache[0].shape == cache_shape) + self.assertTrue(isinstance(static_cache_generation.past_key_values, StaticCache)) + self.assertTrue(len(static_cache_generation.past_key_values.key_cache) == num_hidden_layers) + self.assertTrue(static_cache_generation.past_key_values.key_cache[0].shape == cache_shape) + + # Check 2: The outputs must be similar to the case with dynamic cache + dynamic_cache_generation = model.generate(**generation_kwargs, **inputs_dict) + self._check_similar_generate_outputs(dynamic_cache_generation, static_cache_generation) @require_optimum_quanto @pytest.mark.generate @@ -1936,23 +1931,29 @@ def test_generate_with_quant_cache(self): @pytest.mark.generate @require_torch_gpu @slow - def test_generate_compile_fullgraph(self): + @parameterized.expand( + [ + ("forward_only", False), # TODO (@joao): a few models failing. After fixed, this should not be "@slow" + ("end_to_end", True), # TODO (@joao): end-to-end compilation is broken with torch 2.5+, explore and fix + ] + ) + def test_generate_compile(self, name, end_to_end): """ - Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. + Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. Tests + end-to-end compilation and forward pass compilation only. ⚠️ Runs two sequential generations to ensure the cache doesn't get stuck after the first compiled run! ⚠️ """ - # TODO (@joao): end-to-end compilation is broken, fix me :( - for model_class in self.all_generative_model_classes: if not model_class._supports_static_cache: self.skipTest("This model doesn't support static cache") + # TODO (joao) -- fix and enable me :) - if any(model_name in model_class.__name__.lower() for model_name in ["whisper"]): + if end_to_end and any(model_name in model_class.__name__.lower() for model_name in ["whisper"]): self.skipTest("whisper model end-to-end generate compile not yet supported") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() # TODO (joao) -- fix and enable me :) - if config.is_encoder_decoder: + if end_to_end and config.is_encoder_decoder: self.skipTest("Encoder-decoder model end-to-end generate compile not yet supported") model = model_class(config).to(torch_device) @@ -1970,26 +1971,30 @@ def test_generate_compile_fullgraph(self): "return_dict_in_generate": True, "output_scores": True, } + # end-to-end works best with dynamic cache, forward compilation works best with static cache + if not end_to_end: + generation_kwargs["cache_implementation"] = "static" - max_cache_len = input_ids.shape[1] + generation_kwargs["max_new_tokens"] - config = config.get_text_config() - past_key_values = StaticCache( - config, batch_size=half_batch_size, max_cache_len=max_cache_len, device=torch_device - ) + # get eager + dynamic cache results for future comparison + dynamic_outputs = [] + for model_inputs in input_ids_sets: + dynamic_outputs.append(model.generate(model_inputs, **generation_kwargs)) + + # get compiled results + generation_config = copy.deepcopy(model.generation_config) + generation_config.update(**generation_kwargs) + torch.compiler.reset() + if end_to_end: + model.generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead") + else: + model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead") + compuled_outputs = [] for model_inputs in input_ids_sets: - # eager dynamic cache - output_dynamic = model.generate(model_inputs, **generation_kwargs) - - # end-to-end compiled dynamic cache - torch.compiler.reset() - compiled_generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead") - generation_config = copy.deepcopy(model.generation_config) - generation_config.update(**generation_kwargs) - output_compiled = compiled_generate( - model_inputs, generation_config=generation_config, past_key_values=past_key_values - ) - self._check_similar_generate_outputs(output_dynamic, output_compiled) + compuled_outputs.append(model.generate(model_inputs, generation_config=generation_config)) + + for dynamic_result, compiled_result in zip(dynamic_outputs, compuled_outputs): + self._check_similar_generate_outputs(dynamic_result, compiled_result) @pytest.mark.generate def test_generate_methods_with_num_logits_to_keep(self): @@ -2141,255 +2146,6 @@ def test_eager_matches_fa2_generate(self): # check whether we still need the overwrites self._test_attention_implementation("flash_attention_2") - @pytest.mark.generate - def test_generate_reuse_cache(self): - """Tests that we can pass a cache out of the `generate`, and use it to continue generation""" - max_new_tokens = 2 - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - inputs_dict.pop("attention_mask", None) # Let's rely on `attention_mask=None` in this test, for simplicity - - if not hasattr(config, "use_cache"): - self.skipTest(reason=f"{model_class.__name__} doesn't support caching") - - model = model_class(config).to(torch_device) - - # base generate - generate_kwargs = { - "max_new_tokens": max_new_tokens, - "do_sample": False, - "return_dict_in_generate": True, - "output_scores": True, - "use_cache": True, - } - base_outputs = model.generate(**inputs_dict, **generate_kwargs) - - # generate with cache reuse - generate_kwargs["max_new_tokens"] = 1 - cache_reuse_outputs = model.generate(**inputs_dict, **generate_kwargs) - if not config.is_encoder_decoder: - inputs_dict["input_ids"] = cache_reuse_outputs.sequences - else: - inputs_dict["decoder_input_ids"] = cache_reuse_outputs.sequences - cache_reuse_outputs = model.generate( - **inputs_dict, **generate_kwargs, past_key_values=cache_reuse_outputs.past_key_values - ) - - self._check_similar_generate_outputs(base_outputs, cache_reuse_outputs) - - # @pytest.mark.generate - # def test_inputs_embeds_matches_input_ids_with_generate(self): - # config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - # for model_class in self.all_model_classes: - # if model_class.__name__ not in [ - # *get_values(MODEL_FOR_CAUSAL_LM_MAPPING_NAMES), - # *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES), - # ]: - # continue - - # model = model_class(config) - # model.to(torch_device) - # model.eval() - - # model_forward_args = inspect.signature(model.forward).parameters - # if any(argument not in model_forward_args for argument in ["inputs_embeds", "position_ids"]): - # self.skipTest(reason="This model doesn't use `inputs_embeds` or `position_ids`.") - # has_inputs_embeds_forwarding = "inputs_embeds" in set( - # inspect.signature(model.prepare_inputs_for_generation).parameters.keys() - # ) - # if not has_inputs_embeds_forwarding: - # self.skipTest(reason="This model doesn't support `inputs_embeds` passed to `generate`.") - # inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) - # pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 - - # # VLMs can't generate with embeds and pixels at the same time. We expect the user to pass merged - # # embeds already - # if model_class.__name__ in get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES): - # inputs.pop("pixel_values", None) - # inputs.pop("pixel_values_videos", None) - # inputs.pop("pixel_values_images", None) - - # wte = model.get_input_embeddings() - # if not self.is_encoder_decoder: - # input_ids = inputs["input_ids"] - # # some models infer position ids/attn mask differently when input ids - # # by check if pad_token let's make sure no padding is in input ids - # not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 - # input_ids[input_ids == pad_token_id] = not_pad_token_id - # del inputs["input_ids"] - # inputs_embeds = wte(input_ids) - # out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2)[:, -2:] - # out_embeds = model.generate(inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) - # else: - # encoder_input_ids = inputs["input_ids"] - # decoder_input_ids = inputs.get("decoder_input_ids", encoder_input_ids) - # encoder_input_ids[encoder_input_ids == pad_token_id] = max(0, pad_token_id + 1) - # decoder_input_ids[decoder_input_ids == pad_token_id] = max(0, pad_token_id + 1) - # del inputs["input_ids"] - # inputs.pop("decoder_input_ids", None) - # inputs_embeds = wte(encoder_input_ids) - # decoder_inputs_embeds = wte(decoder_input_ids) - # out_ids = model.generate( - # input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, **inputs, max_new_tokens=2 - # )[:, -2:] - # out_embeds = model.generate( - # inputs_embeds=inputs_embeds, - # decoder_inputs_embeds=decoder_inputs_embeds, - # **inputs, - # max_new_tokens=2, - # ) - # # NOTE: this test changes the order of FP ops, there may be tiny differences in the output - # number_of_different_tokens = (out_ids != out_embeds).sum() - # max_differences = int(out_ids.shape[0] * out_ids.shape[1] * 0.1) - # self.assertTrue(number_of_different_tokens <= max_differences) # accept up to 10% mismatch - - @pytest.mark.generate - def test_static_cache_matches_dynamic(self): - """ - Tests that generating with static cache give almost same results as with dynamic cache. - This test does not compile the model and check only logits similarity for numerical precision - errors. - """ - if len(self.all_generative_model_classes) == 0: - self.skipTest( - reason="Model architecture has no generative classes, and thus not necessarily supporting 4D masks" - ) - for model_class in self.all_generative_model_classes: - if not model_class._supports_static_cache: - self.skipTest(f"{model_class.__name__} does not support static cache") - - if not model_class._supports_cache_class: - self.skipTest(f"{model_class.__name__} does not support cache class") - - config, inputs = self.model_tester.prepare_config_and_inputs_for_common() - if getattr(config, "sliding_window", 0) is not None and getattr(config, "sliding_window", 0) > 0: - self.skipTest(f"{model_class.__name__} with sliding window attention is not supported by this test") - - model = model_class(config).to(device=torch_device, dtype=torch.float32) - model.eval() - - dynamic_out = model.generate( - **inputs, do_sample=False, max_new_tokens=10, output_logits=True, return_dict_in_generate=True - ) - static_out = model.generate( - **inputs, - do_sample=False, - max_new_tokens=10, - cache_implementation="static", - output_logits=True, - return_dict_in_generate=True, - ) - self.assertTrue(torch.allclose(dynamic_out.logits[0], static_out.logits[0], rtol=1e-3, atol=1e-4)) - - # For now, Let's focus only on GPU for `torch.compile` - @slow - @require_torch_accelerator - @require_read_token - @pytest.mark.generate - def test_torch_compile(self): - if version.parse(torch.__version__) < version.parse("2.3"): - self.skipTest(reason="This test requires torch >= 2.3 to run.") - torch.compiler.reset() - if not hasattr(self, "_torch_compile_test_ckpt"): - self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.") - ckpt = self._torch_compile_test_ckpt - revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision - - os.environ["TOKENIZERS_PARALLELISM"] = "false" - - batch_size = 1 - n_iter = 3 - - tokenizer = AutoTokenizer.from_pretrained(ckpt) - if self.is_encoder_decoder: - model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - else: - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - - model.generation_config.max_new_tokens = 4 - - model.generation_config.cache_implementation = "static" - model.forward = torch.compile(model.forward, mode="reduce-overhead") - - input_text = "Why dogs are cute?" - input_ids = tokenizer([input_text] * batch_size, return_tensors="pt").to(torch_device) - - for i in range(n_iter): - _ = model.generate(**input_ids, do_sample=False) - - @slow - @require_torch_gpu # Testing cuda graphs. - @require_read_token - @pytest.mark.generate - def test_compile_cuda_graph_time(self): - if version.parse(torch.__version__) < version.parse("2.3"): - self.skipTest(reason="This test requires torch >= 2.3 to run.") - - # TODO felix: All models supporting `StaticCache` or `torch.compile` should be tested. - # At the moment, only llama, gemma and gemma2 are tested here! - if not hasattr(self, "_torch_compile_test_ckpt"): - self.skipTest(f"{self.__class__.__name__} doesn't have the attribute `_torch_compile_test_ckpt`.") - ckpt = self._torch_compile_test_ckpt - revision = "main" if not hasattr(self, "_torch_compile_test_revision") else self._torch_compile_test_revision - - os.environ["TOKENIZERS_PARALLELISM"] = "false" - - tokenizer = AutoTokenizer.from_pretrained(ckpt) - if self.is_encoder_decoder: - model = AutoModelForSeq2SeqLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - else: - model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16, revision=revision).to( - torch_device - ) - - cache_implementation = "static" - if model.config.model_type == "gemma2": - cache_implementation = "hybrid" - - new_tokens = 50 - gen_config = GenerationConfig( - max_new_tokens=new_tokens, - min_new_tokens=new_tokens, - use_cache=True, - pad_token_id=tokenizer.pad_token_id, - num_beams=1, - do_sample=False, - eos_token_id=None, # This is required for min_new_tokens to actually have an effect. - ) - model.generation_config.eos_token_id = None # greedy_search falls back on this eos_token_id that we need to set to None as well for min_new_tokens to have an effect. - - model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) - - inp = tokenizer("Why cats are cute?", return_tensors="pt").to(torch_device) - - # First run: the first run warms up each graph, which does things like CuBlas or Triton benchmarking - start = time.perf_counter() - _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) - end = time.perf_counter() - graph_warmup_time = end - start - - # Second run: CUDA Graph recording, and replays it - start = time.perf_counter() - _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) - end = time.perf_counter() - record_time = end - start - - # Finally: we hit the optimized, CUDA Graph replay path - start = time.perf_counter() - _ = model.generate(**inp, generation_config=gen_config, cache_implementation=cache_implementation) - end = time.perf_counter() - opt_time = end - start - - # For the recording step, we expect only two cuda graphs and this step should be much faster than the first. - self.assertTrue(record_time < 0.15 * graph_warmup_time) - self.assertTrue(opt_time < record_time) - def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1): batch_size = output.sequences.shape[0] diff --git a/tests/models/bart/test_modeling_bart.py b/tests/models/bart/test_modeling_bart.py index eda51d21199f..e4d0df141be2 100644 --- a/tests/models/bart/test_modeling_bart.py +++ b/tests/models/bart/test_modeling_bart.py @@ -1532,8 +1532,3 @@ def test_retain_grad_hidden_states_attentions(self): @unittest.skip def test_save_load_fast_init_from_base(self): pass - - @unittest.skip(reason="Generate needs input ids") - def test_inputs_embeds_matches_input_ids_with_generate(self): - # generate only works with input ids for bartforcausalLM - pass diff --git a/tests/models/bert/test_modeling_bert.py b/tests/models/bert/test_modeling_bert.py index 8ac1c3d2b409..9e7edc28d0c8 100644 --- a/tests/models/bert/test_modeling_bert.py +++ b/tests/models/bert/test_modeling_bert.py @@ -509,11 +509,6 @@ def test_model_as_decoder(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_model_as_decoder(*config_and_inputs) - @unittest.skip(reason="Generate needs input ids") - def test_inputs_embeds_matches_input_ids_with_generate(self): - # generate only works with input ids for bertforcausalLM - pass - def test_model_as_decoder_with_default_input_mask(self): # This regression test was failing with PyTorch < 1.3 ( diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py index aad26ef147e8..2a8e7633ba40 100644 --- a/tests/models/chameleon/test_modeling_chameleon.py +++ b/tests/models/chameleon/test_modeling_chameleon.py @@ -16,17 +16,14 @@ import unittest -import pytest import requests from parameterized import parameterized from transformers import ChameleonConfig, is_torch_available, is_vision_available, set_seed from transformers.testing_utils import ( require_bitsandbytes, - require_flash_attn, require_read_token, require_torch, - require_torch_gpu, slow, torch_device, ) @@ -329,43 +326,6 @@ def test_model_rope_scaling(self, scaling_type): # The output should be different for long inputs self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5)) - @require_flash_attn - @require_read_token - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = ChameleonForConditionalGeneration.from_pretrained( - "facebook/chameleon-7b", - load_in_4bit=True, - device_map={"": 0}, - ) - - processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b") - texts = ["hi", "Hello this is a very long sentence"] - - processor.tokenizer.padding_side = "right" - - inputs = processor(text=texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = processor.tokenizer.batch_decode(output_native) - - model = ChameleonForConditionalGeneration.from_pretrained( - "facebook/chameleon-7b", - load_in_4bit=True, - attn_implementation="flash_attention_2", - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = processor.tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @unittest.skip("Chameleon forces some token ids to be -inf!") def test_batching_equivalence(self): pass diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py index a888bdcd3bc7..e8483f8c7c7d 100644 --- a/tests/models/gemma/test_modeling_gemma.py +++ b/tests/models/gemma/test_modeling_gemma.py @@ -319,9 +319,6 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi # This is because we are hitting edge cases with the causal_mask buffer model_split_percents = [0.5, 0.6] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "google/gemma-2b" - # used in `test_torch_compile_for_training` _torch_compile_train_cls = GemmaForCausalLM if is_torch_available() else None @@ -419,51 +416,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Gemma apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py index 94670803daa9..7bca83f96d73 100644 --- a/tests/models/gemma2/test_modeling_gemma2.py +++ b/tests/models/gemma2/test_modeling_gemma2.py @@ -78,7 +78,6 @@ class Gemma2ModelTest(GemmaModelTest, unittest.TestCase): test_pruning = False _is_stateful = True model_split_percents = [0.5, 0.6] - _torch_compile_test_ckpt = "google/gemma-2-9b" def setUp(self): self.model_tester = Gemma2ModelTester(self) diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py index 32bce7cbfa61..b92c5db815b7 100644 --- a/tests/models/glm/test_modeling_glm.py +++ b/tests/models/glm/test_modeling_glm.py @@ -28,7 +28,6 @@ require_flash_attn, require_torch, require_torch_accelerator, - require_torch_gpu, require_torch_sdpa, slow, torch_device, @@ -306,10 +305,6 @@ class GlmModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, test_headmasking = False test_pruning = False - # used in `test_torch_compile` - _torch_compile_test_ckpt = "THUDM/glm-4-9b" - _torch_compile_test_revision = "refs/pr/15" - def setUp(self): self.model_tester = GlmModelTester(self) self.config_tester = ConfigTester(self, config_class=GlmConfig, hidden_size=37) @@ -426,41 +421,6 @@ def test_custom_4d_attention_mask(self): torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-3) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - """Overwrite the common test as the test is flaky on tiny models.""" - model = GlmForCausalLM.from_pretrained( - "THUDM/glm-4-9b", - device_map={"": 0}, - torch_dtype=torch.bfloat16, - revision="refs/pr/15", - ) - - tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b", revision="refs/pr/15") - tokenizer.padding_side = "right" - - texts = ["hi", "Hello this is a very long sentence"] - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=15, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = GlmForCausalLM.from_pretrained( - "THUDM/glm-4-9b", - device_map={"": 0}, - attn_implementation="flash_attention_2", - torch_dtype=torch.bfloat16, - revision="refs/pr/15", - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=15, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa @slow diff --git a/tests/models/gptj/test_modeling_gptj.py b/tests/models/gptj/test_modeling_gptj.py index 6f6fba50dc12..afc741cd502d 100644 --- a/tests/models/gptj/test_modeling_gptj.py +++ b/tests/models/gptj/test_modeling_gptj.py @@ -17,14 +17,9 @@ import datetime import unittest -import pytest - -from transformers import BitsAndBytesConfig, GPTJConfig, is_torch_available +from transformers import GPTJConfig, is_torch_available from transformers.testing_utils import ( - require_bitsandbytes, - require_flash_attn, require_torch, - require_torch_gpu, slow, tooslow, torch_device, @@ -505,44 +500,6 @@ def test_model_from_pretrained(self): model = GPTJModel.from_pretrained(model_name, revision="float16", torch_dtype=torch.float16) self.assertIsNotNone(model) - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b") - - texts = ["hi", "Hello this is a very long sentence"] - expected_outputs = [ - "hi<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>Q: I have a question about the new version of the game. I have a question about the", - "Hello this is a very long sentence.\n\nA:\n\nI think the best way to understand this is to think of it", - ] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - quantization_config = BitsAndBytesConfig(load_in_4bit=True) - - model = GPTJForCausalLM.from_pretrained( - "EleutherAI/gpt-j-6b", - device_map={"": 0}, - attn_implementation="flash_attention_2", - revision="float16", - torch_dtype=torch.float16, - quantization_config=quantization_config, - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(expected_outputs, output_fa_2) - @require_torch class GPTJModelLanguageGenerationTest(unittest.TestCase): diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py index 1bcb6641803c..97b59f5aa506 100644 --- a/tests/models/granite/test_modeling_granite.py +++ b/tests/models/granite/test_modeling_granite.py @@ -17,12 +17,10 @@ import tempfile import unittest -import pytest from parameterized import parameterized -from transformers import AutoTokenizer, GraniteConfig, is_torch_available, set_seed +from transformers import GraniteConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_bitsandbytes, require_flash_attn, require_read_token, require_torch, @@ -303,9 +301,6 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi # This is because we are hitting edge cases with the causal_mask buffer model_split_percents = [0.5, 0.7, 0.8] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "ibm/PowerLM-3b" - def setUp(self): self.model_tester = GraniteModelTester(self) self.config_tester = ConfigTester(self, config_class=GraniteConfig, hidden_size=37) @@ -423,46 +418,6 @@ def test_model_rope_scaling(self): with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @require_read_token - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = GraniteForCausalLM.from_pretrained( - "ibm/PowerLM-3b", - load_in_4bit=True, - device_map={"": 0}, - ) - - tokenizer = AutoTokenizer.from_pretrained("ibm/PowerLM-3b") - - texts = ["hi", "Hello this is a very long sentence"] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = GraniteForCausalLM.from_pretrained( - "ibm/PowerLM-3b", - load_in_4bit=True, - device_map={"": 0}, - attn_implementation="flash_attention_2", - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @require_flash_attn @require_torch_gpu @slow diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py index 124ce0c3bb5a..f2f76b9fa75b 100644 --- a/tests/models/granitemoe/test_modeling_granitemoe.py +++ b/tests/models/granitemoe/test_modeling_granitemoe.py @@ -17,12 +17,10 @@ import tempfile import unittest -import pytest from parameterized import parameterized from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_bitsandbytes, require_flash_attn, require_read_token, require_torch, @@ -302,9 +300,6 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test # This is because we are hitting edge cases with the causal_mask buffer model_split_percents = [0.5, 0.7, 0.8] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "ibm/PowerMoE-3b" - def setUp(self): self.model_tester = GraniteMoeModelTester(self) self.config_tester = ConfigTester(self, config_class=GraniteMoeConfig, hidden_size=37) @@ -422,46 +417,6 @@ def test_model_rope_scaling(self): with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @require_read_token - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = GraniteMoeForCausalLM.from_pretrained( - "ibm-granite/granitemoe-3b", - load_in_4bit=True, - device_map={"": 0}, - ) - - tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granitemoe-3b") - - texts = ["hi", "Hello this is a very long sentence"] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = GraniteMoeForCausalLM.from_pretrained( - "ibm-granite/granitemoe-3b", - load_in_4bit=True, - device_map={"": 0}, - attn_implementation="flash_attention_2", - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @require_flash_attn @require_torch_gpu @slow diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 854b8b934578..5790d21c5348 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -196,10 +196,6 @@ def test_inputs_embeds(): def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Model does not support padding right") - def test_flash_attn_2_generate_padding_right(self): - pass - @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_inference_padding_right(self): pass @@ -390,10 +386,6 @@ def setUp(self): def test_inputs_embeds(): pass - @unittest.skip(reason="Model does not support padding right") - def test_flash_attn_2_generate_padding_right(self): - pass - @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_inference_padding_right(self): pass @@ -579,31 +571,6 @@ def test_resize_embeddings_untied(self): # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) - def test_inputs_embeds_matches_input_ids_with_generate(self): - # overwrite because IDEFICS needs ids and embeds at the input to be not None - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - model = model_class(config) - model.to(torch_device) - model.eval() - - inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) - pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 - - wte = model.get_input_embeddings() - - input_ids = inputs["input_ids"] - # some models infer position ids/attn mask differently when input ids - # by check if pad_token let's make sure no padding is in input ids - not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 - input_ids[input_ids == pad_token_id] = not_pad_token_id - del inputs["input_ids"] - inputs_embeds = wte(input_ids) - out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2) - out_embeds = model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) - - self.assertTrue(torch.allclose(out_embeds, out_ids)) - @require_torch class Idefics2ForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index f0366e7b539a..3bb32c869913 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -180,10 +180,6 @@ def test_inputs_embeds(): def test_inputs_embeds_matches_input_ids(self): pass - @unittest.skip(reason="Model does not support padding right") - def test_flash_attn_2_generate_padding_right(self): - pass - @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_inference_padding_right(self): pass @@ -337,10 +333,6 @@ def setUp(self): def test_inputs_embeds(): pass - @unittest.skip(reason="Model does not support padding right") - def test_flash_attn_2_generate_padding_right(self): - pass - @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_inference_padding_right(self): pass @@ -526,31 +518,6 @@ def test_resize_embeddings_untied(self): # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) - def test_inputs_embeds_matches_input_ids_with_generate(self): - # overwrite because IDEFICS needs ids and embeds at the input to be not None - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - model = model_class(config) - model.to(torch_device) - model.eval() - - inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) - pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 - - wte = model.get_input_embeddings() - - input_ids = inputs["input_ids"] - # some models infer position ids/attn mask differently when input ids - # by check if pad_token let's make sure no padding is in input ids - not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 - input_ids[input_ids == pad_token_id] = not_pad_token_id - del inputs["input_ids"] - inputs_embeds = wte(input_ids) - out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2) - out_embeds = model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) - - self.assertTrue(torch.allclose(out_embeds, out_ids)) - @require_torch class Idefics3ForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/jamba/test_modeling_jamba.py b/tests/models/jamba/test_modeling_jamba.py index 251f293f7226..ef0b5831587b 100644 --- a/tests/models/jamba/test_modeling_jamba.py +++ b/tests/models/jamba/test_modeling_jamba.py @@ -539,93 +539,6 @@ def test_flash_attn_2_fp32_ln(self): # with attention mask _ = model(dummy_input, attention_mask=dummy_attention_mask) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - r""" - Overriding the test_flash_attn_2_generate_padding_right test as the Jamba model, like Mixtral, doesn't support - right padding + use cache with FA2 - """ - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - r""" - Overriding the test_flash_attn_2_generate_use_cache test as the Jamba model, like Mixtral, doesn't support - right padding + use cache with FA2 - """ - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Jamba does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py index a04d8bba741a..dc510f0ff040 100644 --- a/tests/models/jetmoe/test_modeling_jetmoe.py +++ b/tests/models/jetmoe/test_modeling_jetmoe.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch JetMoe model.""" import gc -import tempfile import unittest import pytest @@ -377,85 +376,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: JetMoe apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index de6c0b15d661..0f0b595d3d23 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -438,12 +438,6 @@ def check_same_values(layer_1, layer_2): # self.assertTrue(model.transformer.wte.weight.shape, model.lm_head.weight.shape) # self.assertTrue(check_same_values(model.transformer.wte, model.lm_head)) - @unittest.skip( - "KOSMOS-2 doesn't support inputs embeds. The test isn't skipped by checking ipnut args because KOSMOS-2 has `generate()` overwritten" - ) - def test_inputs_embeds_matches_input_ids_with_generate(self): - pass - @slow def test_model_from_pretrained(self): model_name = "microsoft/kosmos-2-patch14-224" diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index 824337d8bdda..375ec1dd3e6f 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -26,7 +26,6 @@ from transformers.generation.configuration_utils import GenerationConfig from transformers.testing_utils import ( backend_empty_cache, - require_bitsandbytes, require_flash_attn, require_read_token, require_torch, @@ -316,9 +315,6 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi # This is because we are hitting edge cases with the causal_mask buffer model_split_percents = [0.5, 0.7, 0.8] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "meta-llama/Llama-2-7b-hf" - # used in `test_torch_compile_for_training` _torch_compile_train_cls = LlamaForCausalLM if is_torch_available() else None @@ -585,43 +581,6 @@ def _reinitialize_config(base_config, new_kwargs): with self.assertRaises(KeyError): config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor" - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @require_read_token - @slow - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = LlamaForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", - load_in_4bit=True, - device_map={"": 0}, - ) - - tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") - - texts = ["hi", "Hello this is a very long sentence"] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = LlamaForCausalLM.from_pretrained( - "meta-llama/Llama-2-7b-hf", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2" - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @require_flash_attn @require_torch_gpu @slow diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py index f19358a22f4b..201aa0b33c0e 100644 --- a/tests/models/mamba2/test_modeling_mamba2.py +++ b/tests/models/mamba2/test_modeling_mamba2.py @@ -275,12 +275,6 @@ def recursive_check(tuple_object, dict_object): dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) check_equivalence(model, tuple_inputs, dict_inputs, {"output_hidden_states": True}) - @unittest.skip( - reason="Mamba2 does not support generating with input embeddings (custom cache_position computation)" - ) - def test_inputs_embeds_matches_input_ids_with_generate(self): - pass - @require_torch @slow diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py index 074dceae1552..0e66b276065c 100644 --- a/tests/models/mimi/test_modeling_mimi.py +++ b/tests/models/mimi/test_modeling_mimi.py @@ -745,22 +745,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): def test_sdpa_can_compile_dynamic(self): pass - # For now, Let's focus only on GPU for `torch.compile` - @slow - @require_torch_gpu - def test_torch_compile(self): - if version.parse(torch.__version__) < version.parse("2.3"): - self.skipTest(reason="This test requires torch >= 2.3 to run.") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - n_iter = 3 - for model_class in self.all_model_classes: - model = model_class(config).to(torch_device) - model.forward = torch.compile(model.forward) - for i in range(n_iter): - _ = model(inputs_dict["input_values"].to(torch_device)) - @is_flaky() def test_batching_equivalence(self): super().test_batching_equivalence() diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py index f2ee714bcdba..1538735ad78b 100644 --- a/tests/models/mistral/test_modeling_mistral.py +++ b/tests/models/mistral/test_modeling_mistral.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Mistral model.""" import gc -import tempfile import unittest import pytest @@ -416,85 +415,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Mistral apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index b9b5faed851f..931bb1f17bec 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Mixtral model.""" -import tempfile import unittest import pytest @@ -415,85 +414,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Mixtral apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index 3efa7b778fb7..5174247b895e 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -126,7 +126,6 @@ class MllamaForCausalLMModelTest(ModelTesterMixin, GenerationTesterMixin, unitte all_generative_model_classes = (MllamaForCausalLM,) if is_torch_available() else () test_pruning = False test_head_masking = False - _torch_compile_test_ckpt = "nltpt/Llama-3.2-11B-Vision" def setUp(self): self.model_tester = MllamaText2TextModelTester(self) diff --git a/tests/models/mt5/test_modeling_mt5.py b/tests/models/mt5/test_modeling_mt5.py index 20412da2e1db..1628d3a5893e 100644 --- a/tests/models/mt5/test_modeling_mt5.py +++ b/tests/models/mt5/test_modeling_mt5.py @@ -576,9 +576,6 @@ class MT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, # The small MT5 model needs higher percentages for CPU/MP tests model_split_percents = [0.5, 0.8, 0.9] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "google/mt5-small" - def setUp(self): self.model_tester = MT5ModelTester(self) self.config_tester = ConfigTester(self, config_class=MT5Config, d_model=37) diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index 4c9dc95fb3bf..963cace28d6e 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -450,98 +450,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding - def test_flash_attn_2_generate_left_padding(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right - def test_flash_attn_2_generate_padding_right(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa @slow @@ -1539,149 +1447,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding - def test_flash_attn_2_generate_left_padding(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask") - if dummy_attention_mask is None: - dummy_attention_mask = torch.ones_like(dummy_input) - - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right - def test_flash_attn_2_generate_padding_right(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask") - if dummy_attention_mask is None: - dummy_attention_mask = torch.ones_like(dummy_input) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache - def test_flash_attn_2_generate_use_cache(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_torch_sdpa def test_sdpa_can_dispatch_composite_models(self): if not self.has_attentions: diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index f3b6be0ac652..957db9f23b0f 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -1437,149 +1437,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding - def test_flash_attn_2_generate_left_padding(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask") - if dummy_attention_mask is None: - dummy_attention_mask = torch.ones_like(dummy_input) - - # make sure we do left padding - dummy_attention_mask[:, :-1] = 0 - dummy_attention_mask[:, -1:] = 1 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right - def test_flash_attn_2_generate_padding_right(self): - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = inputs_dict[model.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - dummy_attention_mask = inputs_dict.get("attention_mask") - if dummy_attention_mask is None: - dummy_attention_mask = torch.ones_like(dummy_input) - # make sure we do right padding - dummy_attention_mask[:, :-1] = 1 - dummy_attention_mask[:, -1:] = 0 - - out = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - out_fa = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False - ) - - self.assertTrue(torch.allclose(out, out_fa)) - - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - # Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache - def test_flash_attn_2_generate_use_cache(self): - max_new_tokens = 30 - - # Ignore copy - for model_class in self.greedy_sample_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None}, - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_torch_sdpa def test_sdpa_can_dispatch_composite_models(self): if not self.has_attentions: diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py index 13adfe1e5794..37a581a33866 100644 --- a/tests/models/nemotron/test_modeling_nemotron.py +++ b/tests/models/nemotron/test_modeling_nemotron.py @@ -92,8 +92,6 @@ class NemotronModelTest(GemmaModelTest): test_pruning = False fx_compatible = False - # used in `test_torch_compile` - _torch_compile_test_ckpt = "nvidia/nemotron-3-8b-base-4k-hf" # used in `test_torch_compile_for_training` _torch_compile_train_cls = NemotronForCausalLM if is_torch_available() else None diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index cfc2a2c29b1d..d07dd40603cd 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -316,10 +316,6 @@ def test_save_load_low_cpu_mem_usage_no_safetensors(self): def test_generate_from_inputs_embeds_with_static_cache(self): pass - @unittest.skip(reason="TODO (@joao): fix me -- failing to produce similar results") - def test_static_cache_matches_dynamic(self): - pass - @unittest.skip("FlashAttention only support fp16 and bf16 data type") def test_flash_attn_2_fp32_ln(self): pass diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py index c17f69a49986..eae6789bef25 100644 --- a/tests/models/phi/test_modeling_phi.py +++ b/tests/models/phi/test_modeling_phi.py @@ -17,15 +17,11 @@ import unittest -import pytest from parameterized import parameterized from transformers import PhiConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_bitsandbytes, - require_flash_attn, require_torch, - require_torch_gpu, slow, torch_device, ) @@ -468,43 +464,6 @@ def test_model_rope_scaling(self): torch.testing.assert_close(ntk_sin_long, original_sin_long) self.assertTrue((ntk_scaling_rope.inv_freq <= original_rope.inv_freq).all()) - @require_flash_attn - @require_torch_gpu - @require_bitsandbytes - @pytest.mark.flash_attn_test - @slow - # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_flash_attn_2_generate_padding_right with LlamaForCausalLM->PhiForCausalLM,LlamaTokenizer->AutoTokenizer,meta-llama/Llama-2-7b-hf->microsoft/phi-1 - def test_flash_attn_2_generate_padding_right(self): - """ - Overwritting the common test as the test is flaky on tiny models - """ - model = PhiForCausalLM.from_pretrained( - "microsoft/phi-1", - load_in_4bit=True, - device_map={"": 0}, - ) - - tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1") - - texts = ["hi", "Hello this is a very long sentence"] - - tokenizer.padding_side = "right" - tokenizer.pad_token = tokenizer.eos_token - - inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0) - - output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_native = tokenizer.batch_decode(output_native) - - model = PhiForCausalLM.from_pretrained( - "microsoft/phi-1", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2" - ) - - output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False) - output_fa_2 = tokenizer.batch_decode(output_fa_2) - - self.assertListEqual(output_native, output_fa_2) - @slow @require_torch diff --git a/tests/models/qwen2/test_modeling_qwen2.py b/tests/models/qwen2/test_modeling_qwen2.py index 4e57f8e0f002..f51dc2e0a5e2 100644 --- a/tests/models/qwen2/test_modeling_qwen2.py +++ b/tests/models/qwen2/test_modeling_qwen2.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Qwen2 model.""" import gc -import tempfile import unittest import pytest @@ -428,85 +427,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Qwen2 apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py index c545e882faee..abc7b57919b0 100644 --- a/tests/models/qwen2_moe/test_modeling_qwen2_moe.py +++ b/tests/models/qwen2_moe/test_modeling_qwen2_moe.py @@ -15,7 +15,6 @@ """Testing suite for the PyTorch Qwen2MoE model.""" import gc -import tempfile import unittest import pytest @@ -453,85 +452,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Qwen2Moe apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 956243dccebe..c020e8258b93 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -267,10 +267,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self): def test_feed_forward_chunking(self): pass - @unittest.skip(reason="Generate needs input ids") - def test_inputs_embeds_matches_input_ids_with_generate(self): - pass - @unittest.skip(reason="CPU offload is not yet supported") def test_cpu_offload(self): pass diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py index 542955f9fa45..985115d7707b 100644 --- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py +++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py @@ -420,10 +420,6 @@ def _check_hidden_states_for_generate( def test_initialization(self): pass - @unittest.skip(reason="RecurrentGemma does not support generating with input embeddings (missing position_ids)") - def test_inputs_embeds_matches_input_ids_with_generate(self): - pass - @require_torch_accelerator @slow diff --git a/tests/models/starcoder2/test_modeling_starcoder2.py b/tests/models/starcoder2/test_modeling_starcoder2.py index 32d28143d72f..df743f132c11 100644 --- a/tests/models/starcoder2/test_modeling_starcoder2.py +++ b/tests/models/starcoder2/test_modeling_starcoder2.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch Starcoder2 model.""" -import tempfile import unittest import pytest @@ -404,85 +403,6 @@ def test_save_load_fast_init_from_base(self): def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Starcoder2 apparently does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py index 68dd5a52b3d6..b03416390766 100644 --- a/tests/models/t5/test_modeling_t5.py +++ b/tests/models/t5/test_modeling_t5.py @@ -580,9 +580,6 @@ class T5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, # The small T5 model needs higher percentages for CPU/MP tests model_split_percents = [0.5, 0.8, 0.9] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "google-t5/t5-small" - def setUp(self): self.model_tester = T5ModelTester(self) self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37) diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py index ec4c1d019b6d..377668851c58 100644 --- a/tests/models/umt5/test_modeling_umt5.py +++ b/tests/models/umt5/test_modeling_umt5.py @@ -317,9 +317,6 @@ class UMT5ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin # The small UMT5 model needs higher percentages for CPU/MP tests model_split_percents = [0.5, 0.8, 0.9] - # used in `test_torch_compile` - _torch_compile_test_ckpt = "google/umt5-small" - def setUp(self): self.model_tester = UMT5ModelTester(self) diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index b24c577a16e5..12aedaca8cf9 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -1574,59 +1574,6 @@ def test_generate_output_type(self, return_dict_in_generate): ) assert isinstance(pred_ids, expected_output_type) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_reuse_cache(self): - max_new_tokens = 2 - for model_class in self.all_generative_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name][..., :10] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = dummy_input.shape[1] * 2 + max_new_tokens * 2 + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # run generate once to get filled cache - output = model.generate( - dummy_input, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - return_dict_in_generate=True, - ) - past_key_values = output.past_key_values - - # Try to continue generation from where we left, given that we have more than 1 new token to process - # e.g. this can happen in speculative decoding when feeding candidate tokens back to target model - _ = model.generate( - dummy_input, - decoder_input_ids=output.sequences, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - past_key_values=past_key_values, - ) - def test_labels_sequence_max_length_correct(self): config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -3961,11 +3908,6 @@ def test_generate_without_input_ids(self): # generate only works with input ids for whisper pass - @unittest.skip(reason="Generate needs input ids") - def test_inputs_embeds_matches_input_ids_with_generate(self): - # generate only works with input ids for whisper - pass - @unittest.skip(reason="Decoder can't keep attention grads") def test_retain_grad_hidden_states_attentions(self): return @@ -3974,18 +3916,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_save_load_fast_init_from_base(self): pass - @unittest.skip( - reason="FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test" - ) - def test_flash_attn_2_generate_reuse_cache(self): - pass - - @unittest.skip( - "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test" - ) - def test_flash_attn_2_generate_padding_right(self): - pass - @unittest.skip( "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test" ) diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py index c0a8020bedd7..a6dd516f98a4 100644 --- a/tests/models/zamba/test_modeling_zamba.py +++ b/tests/models/zamba/test_modeling_zamba.py @@ -542,93 +542,6 @@ def test_flash_attn_2_fp32_ln(self): # with attention mask _ = model(dummy_input, attention_mask=dummy_attention_mask) - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - r""" - Overriding the test_flash_attn_2_generate_padding_right test as the Zamba model, like Mixtral, doesn't support - right padding + use cache with FA2 - """ - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_use_cache(self): - r""" - Overriding the test_flash_attn_2_generate_use_cache test as the Zamba model, like Mixtral, doesn't support - right padding + use cache with FA2 - """ - import torch - - max_new_tokens = 30 - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - - dummy_input = inputs_dict[model_class.main_input_name] - if dummy_input.dtype in [torch.float32, torch.bfloat16]: - dummy_input = dummy_input.to(torch.float16) - - # make sure that all models have enough positions for generation - if hasattr(config, "max_position_embeddings"): - config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - - dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) - # NOTE: Zamba does not support right padding + use_cache with FA2. - dummy_attention_mask[:, -1] = 1 - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - # Just test that a large cache works as expected - _ = model.generate( - dummy_input, - attention_mask=dummy_attention_mask, - max_new_tokens=max_new_tokens, - do_sample=False, - use_cache=True, - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 6e08f2958b29..e2719d8cf1b6 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -82,7 +82,6 @@ require_deepspeed, require_flash_attn, require_non_xpu, - require_read_token, require_safetensors, require_torch, require_torch_accelerator, From 7feeecac09dbd860c86ed8ad219a34d127569ef0 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Mon, 28 Oct 2024 15:04:35 +0000 Subject: [PATCH 04/16] typo --- tests/generation/test_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 6c10b28e1d39..d3cc0b5d1f51 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1989,11 +1989,11 @@ def test_generate_compile(self, name, end_to_end): else: model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead") - compuled_outputs = [] + compiled_outputs = [] for model_inputs in input_ids_sets: - compuled_outputs.append(model.generate(model_inputs, generation_config=generation_config)) + compiled_outputs.append(model.generate(model_inputs, generation_config=generation_config)) - for dynamic_result, compiled_result in zip(dynamic_outputs, compuled_outputs): + for dynamic_result, compiled_result in zip(dynamic_outputs, compiled_outputs): self._check_similar_generate_outputs(dynamic_result, compiled_result) @pytest.mark.generate From a1959f8b59ac1234e56ab407771cfa2721b60253 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Mon, 28 Oct 2024 15:24:48 +0000 Subject: [PATCH 05/16] more specific docstring --- tests/generation/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index d3cc0b5d1f51..fbb2b4fed50a 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -142,7 +142,7 @@ def _check_similar_generate_outputs(self, output_1, output_2, atol=1e-5, rtol=1e Checks whether a pair of generate outputs are similar. Two `generate` call outputs are considered similar in the following siturations: 1. The sequences are the same - 2. The sequences are different, but the scores up until the first mismatch are nearly identical + 2. The sequences are different, but the scores up to (and including) the first mismatch are nearly identical """ # scores doesn't include data regarding decoder input tokens decoder_input_length = output_1.sequences.shape[1] - len(output_1.scores) From e84177ed6012b745a8b490065b5111c0874f1a7a Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Mon, 28 Oct 2024 15:27:59 +0000 Subject: [PATCH 06/16] make fixup --- tests/models/mimi/test_modeling_mimi.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py index 0e66b276065c..df0007d666a0 100644 --- a/tests/models/mimi/test_modeling_mimi.py +++ b/tests/models/mimi/test_modeling_mimi.py @@ -21,7 +21,6 @@ import numpy as np from datasets import Audio, load_dataset -from packaging import version from parameterized import parameterized from pytest import mark From ad8ebae13b59896465b8d12f02a421de7101a868 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Mon, 28 Oct 2024 15:37:59 +0000 Subject: [PATCH 07/16] parameterize at the top? --- tests/generation/test_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index fbb2b4fed50a..88a7616d4592 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1928,15 +1928,15 @@ def test_generate_with_quant_cache(self): with self.assertRaises(ValueError): model.generate(**generation_kwargs, **inputs_dict) - @pytest.mark.generate - @require_torch_gpu - @slow @parameterized.expand( [ ("forward_only", False), # TODO (@joao): a few models failing. After fixed, this should not be "@slow" ("end_to_end", True), # TODO (@joao): end-to-end compilation is broken with torch 2.5+, explore and fix ] ) + @pytest.mark.generate + @require_torch_gpu + @slow def test_generate_compile(self, name, end_to_end): """ Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. Tests From def0940da5d7bbbe2381c12abe128ba37a6b9104 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Mon, 28 Oct 2024 15:54:20 +0000 Subject: [PATCH 08/16] correction --- tests/generation/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 88a7616d4592..c30048fa8674 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -2147,14 +2147,14 @@ def test_eager_matches_fa2_generate(self): self._test_attention_implementation("flash_attention_2") def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1): - batch_size = output.sequences.shape[0] + num_sequences_in_output = output.sequences.shape[0] + batch_size = num_sequences_in_output / num_return_sequences seq_length = getattr(self.model_tester, "seq_length", None) seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) seq_length = getattr(self.model_tester, "text_seq_length", seq_length) config = config.text_config if hasattr(config, "text_config") else config - num_sequences_in_output = batch_size * num_return_sequences gen_len = ( output.sequences.shape[-1] - 1 if config.is_encoder_decoder else output.sequences.shape[-1] - seq_length From a03703738090ca84aa273b9e20369246f3fb3b89 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 29 Oct 2024 09:45:00 +0000 Subject: [PATCH 09/16] more deletions :D --- tests/generation/test_utils.py | 2 +- tests/models/idefics/test_modeling_idefics.py | 7 --- .../models/idefics2/test_modeling_idefics2.py | 45 ---------------- .../models/idefics3/test_modeling_idefics3.py | 45 ---------------- tests/models/mamba2/test_modeling_mamba2.py | 2 +- tests/models/moshi/test_modeling_moshi.py | 53 ------------------- 6 files changed, 2 insertions(+), 152 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 02d7c94a168f..810d22c47cb5 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1546,7 +1546,7 @@ def test_past_key_values_format(self): @pytest.mark.generate @parameterized.expand([(1,), (2,)]) def test_generate_from_inputs_embeds(self, num_beams): - """Tests that we can generate from `inputs_embeds` instead of `input_ids` in LLMs, VLMs, etc""" + """Tests that we can generate from `inputs_embeds` instead of `input_ids` in LLMs, VLMs, etc""" # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids` # if fails, you should probably update the `prepare_inputs_for_generation` function for model_class in self.all_generative_model_classes: diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index c2f0ef8ccd01..d19d10932bfc 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -770,13 +770,6 @@ def test_contrastive_generate_low_memory(self): def test_custom_4d_attention_mask(self): pass - @unittest.skip( - reason="IDEFICS has specific requirements for working with inputs embeds like passing also the ids and pixels" - ) - @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds_decoder_only(self, num_beams): - pass - @unittest.skip(reason="IDEFICS cannot compile due to dynamic control flow when checking inputs") def test_generate_compile_fullgraph(self): pass diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 5790d21c5348..f33a9572ebe8 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -20,7 +20,6 @@ import unittest from io import BytesIO -import pytest import requests from transformers import ( @@ -412,50 +411,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): def test_flash_attn_2_fp32_ln(self): pass - @pytest.mark.generate - def test_generate_from_inputs_embeds_decoder_only(self): - # overwrite because IDEFICS needs ids and embeds at the input to be not None - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - - # Ignore: - # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids, - # which would cause a mismatch), - config.pad_token_id = config.eos_token_id = -1 - config.is_decoder = True - model = model_class(config).to(torch_device).eval() - input_ids = inputs_dict.pop("input_ids") - - # Traditional way of generating text - outputs_from_ids = model.generate( - input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True - ) - self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) - - # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) - inputs_embeds = model.get_input_embeddings()(input_ids) - outputs_from_embeds = model.generate( - input_ids, - inputs_embeds=inputs_embeds, - max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, - ) - self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) - - # But if we pass different inputs_embeds, we should get different outputs (the output text may be the - # same, but the logits will almost surely be different) - random_embeds = torch.rand_like(inputs_embeds) - outputs_from_rand_embeds = model.generate( - input_ids, - inputs_embeds=random_embeds, - max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, - ) - for i in range(len(outputs_from_rand_embeds.scores)): - self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) - # We need to override as we need to prepare such that the image token is the last token def test_resize_tokens_embeddings(self): (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index 3bb32c869913..5dc352d22fe0 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -19,7 +19,6 @@ import unittest from io import BytesIO -import pytest import requests from transformers import ( @@ -359,50 +358,6 @@ def test_prompt_lookup_decoding_matches_greedy_search(self): def test_flash_attn_2_fp32_ln(self): pass - @pytest.mark.generate - def test_generate_from_inputs_embeds_decoder_only(self): - # overwrite because IDEFICS needs ids and embeds at the input to be not None - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - - # Ignore: - # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids, - # which would cause a mismatch), - config.pad_token_id = config.eos_token_id = -1 - config.is_decoder = True - model = model_class(config).to(torch_device).eval() - input_ids = inputs_dict.pop("input_ids") - - # Traditional way of generating text - outputs_from_ids = model.generate( - input_ids, max_new_tokens=5, return_dict_in_generate=True, output_scores=True - ) - self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) - - # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) - inputs_embeds = model.get_input_embeddings()(input_ids) - outputs_from_embeds = model.generate( - input_ids, - inputs_embeds=inputs_embeds, - max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, - ) - self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) - - # But if we pass different inputs_embeds, we should get different outputs (the output text may be the - # same, but the logits will almost surely be different) - random_embeds = torch.rand_like(inputs_embeds) - outputs_from_rand_embeds = model.generate( - input_ids, - inputs_embeds=random_embeds, - max_new_tokens=5, - return_dict_in_generate=True, - output_scores=True, - ) - for i in range(len(outputs_from_rand_embeds.scores)): - self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) - # We need to override as we need to prepare such that the image token is the last token def test_resize_tokens_embeddings(self): (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py index cb859ef5327b..6cb88192bc19 100644 --- a/tests/models/mamba2/test_modeling_mamba2.py +++ b/tests/models/mamba2/test_modeling_mamba2.py @@ -205,7 +205,7 @@ def test_generate_without_input_ids(self): @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds_decoder_only(self, num_beams): + def test_generate_from_inputs_embeds(self, num_beams): pass @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index 7bf95ea873d9..082ef425e7c8 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -655,59 +655,6 @@ def test_initialization(self): msg=f"Parameter {name} of model {model_class} seems not properly initialized", ) - @pytest.mark.generate - @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds_decoder_only(self, num_beams): - for model_class in self.all_generative_model_classes: - config, input_ids, _, inputs_dict = self._get_input_ids_and_config() - - model = model_class(config).to(torch_device).eval() - generation_kwargs = { - "return_dict_in_generate": True, - "output_scores": True, - "num_beams": num_beams, - "do_sample": False, - } - - # Traditional way of generating text - outputs_from_ids = model.generate(input_ids, max_new_tokens=5, **generation_kwargs, **inputs_dict) - self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) - - # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) - inputs_embeds = model.get_input_embeddings()(input_ids) - outputs_from_embeds = model.generate( - input_ids, - inputs_embeds=inputs_embeds, - max_new_tokens=5, - **generation_kwargs, - **inputs_dict, - ) - - # But if we pass different inputs_embeds, we should get different outputs (the output text may be the - # same, but the logits will almost surely be different) - random_embeds = torch.rand_like(inputs_embeds) - outputs_from_rand_embeds = model.generate( - input_ids, - inputs_embeds=random_embeds, - max_new_tokens=5, - **generation_kwargs, - **inputs_dict, - ) - for i in range(len(outputs_from_rand_embeds.scores)): - self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) - - # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same - outputs_from_embeds_wo_ids = model.generate( - inputs_embeds=inputs_embeds, - max_new_tokens=5, - **generation_kwargs, - **inputs_dict, - ) - self.assertListEqual( - outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(), - outputs_from_embeds_wo_ids.sequences.tolist(), - ) - @unittest.skip(reason="Continuing from past key values is not straightforward as we're dealing with 3 inputs") def test_generate_continue_from_past_key_values(self): pass From 02a32fb7a27becd407029c332f439efad63c6c76 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 29 Oct 2024 09:56:59 +0000 Subject: [PATCH 10/16] tmp commit --- tests/generation/test_utils.py | 30 ++++++++------------- tests/models/mamba2/test_modeling_mamba2.py | 4 +-- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 810d22c47cb5..cdcf19a53492 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1544,20 +1544,16 @@ def test_past_key_values_format(self): ) @pytest.mark.generate - @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds(self, num_beams): + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): """Tests that we can generate from `inputs_embeds` instead of `input_ids` in LLMs, VLMs, etc""" # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids` # if fails, you should probably update the `prepare_inputs_for_generation` function for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - # Ignore: - # a) eos (to always output 20 tokens) and pad (so we don't try to infer the attn mask from the input_ids, - # which would cause a mismatch), - config.pad_token_id = config.eos_token_id = -1 - # b) embedding scaling, the scaling factor applied after embeding from input_ids (requires knowledge of the - # variable that holds the scaling factor, which is model-dependent) + # Ignore embedding scaling, the scaling factor applied after embeding from input_ids (requires knowledge + # of the variable that holds the scaling factor, which is model-dependent) if hasattr(config, "scale_embedding"): config.scale_embedding = False @@ -1578,19 +1574,18 @@ def test_generate_from_inputs_embeds(self, num_beams): "output_scores": True, "num_beams": num_beams, "do_sample": False, + "max_new_tokens": 5, + "min_new_tokens": 5, # generate exactly 5 tokens } # Traditional way of generating text - outputs_from_ids = model.generate(input_ids, max_new_tokens=5, **generation_kwargs) + outputs_from_ids = model.generate(input_ids, **generation_kwargs, **inputs_dict) self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) inputs_embeds = model.get_input_embeddings()(input_ids) outputs_from_embeds = model.generate( - input_ids, - inputs_embeds=inputs_embeds, - max_new_tokens=5, - **generation_kwargs, + input_ids, inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict ) self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) @@ -1598,17 +1593,14 @@ def test_generate_from_inputs_embeds(self, num_beams): # same, but the logits will almost surely be different) random_embeds = torch.rand_like(inputs_embeds) outputs_from_rand_embeds = model.generate( - input_ids, - inputs_embeds=random_embeds, - max_new_tokens=5, - **generation_kwargs, + input_ids, inputs_embeds=random_embeds, **generation_kwargs, **inputs_dict ) for i in range(len(outputs_from_rand_embeds.scores)): self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same outputs_from_embeds_wo_ids = model.generate( - inputs_embeds=inputs_embeds, max_new_tokens=5, **generation_kwargs + inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict ) self.assertListEqual( outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(), @@ -1917,7 +1909,7 @@ def test_generate_with_quant_cache(self): @pytest.mark.generate @require_torch_gpu @slow - def test_generate_compile(self, name, end_to_end): + def test_generate_compile(self, _, end_to_end): """ Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. Tests end-to-end compilation and forward pass compilation only. diff --git a/tests/models/mamba2/test_modeling_mamba2.py b/tests/models/mamba2/test_modeling_mamba2.py index 6cb88192bc19..9b3a9563b58d 100644 --- a/tests/models/mamba2/test_modeling_mamba2.py +++ b/tests/models/mamba2/test_modeling_mamba2.py @@ -204,8 +204,8 @@ def test_generate_without_input_ids(self): pass @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") - @parameterized.expand([(1,), (2,)]) - def test_generate_from_inputs_embeds(self, num_beams): + @parameterized.expand([("greedy", 1), ("beam search", 2)]) + def test_generate_from_inputs_embeds(self, _, num_beams): pass @unittest.skip(reason="To fix, Mamba 2 cache slicing test case is an edge case") From 08a59b247ea1e52dcc9eda3ae0d5a36b7388b568 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 29 Oct 2024 11:38:49 +0000 Subject: [PATCH 11/16] for VLMs too --- .../modeling_llava_next_video.py | 3 +- .../modular_llava_next_video.py | 3 +- .../modeling_llava_onevision.py | 3 +- .../video_llava/modeling_video_llava.py | 3 +- tests/generation/test_utils.py | 60 +++++++++++++------ .../models/idefics2/test_modeling_idefics2.py | 33 ++++++++++ tests/models/moshi/test_modeling_moshi.py | 2 +- 7 files changed, 83 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 44b372535d70..95618e0b6e12 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -911,7 +911,8 @@ def forward( if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None: raise ValueError( - "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, " + "and must specify either one" ) legacy_processing = False diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index e9974e920493..c6f5d708abbc 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -424,7 +424,8 @@ def forward( if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None: raise ValueError( - "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, " + "and must specify either one" ) legacy_processing = False diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 946688bfcf07..bb6a8a10f0b4 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -657,7 +657,8 @@ def forward( if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None: raise ValueError( - "You cannot specify both pixel_values/pixel_values_videos and inputs_embeds at the same time, and must specify either one" + "You cannot specify both `pixel_values`/`pixel_values_videos` and `inputs_embeds` at the same time, " + "and must specify either one" ) if inputs_embeds is None: diff --git a/src/transformers/models/video_llava/modeling_video_llava.py b/src/transformers/models/video_llava/modeling_video_llava.py index 30f82e45056c..1f02cf583966 100644 --- a/src/transformers/models/video_llava/modeling_video_llava.py +++ b/src/transformers/models/video_llava/modeling_video_llava.py @@ -534,7 +534,8 @@ def forward( if (pixel_values_images is not None or pixel_values_videos is not None) and inputs_embeds is not None: raise ValueError( - "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + "You cannot specify both `pixel_values_images`/`pixel_values_videos` and `inputs_embeds` at the same " + "time, and must specify either one" ) legacy_processing = False diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index cdcf19a53492..89db3d562257 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1552,22 +1552,44 @@ def test_generate_from_inputs_embeds(self, _, num_beams): for model_class in self.all_generative_model_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - # Ignore embedding scaling, the scaling factor applied after embeding from input_ids (requires knowledge - # of the variable that holds the scaling factor, which is model-dependent) - if hasattr(config, "scale_embedding"): - config.scale_embedding = False - # This test is for decoder-only models (encoder-decoder models have native input embeddings support in the # decoder) if config.is_encoder_decoder: continue + config.is_decoder = True # Skip models without explicit support - config.is_decoder = True model = model_class(config).to(torch_device).eval() if "inputs_embeds" not in inspect.signature(model.prepare_inputs_for_generation).parameters.keys(): continue + # There are a few exception patterns in this test: + # 1 - Some models can't generate without `input_ids`, when `inputs_embeds` are passed + requires_inputs_ids = any( + model_name in model_class.__name__.lower() for model_name in ["idefics", "qwen2vl"] + ) + # 2 - Complex `inputs_embeds` computation, i.e. the correct computation of inputs embeds is more complex + # than calling the embedding layer with `input_ids`. Subcases of this exception: + # 2.A - Ignore `scale_embedding`, if the model supports it (it is controlled by a model-dependent flag) + if hasattr(config, "scale_embedding"): + config.scale_embedding = False + # 2.B - Some VLMs assume `inputs_embeds` and `pixel_values` are mutually exclusive AND fall in the + # exception above (complex `inputs_embeds` computation). Popping `pixel_values` allow us to run the + # checks without adding test complexity. Ditto for `pixel_values_videos` and `pixel_values_images` + pixel_values_is_mutually_exclusive = any( + model_name in model_class.__name__.lower() + for model_name in ["llava", "idefics2", "idefics3", "mllama", "paligemma"] + ) + if pixel_values_is_mutually_exclusive: + inputs_dict.pop("pixel_values", None) + inputs_dict.pop("pixel_values_videos", None) + inputs_dict.pop("pixel_values_images", None) + # 2.C - No easy fix, let's skip the check that compares the outputs from `input_ids` and `inputs_embeds` + has_complex_embeds_computation = any( + model_name in model_class.__name__.lower() for model_name in ["moshi"] + ) + + # Traditional way of generating text input_ids = inputs_dict.pop("input_ids") generation_kwargs = { "return_dict_in_generate": True, @@ -1577,19 +1599,19 @@ def test_generate_from_inputs_embeds(self, _, num_beams): "max_new_tokens": 5, "min_new_tokens": 5, # generate exactly 5 tokens } - - # Traditional way of generating text outputs_from_ids = model.generate(input_ids, **generation_kwargs, **inputs_dict) self.assertEqual(outputs_from_ids.sequences.shape, (input_ids.shape[0], input_ids.shape[1] + 5)) - # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output) + # Same thing, but from input embeddings (`input_ids` is passed so the prompt is present in the output). + # The output of the two calls should be the same. inputs_embeds = model.get_input_embeddings()(input_ids) outputs_from_embeds = model.generate( input_ids, inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict ) - self.assertListEqual(outputs_from_ids.sequences.tolist(), outputs_from_embeds.sequences.tolist()) + if not has_complex_embeds_computation: + self._check_similar_generate_outputs(outputs_from_ids, outputs_from_embeds) - # But if we pass different inputs_embeds, we should get different outputs (the output text may be the + # If we pass different inputs_embeds, we should get different outputs (the output text may be the # same, but the logits will almost surely be different) random_embeds = torch.rand_like(inputs_embeds) outputs_from_rand_embeds = model.generate( @@ -1598,14 +1620,14 @@ def test_generate_from_inputs_embeds(self, _, num_beams): for i in range(len(outputs_from_rand_embeds.scores)): self.assertFalse(torch.allclose(outputs_from_embeds.scores[i], outputs_from_rand_embeds.scores[i])) - # input_ids is not a required input -- if we don't pass it, the newly generated tokens will be the same - outputs_from_embeds_wo_ids = model.generate( - inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict - ) - self.assertListEqual( - outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :].tolist(), - outputs_from_embeds_wo_ids.sequences.tolist(), - ) + # input_ids is not a required input on most models -- if we don't pass it, the newly generated tokens will + # be the same + if not requires_inputs_ids: + outputs_from_embeds_wo_ids = model.generate( + inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict + ) + outputs_from_embeds.sequences = outputs_from_embeds.sequences[:, inputs_embeds.shape[1] :] + self._check_similar_generate_outputs(outputs_from_embeds_wo_ids, outputs_from_embeds) @pytest.mark.generate def test_generate_from_inputs_embeds_with_static_cache(self): diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index f33a9572ebe8..042fecf4bd25 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -195,6 +195,10 @@ def test_inputs_embeds(): def test_inputs_embeds_matches_input_ids(self): pass + @unittest.skip(reason="Model does not support padding right") + def test_flash_attn_2_generate_padding_right(self): + pass + @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_inference_padding_right(self): pass @@ -385,6 +389,10 @@ def setUp(self): def test_inputs_embeds(): pass + @unittest.skip(reason="Model does not support padding right") + def test_flash_attn_2_generate_padding_right(self): + pass + @unittest.skip(reason="Model does not support padding right") def test_flash_attn_2_inference_padding_right(self): pass @@ -526,6 +534,31 @@ def test_resize_embeddings_untied(self): # Check that the model can still do a forward pass successfully (every parameter should be resized) model(**self._prepare_for_class(inputs_dict, model_class)) + def test_inputs_embeds_matches_input_ids_with_generate(self): + # overwrite because IDEFICS needs ids and embeds at the input to be not None + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = copy.deepcopy(self._prepare_for_class(inputs_dict, model_class)) + pad_token_id = config.pad_token_id if config.pad_token_id is not None else 1 + + wte = model.get_input_embeddings() + + input_ids = inputs["input_ids"] + # some models infer position ids/attn mask differently when input ids + # by check if pad_token let's make sure no padding is in input ids + not_pad_token_id = pad_token_id + 1 if max(0, pad_token_id - 1) == 0 else pad_token_id - 1 + input_ids[input_ids == pad_token_id] = not_pad_token_id + del inputs["input_ids"] + inputs_embeds = wte(input_ids) + out_ids = model.generate(input_ids=input_ids, **inputs, max_new_tokens=2) + out_embeds = model.generate(input_ids=input_ids, inputs_embeds=inputs_embeds, **inputs, max_new_tokens=2) + + self.assertTrue(torch.allclose(out_embeds, out_ids)) + @require_torch class Idefics2ForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index 082ef425e7c8..4d138193a191 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -560,7 +560,7 @@ def _get_input_ids_and_config(self, batch_size=2): return config, input_ids, attention_mask, inputs_dict def prepare_config_and_inputs_for_generate(self, batch_size=2): - config, filtered_inputs_dict = super().prepare_config_and_inputs_for_generate() + config, filtered_inputs_dict = super().prepare_config_and_inputs_for_generate(batch_size=batch_size) # Make sure we only return `input_ids`. # Note that audio_codes will still be generated internally, so the ability to test audio codes is still there. From 9b17c4bad7cac89979701b4c1890f2f740c3297f Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 29 Oct 2024 12:24:50 +0000 Subject: [PATCH 12/16] fix _check_outputs --- tests/generation/test_utils.py | 60 ++++++++++++++++------- tests/models/moshi/test_modeling_moshi.py | 6 ++- 2 files changed, 47 insertions(+), 19 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 89db3d562257..af858c53027f 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -607,7 +607,12 @@ def test_beam_search_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) - self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"]) + self._check_outputs( + output_generate, + model.config, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], + ) @pytest.mark.generate def test_beam_search_generate_dict_outputs_use_cache(self): @@ -644,7 +649,11 @@ def test_beam_search_generate_dict_outputs_use_cache(self): ) self._check_outputs( - output_generate, model.config, use_cache=True, num_return_sequences=beam_kwargs["num_beams"] + output_generate, + model.config, + use_cache=True, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], ) @require_accelerate @@ -722,7 +731,12 @@ def test_beam_sample_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, BeamSampleDecoderOnlyOutput) - self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"]) + self._check_outputs( + output_generate, + model.config, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], + ) @pytest.mark.generate def test_generate_without_input_ids(self): @@ -807,7 +821,12 @@ def test_group_beam_search_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) - self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"]) + self._check_outputs( + output_generate, + model.config, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], + ) # TODO: @gante check why it is flaky @is_flaky() @@ -909,7 +928,12 @@ def test_constrained_beam_search_generate_dict_output(self): # Retrocompatibility check self.assertIsInstance(output_generate, BeamSearchDecoderOnlyOutput) - self._check_outputs(output_generate, model.config, num_return_sequences=beam_kwargs["num_beams"]) + self._check_outputs( + output_generate, + model.config, + num_return_sequences=beam_kwargs["num_return_sequences"], + num_beams=beam_kwargs["num_beams"], + ) @pytest.mark.generate def test_contrastive_generate(self): @@ -2140,9 +2164,9 @@ def test_eager_matches_fa2_generate(self): # check whether we still need the overwrites self._test_attention_implementation("flash_attention_2") - def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1): - num_sequences_in_output = output.sequences.shape[0] - batch_size = num_sequences_in_output / num_return_sequences + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): + input_batch_size = int(output.sequences.shape[0] / num_return_sequences) + internal_batch_size = input_batch_size * num_beams seq_length = getattr(self.model_tester, "seq_length", None) seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) @@ -2159,19 +2183,21 @@ def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1 seq_length = self.model_tester.get_subsampled_output_lengths(seq_length) # scores - self._check_scores(num_sequences_in_output, output.scores, length=gen_len, config=config) + self._check_scores(internal_batch_size, output.scores, length=gen_len, config=config) # unprocessed logits - self._check_logits(num_sequences_in_output, output.logits, config=config) + self._check_logits(internal_batch_size, output.logits, config=config) # Attentions if self.has_attentions: if config.is_encoder_decoder: # encoder - self._check_encoder_attention_for_generate(output.encoder_attentions, batch_size, config, seq_length) + self._check_encoder_attention_for_generate( + output.encoder_attentions, input_batch_size, config, seq_length + ) # decoder self._check_attentions_for_generate( - num_sequences_in_output, + internal_batch_size, output.decoder_attentions, min_length=1, max_length=output.sequences.shape[-1], @@ -2183,7 +2209,7 @@ def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1 attentions = output.attentions if not use_cache else output.attentions[1:] min_length = seq_length if not use_cache else seq_length + 1 self._check_attentions_for_generate( - num_sequences_in_output, + internal_batch_size, attentions=attentions, min_length=min_length, max_length=output.sequences.shape[-1], @@ -2195,12 +2221,12 @@ def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1 if config.is_encoder_decoder: # encoder self._check_encoder_hidden_states_for_generate( - output.encoder_hidden_states, batch_size, config, seq_length + output.encoder_hidden_states, input_batch_size, config, seq_length ) # decoder self._check_hidden_states_for_generate( - num_sequences_in_output, + internal_batch_size, output.decoder_hidden_states, min_length=1, max_length=output.sequences.shape[-1], @@ -2212,7 +2238,7 @@ def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1 hidden_states = output.hidden_states if not use_cache else output.hidden_states[1:] min_length = seq_length if not use_cache else seq_length + 1 self._check_hidden_states_for_generate( - num_sequences_in_output, + internal_batch_size, hidden_states, min_length=min_length, max_length=output.sequences.shape[-1], @@ -2243,7 +2269,7 @@ def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1 past_key_values = output.past_key_values past_sequence_length = output.sequences.shape[-1] - 1 self._check_past_key_values_for_generate( - num_sequences_in_output, + internal_batch_size, past_key_values, seq_length=past_sequence_length, config=config, diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index 4d138193a191..7d4b855c10d8 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -591,9 +591,11 @@ def _check_hidden_states_for_generate( [expected_shape] * len(iter_hidden_states), ) - def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1): + def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): # Overwrite because the generate method actually alway uses `inputs_embeds` so `use_cache` is always `True` - super()._check_outputs(output, config, use_cache=True, num_return_sequences=num_return_sequences) + super()._check_outputs( + output, config, use_cache=True, num_return_sequences=num_return_sequences, num_beams=num_beams + ) def _check_hidden_states_for_generate( self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1 From 4a24d7e1bccd8306d6035c6ccbbcee8acfc0e9bc Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 29 Oct 2024 12:29:09 +0000 Subject: [PATCH 13/16] test nit --- tests/generation/test_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index af858c53027f..a042875235d9 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -2166,7 +2166,7 @@ def test_eager_matches_fa2_generate(self): def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): input_batch_size = int(output.sequences.shape[0] / num_return_sequences) - internal_batch_size = input_batch_size * num_beams + internal_batch_size = input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences seq_length = getattr(self.model_tester, "seq_length", None) seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) From e50df32a583fbd5a6e3cea179a173e079e8514a6 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 29 Oct 2024 12:36:18 +0000 Subject: [PATCH 14/16] make fixup --- tests/generation/test_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index a042875235d9..96348b1c51db 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -2166,7 +2166,9 @@ def test_eager_matches_fa2_generate(self): def _check_outputs(self, output, config, use_cache=False, num_return_sequences=1, num_beams=1): input_batch_size = int(output.sequences.shape[0] / num_return_sequences) - internal_batch_size = input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences + internal_batch_size = ( + input_batch_size * num_beams if num_beams > 1 else input_batch_size * num_return_sequences + ) seq_length = getattr(self.model_tester, "seq_length", None) seq_length = getattr(self.model_tester, "encoder_seq_length", seq_length) From 81443f5b7e170928bf6930e3427cb7da8f4e56bf Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 29 Oct 2024 13:30:25 +0000 Subject: [PATCH 15/16] fix another flaky --- tests/generation/test_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 96348b1c51db..bc8d140da595 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -2048,6 +2048,9 @@ def test_assisted_decoding_with_num_logits_to_keep(self): self.skipTest(reason="Stateful models don't support assisted generation") config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1) + # NOTE: assisted generation only works with cache on at the moment. + if not hasattr(config, "use_cache"): + self.skipTest(reason=f"{model_class.__name__} doesn't support caching") config.use_cache = True config.is_decoder = True @@ -2064,7 +2067,6 @@ def test_assisted_decoding_with_num_logits_to_keep(self): "output_scores": True, } - assistant_model.generation_config.assistant_confidence_threshold = None # Setting num_logits_to_keep at 0 keeps all logits (old behavior) with_all_logits = model.generate(**generation_kwargs, **inputs_dict, num_logits_to_keep=0) # By default, num_logits_to_keep is automatically set to 1 if not provided (new behavior) From 4fd197fdf4f716568be279b341ab324d0bedd1dd Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 29 Oct 2024 14:17:25 +0000 Subject: [PATCH 16/16] test_generate_from_inputs_embeds -- handle missing attention mask --- src/transformers/generation/utils.py | 23 ++++++++++++------- .../models/musicgen/modeling_musicgen.py | 4 ++-- .../modeling_musicgen_melody.py | 4 ++-- tests/generation/test_utils.py | 5 +++- 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 46ea679aca50..6e6d5b8bdce7 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -572,27 +572,34 @@ def _maybe_initialize_input_ids_for_generation( def _prepare_attention_mask_for_generation( self, - inputs: torch.Tensor, - pad_token_id: Optional[torch.Tensor], - eos_token_id: Optional[torch.Tensor], + inputs_tensor: torch.Tensor, + generation_config: GenerationConfig, + model_kwargs: Dict[str, Any], ) -> torch.LongTensor: + pad_token_id = generation_config._pad_token_tensor + eos_token_id = generation_config._eos_token_tensor + + # `input_ids` may be present in the model kwargs, instead of being the main input (e.g. multimodal model) + if "input_ids" in model_kwargs and model_kwargs["input_ids"].shape[1] > 0: + inputs_tensor = model_kwargs["input_ids"] + # No information for attention mask inference -> return default attention mask - default_attention_mask = torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device) + default_attention_mask = torch.ones(inputs_tensor.shape[:2], dtype=torch.long, device=inputs_tensor.device) if pad_token_id is None: return default_attention_mask - is_input_ids = len(inputs.shape) == 2 and inputs.dtype in [torch.int, torch.long] + is_input_ids = len(inputs_tensor.shape) == 2 and inputs_tensor.dtype in [torch.int, torch.long] if not is_input_ids: return default_attention_mask is_pad_token_in_inputs = (pad_token_id is not None) and ( - isin_mps_friendly(elements=inputs, test_elements=pad_token_id).any() + isin_mps_friendly(elements=inputs_tensor, test_elements=pad_token_id).any() ) is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ~( isin_mps_friendly(elements=eos_token_id, test_elements=pad_token_id).any() ) can_infer_attention_mask = is_pad_token_in_inputs * is_pad_token_not_equal_to_eos_token_id - attention_mask_from_padding = inputs.ne(pad_token_id).long() + attention_mask_from_padding = inputs_tensor.ne(pad_token_id).long() attention_mask = ( attention_mask_from_padding * can_infer_attention_mask + default_attention_mask * ~can_infer_attention_mask @@ -2024,7 +2031,7 @@ def generate( if not kwargs_has_attention_mask and requires_attention_mask and accepts_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor + inputs_tensor, generation_config, model_kwargs ) elif kwargs_has_attention_mask: # TODO (joao): generalize this check with other types of inputs diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index c18e1d1c9d86..109ddfb626d2 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -1562,7 +1562,7 @@ def generate( if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - input_ids, generation_config._pad_token_tensor, generation_config._eos_token_tensor + input_ids, generation_config, model_kwargs ) # 5. Prepare `max_length` depending on other stopping criteria. @@ -2578,7 +2578,7 @@ def generate( if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor + inputs_tensor, generation_config, model_kwargs ) if "encoder_outputs" not in model_kwargs: diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index d2f339afc414..61f2ce414e1d 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -1484,7 +1484,7 @@ def generate( if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - input_ids, generation_config._pad_token_tensor, generation_config._eos_token_tensor + input_ids, generation_config, model_kwargs ) # 5. Prepare `max_length` depending on other stopping criteria. @@ -2425,7 +2425,7 @@ def generate( if model_kwargs.get("attention_mask", None) is None and requires_attention_mask: model_kwargs["attention_mask"] = self._prepare_attention_mask_for_generation( - inputs_tensor, generation_config._pad_token_tensor, generation_config._eos_token_tensor + inputs_tensor, generation_config, model_kwargs ) if "encoder_hidden_states" not in model_kwargs: diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index bc8d140da595..545b696d6737 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1612,6 +1612,9 @@ def test_generate_from_inputs_embeds(self, _, num_beams): has_complex_embeds_computation = any( model_name in model_class.__name__.lower() for model_name in ["moshi"] ) + # 3 - `inputs_dict` doesn't contain `attention_mask`. When `attention_mask` is not passed to generate, + # we infer it from `input_ids`. The last test case will fail if there is a pad token in the original input. + missing_attention_mask = "attention_mask" not in inputs_dict # Traditional way of generating text input_ids = inputs_dict.pop("input_ids") @@ -1646,7 +1649,7 @@ def test_generate_from_inputs_embeds(self, _, num_beams): # input_ids is not a required input on most models -- if we don't pass it, the newly generated tokens will # be the same - if not requires_inputs_ids: + if not (requires_inputs_ids or missing_attention_mask): outputs_from_embeds_wo_ids = model.generate( inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict )