From da120271246683f4de4dfad102837126902e067f Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Thu, 18 Sep 2025 14:24:55 +0000 Subject: [PATCH 1/3] update test (and overwrites) --- tests/generation/test_utils.py | 110 ++++++++++++------ tests/models/bamba/test_modeling_bamba.py | 83 +------------ tests/models/blip_2/test_modeling_blip_2.py | 84 ------------- .../falcon_h1/test_modeling_falcon_h1.py | 84 +------------ tests/models/idefics/test_modeling_idefics.py | 89 +++++--------- .../models/imagegpt/test_modeling_imagegpt.py | 4 - .../test_modeling_instructblip.py | 89 -------------- .../test_modeling_instructblipvideo.py | 89 -------------- tests/models/kosmos2/test_modeling_kosmos2.py | 65 +++-------- .../kosmos2_5/test_modeling_kosmos2_5.py | 67 +++-------- .../test_modeling_kyutai_speech_to_text.py | 84 +------------ tests/models/mllama/test_modeling_mllama.py | 19 +++ tests/models/moshi/test_modeling_moshi.py | 72 ++++-------- .../qwen2_audio/test_modeling_qwen2_audio.py | 1 + tests/models/voxtral/test_modeling_voxtral.py | 2 +- tests/models/zamba/test_modeling_zamba.py | 45 ------- tests/models/zamba2/test_modeling_zamba2.py | 45 ------- 17 files changed, 188 insertions(+), 844 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 680002d4600b..af60e3fe02da 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -23,6 +23,7 @@ import unittest import warnings from pathlib import Path +from typing import Optional import numpy as np import pytest @@ -927,32 +928,44 @@ def test_prompt_lookup_decoding_stops_at_eos(self): self.assertTrue(output_prompt_lookup.shape[-1] == 10) @pytest.mark.generate - def test_left_padding_compatibility(self): - # NOTE: left-padding results in small numerical differences. This is expected. - # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + def test_left_padding_compatibility( + self, unpadded_custom_inputs: Optional[dict] = None, padded_custom_inputs: Optional[dict] = None + ): + """ + Tests that adding left-padding yields the same logits as the original input. Exposes arguments for custom + inputs for overwrites, to prevent full rewrites of the test when all we need is model-specific input handling. - # First, filter out models that don't support left padding - # - The model must have generative capabilities - if len(self.all_generative_model_classes) == 0: - self.skipTest(reason="No generative architecture available for this model.") + ! If you overwrite this test, make sure to document why you need to overwrite it ! - # - The model must support padding + NOTE: left-padding results in small numerical differences. This is expected. + See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 + + Args: + unpadded_custom_inputs (`dict`, *optional*): + Used in test overwrites. Custom inputs to add/overwrite over the default test inputs. + padded_custom_inputs (`dict`, *optional*): + Used in test overwrites. Custom inputs to add/overwrite over the padded test input handcrafted in this + test. Commonly used e.g. with multimodal cross attention masks. + """ + + # First, filter out models that don't support left padding + # 1. The model must support padding if not self.has_attentions: self.skipTest(reason="This model doesn't support padding.") - - # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) + # 2. [encoder-decoder] The model must be a decoder-only architecture. Encoder-based architectures can use + # right-padding in their (encoder) inputs. Encoder-decoder may use left-padding on their decoder inputs + # [TODO: lift this restriction? technically, we can test padding the decoder inputs.] decoder_only_classes = [] for model_class in self.all_generative_model_classes: config, _ = self.prepare_config_and_inputs_for_generate() - if config.get_text_config(decoder=True).is_encoder_decoder: + if config.is_encoder_decoder: continue else: decoder_only_classes.append(model_class) if len(decoder_only_classes) == 0: self.skipTest(reason="No decoder-only architecture available for this model.") - - # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't - # added support for it yet. We skip these models for now. + # 3. [old models] Decoder-only architectures derived from encoder-decoder models could support it in theory, + # but we haven't added support for it yet. We skip these models for now. has_encoder_attributes = any( attr_name for attr_name in config.to_dict() @@ -963,48 +976,73 @@ def test_left_padding_compatibility(self): reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." ) - # Then, test left-padding - def _prepare_model_kwargs(input_ids, attention_mask, signature): - model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} + # Now we can start testing + unpadded_custom_inputs = unpadded_custom_inputs or {} + padded_custom_inputs = padded_custom_inputs or {} + + def _prepare_model_kwargs(model_inputs, signature): + model_kwargs = {"input_ids": model_inputs["input_ids"], "attention_mask": model_inputs["attention_mask"]} if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) + position_ids = torch.cumsum(model_inputs["attention_mask"], dim=-1) - 1 + position_ids.masked_fill_(model_inputs["attention_mask"] == 0, 1) model_kwargs["position_ids"] = position_ids if "cache_position" in signature: - cache_position = torch.arange(input_ids.shape[1], device=torch_device) + cache_position = torch.arange(model_inputs["input_ids"].shape[1], device=torch_device) model_kwargs["cache_position"] = cache_position + # forward all other inputs, if they are in the signature + model_kwargs.update({k: v for k, v in model_inputs.items() if k not in model_kwargs and k in signature}) return model_kwargs for model_class in decoder_only_classes: config, inputs_dict = self.prepare_config_and_inputs_for_generate() - input_ids = inputs_dict["input_ids"] - attention_mask = inputs_dict.get("attention_mask") - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) - model = model_class(config).to(torch_device).eval() signature = inspect.signature(model.forward).parameters.keys() - # no cache as some models require special cache classes to be init outside forward + # No cache to simplify the test (some models need careful init) model.generation_config.use_cache = False + inputs_dict.update(unpadded_custom_inputs) + # special case: an inexistent `attention_mask` is a full mask + inputs_dict["attention_mask"] = inputs_dict.get("attention_mask", None) + if inputs_dict["attention_mask"] is None: + inputs_dict["attention_mask"] = torch.ones_like(inputs_dict["input_ids"]) - # Without padding - model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) - next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :] + # Get output logits from inputs without padding + model_kwargs_wo_padding = _prepare_model_kwargs(inputs_dict, signature) + next_logits_wo_padding = model(**model_kwargs_wo_padding).logits[:, -1, :] - # With left-padding (length 32) - # can hardcode pad_token to be 0 as we'll do attn masking anyway - pad_token_id = ( - config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 - ) + # Prepare padding on common inputs (pad length 32) + input_ids = inputs_dict["input_ids"] + attention_mask = inputs_dict["attention_mask"] + token_type_ids = inputs_dict.get("token_type_ids", None) + pad_token_id = getattr(config.get_text_config(decoder=True), "pad_token_id", None) or 0 pad_size = (input_ids.shape[0], 32, *input_ids.shape[2:]) padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id padded_input_ids = torch.cat((padding, input_ids), dim=1) padded_attention_mask = torch.cat( (torch.zeros(pad_size[:2], dtype=input_ids.dtype, device=torch_device), attention_mask), dim=1 ) - model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) - next_logits_with_padding = model(**model_kwargs).logits[:, -1, :] + if token_type_ids is not None: + padded_token_type_ids = torch.cat( + ( + # Assumption: take the first token type id as the padding token type id + torch.ones(pad_size[:2], dtype=input_ids.dtype, device=torch_device) * token_type_ids[0, 0], + token_type_ids, + ), + dim=1, + ) + else: + padded_token_type_ids = None + + # Get output logits from inputs with left-padding (pad length 32) + padded_inputs_dict = copy.deepcopy(inputs_dict) + padded_inputs_dict["input_ids"] = padded_input_ids + padded_inputs_dict["attention_mask"] = padded_attention_mask + if padded_token_type_ids is not None: + padded_inputs_dict["token_type_ids"] = padded_token_type_ids + padded_inputs_dict.update(padded_custom_inputs) + + model_kwargs_with_padding = _prepare_model_kwargs(padded_inputs_dict, signature) + next_logits_with_padding = model(**model_kwargs_with_padding).logits[:, -1, :] # They should result in very similar logits torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5) diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py index c2e7c435dbfa..8c73255f7cd0 100644 --- a/tests/models/bamba/test_modeling_bamba.py +++ b/tests/models/bamba/test_modeling_bamba.py @@ -438,88 +438,11 @@ def test_batching_equivalence(self): super().test_batching_equivalence() self.model_tester.use_input_mask = orig - # essentially the same test in test_utils, just adjustment for rtol for this model @pytest.mark.generate def test_left_padding_compatibility(self): - # NOTE: left-padding results in small numerical differences. This is expected. - # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 - - # First, filter out models that don't support left padding - # - The model must have generative capabilities - if len(self.all_generative_model_classes) == 0: - self.skipTest(reason="No generative architecture available for this model.") - - # - The model must support padding - if not self.has_attentions: - self.skipTest(reason="This model doesn't support padding.") - - # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) - decoder_only_classes = [] - for model_class in self.all_generative_model_classes: - config, _ = self.prepare_config_and_inputs_for_generate() - if config.is_encoder_decoder: - continue - else: - decoder_only_classes.append(model_class) - if len(decoder_only_classes) == 0: - self.skipTest(reason="No decoder-only architecture available for this model.") - - # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't - # added support for it yet. We skip these models for now. - has_encoder_attributes = any( - attr_name - for attr_name in config.to_dict() - if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" - ) - if has_encoder_attributes: - self.skipTest( - reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." - ) - - # Then, test left-padding - def _prepare_model_kwargs(input_ids, attention_mask, signature): - model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} - if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - if "cache_position" in signature: - cache_position = torch.arange(input_ids.shape[-1], device=torch_device) - model_kwargs["cache_position"] = cache_position - return model_kwargs - - for model_class in decoder_only_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - input_ids = inputs_dict["input_ids"] - - # - for left padding we absolutely need to use an all ones - # attention mask, so we do not use the one in inputs_dict - attention_mask = torch.ones_like(input_ids) - - model = model_class(config).to(torch_device).eval() - signature = inspect.signature(model.forward).parameters.keys() - - # no cache as some models require special cache classes to be init outside forward - model.generation_config.use_cache = False - - # Without padding - model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) - next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :] - - # With left-padding (length 32) - # can hardcode pad_token to be 0 as we'll do attn masking anyway - pad_token_id = ( - config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 - ) - pad_size = (input_ids.shape[0], 32) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) - model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) - next_logits_with_padding = model(**model_kwargs).logits[:, -1, :] - - # They should result in very similar logits - torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5) + # TODO: document why a random attention mask causes this test to fail, but a full mask doesn't + unpadded_custom_inputs = {"attention_mask": None} + super().test_left_padding_compatibility(unpadded_custom_inputs=unpadded_custom_inputs) @unittest.skip( "Bamba requires additionally specifying position_ids, seq_idx, and FlashAttentionKwargs for padding-free training." diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 0b3ab74d519c..5667b1a3fe19 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -19,7 +19,6 @@ import unittest import numpy as np -import pytest import requests from parameterized import parameterized @@ -597,89 +596,6 @@ def _check_generate_outputs(self, output, config, use_cache=False, num_return_se output, config, use_cache=use_cache, num_return_sequences=num_return_sequences, num_beams=num_beams ) - # overwrite because BLIP2 cannot generate only from input ids, and requires pixel values in all cases to be present - @pytest.mark.generate - def test_left_padding_compatibility(self): - # NOTE: left-padding results in small numerical differences. This is expected. - # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 - - # First, filter out models that don't support left padding - # - The model must have generative capabilities - if len(self.all_generative_model_classes) == 0: - self.skipTest(reason="No generative architecture available for this model.") - - # - The model must support padding - if not self.has_attentions: - self.skipTest(reason="This model doesn't support padding.") - - # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) - decoder_only_classes = [] - for model_class in self.all_generative_model_classes: - config, _ = self.prepare_config_and_inputs_for_generate() - if config.is_encoder_decoder: - continue - else: - decoder_only_classes.append(model_class) - if len(decoder_only_classes) == 0: - self.skipTest(reason="No decoder-only architecture available for this model.") - - # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't - # added support for it yet. We skip these models for now. - has_encoder_attributes = any( - attr_name - for attr_name in config.to_dict() - if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" - ) - if has_encoder_attributes: - self.skipTest( - reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." - ) - - # Then, test left-padding - def _prepare_model_kwargs(input_ids, attention_mask, signature): - model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} - if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - if "cache_position" in signature: - cache_position = torch.arange(input_ids.shape[-1], device=torch_device) - model_kwargs["cache_position"] = cache_position - return model_kwargs - - for model_class in decoder_only_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - input_ids = inputs_dict["input_ids"] - attention_mask = inputs_dict.get("attention_mask") - pixel_values = inputs_dict["pixel_values"] - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) - - model = model_class(config).to(torch_device).eval() - signature = inspect.signature(model.forward).parameters.keys() - - # no cache as some models require special cache classes to be init outside forward - model.generation_config.use_cache = False - - # Without padding - model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) - next_logits_wo_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :] - - # With left-padding (length 32) - # can hardcode pad_token to be 0 as we'll do attn masking anyway - pad_token_id = ( - config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 - ) - pad_size = (input_ids.shape[0], 32) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) - model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) - next_logits_with_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :] - - # They should result in very similar logits - torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5) - # this class is based on `T5ModelTester` found in tests/models/t5/test_modeling_t5.py class Blip2TextModelTester: diff --git a/tests/models/falcon_h1/test_modeling_falcon_h1.py b/tests/models/falcon_h1/test_modeling_falcon_h1.py index cc78f7bf7c1d..04ad9e2e1195 100644 --- a/tests/models/falcon_h1/test_modeling_falcon_h1.py +++ b/tests/models/falcon_h1/test_modeling_falcon_h1.py @@ -14,7 +14,6 @@ # limitations under the License. """Testing suite for the PyTorch FalconH1 model.""" -import inspect import unittest import pytest @@ -413,88 +412,11 @@ def test_batching_equivalence(self): super().test_batching_equivalence() self.model_tester.use_input_mask = orig - # essentially the same test in test_utils, just adjustment for rtol for this model @pytest.mark.generate def test_left_padding_compatibility(self): - # NOTE: left-padding results in small numerical differences. This is expected. - # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 - - # First, filter out models that don't support left padding - # - The model must have generative capabilities - if len(self.all_generative_model_classes) == 0: - self.skipTest(reason="No generative architecture available for this model.") - - # - The model must support padding - if not self.has_attentions: - self.skipTest(reason="This model doesn't support padding.") - - # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) - decoder_only_classes = [] - for model_class in self.all_generative_model_classes: - config, _ = self.prepare_config_and_inputs_for_generate() - if config.is_encoder_decoder: - continue - else: - decoder_only_classes.append(model_class) - if len(decoder_only_classes) == 0: - self.skipTest(reason="No decoder-only architecture available for this model.") - - # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't - # added support for it yet. We skip these models for now. - has_encoder_attributes = any( - attr_name - for attr_name in config.to_dict() - if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" - ) - if has_encoder_attributes: - self.skipTest( - reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." - ) - - # Then, test left-padding - def _prepare_model_kwargs(input_ids, attention_mask, signature): - model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} - if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - if "cache_position" in signature: - cache_position = torch.arange(input_ids.shape[-1], device=torch_device) - model_kwargs["cache_position"] = cache_position - return model_kwargs - - for model_class in decoder_only_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - input_ids = inputs_dict["input_ids"] - - # - for left padding we absolutely need to use an all ones - # attention mask, so we do not use the one in inputs_dict - attention_mask = torch.ones_like(input_ids) - - model = model_class(config).to(torch_device).eval() - signature = inspect.signature(model.forward).parameters.keys() - - # no cache as some models require special cache classes to be init outside forward - model.generation_config.use_cache = False - - # Without padding - model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) - next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :] - - # With left-padding (length 32) - # can hardcode pad_token to be 0 as we'll do attn masking anyway - pad_token_id = ( - config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 - ) - pad_size = (input_ids.shape[0], 32) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) - model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) - next_logits_with_padding = model(**model_kwargs).logits[:, -1, :] - - # They should result in very similar logits - torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5) + # TODO: document why a random attention mask causes this test to fail, but a full mask doesn't + unpadded_custom_inputs = {"attention_mask": None} + super().test_left_padding_compatibility(unpadded_custom_inputs=unpadded_custom_inputs) @slow diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index 2cf220fd6dfd..472521c01068 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -13,7 +13,6 @@ # limitations under the License. """Testing suite for the PyTorch Idefics model.""" -import inspect import unittest from functools import cached_property @@ -327,7 +326,6 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMi test_pruning = False test_headmasking = False test_torchscript = False - has_attentions = False # only supports SDOA and thus no attention probs returned def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) @@ -594,6 +592,33 @@ def test_generate_from_random_inputs_embeds( ): pass + @pytest.mark.generate + def test_left_padding_compatibility(self): + # Overwrite -- Idefics needs to prepare `image_attention_mask`, and it must be padded accordingly + _, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] + image_attention_mask = inputs_dict["image_attention_mask"] + + pad_size_img = (input_ids.shape[0], 32, image_attention_mask.shape[-1]) + extra_img_mask = torch.zeros(pad_size_img, dtype=image_attention_mask.dtype, device=torch_device) + padded_image_attention_mask = torch.cat([extra_img_mask, image_attention_mask], dim=1) + + # `image_attention_mask` is randomly generated in `prepare_config_and_inputs_for_generate`, and it must match + # its padded version for the test to be valid -- we need to pass both + unpadded_custom_inputs = {"image_attention_mask": image_attention_mask} + padded_custom_inputs = {"image_attention_mask": padded_image_attention_mask} + super().test_left_padding_compatibility( + unpadded_custom_inputs=unpadded_custom_inputs, padded_custom_inputs=padded_custom_inputs + ) + + @unittest.skip(reason="Idefics can't do text-only inference (test filters non-text inputs)") + def test_eager_padding_matches_padding_free_with_position_ids(self): + pass + + @unittest.skip(reason="Idefics can't do text-only inference (test filters non-text inputs)") + def test_sdpa_padding_matches_padding_free_with_position_ids(self): + pass + @require_torch class IdeficsForVisionText2TextTest(IdeficsModelTest, GenerationTesterMixin, unittest.TestCase): @@ -613,66 +638,6 @@ def test_eager_matches_sdpa_inference( ): pass - @pytest.mark.generate - def test_left_padding_compatibility(self): - """Overwrite because IDEFICS needs image attention mask to be also padded""" - # NOTE: left-padding results in small numerical differences. This is expected. - # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 - - def _prepare_model_kwargs(input_ids, attention_mask, image_attention_mask, signature): - model_kwargs = { - "input_ids": input_ids, - "attention_mask": attention_mask, - "image_attention_mask": image_attention_mask, - } - if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - if "cache_position" in signature: - cache_position = torch.arange(input_ids.shape[-1], device=torch_device) - model_kwargs["cache_position"] = cache_position - return model_kwargs - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - input_ids = inputs_dict.pop("input_ids") - attention_mask = inputs_dict.pop("attention_mask") - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) - image_attention_mask = inputs_dict.pop("image_attention_mask", None) - - model = model_class(config).to(torch_device).eval() - signature = inspect.signature(model.forward).parameters.keys() - - # no cache as some models require special cache classes to be init outside forward - model.generation_config.use_cache = False - - # Without padding - model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, image_attention_mask, signature) - next_logits_wo_padding = model(**model_kwargs, **inputs_dict).logits[:, -1, :] - - # With left-padding (length 32) - # can hardcode pad_token to be 0 as we'll do attn masking anyway - pad_token_id = ( - config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 - ) - pad_size = (input_ids.shape[0], 32) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) - - pad_size_img = (input_ids.shape[0], 32, image_attention_mask.shape[-1]) - extra_img_mask = torch.zeros(pad_size_img, dtype=image_attention_mask.dtype, device=torch_device) - padded_image_attention_mask = torch.cat([extra_img_mask, image_attention_mask], dim=1) - model_kwargs = _prepare_model_kwargs( - padded_input_ids, padded_attention_mask, padded_image_attention_mask, signature - ) - next_logits_with_padding = model(**model_kwargs, **inputs_dict).logits[:, -1, :] - - # They should result in very similar logits - torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5) - @pytest.mark.generate def test_generate_continue_from_past_key_values(self): """Overwrite because IDEFICS needs image attention mask to be also processed""" diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py index 1c10ed0797db..9a43671ad975 100644 --- a/tests/models/imagegpt/test_modeling_imagegpt.py +++ b/tests/models/imagegpt/test_modeling_imagegpt.py @@ -316,10 +316,6 @@ def test_forward_signature(self): expected_arg_names = ["input_ids"] self.assertListEqual(arg_names[:1], expected_arg_names) - @unittest.skip(reason="The model doesn't support left padding") # and it's not used enough to be worth fixing :) - def test_left_padding_compatibility(self): - pass - @unittest.skip(reason="Model inputs don't fit test pattern") # and it's not used enough to be worth fixing :) def test_past_key_values_format(self): pass diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 3ce58e4cb24a..17a54da482a2 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -18,7 +18,6 @@ import unittest import numpy as np -import pytest import requests from transformers import ( @@ -566,94 +565,6 @@ def _check_generate_outputs(self, output, config, use_cache=False, num_return_se output, config, use_cache=use_cache, num_return_sequences=num_return_sequences, num_beams=num_beams ) - # overwrite because InstructBLIP cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present - @pytest.mark.generate - def test_left_padding_compatibility(self): - # NOTE: left-padding results in small numerical differences. This is expected. - # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 - - # First, filter out models that don't support left padding - # - The model must have generative capabilities - if len(self.all_generative_model_classes) == 0: - self.skipTest(reason="No generative architecture available for this model.") - - # - The model must support padding - if not self.has_attentions: - self.skipTest(reason="This model doesn't support padding.") - - # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) - decoder_only_classes = [] - for model_class in self.all_generative_model_classes: - config, _ = self.prepare_config_and_inputs_for_generate() - if config.is_encoder_decoder: - continue - else: - decoder_only_classes.append(model_class) - if len(decoder_only_classes) == 0: - self.skipTest(reason="No decoder-only architecture available for this model.") - - # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't - # added support for it yet. We skip these models for now. - has_encoder_attributes = any( - attr_name - for attr_name in config.to_dict() - if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" - ) - if has_encoder_attributes: - self.skipTest( - reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." - ) - - # Then, test left-padding - def _prepare_model_kwargs(input_ids, attention_mask, signature): - model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} - if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - if "cache_position" in signature: - cache_position = torch.arange(input_ids.shape[-1], device=torch_device) - model_kwargs["cache_position"] = cache_position - return model_kwargs - - for model_class in decoder_only_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - input_ids = inputs_dict["input_ids"] - attention_mask = inputs_dict.get("attention_mask") - pixel_values = inputs_dict["pixel_values"] - qformer_input_ids = inputs_dict["qformer_input_ids"] - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) - - model = model_class(config).to(torch_device).eval() - signature = inspect.signature(model.forward).parameters.keys() - - # no cache as some models require special cache classes to be init outside forward - model.generation_config.use_cache = False - - # Without padding - model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) - next_logits_wo_padding = model( - **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids - ).logits[:, -1, :] - - # With left-padding (length 32) - # can hardcode pad_token to be 0 as we'll do attn masking anyway - pad_token_id = ( - config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 - ) - pad_size = (input_ids.shape[0], 32) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) - model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) - next_logits_with_padding = model( - **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids - ).logits[:, -1, :] - - # They should result in very similar logits - torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5) - def test_sdpa_can_dispatch_composite_models(self): """ Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model. diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index a91d31082da9..d6336c8c6840 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -18,7 +18,6 @@ import unittest import numpy as np -import pytest from huggingface_hub import hf_hub_download from transformers import ( @@ -578,94 +577,6 @@ def _check_generate_outputs(self, output, config, use_cache=False, num_return_se output, config, use_cache=use_cache, num_return_sequences=num_return_sequences, num_beams=num_beams ) - # overwrite because InstructBLIPVideo cannot generate only from input ids, and requires `pixel` values and `qformer_input_ids` in all cases to be present - @pytest.mark.generate - def test_left_padding_compatibility(self): - # NOTE: left-padding results in small numerical differences. This is expected. - # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 - - # First, filter out models that don't support left padding - # - The model must have generative capabilities - if len(self.all_generative_model_classes) == 0: - self.skipTest(reason="No generative architecture available for this model.") - - # - The model must support padding - if not self.has_attentions: - self.skipTest(reason="This model doesn't support padding.") - - # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) - decoder_only_classes = [] - for model_class in self.all_generative_model_classes: - config, _ = self.prepare_config_and_inputs_for_generate() - if config.is_encoder_decoder: - continue - else: - decoder_only_classes.append(model_class) - if len(decoder_only_classes) == 0: - self.skipTest(reason="No decoder-only architecture available for this model.") - - # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't - # added support for it yet. We skip these models for now. - has_encoder_attributes = any( - attr_name - for attr_name in config.to_dict() - if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" - ) - if has_encoder_attributes: - self.skipTest( - reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." - ) - - # Then, test left-padding - def _prepare_model_kwargs(input_ids, attention_mask, signature): - model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} - if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - if "cache_position" in signature: - cache_position = torch.arange(input_ids.shape[-1], device=torch_device) - model_kwargs["cache_position"] = cache_position - return model_kwargs - - for model_class in decoder_only_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - input_ids = inputs_dict["input_ids"] - attention_mask = inputs_dict.get("attention_mask") - pixel_values = inputs_dict["pixel_values"] - qformer_input_ids = inputs_dict["qformer_input_ids"] - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) - - model = model_class(config).to(torch_device).eval() - signature = inspect.signature(model.forward).parameters.keys() - - # no cache as some models require special cache classes to be init outside forward - model.generation_config.use_cache = False - - # Without padding - model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) - next_logits_wo_padding = model( - **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids - ).logits[:, -1, :] - - # With left-padding (length 32) - # can hardcode pad_token to be 0 as we'll do attn masking anyway - pad_token_id = ( - config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 - ) - pad_size = (input_ids.shape[0], 32) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) - model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) - next_logits_with_padding = model( - **model_kwargs, pixel_values=pixel_values, qformer_input_ids=qformer_input_ids - ).logits[:, -1, :] - - # They should result in very similar logits - torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5) - def test_sdpa_can_dispatch_composite_models(self): """ Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model. diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index ac16e62c55f3..38a769229952 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -481,57 +481,24 @@ def test_sdpa_padding_matches_padding_free_with_position_ids(self): @pytest.mark.generate def test_left_padding_compatibility(self): - # Overwrite because Kosmos-2 need to pad pixel values and pad image-attn-mask - - def _prepare_model_kwargs(input_ids, attention_mask, pad_size, signature): - model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} - if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - if "cache_position" in signature: - cache_position = torch.arange(input_ids.shape[-1], device=torch_device) - model_kwargs["cache_position"] = cache_position - if "image_embeds_position_mask" in signature: - image_embeds_position_mask = torch.zeros_like(input_ids) - image_embeds_position_mask[:, (pad_size + 1) : pad_size + 1 + self.model_tester.latent_query_num] = 1 - model_kwargs["image_embeds_position_mask"] = image_embeds_position_mask - return model_kwargs + # Overwrite -- kosmos2 needs to prepare `image_embeds_position_mask`, and it must be padded accordingly + _, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - input_ids = inputs_dict["input_ids"] - pixel_values = inputs_dict["pixel_values"] - attention_mask = inputs_dict.get("attention_mask") - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) - - model = model_class(config).to(torch_device).eval() - signature = inspect.signature(model.forward).parameters.keys() - - # no cache as some models require special cache classes to be init outside forward - model.generation_config.use_cache = False - - # Without padding - model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, pad_size=0, signature=signature) - next_logits_wo_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :] - - # With left-padding (length 32) - # can hardcode pad_token to be 0 as we'll do attn masking anyway - pad_token_id = ( - config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 - ) - pad_size = (input_ids.shape[0], 32) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) - model_kwargs = _prepare_model_kwargs( - padded_input_ids, padded_attention_mask, pad_size=32, signature=signature + def _prepare_image_embeds_position_mask(input_ids, pad_size): + image_embeds_position_mask = torch.zeros( + input_ids.shape[0], input_ids.shape[1] + pad_size, device=torch_device, dtype=input_ids.dtype ) - next_logits_with_padding = model(**model_kwargs, pixel_values=pixel_values).logits[:, -1, :] - - # They should result in very similar logits - torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-3, atol=1e-3) + image_embeds_position_mask[:, (pad_size + 1) : pad_size + 1 + self.model_tester.latent_query_num] = 1 + return image_embeds_position_mask + + # `image_embeds_position_mask` is randomly generated in `prepare_config_and_inputs_for_generate`, and it must + # match its padded version for the test to be valid -- we need to pass both + unpadded_custom_inputs = {"image_embeds_position_mask": _prepare_image_embeds_position_mask(input_ids, 0)} + padded_custom_inputs = {"image_embeds_position_mask": _prepare_image_embeds_position_mask(input_ids, 32)} + super().test_left_padding_compatibility( + unpadded_custom_inputs=unpadded_custom_inputs, padded_custom_inputs=padded_custom_inputs + ) @slow def test_model_from_pretrained(self): diff --git a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py index c2a18cb5b690..b3155915b03d 100644 --- a/tests/models/kosmos2_5/test_modeling_kosmos2_5.py +++ b/tests/models/kosmos2_5/test_modeling_kosmos2_5.py @@ -570,57 +570,24 @@ def test_generate_from_inputs_embeds(self): @pytest.mark.generate def test_left_padding_compatibility(self): - # Overwrite because Kosmos-2.5 need to pad pixel values and pad image-attn-mask - - def _prepare_model_kwargs(input_ids, attention_mask, pad_size, signature): - model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} - if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - if "cache_position" in signature: - cache_position = torch.arange(input_ids.shape[-1], device=torch_device) - model_kwargs["cache_position"] = cache_position - if "image_embeds_position_mask" in signature: - image_embeds_position_mask = torch.zeros_like(input_ids) - image_embeds_position_mask[:, (pad_size + 1) : pad_size + 1 + self.model_tester.latent_query_num] = 1 - model_kwargs["image_embeds_position_mask"] = image_embeds_position_mask - return model_kwargs - - for model_class in self.all_generative_model_classes: - config, inputs_dict = self.prepare_config_and_inputs_for_generate() - input_ids = inputs_dict["input_ids"] - flattened_patches = inputs_dict["flattened_patches"] - attention_mask = inputs_dict.get("attention_mask") - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) - - model = model_class(config).to(torch_device).eval() - signature = inspect.signature(model.forward).parameters.keys() - - # no cache as some models require special cache classes to be init outside forward - model.generation_config.use_cache = False - - # Without padding - model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, pad_size=0, signature=signature) - next_logits_wo_padding = model(**model_kwargs, flattened_patches=flattened_patches).logits[:, -1, :] - - # With left-padding (length 32) - # can hardcode pad_token to be 0 as we'll do attn masking anyway - pad_token_id = ( - config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 - ) - pad_size = (input_ids.shape[0], 32) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) - model_kwargs = _prepare_model_kwargs( - padded_input_ids, padded_attention_mask, pad_size=32, signature=signature - ) - next_logits_with_padding = model(**model_kwargs, flattened_patches=flattened_patches).logits[:, -1, :] + # Overwrite -- Kosmos-2.5 needs to prepare `image_embeds_position_mask`, and it must be padded accordingly + _, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] - # They should result in very similar logits - self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-3)) + def _prepare_image_embeds_position_mask(input_ids, pad_size): + image_embeds_position_mask = torch.zeros( + input_ids.shape[0], input_ids.shape[1] + pad_size, device=torch_device, dtype=input_ids.dtype + ) + image_embeds_position_mask[:, (pad_size + 1) : pad_size + 1 + self.model_tester.latent_query_num] = 1 + return image_embeds_position_mask + + # `image_embeds_position_mask` is randomly generated in `prepare_config_and_inputs_for_generate`, and it must + # match its padded version for the test to be valid -- we need to pass both + unpadded_custom_inputs = {"image_embeds_position_mask": _prepare_image_embeds_position_mask(input_ids, 0)} + padded_custom_inputs = {"image_embeds_position_mask": _prepare_image_embeds_position_mask(input_ids, 32)} + super().test_left_padding_compatibility( + unpadded_custom_inputs=unpadded_custom_inputs, padded_custom_inputs=padded_custom_inputs + ) @require_vision diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py index b7c4537006dd..5223720c639d 100644 --- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py +++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py @@ -14,7 +14,6 @@ """Testing suite for the PyTorch Moshi ASR model.""" import gc -import inspect import tempfile import unittest @@ -361,86 +360,9 @@ def test_disk_offload_safetensors(self): @pytest.mark.generate def test_left_padding_compatibility(self): - # NOTE: left-padding results in small numerical differences. This is expected. - # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 - - # First, filter out models that don't support left padding - # - The model must have generative capabilities - if len(self.all_generative_model_classes) == 0: - self.skipTest(reason="No generative architecture available for this model.") - - # - The model must support padding - if not self.has_attentions: - self.skipTest(reason="This model doesn't support padding.") - - # - The model must be a decoder-only architecture (encoder-based architectures use right-padding) - decoder_only_classes = [] - for model_class in self.all_generative_model_classes: - config, _ = self.prepare_config_and_inputs_for_generate() - if config.is_encoder_decoder: - continue - else: - decoder_only_classes.append(model_class) - if len(decoder_only_classes) == 0: - self.skipTest(reason="No decoder-only architecture available for this model.") - - # - Decoder-only architectures derived from encoder-decoder models could support it in theory, but we haven't - # added support for it yet. We skip these models for now. - has_encoder_attributes = any( - attr_name - for attr_name in config.to_dict() - if attr_name.startswith("encoder") and attr_name != "encoder_no_repeat_ngram_size" - ) - if has_encoder_attributes: - self.skipTest( - reason="The decoder-only derived from encoder-decoder models are not expected to support left-padding." - ) - - # Then, test left-padding - def _prepare_model_kwargs(input_ids, attention_mask, signature): - model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} - if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - if "cache_position" in signature: - cache_position = torch.arange(input_ids.shape[1], device=torch_device) - model_kwargs["cache_position"] = cache_position - return model_kwargs - - for model_class in decoder_only_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - input_ids = inputs_dict["input_ids"] - attention_mask = inputs_dict.get("attention_mask") - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) - - model = model_class(config).to(torch_device).eval() - signature = inspect.signature(model.forward).parameters.keys() - - # no cache as some models require special cache classes to be init outside forward - model.generation_config.use_cache = False - - # Without padding - model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) - next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :] - - # With left-padding (length 32) - # can hardcode pad_token to be 0 as we'll do attn masking anyway - pad_token_id = ( - config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 - ) - pad_size = (input_ids.shape[0], 32, *input_ids.shape[2:]) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - padded_attention_mask = torch.cat( - (torch.zeros(pad_size[:2], dtype=input_ids.dtype, device=torch_device), attention_mask), dim=1 - ) - model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) - next_logits_with_padding = model(**model_kwargs).logits[:, -1, :] - - # They should result in very similar logits - torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5) + # TODO: this tester has non-standard input mokey-patching ☠️ + unpadded_custom_inputs = self.model_tester.prepare_config_and_inputs_for_common()[1] + super().test_left_padding_compatibility(unpadded_custom_inputs=unpadded_custom_inputs) def test_generate_continue_from_past_key_values(self): # Tests that we can continue generating from past key values, returned from a previous `generate` call diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index ca5579ecb058..3bf2a5f37c53 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -505,6 +505,25 @@ def test_generate_text_only_with_cache(self): model.generate(input_ids, use_cache=True) + @pytest.mark.generate + def test_left_padding_compatibility(self): + # Overwrite -- mllama needs to prepare `cross_attention_mask`, and it must be padded accordingly + _, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] + cross_attention_mask = inputs_dict["cross_attention_mask"] + + pad_cross_attn_size = (input_ids.shape[0], 32, *cross_attention_mask.shape[2:]) + extra_cross_attn_mask = torch.zeros(pad_cross_attn_size, dtype=cross_attention_mask.dtype, device=torch_device) + padded_cross_attention_mask = torch.cat([extra_cross_attn_mask, cross_attention_mask], dim=1) + + # `cross_attention_mask` is randomly generated in `prepare_config_and_inputs_for_generate`, and it must match + # its padded version for the test to be valid -- we need to pass both + unpadded_custom_inputs = {"cross_attention_mask": cross_attention_mask} + padded_custom_inputs = {"cross_attention_mask": padded_cross_attention_mask} + super().test_left_padding_compatibility( + unpadded_custom_inputs=unpadded_custom_inputs, padded_custom_inputs=padded_custom_inputs + ) + @require_torch class MllamaForConditionalGenerationIntegrationTest(unittest.TestCase): diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index 21f56e1bc56d..d4815a140d69 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -629,54 +629,30 @@ def test_sdpa_can_compile_dynamic(self): @pytest.mark.generate def test_left_padding_compatibility(self): - # NOTE: left-padding results in small numerical differences. This is expected. - # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 - - # Then, test left-padding - - for model_class in self.all_generative_model_classes: - config, input_ids, attention_mask, input_dict = self._get_input_ids_and_config() - model = model_class(config).to(torch_device).eval() - - # no cache as some models require special cache classes to be init outside forward - model.generation_config.use_cache = False - - # Without padding - next_logits_wo_padding = model(input_ids=input_ids, attention_mask=attention_mask, **input_dict).logits[ - :, -1, : - ] - - # With left-padding (length 32) - # can hardcode pad_token to be 0 as we'll do attn masking anyway - pad_token_id = ( - config.get_text_config().pad_token_id if config.get_text_config().pad_token_id is not None else 0 - ) - pad_size = (input_ids.shape[0], 32) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - - padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) - - padding = ( - torch.ones( - (pad_size[0], self.model_tester.num_codebooks, 32), dtype=input_ids.dtype, device=torch_device - ) - * config.audio_vocab_size - ) - padded_moshi_audio_codes = torch.cat((padding, input_dict["moshi_audio_codes"]), dim=2) - padded_user_audio_codes = torch.cat((padding, input_dict["user_audio_codes"]), dim=2) - - model_kwargs = { - "input_ids": padded_input_ids, - "attention_mask": padded_attention_mask, - "moshi_audio_codes": padded_moshi_audio_codes, - "user_audio_codes": padded_user_audio_codes, - } - - next_logits_with_padding = model(**model_kwargs).logits[:, -1, :] - - # They should result in very similar logits - torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=1e-5, atol=1e-5) + # Overwrite -- Moshi needs to prepare the audio codes, and they must be padded accordingly + config, inputs_dict = self.prepare_config_and_inputs_for_generate() + input_ids = inputs_dict["input_ids"] + moshi_audio_codes = inputs_dict["moshi_audio_codes"] + user_audio_codes = inputs_dict["user_audio_codes"] + + pad_size = (input_ids.shape[0], 32) + padding = ( + torch.ones((pad_size[0], self.model_tester.num_codebooks, 32), dtype=input_ids.dtype, device=torch_device) + * config.audio_vocab_size + ) + padded_moshi_audio_codes = torch.cat((padding, moshi_audio_codes), dim=2) + padded_user_audio_codes = torch.cat((padding, user_audio_codes), dim=2) + + # the audio codes are randomly generated in `prepare_config_and_inputs_for_generate`, and they must match + # their padded version for the test to be valid -- we need to pass both + unpadded_custom_inputs = {"moshi_audio_codes": moshi_audio_codes, "user_audio_codes": user_audio_codes} + padded_custom_inputs = { + "moshi_audio_codes": padded_moshi_audio_codes, + "user_audio_codes": padded_user_audio_codes, + } + super().test_left_padding_compatibility( + unpadded_custom_inputs=unpadded_custom_inputs, padded_custom_inputs=padded_custom_inputs + ) @slow @is_flaky(max_attempts=5, description="flaky on some models.") diff --git a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py index b1f809892c8f..538353fee44d 100644 --- a/tests/models/qwen2_audio/test_modeling_qwen2_audio.py +++ b/tests/models/qwen2_audio/test_modeling_qwen2_audio.py @@ -63,6 +63,7 @@ def __init__( "use_labels": True, "use_mrope": False, "vocab_size": 99, + "pad_token_id": 1, # can't be the same as the audio token id }, is_training=True, audio_config={ diff --git a/tests/models/voxtral/test_modeling_voxtral.py b/tests/models/voxtral/test_modeling_voxtral.py index 123bec730f4e..d6662ebd5532 100644 --- a/tests/models/voxtral/test_modeling_voxtral.py +++ b/tests/models/voxtral/test_modeling_voxtral.py @@ -59,7 +59,7 @@ def __init__( "use_mrope": False, "vocab_size": 99, "head_dim": 8, - "pad_token_id": 0, + "pad_token_id": 1, # can't be the same as the audio token id }, is_training=True, audio_config={ diff --git a/tests/models/zamba/test_modeling_zamba.py b/tests/models/zamba/test_modeling_zamba.py index b601b280558b..070e84733092 100644 --- a/tests/models/zamba/test_modeling_zamba.py +++ b/tests/models/zamba/test_modeling_zamba.py @@ -480,51 +480,6 @@ def _get_input_ids_and_config(self): ) = config_and_inputs return config, input_ids, input_mask - def test_left_padding_compatibility(self): - r""" - Overriding the test_left_padding_compatibility test as the mamba layers accentuate the numerical differences - effect of the left padding discussed in the issue in the note. Using a more permissive tolerance value. - """ - import inspect - # NOTE: left-padding results in small numerical differences. This is expected. - # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 - - # First, filter out models that don't support left padding - generative and decoder-only. - # Zamba is a decoder-only architecture - decoder_only_classes = self.all_generative_model_classes - - # Then, test left-padding - def _prepare_model_kwargs(input_ids, attention_mask, signature): - model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} - if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - if "cache_position" in signature: - cache_position = torch.arange(input_ids.shape[-1], device=torch_device) - model_kwargs["cache_position"] = cache_position - return model_kwargs - - for model_class in decoder_only_classes: - config, input_ids, attention_mask = self._get_input_ids_and_config() - model = model_class(config).to(torch_device).eval() - signature = inspect.signature(model.forward).parameters.keys() - - # Without padding - model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) - next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :] - - # With left-padding (length 32) - pad_size = (input_ids.shape[0], 32) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * config.pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) - model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) - next_logits_with_padding = model(**model_kwargs).logits[:, -1, :] - - # They should result in very similar logits - torch.testing.assert_close(next_logits_wo_padding, next_logits_with_padding, rtol=3e-3, atol=3e-3) - @require_flash_attn @require_torch_gpu @require_bitsandbytes diff --git a/tests/models/zamba2/test_modeling_zamba2.py b/tests/models/zamba2/test_modeling_zamba2.py index c6921297d6e7..99c6f5fc53d9 100644 --- a/tests/models/zamba2/test_modeling_zamba2.py +++ b/tests/models/zamba2/test_modeling_zamba2.py @@ -499,51 +499,6 @@ def _get_input_ids_and_config(self): ) = config_and_inputs return config, input_ids, input_mask - def test_left_padding_compatibility(self): - r""" - Overriding the test_left_padding_compatibility test as the mamba layers accentuate the numerical differences - effect of the left padding discussed in the issue in the note. Using a more permissive tolerance value. - """ - import inspect - # NOTE: left-padding results in small numerical differences. This is expected. - # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 - - # First, filter out models that don't support left padding - generative and decoder-only. - # Zamba2 is a decoder-only architecture - decoder_only_classes = self.all_generative_model_classes - - # Then, test left-padding - def _prepare_model_kwargs(input_ids, attention_mask, signature): - model_kwargs = {"input_ids": input_ids, "attention_mask": attention_mask} - if "position_ids" in signature: - position_ids = torch.cumsum(attention_mask, dim=-1) - 1 - position_ids.masked_fill_(attention_mask == 0, 1) - model_kwargs["position_ids"] = position_ids - if "cache_position" in signature: - cache_position = torch.arange(input_ids.shape[-1], device=torch_device) - model_kwargs["cache_position"] = cache_position - return model_kwargs - - for model_class in decoder_only_classes: - config, input_ids, attention_mask = self._get_input_ids_and_config() - model = model_class(config).to(torch_device).eval() - signature = inspect.signature(model.forward).parameters.keys() - - # Without padding - model_kwargs = _prepare_model_kwargs(input_ids, attention_mask, signature) - next_logits_wo_padding = model(**model_kwargs).logits[:, -1, :] - - # With left-padding (length 32) - pad_size = (input_ids.shape[0], 32) - padding = torch.ones(pad_size, dtype=input_ids.dtype, device=torch_device) * config.pad_token_id - padded_input_ids = torch.cat((padding, input_ids), dim=1) - padded_attention_mask = torch.cat((torch.zeros_like(padding), attention_mask), dim=1) - model_kwargs = _prepare_model_kwargs(padded_input_ids, padded_attention_mask, signature) - next_logits_with_padding = model(**model_kwargs).logits[:, -1, :] - - # They should result in very similar logits - self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=3e-3)) - @require_flash_attn @require_torch_gpu @require_bitsandbytes From 6ecd0fa59fa491f3e18724ce83339a88ef997489 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Thu, 18 Sep 2025 15:57:31 +0000 Subject: [PATCH 2/3] better test comment --- .../test_modeling_kyutai_speech_to_text.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py index 5223720c639d..95db22036544 100644 --- a/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py +++ b/tests/models/kyutai_speech_to_text/test_modeling_kyutai_speech_to_text.py @@ -360,7 +360,9 @@ def test_disk_offload_safetensors(self): @pytest.mark.generate def test_left_padding_compatibility(self): - # TODO: this tester has non-standard input mokey-patching ☠️ + # TODO: this tester has non-standard input monkey-patching in `prepare_config_and_inputs_for_generate`, + # and the test fails with the monkey-patched test inputs (bad shapes for the test) ☠️ The base inputs work + # fine, though. unpadded_custom_inputs = self.model_tester.prepare_config_and_inputs_for_common()[1] super().test_left_padding_compatibility(unpadded_custom_inputs=unpadded_custom_inputs) From acd336d2b804d816b4a467aae063cad20f29b7da Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Thu, 18 Sep 2025 16:00:04 +0000 Subject: [PATCH 3/3] 0 as a default for --- tests/generation/test_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index af60e3fe02da..784c37fa883e 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1024,8 +1024,8 @@ def _prepare_model_kwargs(model_inputs, signature): if token_type_ids is not None: padded_token_type_ids = torch.cat( ( - # Assumption: take the first token type id as the padding token type id - torch.ones(pad_size[:2], dtype=input_ids.dtype, device=torch_device) * token_type_ids[0, 0], + # Assumption: `0` is a good default value for padding token type ids + torch.zeros(pad_size[:2], dtype=input_ids.dtype, device=torch_device), token_type_ids, ), dim=1,