diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 310c42222e9d..07a5375be95e 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -3063,7 +3063,7 @@ def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None): raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") if gradient_checkpointing_kwargs is None: - gradient_checkpointing_kwargs = {"use_reentrant": True} + gradient_checkpointing_kwargs = {"use_reentrant": False} gradient_checkpointing_func = functools.partial(checkpoint, **gradient_checkpointing_kwargs) diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py index f607f41dd1dd..4ac1c4b6b357 100644 --- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py @@ -487,6 +487,8 @@ def forward( x_branch = x_branch.unsqueeze(-1) self.conv1d_state = conv_state[:, :, 1:] else: + self.conv1d_state = None + self.rg_lru.recurrent_states = None x_branch = self.conv_1d(x_branch)[..., :seq_len] x_branch = self.rg_lru(x_branch.transpose(1, 2), position_ids) diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py index 9b4e877c30f4..863623d2f3e0 100644 --- a/tests/models/align/test_modeling_align.py +++ b/tests/models/align/test_modeling_align.py @@ -344,24 +344,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="ALIGN does not use inputs_embeds") diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 4b7e8bc0ad01..d6ddf36295fd 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -172,24 +172,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="AltCLIPVisionModel use the same cv backbone with CLIP model.") @@ -309,24 +305,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass def test_model_outputs_equivalence(self): diff --git a/tests/models/aria/test_modeling_aria.py b/tests/models/aria/test_modeling_aria.py index 126da4be3b00..9813f79ba95b 100644 --- a/tests/models/aria/test_modeling_aria.py +++ b/tests/models/aria/test_modeling_aria.py @@ -15,6 +15,7 @@ import unittest +import pytest import requests from transformers import ( @@ -197,23 +198,23 @@ def setUp(self): self.model_tester = AriaVisionText2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=AriaConfig, has_text_modality=False) - @unittest.skip( + @pytest.mark.xfail( reason="This architecture seems to not compute gradients for the last vision-layernorm because the model uses hidden states pre-norm" ) def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( + @pytest.mark.xfail( reason="This architecture seems to not compute gradients for the last vision-layernorm because the model uses hidden states pre-norm" ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + def test_training_gradient_checkpointing_use_reentrant_false(self): + super().test_training_gradient_checkpointing_use_reentrant_false() - @unittest.skip( + @pytest.mark.xfail( reason="This architecture seems to not compute gradients for the last vision-layernorm because the model uses hidden states pre-norm" ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() SKIP = False diff --git a/tests/models/autoformer/test_modeling_autoformer.py b/tests/models/autoformer/test_modeling_autoformer.py index fd2345f3e94e..0a96be5f73cb 100644 --- a/tests/models/autoformer/test_modeling_autoformer.py +++ b/tests/models/autoformer/test_modeling_autoformer.py @@ -17,6 +17,7 @@ import tempfile import unittest +import pytest from huggingface_hub import hf_hub_download from transformers import is_torch_available @@ -242,23 +243,17 @@ def test_encoder_decoder_model_standalone(self): def test_resize_tokens_embeddings(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() # # Input is 'static_categorical_features' not 'input_ids' def test_model_main_input_name(self): diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py index 2fb3dc27cebb..b259255d022f 100644 --- a/tests/models/aya_vision/test_modeling_aya_vision.py +++ b/tests/models/aya_vision/test_modeling_aya_vision.py @@ -184,17 +184,17 @@ def test_config(self): def test_training(self): pass - @unittest.skip(reason="SiglipVisionModel does not support standalone training") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip(reason="SiglipVisionModel does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip(reason="SiglipVisionModel does not support standalone training") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip(reason="Compile not yet supported because in LLava models") @pytest.mark.torch_compile_test diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py index af2c8e65b581..417baf5dcf41 100644 --- a/tests/models/beit/test_modeling_beit.py +++ b/tests/models/beit/test_modeling_beit.py @@ -340,7 +340,7 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_training_gradient_checkpointing(self): + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() if not self.model_tester.is_training: self.skipTest(reason="model_tester.is_training is set to False") @@ -362,25 +362,13 @@ def test_training_gradient_checkpointing(self): continue model = model_class(config) - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model.to(torch_device) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) loss = model(**inputs).loss loss.backward() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @slow def test_model_from_pretrained(self): model_name = "microsoft/beit-base-patch16-224" diff --git a/tests/models/big_bird/test_modeling_big_bird.py b/tests/models/big_bird/test_modeling_big_bird.py index 2af061f4ba3d..1646f94c65b4 100644 --- a/tests/models/big_bird/test_modeling_big_bird.py +++ b/tests/models/big_bird/test_modeling_big_bird.py @@ -15,6 +15,8 @@ import unittest +import pytest + from transformers import BigBirdConfig, is_torch_available from transformers.models.auto import get_values from transformers.models.big_bird.tokenization_big_bird import BigBirdTokenizer @@ -579,23 +581,17 @@ def test_for_change_to_full_attn(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_change_to_full_attn(*config_and_inputs) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @require_torch diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index 5a81b45f71a7..acfe8f405a53 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -188,24 +188,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow @@ -325,22 +321,6 @@ def test_model(self): def test_training(self): pass - @unittest.skip - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip(reason="Blip does not use inputs_embeds") def test_inputs_embeds(self): pass @@ -832,7 +812,7 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_training_gradient_checkpointing(self): + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): if not self.model_tester.is_training: self.skipTest(reason="ModelTester is not setup for training") @@ -843,7 +823,7 @@ def test_training_gradient_checkpointing(self): model = model_class(config) model.to(torch_device) - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) @@ -853,18 +833,6 @@ def test_training_gradient_checkpointing(self): loss = model(**inputs).loss loss.backward() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - def test_load_vision_text_config(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -960,7 +928,7 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_training_gradient_checkpointing(self): + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): if not self.model_tester.is_training: self.skipTest(reason="ModelTester is not setup for training") @@ -971,7 +939,7 @@ def test_training_gradient_checkpointing(self): model = model_class(config) model.to(torch_device) - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) diff --git a/tests/models/blip/test_modeling_blip_text.py b/tests/models/blip/test_modeling_blip_text.py index 52e597b32ecc..129a8b233b00 100644 --- a/tests/models/blip/test_modeling_blip_text.py +++ b/tests/models/blip/test_modeling_blip_text.py @@ -141,22 +141,6 @@ def test_model(self): def test_training(self): pass - @unittest.skip - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip(reason="Blip does not use inputs_embeds") def test_inputs_embeds(self): pass diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 2448d3221cf7..b3c660fc726f 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -195,24 +195,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow @@ -1086,10 +1082,6 @@ def test_model(self): def test_training(self): pass - @unittest.skip(reason="Training is not yet supported") - def test_training_gradient_checkpointing(self): - pass - @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass @@ -1237,20 +1229,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="Training is not yet supported") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="Training is not yet supported") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip(reason="Training is not yet supported") - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="Training is not yet supported") - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="Blip2VisionModelWithProjection does not use inputs_embeds") @@ -1476,11 +1468,11 @@ def test_training_gradient_checkpointing(self): pass @unittest.skip(reason="Training is not yet supported") - def test_training_gradient_checkpointing_use_reentrant(self): + def test_training_gradient_checkpointing_use_reentrant_false(self): pass @unittest.skip(reason="Training is not yet supported") - def test_training_gradient_checkpointing_use_reentrant_false(self): + def test_training_gradient_checkpointing_use_reentrant_true(self): pass diff --git a/tests/models/canine/test_modeling_canine.py b/tests/models/canine/test_modeling_canine.py index ac7fe07b64fa..67174e159f67 100644 --- a/tests/models/canine/test_modeling_canine.py +++ b/tests/models/canine/test_modeling_canine.py @@ -15,6 +15,8 @@ import unittest +import pytest + from transformers import CanineConfig, is_torch_available from transformers.testing_utils import require_torch, slow, torch_device @@ -450,23 +452,17 @@ def test_inputs_embeds_matches_input_ids(self): def test_model_get_set_embeddings(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @slow def test_model_from_pretrained(self): diff --git a/tests/models/chinese_clip/test_modeling_chinese_clip.py b/tests/models/chinese_clip/test_modeling_chinese_clip.py index f48e8c39e02b..fd3039d8b027 100644 --- a/tests/models/chinese_clip/test_modeling_chinese_clip.py +++ b/tests/models/chinese_clip/test_modeling_chinese_clip.py @@ -375,24 +375,20 @@ def test_model_from_pretrained(self): model = ChineseCLIPTextModel.from_pretrained(model_name) self.assertIsNotNone(model) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @@ -449,22 +445,6 @@ def test_model(self): def test_training(self): pass - @unittest.skip - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @slow def test_model_from_pretrained(self): model_name = "OFA-Sys/chinese-clip-vit-base-patch16" diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 4327f0a15878..3791011c7ee3 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -242,22 +242,6 @@ def test_model_with_projection(self): def test_training(self): pass - @unittest.skip(reason="ClapAudioModel does not output any loss term in the forward pass") - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @slow def test_model_from_pretrained(self): model_name = "laion/clap-htsat-fused" @@ -396,22 +380,6 @@ def test_model_with_projection(self): def test_training(self): pass - @unittest.skip(reason="ClapTextModel does not output any loss term in the forward pass") - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip(reason="ClapTextModel does not use inputs_embeds") def test_inputs_embeds(self): pass diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index 4ce1b9faa6aa..5cab257ecb63 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -251,24 +251,20 @@ def test_model_with_projection(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_with_projection(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow @@ -413,24 +409,20 @@ def test_model_with_projection(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_with_projection(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="CLIP does not use inputs_embeds") @@ -637,17 +629,17 @@ def test_inputs_embeds(self): def test_model_get_set_embeddings(self): pass - @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet") - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip(reason="CLIPForImageClassification does not support gradient checkpointing yet") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) @slow diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index c3dcf643966c..9cdf45062b10 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -18,6 +18,7 @@ import unittest import numpy as np +import pytest import requests from transformers import CLIPSegConfig, CLIPSegProcessor, CLIPSegTextConfig, CLIPSegVisionConfig @@ -177,24 +178,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow @@ -306,24 +303,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="CLIPSeg does not use inputs_embeds") @@ -467,23 +460,23 @@ def test_retain_grad_hidden_states_attentions(self): def test_model_get_set_embeddings(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + @pytest.mark.xfail( + reason="CLIPSegForImageSegmentation does not expose input embeddings. Gradients cannot flow back to the token embeddings when using gradient checkpointing." ) def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + @pytest.mark.xfail( + reason="CLIPSegForImageSegmentation does not expose input embeddings. Gradients cannot flow back to the token embeddings when using gradient checkpointing." ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + def test_training_gradient_checkpointing_use_reentrant_false(self): + super().test_training_gradient_checkpointing_use_reentrant_false() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + @pytest.mark.xfail( + reason="CLIPSegForImageSegmentation does not expose input embeddings. Gradients cannot flow back to the token embeddings when using gradient checkpointing." ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() def test_load_vision_text_config(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index c4b6417db2c5..650a0c59c6ea 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -183,10 +183,6 @@ def test_model(self): def test_training(self): pass - @unittest.skip(reason="ClvpEncoder does not output loss") - def test_training_gradient_checkpointing(self): - pass - @unittest.skip(reason="ClvpEncoder does not output loss") def test_gradient_checkpointing_enable_disable(self): pass diff --git a/tests/models/colpali/test_modeling_colpali.py b/tests/models/colpali/test_modeling_colpali.py index 3f42e1a28bf0..779cc4fa4d85 100644 --- a/tests/models/colpali/test_modeling_colpali.py +++ b/tests/models/colpali/test_modeling_colpali.py @@ -205,24 +205,6 @@ def test_colpali_forward_inputs(self): self.assertIsInstance(outputs, ColPaliForRetrievalOutput) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip( reason="From PaliGemma: Some undefined behavior encountered with test versions of this model. Skip for now." ) diff --git a/tests/models/convnextv2/test_modeling_convnextv2.py b/tests/models/convnextv2/test_modeling_convnextv2.py index 79fd07b098fc..86592528c257 100644 --- a/tests/models/convnextv2/test_modeling_convnextv2.py +++ b/tests/models/convnextv2/test_modeling_convnextv2.py @@ -204,7 +204,7 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_training_gradient_checkpointing(self): + def cehck_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): if not self.model_tester.is_training: self.skipTest(reason="ModelTester is not set to test training") @@ -222,7 +222,7 @@ def test_training_gradient_checkpointing(self): model = model_class(config) model.to(torch_device) - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) loss = model(**inputs).loss diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py index 372b12baed18..d96aebc0fbb3 100644 --- a/tests/models/deit/test_modeling_deit.py +++ b/tests/models/deit/test_modeling_deit.py @@ -292,7 +292,7 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_training_gradient_checkpointing(self): + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() if not self.model_tester.is_training: self.skipTest(reason="model_tester.is_training is set to False") @@ -307,25 +307,13 @@ def test_training_gradient_checkpointing(self): if model_class.__name__ == "DeiTForImageClassificationWithTeacher": continue model = model_class(config) - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model.to(torch_device) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) loss = model(**inputs).loss loss.backward() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - def test_problem_types(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/depth_anything/test_modeling_depth_anything.py b/tests/models/depth_anything/test_modeling_depth_anything.py index 4e5576fbb09e..4a07c9ff94d4 100644 --- a/tests/models/depth_anything/test_modeling_depth_anything.py +++ b/tests/models/depth_anything/test_modeling_depth_anything.py @@ -170,28 +170,24 @@ def test_for_depth_estimation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) - @unittest.skip(reason="Depth Anything does not support training yet") - def test_training(self): + @unittest.skip(reason="Depth Anything with AutoBackbone does not have a base model and hence no input_embeddings") + def test_model_get_set_embeddings(self): pass - @unittest.skip(reason="Depth Anything does not support training yet") - def test_training_gradient_checkpointing(self): + @unittest.skip(reason="Training is not yet supported") + def test_training(self): pass - @unittest.skip(reason="Depth Anything with AutoBackbone does not have a base model and hence no input_embeddings") - def test_model_get_set_embeddings(self): + @unittest.skip(reason="Training is not yet supported") + def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="Training is not yet supported") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="Training is not yet supported") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow diff --git a/tests/models/depth_pro/test_modeling_depth_pro.py b/tests/models/depth_pro/test_modeling_depth_pro.py index 7a5e6e679e6b..91f5dcf2b4c9 100644 --- a/tests/models/depth_pro/test_modeling_depth_pro.py +++ b/tests/models/depth_pro/test_modeling_depth_pro.py @@ -268,7 +268,7 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_training_gradient_checkpointing(self): + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): for model_class in self.all_model_classes: if model_class.__name__ == "DepthProForDepthEstimation": continue @@ -281,24 +281,12 @@ def test_training_gradient_checkpointing(self): continue model = model_class(config) model.to(torch_device) - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) loss = model(**inputs).loss loss.backward() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - # this started when switched from normal initialization to kaiming_normal initialization # maybe because the magnitude of offset values from ViT-encoders increases when followed by many convolution layers def test_batching_equivalence(self, atol=1e-4, rtol=1e-4): diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py index a3624493c5fb..0c5f56ec4d0a 100644 --- a/tests/models/dinov2/test_modeling_dinov2.py +++ b/tests/models/dinov2/test_modeling_dinov2.py @@ -16,6 +16,8 @@ import unittest from functools import cached_property +import pytest + from transformers import Dinov2Config from transformers.testing_utils import ( require_torch, @@ -241,23 +243,17 @@ def test_config(self): def test_inputs_embeds(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() def test_model_get_set_embeddings(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py index dc5fc3b12d66..3e6916d3a890 100644 --- a/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py +++ b/tests/models/dinov2_with_registers/test_modeling_dinov2_with_registers.py @@ -16,6 +16,8 @@ import unittest from functools import cached_property +import pytest + from transformers import Dinov2WithRegistersConfig from transformers.testing_utils import ( require_torch, @@ -249,23 +251,17 @@ def test_config(self): def test_inputs_embeds(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() def test_model_get_set_embeddings(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/dinov3_vit/test_modeling_dinov3_vit.py b/tests/models/dinov3_vit/test_modeling_dinov3_vit.py index c5997e97e831..5ed0cf07da6a 100644 --- a/tests/models/dinov3_vit/test_modeling_dinov3_vit.py +++ b/tests/models/dinov3_vit/test_modeling_dinov3_vit.py @@ -215,24 +215,6 @@ def test_config(self): def test_inputs_embeds(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - def test_model_get_set_embeddings(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py index 246750893dbf..0c08d1e02b71 100644 --- a/tests/models/dpt/test_modeling_dpt.py +++ b/tests/models/dpt/test_modeling_dpt.py @@ -223,7 +223,7 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_training_gradient_checkpointing(self): + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): for model_class in self.all_model_classes: if model_class.__name__ == "DPTForDepthEstimation": continue @@ -236,24 +236,12 @@ def test_training_gradient_checkpointing(self): continue model = model_class(config) model.to(torch_device) - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) loss = model(**inputs).loss loss.backward() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip(reason="Inductor error for dynamic shape") @pytest.mark.torch_compile_test def test_sdpa_can_compile_dynamic(self): diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py index f2c8bd4b5e45..be7c47b7d3e5 100644 --- a/tests/models/dpt/test_modeling_dpt_auto_backbone.py +++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py @@ -174,7 +174,7 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_training_gradient_checkpointing(self): + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): for model_class in self.all_model_classes: if model_class.__name__ == "DPTForDepthEstimation": continue @@ -187,7 +187,7 @@ def test_training_gradient_checkpointing(self): continue model = model_class(config) model.to(torch_device) - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) loss = model(**inputs).loss @@ -197,18 +197,6 @@ def test_training_gradient_checkpointing(self): def test_model_get_set_embeddings(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @slow def test_model_from_pretrained(self): model_name = "Intel/dpt-large" diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py index f54ab5484f1a..56b85ac9153f 100644 --- a/tests/models/dpt/test_modeling_dpt_hybrid.py +++ b/tests/models/dpt/test_modeling_dpt_hybrid.py @@ -238,7 +238,7 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_training_gradient_checkpointing(self): + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): for model_class in self.all_model_classes: if model_class.__name__ == "DPTForDepthEstimation": continue @@ -251,24 +251,12 @@ def test_training_gradient_checkpointing(self): continue model = model_class(config) model.to(torch_device) - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) loss = model(**inputs).loss loss.backward() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @slow def test_model_from_pretrained(self): model_name = "Intel/dpt-hybrid-midas" diff --git a/tests/models/efficientloftr/test_modeling_efficientloftr.py b/tests/models/efficientloftr/test_modeling_efficientloftr.py index 8db928d00b58..ba05373a9dd9 100644 --- a/tests/models/efficientloftr/test_modeling_efficientloftr.py +++ b/tests/models/efficientloftr/test_modeling_efficientloftr.py @@ -161,20 +161,20 @@ def test_model_get_set_embeddings(self): def test_feed_forward_chunking(self): pass - @unittest.skip(reason="EfficientLoFTRForKeypointMatching is not trainable") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="EfficientLoFTRForKeypointMatching is not trainable") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip(reason="EfficientLoFTRForKeypointMatching is not trainable") - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="EfficientLoFTRForKeypointMatching is not trainable") - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="EfficientLoFTR does not output any loss term in the forward pass") diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index 501208452474..9bcf54a79de2 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -19,6 +19,7 @@ import unittest import numpy as np +import pytest import requests from transformers import ( @@ -295,24 +296,20 @@ def check_hidden_states_output(inputs_dict, config, model_class): check_hidden_states_output(inputs_dict, config, model_class) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow @@ -440,24 +437,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="FLAVA does not use input_embeds") @@ -595,24 +588,20 @@ def test_forward_signature(self): def test_model_get_set_embeddings(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="FLAVA does not use input_embeds") @@ -708,32 +697,28 @@ def test_attention_outputs(self): def test_model_get_set_embeddings(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip - def test_hidden_states_output(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing(self): pass - @unittest.skip(reason="FlavaImageCodebook has no attentions") - def test_retain_grad_hidden_states_attentions(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip - def test_training_gradient_checkpointing(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip + def test_hidden_states_output(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="FlavaImageCodebook has no attentions") + def test_retain_grad_hidden_states_attentions(self): pass @unittest.skip(reason="FLAVA does not use input_embeds") @@ -1090,23 +1075,17 @@ class FlavaForPreTrainingTest(FlavaModelTest): all_model_classes = (FlavaForPreTraining,) if is_torch_available() else () class_for_tester = FlavaForPreTrainingTester - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() # We will verify our results on an image of cute cats diff --git a/tests/models/fnet/test_modeling_fnet.py b/tests/models/fnet/test_modeling_fnet.py index 94eee5c66bf8..caffebd52dc2 100644 --- a/tests/models/fnet/test_modeling_fnet.py +++ b/tests/models/fnet/test_modeling_fnet.py @@ -15,6 +15,8 @@ import unittest +import pytest + from transformers import FNetConfig, is_torch_available from transformers.models.auto import get_values from transformers.testing_utils import require_tokenizers, require_torch, slow, torch_device @@ -290,23 +292,17 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def test_attention_outputs(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() def test_model_outputs_equivalence(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py index 732014fb9f62..3ffd29deaa56 100644 --- a/tests/models/fuyu/test_modeling_fuyu.py +++ b/tests/models/fuyu/test_modeling_fuyu.py @@ -196,24 +196,6 @@ def test_mismatching_image_patches(self): with self.assertRaises(ValueError): _ = model(input_ids=input_ids, image_patches=image_patches) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @parameterized.expand([("random",), ("same",)]) @pytest.mark.generate @unittest.skip("Fuyu doesn't support assisted generation due to the need to crop/extend image patches indices") diff --git a/tests/models/gemma3/test_modeling_gemma3.py b/tests/models/gemma3/test_modeling_gemma3.py index a569919e5bd8..d47ce6e1f02a 100644 --- a/tests/models/gemma3/test_modeling_gemma3.py +++ b/tests/models/gemma3/test_modeling_gemma3.py @@ -411,17 +411,17 @@ def test_bidirectional_image_attention(self): # We expect a non-causal mask only within same image and no looking ahead to the future self.assertTrue((attention[..., :4, 7:10] == 0).all().item()) - @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip("Loading nested configs with overwritten `kwargs` isn't supported yet, FIXME @raushan.") def test_load_with_mismatched_shapes(self): diff --git a/tests/models/gemma3n/test_modeling_gemma3n.py b/tests/models/gemma3n/test_modeling_gemma3n.py index 3cb81cc8aaac..82a50ceb0543 100644 --- a/tests/models/gemma3n/test_modeling_gemma3n.py +++ b/tests/models/gemma3n/test_modeling_gemma3n.py @@ -767,18 +767,6 @@ def setUp(self): text_config={"activation_sparsity_pattern": None}, ) - @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip( reason="Siglip has no FLEX attention, and we don't have a proper way to set/test attn in VLMs. TODO @raushan" ) diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 269747b32359..4fa301c8c9ee 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -165,24 +165,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow diff --git a/tests/models/gpt2/test_modeling_gpt2.py b/tests/models/gpt2/test_modeling_gpt2.py index 36b2beb22a79..9fb8e3f29593 100644 --- a/tests/models/gpt2/test_modeling_gpt2.py +++ b/tests/models/gpt2/test_modeling_gpt2.py @@ -268,18 +268,18 @@ def test_training_gradient_checkpointing(self): super().test_training_gradient_checkpointing() self.all_model_classes = self.original_all_model_classes - def test_training_gradient_checkpointing_use_reentrant(self): + def test_training_gradient_checkpointing_use_reentrant_false(self): # overwritten: GPT2DoubleHeadsModel fails this test, non-standard class self.original_all_model_classes = self.all_model_classes self.all_model_classes = (cls for cls in self.all_model_classes if cls.__name__ != "GPT2DoubleHeadsModel") - super().test_training_gradient_checkpointing_use_reentrant() + super().test_training_gradient_checkpointing_use_reentrant_false() self.all_model_classes = self.original_all_model_classes - def test_training_gradient_checkpointing_use_reentrant_false(self): + def test_training_gradient_checkpointing_use_reentrant_true(self): # overwritten: GPT2DoubleHeadsModel fails this test, non-standard class self.original_all_model_classes = self.all_model_classes self.all_model_classes = (cls for cls in self.all_model_classes if cls.__name__ != "GPT2DoubleHeadsModel") - super().test_training_gradient_checkpointing_use_reentrant_false() + super().test_training_gradient_checkpointing_use_reentrant_true() self.all_model_classes = self.original_all_model_classes diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index 7a656e404d13..6565630e0fbf 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -249,24 +249,20 @@ def test_attention_outputs(self): ], ) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass # override since the attention mask from GroupViT is not used to compute loss, thus no grad @@ -437,24 +433,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="GroupViTTextModel does not use inputs_embeds") diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index a69bcc37b5e0..7e734518b5be 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -447,7 +447,7 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_training_gradient_checkpointing(self): + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): if not self.model_tester.is_training: self.skipTest(reason="model_tester.is_training is set to False") @@ -463,24 +463,12 @@ def test_training_gradient_checkpointing(self): model = model_class(config) model.to(torch_device) - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) loss = model(**inputs).loss loss.backward() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip(reason="""IDEFICS does not support retaining the gradients of the hidden states and attention""") def test_retain_grad_hidden_states_attentions(self): return @@ -855,18 +843,6 @@ def test_for_token_classification(self): def test_retain_grad_hidden_states_attentions(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip("Idefics has a hard requirement on SDPA") def test_sdpa_can_dispatch_non_composite_models(self): pass diff --git a/tests/models/imagegpt/test_modeling_imagegpt.py b/tests/models/imagegpt/test_modeling_imagegpt.py index 08cf69199ee1..c07c6a05ab63 100644 --- a/tests/models/imagegpt/test_modeling_imagegpt.py +++ b/tests/models/imagegpt/test_modeling_imagegpt.py @@ -275,24 +275,6 @@ def test_imagegpt_image_classification(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_imagegpt_for_image_classification(*config_and_inputs) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @slow def test_model_from_pretrained(self): model_name = "openai/imagegpt-small" diff --git a/tests/models/informer/test_modeling_informer.py b/tests/models/informer/test_modeling_informer.py index 8b4d6c1eabfe..4dfad7ad3e70 100644 --- a/tests/models/informer/test_modeling_informer.py +++ b/tests/models/informer/test_modeling_informer.py @@ -18,6 +18,7 @@ import unittest import numpy as np +import pytest from huggingface_hub import hf_hub_download from transformers import is_torch_available @@ -293,23 +294,17 @@ def test_determinism(self): def test_batching_equivalence(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() # # Input is 'static_categorical_features' not 'input_ids' def test_model_main_input_name(self): diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 6a6050abb146..9ae2011015eb 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -193,24 +193,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="InstructBlipVisionModel is an internal building block, doesn't support standalone training") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="InstructBlipVisionModel is an internal building block, doesn't support standalone training") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 0046ccb0bf58..784f9f2083a1 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -199,28 +199,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip( - reason="InstructBlipVideoVisionModel is an internal building block, doesn't support standalone training" - ) + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip( - reason="InstructBlipVideoVisionModel is an internal building block, doesn't support standalone training" - ) + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow diff --git a/tests/models/layoutlm/test_modeling_layoutlm.py b/tests/models/layoutlm/test_modeling_layoutlm.py index 6c3d45bd2acb..cea811b7512e 100644 --- a/tests/models/layoutlm/test_modeling_layoutlm.py +++ b/tests/models/layoutlm/test_modeling_layoutlm.py @@ -13,6 +13,8 @@ # limitations under the License. import unittest +import pytest + from transformers import LayoutLMConfig, is_torch_available from transformers.testing_utils import require_torch, slow, torch_device @@ -271,23 +273,17 @@ def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering(*config_and_inputs) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() def prepare_layoutlm_batch_inputs(): diff --git a/tests/models/lfm2_vl/test_modeling_lfm2_vl.py b/tests/models/lfm2_vl/test_modeling_lfm2_vl.py index 6391f2ea817b..5b030f5878ac 100644 --- a/tests/models/lfm2_vl/test_modeling_lfm2_vl.py +++ b/tests/models/lfm2_vl/test_modeling_lfm2_vl.py @@ -217,17 +217,17 @@ def test_attention_outputs(self): def test_sdpa_can_compile_dynamic(self): pass - @unittest.skip(reason="Backbone Siglip2VisionModel does not support standalone training") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip(reason="Backbone Siglip2VisionModel does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip(reason="Backbone Siglip2VisionModel does not support standalone training") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @require_torch_accelerator diff --git a/tests/models/lightglue/test_modeling_lightglue.py b/tests/models/lightglue/test_modeling_lightglue.py index 5dc57707b4ac..697f63b82717 100644 --- a/tests/models/lightglue/test_modeling_lightglue.py +++ b/tests/models/lightglue/test_modeling_lightglue.py @@ -161,20 +161,20 @@ def test_model_get_set_embeddings(self): def test_feed_forward_chunking(self): pass - @unittest.skip(reason="LightGlueForKeypointMatching is not trainable") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="LightGlueForKeypointMatching is not trainable") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip(reason="LightGlueForKeypointMatching is not trainable") - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="LightGlueForKeypointMatching is not trainable") - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="LightGlue does not output any loss term in the forward pass") diff --git a/tests/models/lilt/test_modeling_lilt.py b/tests/models/lilt/test_modeling_lilt.py index 33230c0128de..4f1a65791c3e 100644 --- a/tests/models/lilt/test_modeling_lilt.py +++ b/tests/models/lilt/test_modeling_lilt.py @@ -15,6 +15,8 @@ import unittest +import pytest + from transformers import LiltConfig, is_torch_available from transformers.testing_utils import require_torch, slow, torch_device @@ -271,23 +273,13 @@ def test_for_question_answering(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_question_answering(*config_and_inputs) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() @slow def test_model_from_pretrained(self): diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 6dfbec25762e..01f345fb2f11 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -16,6 +16,7 @@ import copy import unittest +import pytest import requests from parameterized import parameterized @@ -264,23 +265,17 @@ def test_vision_feature_layers(self, vision_feature_layer): assert base_model.multi_modal_projector.linear_1.in_features == expected_features model(**input_dict) - @unittest.skip( - reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip( "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index cb6a6c934803..c93ed8a18e70 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -16,6 +16,7 @@ import copy import unittest +import pytest import requests from huggingface_hub import hf_hub_download from parameterized import parameterized @@ -294,23 +295,17 @@ def test_vision_feature_layers(self, vision_feature_layer): assert base_model.multi_modal_projector.linear_1.in_features == expected_features model(**input_dict) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip( "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index aba7b644f733..394be32cbba5 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -17,6 +17,7 @@ import unittest import numpy as np +import pytest from huggingface_hub import hf_hub_download from parameterized import parameterized @@ -301,23 +302,17 @@ def test_vision_feature_layers(self, vision_feature_layer): assert base_model.multi_modal_projector.linear_1.in_features == expected_features model(**input_dict) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip("FlashAttention only support fp16 and bf16 data type") def test_flash_attn_2_fp32_ln(self): diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py index 603bd260ad75..a4000b5ccef3 100644 --- a/tests/models/llava_onevision/test_modeling_llava_onevision.py +++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py @@ -16,6 +16,7 @@ import unittest import numpy as np +import pytest import requests from huggingface_hub import hf_hub_download from parameterized import parameterized @@ -268,23 +269,17 @@ def test_vision_feature_layers(self, vision_feature_layer): assert base_model.multi_modal_projector.linear_1.in_features == expected_features model(**input_dict) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip( "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" diff --git a/tests/models/longcat_flash/test_modeling_longcat_flash.py b/tests/models/longcat_flash/test_modeling_longcat_flash.py index 85c203bae44f..0c916dac558e 100644 --- a/tests/models/longcat_flash/test_modeling_longcat_flash.py +++ b/tests/models/longcat_flash/test_modeling_longcat_flash.py @@ -234,18 +234,6 @@ def _check_past_key_values_for_generate(self, batch_size, past_key_values, seq_l self.assertEqual(past_key_values.layers[layer_idx].keys.shape, expected_key_shape) self.assertEqual(past_key_values.layers[layer_idx].values.shape, expected_value_shape) - @unittest.skip("MoE experts may not receive gradients with small test data") - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip("MoE experts may not receive gradients with small test data") - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip("MoE experts may not receive gradients with small test data") - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip("LongcatFlash router uses weight.type() directly in forward which prevents offloading") def test_cpu_offload(self): pass diff --git a/tests/models/luke/test_modeling_luke.py b/tests/models/luke/test_modeling_luke.py index f98ef0070f4a..ac29c9086f21 100644 --- a/tests/models/luke/test_modeling_luke.py +++ b/tests/models/luke/test_modeling_luke.py @@ -15,6 +15,8 @@ import unittest +import pytest + from transformers import LukeConfig, is_torch_available from transformers.testing_utils import require_torch, slow, torch_device @@ -860,23 +862,17 @@ def test_retain_grad_entity_hidden_states(self): self.assertIsNotNone(entity_hidden_states.grad) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @require_torch diff --git a/tests/models/marian/test_modeling_marian.py b/tests/models/marian/test_modeling_marian.py index 6e959c182d86..6d249b422b4a 100644 --- a/tests/models/marian/test_modeling_marian.py +++ b/tests/models/marian/test_modeling_marian.py @@ -17,6 +17,8 @@ import unittest from functools import cached_property +import pytest + from transformers import MarianConfig, is_torch_available from transformers.testing_utils import ( require_sentencepiece, @@ -329,23 +331,17 @@ def test_resize_decoder_token_embeddings(self): def test_tie_word_embeddings_decoder(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() def assert_tensors_close(a, b, atol=1e-12, prefix=""): diff --git a/tests/models/metaclip_2/test_modeling_metaclip_2.py b/tests/models/metaclip_2/test_modeling_metaclip_2.py index 40e17e652098..575bbd28a0cc 100644 --- a/tests/models/metaclip_2/test_modeling_metaclip_2.py +++ b/tests/models/metaclip_2/test_modeling_metaclip_2.py @@ -18,6 +18,7 @@ import unittest import numpy as np +import pytest import requests from parameterized import parameterized @@ -252,24 +253,20 @@ def test_model_with_projection(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_with_projection(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow @@ -419,24 +416,20 @@ def test_model_with_projection(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_with_projection(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="MetaClip2 does not use inputs_embeds") @@ -648,17 +641,17 @@ def test_inputs_embeds(self): def test_model_get_set_embeddings(self): pass - @unittest.skip(reason="MetaClip2ForImageClassification does not support gradient checkpointing yet") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip(reason="MetaClip2ForImageClassification does not support gradient checkpointing yet") - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip(reason="MetaClip2ForImageClassification does not support gradient checkpointing yet") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @parameterized.expand(TEST_EAGER_MATCHES_SDPA_INFERENCE_PARAMETERIZATION) @slow diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py index 9cd6d63c1e84..545b69c3faef 100644 --- a/tests/models/moshi/test_modeling_moshi.py +++ b/tests/models/moshi/test_modeling_moshi.py @@ -713,17 +713,17 @@ def test_generate_without_input_ids(self): print(output_ids_generate) self.assertIsNotNone(output_ids_generate) - @unittest.skip(reason="The audio encoder has no gradients.") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip(reason="The audio encoder has no gradients.") - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip(reason="The audio encoder has no gradients.") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() def test_generate_from_input_values(self): for model_class in self.all_generative_model_classes: diff --git a/tests/models/mra/test_modeling_mra.py b/tests/models/mra/test_modeling_mra.py index a3cb330c0a3c..663dd43f49f4 100644 --- a/tests/models/mra/test_modeling_mra.py +++ b/tests/models/mra/test_modeling_mra.py @@ -15,6 +15,8 @@ import unittest +import pytest + from transformers import MraConfig, is_torch_available from transformers.testing_utils import require_torch, slow, torch_device @@ -317,23 +319,17 @@ def test_model_from_pretrained(self): def test_attention_outputs(self): return - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip( reason="Model has `nan` in hidden_states, see https://github.com/huggingface/transformers/issues/29373." diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py index 76dba9c18f00..5d95d30c7e17 100644 --- a/tests/models/owlv2/test_modeling_owlv2.py +++ b/tests/models/owlv2/test_modeling_owlv2.py @@ -182,24 +182,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="OwlV2 does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="OwlV2 does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow @@ -315,24 +311,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="OwlV2 does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="OwlV2 does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="OWLV2 does not use inputs_embeds") @@ -574,24 +566,20 @@ def test_model_get_set_embeddings(self): def test_forward_signature(self): pass - @unittest.skip(reason="OwlV2 does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="OwlV2 does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index bff850b038ab..cd0ae82a94ef 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -180,24 +180,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="OWL-ViT does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="OWL-ViT does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow @@ -311,24 +307,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="OWL-ViT does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="OWL-ViT does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="OWLVIT does not use inputs_embeds") @@ -567,24 +559,20 @@ def test_model_get_set_embeddings(self): def test_forward_signature(self): pass - @unittest.skip(reason="OWL-ViT does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="OWL-ViT does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index d681bffd878a..5d8adacceada 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -16,6 +16,7 @@ import copy import unittest +import pytest import requests from transformers import ( @@ -228,23 +229,17 @@ def test_mismatching_num_image_tokens(self): pixel_values = torch.cat([pixel_values, pixel_values], dim=0) _ = model(input_ids=input_ids, pixel_values=pixel_values) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") def test_cpu_offload(self): diff --git a/tests/models/paligemma2/test_modeling_paligemma2.py b/tests/models/paligemma2/test_modeling_paligemma2.py index a000d68d20a1..0238b48c2232 100644 --- a/tests/models/paligemma2/test_modeling_paligemma2.py +++ b/tests/models/paligemma2/test_modeling_paligemma2.py @@ -16,6 +16,8 @@ import copy import unittest +import pytest + from transformers import ( PaliGemmaConfig, PaliGemmaForConditionalGeneration, @@ -208,23 +210,17 @@ def test_mismatching_num_image_tokens(self): pixel_values = torch.cat([pixel_values, pixel_values], dim=0) _ = model(input_ids=input_ids, pixel_values=pixel_values) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip(reason="Some undefined behavior encountered with test versions of this model. Skip for now.") def test_cpu_offload(self): diff --git a/tests/models/pegasus/test_modeling_pegasus.py b/tests/models/pegasus/test_modeling_pegasus.py index 9753d15a3a08..a04b8a729551 100644 --- a/tests/models/pegasus/test_modeling_pegasus.py +++ b/tests/models/pegasus/test_modeling_pegasus.py @@ -17,6 +17,8 @@ import unittest from functools import cached_property +import pytest + from transformers import PegasusConfig, is_torch_available from transformers.testing_utils import ( require_sentencepiece, @@ -273,23 +275,17 @@ def test_generate_fp16(self): model.generate(input_ids, attention_mask=attention_mask) model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() def assert_tensors_close(a, b, atol=1e-12, prefix=""): diff --git a/tests/models/perception_lm/test_modeling_perception_lm.py b/tests/models/perception_lm/test_modeling_perception_lm.py index c679571bf132..a1aba5565482 100644 --- a/tests/models/perception_lm/test_modeling_perception_lm.py +++ b/tests/models/perception_lm/test_modeling_perception_lm.py @@ -281,14 +281,14 @@ def test_training_gradient_checkpointing(self): self.all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else () super().test_training_gradient_checkpointing() - def test_training_gradient_checkpointing_use_reentrant(self): - self.all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else () - super().test_training_gradient_checkpointing_use_reentrant() - def test_training_gradient_checkpointing_use_reentrant_false(self): self.all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else () super().test_training_gradient_checkpointing_use_reentrant_false() + def test_training_gradient_checkpointing_use_reentrant_true(self): + self.all_model_classes = (PerceptionLMForConditionalGeneration,) if is_torch_available() else () + super().test_training_gradient_checkpointing_use_reentrant_true() + @unittest.skip( reason="PE/TIMM's attention implementation is self configured and won't raise ValueError on global attention implementation." ) diff --git a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py index cc6b60690fa5..0fb1ed87fd00 100644 --- a/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py +++ b/tests/models/phi4_multimodal/test_modeling_phi4_multimodal.py @@ -208,17 +208,17 @@ def setUp(self): self.model_tester = Phi4MultimodalModelTester(self) self.config_tester = ConfigTester(self, config_class=Phi4MultimodalConfig) - @unittest.skip(reason="Depending on input modalities, some params may not have gradients") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip(reason="Depending on input modalities, some params may not have gradients") - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip(reason="Depending on input modalities, some params may not have gradients") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip(reason="Test tries to instantiate dynamic cache with an arg") def test_multi_gpu_data_parallel_forward(self): diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py index 68a52d235af0..7f7a950c5664 100644 --- a/tests/models/pix2struct/test_modeling_pix2struct.py +++ b/tests/models/pix2struct/test_modeling_pix2struct.py @@ -183,24 +183,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`") @@ -321,24 +317,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="Training is tested directly on `Pix2StructTextImageModelTest`") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="Pix2Struct does not use inputs_embeds") @@ -496,7 +488,7 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_training_gradient_checkpointing(self): + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): if not self.model_tester.is_training: self.skipTest(reason="model_tester.is_training is set to False") @@ -507,7 +499,7 @@ def test_training_gradient_checkpointing(self): model = model_class(config) model.to(torch_device) - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) diff --git a/tests/models/prompt_depth_anything/test_modeling_prompt_depth_anything.py b/tests/models/prompt_depth_anything/test_modeling_prompt_depth_anything.py index 4910c1cfaa9b..a57b73d27128 100644 --- a/tests/models/prompt_depth_anything/test_modeling_prompt_depth_anything.py +++ b/tests/models/prompt_depth_anything/test_modeling_prompt_depth_anything.py @@ -169,30 +169,26 @@ def test_for_depth_estimation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs) - @unittest.skip(reason="Prompt Depth Anything does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="Prompt Depth Anything does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="Prompt Depth Anything with AutoBackbone does not have a base model and hence no input_embeddings" - ) - def test_model_get_set_embeddings(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip( - reason="This architecture seems to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + reason="Prompt Depth Anything with AutoBackbone does not have a base model and hence no input_embeddings" ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + def test_model_get_set_embeddings(self): pass @slow diff --git a/tests/models/pvt_v2/test_modeling_pvt_v2.py b/tests/models/pvt_v2/test_modeling_pvt_v2.py index a9ef785f9228..536fc50aa362 100644 --- a/tests/models/pvt_v2/test_modeling_pvt_v2.py +++ b/tests/models/pvt_v2/test_modeling_pvt_v2.py @@ -17,6 +17,8 @@ import tempfile import unittest +import pytest + from transformers import PvtV2Backbone, PvtV2Config, is_torch_available, is_vision_available from transformers.models.auto.modeling_auto import MODEL_MAPPING_NAMES from transformers.testing_utils import ( @@ -175,17 +177,6 @@ def test_inputs_embeds(self): def test_model_get_set_embeddings(self): pass - @unittest.skip(reason="This architecture does not work with using reentrant.") - def test_training_gradient_checkpointing(self): - # Scenario - 1 default behaviour - self.check_training_gradient_checkpointing() - - @unittest.skip(reason="This architecture does not work with using reentrant.") - def test_training_gradient_checkpointing_use_reentrant(self): - # Scenario - 2 with `use_reentrant=True` - this is the default value that is used in pytorch's - # torch.utils.checkpoint.checkpoint - self.check_training_gradient_checkpointing(gradient_checkpointing_kwargs={"use_reentrant": True}) - def test_hidden_states_output(self): def check_hidden_states_output(inputs_dict, config, model_class): model = model_class(config) @@ -257,6 +248,10 @@ def test_model_from_pretrained(self): model = PvtV2Model.from_pretrained(model_name) self.assertIsNotNone(model) + @pytest.mark.xfail(reason="This architecture does not seem to be compatible with use_reentrant=True.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() + @require_torch class PvtV2ModelIntegrationTest(unittest.TestCase): diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 5852b474fc4f..4b57ccaae859 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -376,7 +376,7 @@ def test_enable_input_require_grads_with_gradient_checkpointing(self): model = model_class(config) model.to(torch_device) - model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": True}) + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False}) model.enable_input_require_grads() model.train() diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py index 574cabf54dee..b63e734bab4d 100644 --- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py +++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py @@ -51,10 +51,6 @@ class RecurrentGemmaModelTest(CausalLMModelTest, unittest.TestCase): def test_eager_matches_sdpa_generate(self): pass - @unittest.skip(reason="SQRBound is known to have issues with gc") - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip(reason="Past key values are not returned") def test_prompt_lookup_decoding_matches_greedy_search(self): pass diff --git a/tests/models/roformer/test_modeling_roformer.py b/tests/models/roformer/test_modeling_roformer.py index bf4fe458ebbc..9e605e1fa276 100644 --- a/tests/models/roformer/test_modeling_roformer.py +++ b/tests/models/roformer/test_modeling_roformer.py @@ -16,6 +16,8 @@ import copy import unittest +import pytest + from transformers import RoFormerConfig, is_torch_available from transformers.testing_utils import require_torch, slow, torch_device @@ -468,23 +470,17 @@ def test_model_from_pretrained(self): model = RoFormerModel.from_pretrained(model_name) self.assertIsNotNone(model) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @require_torch diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py index 935126f57907..4286f8c2bee0 100644 --- a/tests/models/sam/test_modeling_sam.py +++ b/tests/models/sam/test_modeling_sam.py @@ -226,24 +226,20 @@ def test_attention_outputs(self): list(expected_attention_shape), ) - @unittest.skip(reason="SamVisionModel does not support training") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="SamVisionModel does not support training") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="SamVisionModel does not support training") @@ -618,24 +614,20 @@ def test_attention_outputs(self): list(expected_mask_decoder_attention_shape), ) - @unittest.skip(reason="SamModel does not support training") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="SamModel does not support training") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="SamModel does not support training") diff --git a/tests/models/sam_hq/test_modeling_sam_hq.py b/tests/models/sam_hq/test_modeling_sam_hq.py index 77969508d737..dcb56c9b5da7 100644 --- a/tests/models/sam_hq/test_modeling_sam_hq.py +++ b/tests/models/sam_hq/test_modeling_sam_hq.py @@ -233,24 +233,20 @@ def test_attention_outputs(self): list(expected_attention_shape), ) - @unittest.skip(reason="SamVisionModel does not support training") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="SamVisionModel does not support training") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="SamVisionModel does not support training") @@ -661,24 +657,20 @@ def test_attention_outputs(self): list(expected_mask_decoder_attention_shape), ) - @unittest.skip(reason="SamHQModel does not support training") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="SamHQModel does not support training") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="SamHQModel does not support training") diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py index b2ac9930a4f9..6ddf803879b9 100644 --- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py @@ -18,6 +18,8 @@ import unittest from functools import cached_property +import pytest + from transformers import SeamlessM4TConfig, is_speech_available, is_torch_available from transformers.testing_utils import require_speech, require_torch, slow, torch_device from transformers.trainer_utils import set_seed @@ -387,23 +389,17 @@ def test_model_weights_reload_no_missing_tied_weights(self): def test_forward_signature(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip( reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245" @@ -587,23 +583,17 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip( reason="In training model, the first encoder layer is sometimes skipped. Training is not supported yet, so the test is ignored." diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py index 1a07f8b7d3c7..bd4852ae3efd 100644 --- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py +++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py @@ -18,6 +18,8 @@ import unittest from functools import cached_property +import pytest + from transformers import SeamlessM4Tv2Config, is_speech_available, is_torch_available from transformers.testing_utils import require_speech, require_torch, slow, torch_device from transformers.trainer_utils import set_seed @@ -413,23 +415,17 @@ def test_model_weights_reload_no_missing_tied_weights(self): def test_forward_signature(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip( reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245" @@ -600,23 +596,17 @@ def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip( reason="This architecture has tied weights by default and there is no way to remove it, check: https://github.com/huggingface/transformers/pull/31771#issuecomment-2210915245" diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py index 4cc51ebaaba4..dd3a3d56346b 100644 --- a/tests/models/siglip/test_modeling_siglip.py +++ b/tests/models/siglip/test_modeling_siglip.py @@ -18,6 +18,7 @@ import unittest import numpy as np +import pytest import requests from parameterized import parameterized @@ -241,18 +242,6 @@ def test_model(self): def test_training(self): pass - @unittest.skip(reason="SiglipVisionModel does not support standalone training") - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip(reason="SiglipVisionModel does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip(reason="SiglipVisionModel does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @slow def test_model_from_pretrained(self): model_name = "google/siglip-base-patch16-224" @@ -372,20 +361,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="SiglipTextModel does not support standalone training") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="SiglipTextModel does not support standalone training") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip(reason="SiglipTextModel does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="SiglipTextModel does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="Siglip does not use inputs_embeds") @@ -569,17 +558,17 @@ def test_inputs_embeds(self): def test_model_get_set_embeddings(self): pass - @unittest.skip(reason="SiglipForImageClassification does not support gradient checkpointing yet") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip(reason="SiglipForImageClassification does not support gradient checkpointing yet") - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip(reason="SiglipForImageClassification does not support gradient checkpointing yet") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() # We will verify our results on an image of cute cats diff --git a/tests/models/siglip2/test_modeling_siglip2.py b/tests/models/siglip2/test_modeling_siglip2.py index f45165cbf200..2c2108407c3d 100644 --- a/tests/models/siglip2/test_modeling_siglip2.py +++ b/tests/models/siglip2/test_modeling_siglip2.py @@ -18,6 +18,7 @@ import unittest import numpy as np +import pytest from parameterized import parameterized from pytest import mark @@ -314,22 +315,6 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="Siglip2VisionModel does not support standalone training") - def test_training(self): - pass - - @unittest.skip(reason="Siglip2VisionModel does not support standalone training") - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip(reason="Siglip2VisionModel does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip(reason="Siglip2VisionModel does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @slow def test_model_from_pretrained(self): model_name = "google/siglip2-base-patch16-naflex" @@ -446,20 +431,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip(reason="Siglip2TextModel does not support standalone training") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="Siglip2TextModel does not support standalone training") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip(reason="Siglip2TextModel does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="Siglip2TextModel does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="Siglip2 does not use inputs_embeds") @@ -657,17 +642,17 @@ def test_inputs_embeds(self): def test_model_get_set_embeddings(self): pass - @unittest.skip(reason="Siglip2ForImageClassification does not support gradient checkpointing yet") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip(reason="Siglip2ForImageClassification does not support gradient checkpointing yet") - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip(reason="Siglip2ForImageClassification does not support gradient checkpointing yet") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() # Draw a circle on an images with different aspect ratios diff --git a/tests/models/smolvlm/test_modeling_smolvlm.py b/tests/models/smolvlm/test_modeling_smolvlm.py index 8005b7f88a87..e61db1f9022a 100644 --- a/tests/models/smolvlm/test_modeling_smolvlm.py +++ b/tests/models/smolvlm/test_modeling_smolvlm.py @@ -354,22 +354,6 @@ def test_flash_attn_2_inference_padding_right(self): def test_generate_methods_with_logits_to_keep(self): super().test_generate_methods_with_logits_to_keep() - @unittest.skip - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip(reason="Unsupported") def test_generate_with_static_cache(self): pass diff --git a/tests/models/speech_to_text/test_modeling_speech_to_text.py b/tests/models/speech_to_text/test_modeling_speech_to_text.py index ef58d0017213..8f31d7f9f51b 100644 --- a/tests/models/speech_to_text/test_modeling_speech_to_text.py +++ b/tests/models/speech_to_text/test_modeling_speech_to_text.py @@ -301,24 +301,20 @@ def test_encoder_decoder_model_standalone(self): def test_inputs_embeds(self): pass - @unittest.skip(reason="Training is not supported yet") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @require_torch_fp16 diff --git a/tests/models/speecht5/test_modeling_speecht5.py b/tests/models/speecht5/test_modeling_speecht5.py index ade0a31217ff..8db2942c7ccd 100644 --- a/tests/models/speecht5/test_modeling_speecht5.py +++ b/tests/models/speecht5/test_modeling_speecht5.py @@ -645,22 +645,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_training(self): pass - @unittest.skip(reason="Training is not supported yet") - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: @@ -932,22 +916,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_training(self): pass - @unittest.skip(reason="training is not supported yet") - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: @@ -1592,22 +1560,6 @@ def test_save_load(self): def test_training(self): pass - @unittest.skip(reason="Training is not supported yet") - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - # overwrite from test_modeling_common def _mock_init_weights(self, module): if hasattr(module, "weight") and module.weight is not None: diff --git a/tests/models/splinter/test_modeling_splinter.py b/tests/models/splinter/test_modeling_splinter.py index 59d4537171b2..97a948d6eb9e 100644 --- a/tests/models/splinter/test_modeling_splinter.py +++ b/tests/models/splinter/test_modeling_splinter.py @@ -360,18 +360,6 @@ def test_multi_gpu_data_parallel_forward(self): with torch.no_grad(): _ = model(**self._prepare_for_class(inputs_dict, model_class)) - @unittest.skip( - "Splinter GC with `use_reentrant` fails after #38751, FIXME raushan after deprecated args are removed" - ) - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - "Splinter GC with `use_reentrant` fails after #38751, FIXME raushan after deprecated args are removed" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - @require_torch class SplinterModelIntegrationTest(unittest.TestCase): diff --git a/tests/models/superglue/test_modeling_superglue.py b/tests/models/superglue/test_modeling_superglue.py index cc524712e793..fbc37d2b3078 100644 --- a/tests/models/superglue/test_modeling_superglue.py +++ b/tests/models/superglue/test_modeling_superglue.py @@ -147,20 +147,20 @@ def test_model_get_set_embeddings(self): def test_feed_forward_chunking(self): pass - @unittest.skip(reason="SuperGlueForKeypointMatching is not trainable") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="SuperGlueForKeypointMatching is not trainable") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip(reason="SuperGlueForKeypointMatching is not trainable") - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="SuperGlueForKeypointMatching is not trainable") - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="SuperGlue does not output any loss term in the forward pass") diff --git a/tests/models/superpoint/test_modeling_superpoint.py b/tests/models/superpoint/test_modeling_superpoint.py index 09f159c571fd..dcd0b0ecb75e 100644 --- a/tests/models/superpoint/test_modeling_superpoint.py +++ b/tests/models/superpoint/test_modeling_superpoint.py @@ -147,20 +147,20 @@ def test_model_get_set_embeddings(self): def test_feed_forward_chunking(self): pass - @unittest.skip(reason="SuperPointForKeypointDetection does not support training") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="SuperPointForKeypointDetection does not support training") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip(reason="SuperPointForKeypointDetection does not support training") - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="SuperPointForKeypointDetection does not support training") - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="SuperPoint does not output any loss term in the forward pass") diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py index 41e74a6a0a66..825c1aabfd48 100644 --- a/tests/models/swin/test_modeling_swin.py +++ b/tests/models/swin/test_modeling_swin.py @@ -261,9 +261,6 @@ def test_model(self): def test_multi_gpu_data_parallel_forward(self): pass - def test_training_gradient_checkpointing(self): - super().test_training_gradient_checkpointing() - def test_backbone(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_backbone(*config_and_inputs) diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py index 9710c1a4c4af..679f1c6a6e1f 100644 --- a/tests/models/swin2sr/test_modeling_swin2sr.py +++ b/tests/models/swin2sr/test_modeling_swin2sr.py @@ -199,24 +199,20 @@ def test_multi_gpu_data_parallel_forward(self): def test_inputs_embeds(self): pass - @unittest.skip(reason="Swin2SR does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="Swin2SR does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass def test_model_get_set_embeddings(self): diff --git a/tests/models/t5gemma/test_modeling_t5gemma.py b/tests/models/t5gemma/test_modeling_t5gemma.py index 5897f08601f2..a22c0ea67a53 100644 --- a/tests/models/t5gemma/test_modeling_t5gemma.py +++ b/tests/models/t5gemma/test_modeling_t5gemma.py @@ -1504,20 +1504,20 @@ def test_with_token_classification_head(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_with_token_classification_head(*config_and_inputs) - @unittest.skip("No loss in the output of T5GemmaEncoderModel") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip("No loss in the output of T5GemmaEncoderModel") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip("No loss in the output of T5GemmaEncoderModel") - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip("No loss in the output of T5GemmaEncoderModel") - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass # Based on tests.test_modeling_common.ModelTesterMixin.test_flex_attention_with_grads diff --git a/tests/models/t5gemma2/test_modeling_t5gemma2.py b/tests/models/t5gemma2/test_modeling_t5gemma2.py index de6fbc2c7699..d30932f4bf35 100644 --- a/tests/models/t5gemma2/test_modeling_t5gemma2.py +++ b/tests/models/t5gemma2/test_modeling_t5gemma2.py @@ -988,17 +988,17 @@ def test_custom_4d_attention_mask(self): torch.testing.assert_close(normalized_0[2], normalized_1[2], rtol=1e-3, atol=1e-4) torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-4) - @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip(reason="SiglipVisionModel (vision backbone) does not support standalone training") def test_torch_compile_for_training(self): diff --git a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py index 7cf421a10404..74cda31cb083 100644 --- a/tests/models/time_series_transformer/test_modeling_time_series_transformer.py +++ b/tests/models/time_series_transformer/test_modeling_time_series_transformer.py @@ -17,6 +17,7 @@ import tempfile import unittest +import pytest from huggingface_hub import hf_hub_download from parameterized import parameterized @@ -361,23 +362,17 @@ def test_attention_outputs(self): [self.model_tester.num_attention_heads, encoder_seq_length, encoder_seq_length], ) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @parameterized.expand( [ diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py index 1e6f6d2af8d5..7052923a93e2 100644 --- a/tests/models/udop/test_modeling_udop.py +++ b/tests/models/udop/test_modeling_udop.py @@ -17,6 +17,7 @@ import unittest from functools import cached_property +import pytest from datasets import load_dataset from transformers import UdopConfig, is_torch_available @@ -314,21 +315,17 @@ def test_model_fp16_forward(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs) - @unittest.skip(reason="Gradient checkpointing is not supported by this model") + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip(reason="Udop has no separate base model without a head.") def test_model_base_model_prefix(self): diff --git a/tests/models/umt5/test_modeling_umt5.py b/tests/models/umt5/test_modeling_umt5.py index a6f90b5edc06..dc4fd60dc608 100644 --- a/tests/models/umt5/test_modeling_umt5.py +++ b/tests/models/umt5/test_modeling_umt5.py @@ -343,24 +343,6 @@ def test_model_fp16_forward(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip(reason="UMT5 has no separate base model without a head.") def test_model_base_model_prefix(self): pass diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index 3bcea272dd51..4d4db3f3a482 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -17,6 +17,7 @@ import unittest import numpy as np +import pytest import requests from huggingface_hub import hf_hub_download from parameterized import parameterized @@ -215,23 +216,17 @@ def setUp(self): def test_config(self): self.config_tester.run_common_tests() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip( "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" diff --git a/tests/models/vilt/test_modeling_vilt.py b/tests/models/vilt/test_modeling_vilt.py index 207414f2bd72..39fc3b4bf4bb 100644 --- a/tests/models/vilt/test_modeling_vilt.py +++ b/tests/models/vilt/test_modeling_vilt.py @@ -291,7 +291,7 @@ def test_training(self): loss = model(**inputs).loss loss.backward() - def test_training_gradient_checkpointing(self): + def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None): if not self.model_tester.is_training: self.skipTest(reason="model_tester.is_training is set to False.") @@ -309,24 +309,12 @@ def test_training_gradient_checkpointing(self): model = model_class(config) model.to(torch_device) - model.gradient_checkpointing_enable() + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=gradient_checkpointing_kwargs) model.train() inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) loss = model(**inputs).loss loss.backward() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - @unittest.skip( reason="""VilT samples image tokens from a multinomial distribution, resulting in not deterministic hidden states""" diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index 36dd68df81c1..f08668eecd5b 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -16,6 +16,7 @@ import copy import unittest +import pytest import requests from parameterized import parameterized @@ -255,23 +256,17 @@ def test_vision_feature_layers(self, vision_feature_layers): assert base_model.multi_modal_projector.linear_1.in_features == expected_features model(**input_dict) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @unittest.skip( "VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test" diff --git a/tests/models/visual_bert/test_modeling_visual_bert.py b/tests/models/visual_bert/test_modeling_visual_bert.py index 67fe5af6e8c0..0fa3ad4b461b 100644 --- a/tests/models/visual_bert/test_modeling_visual_bert.py +++ b/tests/models/visual_bert/test_modeling_visual_bert.py @@ -16,6 +16,8 @@ import copy import unittest +import pytest + from transformers import VisualBertConfig, is_torch_available from transformers.testing_utils import require_torch, slow, torch_device @@ -546,23 +548,17 @@ def test_model_from_pretrained(self): model = VisualBertModel.from_pretrained(model_name) self.assertIsNotNone(model) - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass + super().test_training_gradient_checkpointing() - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") def test_training_gradient_checkpointing_use_reentrant_false(self): - pass + super().test_training_gradient_checkpointing_use_reentrant_false() + + @pytest.mark.xfail(reason="This architecture seems to not compute gradients for some layer.") + def test_training_gradient_checkpointing_use_reentrant_true(self): + super().test_training_gradient_checkpointing_use_reentrant_true() @require_torch diff --git a/tests/models/vitmatte/test_modeling_vitmatte.py b/tests/models/vitmatte/test_modeling_vitmatte.py index 64ff2e582b77..6857dafdfbbd 100644 --- a/tests/models/vitmatte/test_modeling_vitmatte.py +++ b/tests/models/vitmatte/test_modeling_vitmatte.py @@ -160,24 +160,20 @@ def test_config(self): def test_inputs_embeds(self): pass - @unittest.skip(reason="Training is not yet supported") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="Training is not yet supported") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="ViTMatte does not support input and output embeddings") diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py index 645204331bf7..8653dde6a025 100644 --- a/tests/models/vitpose/test_modeling_vitpose.py +++ b/tests/models/vitpose/test_modeling_vitpose.py @@ -182,20 +182,20 @@ def test_inputs_embeds(self): def test_model_get_set_embeddings(self): pass - @unittest.skip(reason="VitPose does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="VitPose does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip(reason="VitPose does not support training yet") - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="VitPose does not support training yet") - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass def test_forward_signature(self): diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py index a3671b61b477..f6a5c69b4917 100644 --- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py +++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py @@ -164,18 +164,6 @@ def test_retain_grad_hidden_states_attentions(self): def test_training(self): pass - @unittest.skip(reason="VitPoseBackbone does not support training yet") - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip(reason="VitPoseBackbone does not support training yet") - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip(reason="VitPoseBackbone does not support training yet") - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/vjepa2/test_modeling_vjepa2.py b/tests/models/vjepa2/test_modeling_vjepa2.py index 00716696fe6f..84bfb09b5d7f 100644 --- a/tests/models/vjepa2/test_modeling_vjepa2.py +++ b/tests/models/vjepa2/test_modeling_vjepa2.py @@ -169,24 +169,6 @@ def test_config(self): def test_inputs_embeds(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): - pass - - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): - pass - def test_model_get_set_embeddings(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index f0739460f46d..b609b86c9f7b 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -506,25 +506,20 @@ def test_beam_search_output(self): self.assertEqual(output.beam_indices.shape[0], input_features.shape[0] * 3) self.assertEqual(output.sequences_scores.shape[0], input_features.shape[0] * 3) - # training is not supported yet - @unittest.skip(reason="Training is not supported yet") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="Training is not supported yet") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @parameterized.expand([("offloaded",)]) diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index dfa7084403f8..dbd09deaa4c2 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -188,24 +188,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow @@ -408,24 +404,20 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip( - reason="This architecture seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" - ) - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @unittest.skip(reason="X-CLIP does not use inputs_embeds") diff --git a/tests/models/zoedepth/test_modeling_zoedepth.py b/tests/models/zoedepth/test_modeling_zoedepth.py index 88cd9d1f9a64..e46bb6cbbd7d 100644 --- a/tests/models/zoedepth/test_modeling_zoedepth.py +++ b/tests/models/zoedepth/test_modeling_zoedepth.py @@ -173,20 +173,20 @@ def test_for_depth_estimation(self): def test_model_common_attributes(self): pass - @unittest.skip(reason="ZoeDepth does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training(self): pass - @unittest.skip(reason="ZoeDepth does not support training yet") + @unittest.skip(reason="This module does not support standalone training") def test_training_gradient_checkpointing(self): pass - @unittest.skip(reason="ZoeDepth does not support training yet") - def test_training_gradient_checkpointing_use_reentrant(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_false(self): pass - @unittest.skip(reason="ZoeDepth does not support training yet") - def test_training_gradient_checkpointing_use_reentrant_false(self): + @unittest.skip(reason="This module does not support standalone training") + def test_training_gradient_checkpointing_use_reentrant_true(self): pass @slow diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 67a645e8b180..79b826f88e28 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -1617,16 +1617,15 @@ def test_training_gradient_checkpointing(self): # Scenario - 1 default behaviour self.check_training_gradient_checkpointing() - def test_training_gradient_checkpointing_use_reentrant(self): - # Scenario - 2 with `use_reentrant=True` - this is the default value that is used in pytorch's - # torch.utils.checkpoint.checkpoint - self.check_training_gradient_checkpointing(gradient_checkpointing_kwargs={"use_reentrant": True}) - def test_training_gradient_checkpointing_use_reentrant_false(self): - # Scenario - 3 with `use_reentrant=False` pytorch suggests users to use this value for - # future releases: https://pytorch.org/docs/stable/checkpoint.html + # Scenario - 2 with `use_reentrant=False` - this is the default value that is used in pytorch's + # torch.utils.checkpoint.checkpoint self.check_training_gradient_checkpointing(gradient_checkpointing_kwargs={"use_reentrant": False}) + def test_training_gradient_checkpointing_use_reentrant_true(self): + # Scenario - 3 with `use_reentrant=True` (old default behaviour, not recommended) + self.check_training_gradient_checkpointing(gradient_checkpointing_kwargs={"use_reentrant": True}) + def test_attention_outputs(self): if not self.has_attentions: self.skipTest(reason="Model does not output attentions") diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index 35df83b70e67..e900fcb23430 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -1040,7 +1040,6 @@ def test_gradient_checkpointing(self): per_device_train_batch_size=1, learning_rate=0.1, gradient_checkpointing=True, - gradient_checkpointing_kwargs={"use_reentrant": False}, output_dir=tmp_dir, ) previous_params = {k: v.detach().clone() for k, v in trainer.model.named_parameters()}