diff --git a/src/transformers/models/janus/modeling_janus.py b/src/transformers/models/janus/modeling_janus.py index 959cdc6856f4..a526ce5d7af1 100644 --- a/src/transformers/models/janus/modeling_janus.py +++ b/src/transformers/models/janus/modeling_janus.py @@ -58,7 +58,7 @@ class JanusPreTrainedModel(PreTrainedModel): config_class = JanusConfig base_model_prefix = "model" supports_gradient_checkpointing = True - _no_split_modules = ["LlamaDecoderLayer"] + _no_split_modules = ["LlamaDecoderLayer", "JanusVisionEncoderLayer"] _skip_keys_device_placement = ["past_key_values", "causal_mask"] _supports_flash_attn_2 = True _supports_sdpa = True @@ -1133,6 +1133,7 @@ def forward( image_features = image_embeds.reshape(-1, embed_dim) image_attention_mask = image_attention_mask.unsqueeze(-1).expand(-1, -1, embed_dim) + image_attention_mask = image_attention_mask.to(inputs_embeds.device) image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features) diff --git a/src/transformers/models/janus/modular_janus.py b/src/transformers/models/janus/modular_janus.py index a6965687781c..0d484ffb0c05 100644 --- a/src/transformers/models/janus/modular_janus.py +++ b/src/transformers/models/janus/modular_janus.py @@ -379,7 +379,7 @@ class JanusPreTrainedModel(PreTrainedModel): config_class = JanusConfig base_model_prefix = "model" supports_gradient_checkpointing = True - _no_split_modules = ["LlamaDecoderLayer"] + _no_split_modules = ["LlamaDecoderLayer", "JanusVisionEncoderLayer"] _skip_keys_device_placement = ["past_key_values", "causal_mask"] _supports_flash_attn_2 = True _supports_sdpa = True @@ -971,6 +971,7 @@ def forward( image_features = image_embeds.reshape(-1, embed_dim) image_attention_mask = image_attention_mask.unsqueeze(-1).expand(-1, -1, embed_dim) + image_attention_mask = image_attention_mask.to(inputs_embeds.device) image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features) diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index 0e741fd9db71..efe9fbc17d2d 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -130,6 +130,7 @@ is_seqio_available, is_soundfile_available, is_spacy_available, + is_speech_available, is_spqr_available, is_sudachi_available, is_sudachi_projection_available, @@ -1476,6 +1477,13 @@ def require_tiktoken(test_case): return unittest.skipUnless(is_tiktoken_available(), "test requires TikToken")(test_case) +def require_speech(test_case): + """ + Decorator marking a test that requires speech. These tests are skipped when speech isn't available. + """ + return unittest.skipUnless(is_speech_available(), "test requires torchaudio")(test_case) + + def get_gpu_count(): """ Return the number of available gpus (regardless of whether torch, tf or jax is used) diff --git a/tests/models/janus/test_modeling_janus.py b/tests/models/janus/test_modeling_janus.py index 48cf7ebc2f5e..2729c0718c3a 100644 --- a/tests/models/janus/test_modeling_janus.py +++ b/tests/models/janus/test_modeling_janus.py @@ -35,6 +35,7 @@ from transformers.models.auto import get_values from transformers.models.auto.modeling_auto import MODEL_FOR_BACKBONE_MAPPING_NAMES, MODEL_MAPPING_NAMES from transformers.testing_utils import ( + Expectations, require_torch, slow, torch_device, @@ -538,12 +539,21 @@ def test_model_generate_images(self): self.assertTrue(out.shape[1] == 576) # fmt: off - expected_tokens = torch.tensor([4484, 4015, 15750, 506, 3758, 11651, 8597, 5739, 4861, 971, - 14985, 14834, 15438, 7548, 1820, 1465, 13529, 12761, 10503, 12761, - 14303, 6155, 4015, 11766, 705, 15736, 14146, 10417, 1951, 7713, - 14305, 15617, 6169, 2706, 8006, 14893, 3855, 10188, 15652, 6297, - 1097, 12108, 15038, 311, 14998, 15165, 897, 4044, 1762, 4676, - ]).to(model.device) + expected_tokens = Expectations( + { + ("rocm", None): [10367, 1380, 4841, 15155, 1224, 16361, 15834, 13722, 15258, 8321, 10496, 14532, 8770, + 12353, 5481, 11484, 2585, 8587, 3201, 14292, 3356, 2037, 3077, 6107, 3758, 2572, 9376, + 13219, 6007, 14292, 12696, 10666, 10046, 13483, 8282, 9101, 5208, 4260, 13886, 13335, + 6135, 2316, 15423, 311, 5460, 12218, 14172, 8583, 14577, 3648 + ], + ("cuda", None): [4484, 4015, 15750, 506, 3758, 11651, 8597, 5739, 4861, 971, 14985, 14834, 15438, 7548, + 1820, 1465, 13529, 12761, 10503, 12761, 14303, 6155, 4015, 11766, 705, 15736, 14146, + 10417, 1951, 7713, 14305, 15617, 6169, 2706, 8006, 14893, 3855, 10188, 15652, 6297, + 1097, 12108, 15038, 311, 14998, 15165, 897, 4044, 1762, 4676 + ], + } + ) + expected_tokens = torch.tensor(expected_tokens.get_expectation()).to(model.device) # fmt: on # Compare the first 50 generated tokens. diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py index e802e8cfb921..505237239376 100644 --- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py +++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py @@ -18,7 +18,7 @@ import unittest from transformers import SeamlessM4TConfig, is_speech_available, is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_speech, require_torch, slow, torch_device from transformers.trainer_utils import set_seed from transformers.utils import cached_property @@ -1028,6 +1028,7 @@ def test_to_swh_text(self): self.assertListAlmostEqual(expected_wav_slice, output.waveform.squeeze().tolist()[50:60]) + @require_speech @slow def test_to_rus_speech(self): model = SeamlessM4TModel.from_pretrained(self.repo_id).to(torch_device) @@ -1066,6 +1067,7 @@ def test_text_to_text_model(self): } self.factory_test_task(SeamlessM4TModel, SeamlessM4TForTextToText, self.input_text, kwargs1, kwargs2) + @require_speech @slow def test_speech_to_text_model(self): kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True, "generate_speech": False} @@ -1077,6 +1079,7 @@ def test_speech_to_text_model(self): } self.factory_test_task(SeamlessM4TModel, SeamlessM4TForSpeechToText, self.input_audio, kwargs1, kwargs2) + @require_speech @slow def test_speech_to_speech_model(self): kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True} diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py index 75ff7edccbdd..eab2b0bd282f 100644 --- a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py +++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py @@ -18,7 +18,7 @@ import unittest from transformers import SeamlessM4Tv2Config, is_speech_available, is_torch_available -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_speech, require_torch, slow, torch_device from transformers.trainer_utils import set_seed from transformers.utils import cached_property @@ -1095,6 +1095,7 @@ def test_to_swh_text(self): [-2.001826e-04, 8.580012e-02], [output.waveform.mean().item(), output.waveform.std().item()] ) + @require_speech @slow def test_to_rus_speech(self): model = SeamlessM4Tv2Model.from_pretrained(self.repo_id).to(torch_device) @@ -1139,6 +1140,7 @@ def test_text_to_text_model(self): } self.factory_test_task(SeamlessM4Tv2Model, SeamlessM4Tv2ForTextToText, self.input_text, kwargs1, kwargs2) + @require_speech @slow def test_speech_to_text_model(self): kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True, "generate_speech": False} @@ -1150,6 +1152,7 @@ def test_speech_to_text_model(self): } self.factory_test_task(SeamlessM4Tv2Model, SeamlessM4Tv2ForSpeechToText, self.input_audio, kwargs1, kwargs2) + @require_speech @slow def test_speech_to_speech_model(self): kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True}