diff --git a/tests/models/aya_vision/test_modeling_aya_vision.py b/tests/models/aya_vision/test_modeling_aya_vision.py index 8911d39ec10c..b4a2f345b895 100644 --- a/tests/models/aya_vision/test_modeling_aya_vision.py +++ b/tests/models/aya_vision/test_modeling_aya_vision.py @@ -71,7 +71,7 @@ def __init__( "vocab_size": 99, "hidden_size": 128, "intermediate_size": 37, - "num_hidden_layers": 4, + "num_hidden_layers": 2, "num_attention_heads": 4, "output_channels": 64, "hidden_act": "silu", diff --git a/tests/models/bamba/test_modeling_bamba.py b/tests/models/bamba/test_modeling_bamba.py index c2e7c435dbfa..06f99fc1c6ac 100644 --- a/tests/models/bamba/test_modeling_bamba.py +++ b/tests/models/bamba/test_modeling_bamba.py @@ -73,7 +73,7 @@ def __init__( use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, num_key_value_heads=2, intermediate_size=64, diff --git a/tests/models/bitnet/test_modeling_bitnet.py b/tests/models/bitnet/test_modeling_bitnet.py index 75d885ba4d51..19bc0c45eb2e 100644 --- a/tests/models/bitnet/test_modeling_bitnet.py +++ b/tests/models/bitnet/test_modeling_bitnet.py @@ -49,7 +49,7 @@ def __init__( use_input_mask=True, vocab_size=99, hidden_size=64, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, num_key_value_heads=2, intermediate_size=37, diff --git a/tests/models/bros/test_modeling_bros.py b/tests/models/bros/test_modeling_bros.py index 3a80497cafc6..8f3f5957e02e 100644 --- a/tests/models/bros/test_modeling_bros.py +++ b/tests/models/bros/test_modeling_bros.py @@ -49,7 +49,7 @@ def __init__( use_labels=True, vocab_size=99, hidden_size=64, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py index 427a7f447d74..436d1f9d4226 100644 --- a/tests/models/cohere/test_modeling_cohere.py +++ b/tests/models/cohere/test_modeling_cohere.py @@ -54,7 +54,7 @@ def __init__( use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/cohere2_vision/test_modeling_cohere2_vision.py b/tests/models/cohere2_vision/test_modeling_cohere2_vision.py index 7a12c2ad9fca..776b2b254f17 100644 --- a/tests/models/cohere2_vision/test_modeling_cohere2_vision.py +++ b/tests/models/cohere2_vision/test_modeling_cohere2_vision.py @@ -65,7 +65,7 @@ def __init__( "vocab_size": 99, "hidden_size": 128, "intermediate_size": 37, - "num_hidden_layers": 4, + "num_hidden_layers": 2, "num_attention_heads": 4, "output_channels": 64, "hidden_act": "silu", diff --git a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py index 9ed521509408..62bb9c999958 100644 --- a/tests/models/deepseek_v3/test_modeling_deepseek_v3.py +++ b/tests/models/deepseek_v3/test_modeling_deepseek_v3.py @@ -65,7 +65,7 @@ def __init__( hidden_size=32, intermediate_size=37, moe_intermediate_size=12, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, num_key_value_heads=4, n_shared_experts=1, diff --git a/tests/models/eomt/test_modeling_eomt.py b/tests/models/eomt/test_modeling_eomt.py index 1c92692f2795..f0d4a7c1fa9e 100644 --- a/tests/models/eomt/test_modeling_eomt.py +++ b/tests/models/eomt/test_modeling_eomt.py @@ -47,7 +47,7 @@ def __init__( num_labels=4, hidden_size=8, num_attention_heads=2, - num_hidden_layers=4, + num_hidden_layers=2, ): self.parent = parent self.batch_size = batch_size diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py index f15b86d425f1..14e160fe594f 100644 --- a/tests/models/falcon/test_modeling_falcon.py +++ b/tests/models/falcon/test_modeling_falcon.py @@ -208,7 +208,7 @@ def test_falcon_alibi_sdpa_matches_eager(self): config = FalconConfig( vocab_size=1000, hidden_size=64, - num_hidden_layers=3, + num_hidden_layers=2, num_attention_heads=4, new_decoder_architecture=True, alibi=True, diff --git a/tests/models/falcon_h1/test_modeling_falcon_h1.py b/tests/models/falcon_h1/test_modeling_falcon_h1.py index cc78f7bf7c1d..3e475ef70802 100644 --- a/tests/models/falcon_h1/test_modeling_falcon_h1.py +++ b/tests/models/falcon_h1/test_modeling_falcon_h1.py @@ -55,7 +55,7 @@ def __init__( use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=4, num_key_value_heads=2, intermediate_size=64, diff --git a/tests/models/got_ocr2/test_modeling_got_ocr2.py b/tests/models/got_ocr2/test_modeling_got_ocr2.py index 59577106b069..3ece8d3aabaf 100644 --- a/tests/models/got_ocr2/test_modeling_got_ocr2.py +++ b/tests/models/got_ocr2/test_modeling_got_ocr2.py @@ -59,7 +59,7 @@ def __init__( "vocab_size": 99, "hidden_size": 128, "intermediate_size": 37, - "num_hidden_layers": 4, + "num_hidden_layers": 2, "num_attention_heads": 4, "num_key_value_heads": 2, "output_channels": 64, diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index 2cf220fd6dfd..5539d6a0b075 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -67,7 +67,7 @@ def __init__( use_labels=True, vocab_size=99, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", @@ -85,7 +85,7 @@ def __init__( vision_patch_size=2, vision_image_size=30, vision_num_attention_heads=4, - vision_num_hidden_layers=5, + vision_num_hidden_layers=2, vision_intermediate_size=37, perceiver_qk_layer_norms_perceiver=False, perceiver_resampler_depth=2, diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index a500d8bf4946..6603f3604e0b 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -86,7 +86,7 @@ def __init__( "vocab_size": 100, "hidden_size": 64, "intermediate_size": 56, - "num_hidden_layers": 3, + "num_hidden_layers": 2, "num_attention_heads": 2, "num_key_value_heads": 2, "hidden_act": "silu", diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index b4434f34b81c..fe05eda8c0fb 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -74,7 +74,7 @@ def __init__( "vocab_size": 100, "hidden_size": 64, "intermediate_size": 56, - "num_hidden_layers": 3, + "num_hidden_layers": 2, "num_attention_heads": 2, "num_key_value_heads": 2, "hidden_act": "silu", diff --git a/tests/models/internvl/test_modeling_internvl.py b/tests/models/internvl/test_modeling_internvl.py index 297dc6cffe85..8704fccb6a1c 100644 --- a/tests/models/internvl/test_modeling_internvl.py +++ b/tests/models/internvl/test_modeling_internvl.py @@ -74,7 +74,7 @@ def __init__( "vocab_size": 99, "hidden_size": 128, "intermediate_size": 37, - "num_hidden_layers": 4, + "num_hidden_layers": 2, "num_attention_heads": 4, "num_key_value_heads": 2, "output_channels": 64, diff --git a/tests/models/longcat_flash/test_modeling_longcat_flash.py b/tests/models/longcat_flash/test_modeling_longcat_flash.py index bc52e890ce0a..ecfda972339d 100644 --- a/tests/models/longcat_flash/test_modeling_longcat_flash.py +++ b/tests/models/longcat_flash/test_modeling_longcat_flash.py @@ -60,7 +60,7 @@ def __init__( hidden_size=144, ffn_hidden_size=288, expert_ffn_hidden_size=48, - num_layers=2, + num_layers=1, # We have `self.num_hidden_layers = 2 * num_layers` in the body. See `LongcatFlashConfig`. num_attention_heads=8, num_key_value_heads=8, kv_lora_rank=16, @@ -96,7 +96,7 @@ def __init__( self.expert_ffn_hidden_size = expert_ffn_hidden_size self.num_layers = num_layers self.num_hidden_layers = 2 * num_layers # for compatibility - self.expected_num_hidden_layers = 3 # embedding + 2 layers + self.expected_num_hidden_layers = 2 # embedding + 2 layers self.num_attention_heads = num_attention_heads self.num_key_value_heads = num_key_value_heads self.kv_lora_rank = kv_lora_rank diff --git a/tests/models/lxmert/test_modeling_lxmert.py b/tests/models/lxmert/test_modeling_lxmert.py index 033fcc0605d6..3d9a88d561ce 100644 --- a/tests/models/lxmert/test_modeling_lxmert.py +++ b/tests/models/lxmert/test_modeling_lxmert.py @@ -59,7 +59,7 @@ def __init__( num_object_labels=16, num_attr_labels=4, num_visual_features=10, - l_layers=2, + l_layers=1, x_layers=1, r_layers=1, visual_feat_dim=128, diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index ca5579ecb058..0d151602ffce 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -145,7 +145,7 @@ def __init__( "model_type": "mllama", "vocab_size": 99, "hidden_size": 32, - "num_hidden_layers": 4, + "num_hidden_layers": 2, "num_attention_heads": 4, "num_key_value_heads": 4, "intermediate_size": 37, @@ -166,7 +166,7 @@ def __init__( "intermediate_layers_indices": [0], "vision_output_dim": 32, "projection_dim": 32, - "num_hidden_layers": 6, + "num_hidden_layers": 2, "num_global_layers": 2, "num_attention_heads": 4, "intermediate_size": 37, diff --git a/tests/models/pop2piano/test_modeling_pop2piano.py b/tests/models/pop2piano/test_modeling_pop2piano.py index 0a4a773faac2..91e25f6093b2 100644 --- a/tests/models/pop2piano/test_modeling_pop2piano.py +++ b/tests/models/pop2piano/test_modeling_pop2piano.py @@ -57,7 +57,7 @@ def __init__( use_attention_mask=True, use_labels=True, hidden_size=64, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=8, diff --git a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py index 32ebdd0ab036..61fa18153902 100644 --- a/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py +++ b/tests/models/qwen2_5_omni/test_modeling_qwen2_5_omni.py @@ -99,7 +99,7 @@ def __init__( "vocab_size": 99, "hidden_size": 32, "intermediate_size": 37, - "num_hidden_layers": 4, + "num_hidden_layers": 2, "num_attention_heads": 4, "num_key_value_heads": 2, "hidden_act": "silu", diff --git a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py index 650f8b05d3b1..d90dff9f13ff 100644 --- a/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py +++ b/tests/models/qwen2_5_vl/test_modeling_qwen2_5_vl.py @@ -85,7 +85,7 @@ def __init__( max_window_layers=3, model_type="qwen2_5_vl", num_attention_heads=4, - num_hidden_layers=4, + num_hidden_layers=2, num_key_value_heads=2, rope_theta=10000, tie_word_embeddings=True, diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index ef109fb7cca7..37f315b5dc38 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -79,7 +79,7 @@ def __init__( max_window_layers=3, model_type="qwen2_vl", num_attention_heads=4, - num_hidden_layers=4, + num_hidden_layers=2, num_key_value_heads=2, rope_theta=10000, tie_word_embeddings=True, diff --git a/tests/models/qwen3_vl/test_modeling_qwen3_vl.py b/tests/models/qwen3_vl/test_modeling_qwen3_vl.py index 35031bf542aa..6074efecf4a9 100644 --- a/tests/models/qwen3_vl/test_modeling_qwen3_vl.py +++ b/tests/models/qwen3_vl/test_modeling_qwen3_vl.py @@ -61,7 +61,7 @@ def __init__( "max_position_embeddings": 512, "model_type": "qwen3_vl", "num_attention_heads": 4, - "num_hidden_layers": 4, + "num_hidden_layers": 2, "num_key_value_heads": 2, "rope_theta": 10000, "tie_word_embeddings": True, diff --git a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py index adae69a81fa8..411845fcbfa5 100644 --- a/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py +++ b/tests/models/qwen3_vl_moe/test_modeling_qwen3_vl_moe.py @@ -61,7 +61,7 @@ def __init__( "model_type": "qwen3_vl_moe", "num_attention_heads": 4, "num_key_value_heads": 2, - "num_hidden_layers": 4, + "num_hidden_layers": 2, "moe_intermediate_size": 16, "num_experts_per_tok": 4, "num_experts": 8, diff --git a/tests/models/reformer/test_modeling_reformer.py b/tests/models/reformer/test_modeling_reformer.py index 8f2b1cdc9957..48df1559e991 100644 --- a/tests/models/reformer/test_modeling_reformer.py +++ b/tests/models/reformer/test_modeling_reformer.py @@ -83,7 +83,7 @@ def __init__( axial_pos_embds=True, axial_pos_shape=[4, 8], axial_pos_embds_dim=[16, 16], - attn_layers=["local", "local", "local", "local"], + attn_layers=["local", "local"], pad_token_id=0, eos_token_id=2, scope=None, diff --git a/tests/models/smolvlm/test_modeling_smolvlm.py b/tests/models/smolvlm/test_modeling_smolvlm.py index 6a3c8c5fa346..7856afd2c9eb 100644 --- a/tests/models/smolvlm/test_modeling_smolvlm.py +++ b/tests/models/smolvlm/test_modeling_smolvlm.py @@ -77,7 +77,7 @@ def __init__( "vocab_size": 100, "hidden_size": 64, "intermediate_size": 56, - "num_hidden_layers": 3, + "num_hidden_layers": 2, "num_attention_heads": 2, "num_key_value_heads": 2, "hidden_act": "silu", diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py index 3ec5df33d2b9..4e6aa707ee20 100644 --- a/tests/models/udop/test_modeling_udop.py +++ b/tests/models/udop/test_modeling_udop.py @@ -55,7 +55,7 @@ def __init__( use_attention_mask=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, d_ff=37, relative_attention_num_buckets=32, @@ -425,7 +425,7 @@ def __init__( is_training=False, use_attention_mask=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, decoder_layers=2, num_attention_heads=4, d_ff=37, diff --git a/tests/models/vitpose/test_modeling_vitpose.py b/tests/models/vitpose/test_modeling_vitpose.py index 7cb92e10f005..d5dddc74a3bc 100644 --- a/tests/models/vitpose/test_modeling_vitpose.py +++ b/tests/models/vitpose/test_modeling_vitpose.py @@ -51,7 +51,7 @@ def __init__( is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py index 5a35795a7495..6f8ee5eb9ed4 100644 --- a/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py +++ b/tests/models/vitpose_backbone/test_modeling_vitpose_backbone.py @@ -44,7 +44,7 @@ def __init__( is_training=True, use_labels=True, hidden_size=32, - num_hidden_layers=5, + num_hidden_layers=2, num_attention_heads=4, intermediate_size=37, hidden_act="gelu", diff --git a/tests/models/vjepa2/test_modeling_vjepa2.py b/tests/models/vjepa2/test_modeling_vjepa2.py index 1d0004122ab4..c61cb72bc0a0 100644 --- a/tests/models/vjepa2/test_modeling_vjepa2.py +++ b/tests/models/vjepa2/test_modeling_vjepa2.py @@ -61,7 +61,7 @@ def __init__( patch_size=16, num_channels=3, hidden_size=32, - num_hidden_layers=4, + num_hidden_layers=2, num_attention_heads=2, num_frames=2, mlp_ratio=1, diff --git a/tests/models/xlnet/test_modeling_xlnet.py b/tests/models/xlnet/test_modeling_xlnet.py index ae0e2b9d56df..9f1fb24d17c7 100644 --- a/tests/models/xlnet/test_modeling_xlnet.py +++ b/tests/models/xlnet/test_modeling_xlnet.py @@ -80,7 +80,7 @@ def __init__( self.hidden_size = 32 self.num_attention_heads = 4 self.d_inner = 128 - self.num_hidden_layers = 5 + self.num_hidden_layers = 3 self.type_sequence_label_size = 2 self.bi_data = False self.same_length = False diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 188c7517d54c..5739bab3a3a6 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -674,6 +674,45 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): return inputs_dict + def test_num_layers_is_small(self): + # TODO (if possible): Avoid exceptional cases, especially for `OwlViT`. + # ⛔ DO NOT edit this list (unless there is really nothing to tweak in the model tester class and approved by the reviewer) ⛔! + exceptional_num_hidden_layers = { + # TODO: There might be some way to fix + "FunnelModelTest": 5, + "FunnelBaseModelTest": 4, + "GroupViTVisionModelTest": 12, + "OwlViTModelTest": 12, + "OwlViTTextModelTest": 12, + "OwlViTForObjectDetectionTest": 12, + "Owlv2ModelTest": 12, + "Owlv2TextModelTest": 12, + "Owlv2ForObjectDetectionTest": 12, + "SamHQModelTest": 12, + "Swin2SRModelTest": 3, + "XLNetModelTest": 3, + "DPTModelTest": 4, # `test_modeling_dpt_hybrid.py`: not able to get it work after change `num_hidden_layers` and `neck_hidden_sizes` + # Nothing we can't do + "Gemma3nTextModelTest": 4, # need to test KV shared layer for both types: `full_attention` and `sliding_attention` + "BeitModelTest": 4, # BeitForSemanticSegmentation requires config.out_indices to be a list of 4 integers + "ZambaModelTest": 5, # The minimum number to test beyond the initial ["mamba", "mamba", "hybrid"] in `ZambaConfig._layers_block_type` + } + target_num_hidden_layers = exceptional_num_hidden_layers.get(type(self).__name__, 2) + + if hasattr(self.model_tester, "num_hidden_layers") and isinstance(self.model_tester.num_hidden_layers, int): + assert self.model_tester.num_hidden_layers <= target_num_hidden_layers + + if hasattr(self.model_tester, "vision_config") and "num_hidden_layers" in self.model_tester.vision_config: + if isinstance(self.model_tester.vision_config, dict): + assert self.model_tester.vision_config["num_hidden_layers"] <= target_num_hidden_layers + else: + assert self.model_tester.vision_config.num_hidden_layers <= target_num_hidden_layers + if hasattr(self.model_tester, "text_config") and "num_hidden_layers" in self.model_tester.text_config: + if isinstance(self.model_tester.text_config, dict): + assert self.model_tester.text_config["num_hidden_layers"] <= target_num_hidden_layers + else: + assert self.model_tester.text_config.num_hidden_layers <= target_num_hidden_layers + def test_save_load(self): def check_save_load(out1, out2): # make sure we don't have nans