diff --git a/src/transformers/models/arcee/configuration_arcee.py b/src/transformers/models/arcee/configuration_arcee.py index b4e23ffb3b8f..b9892eaf8b61 100644 --- a/src/transformers/models/arcee/configuration_arcee.py +++ b/src/transformers/models/arcee/configuration_arcee.py @@ -136,7 +136,7 @@ def __init__( bos_token_id: Optional[int] = 128000, eos_token_id: Optional[int] = 128001, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, diff --git a/src/transformers/models/arcee/modular_arcee.py b/src/transformers/models/arcee/modular_arcee.py index cb75888957d8..09fbe14d291f 100644 --- a/src/transformers/models/arcee/modular_arcee.py +++ b/src/transformers/models/arcee/modular_arcee.py @@ -137,7 +137,7 @@ def __init__( bos_token_id: Optional[int] = 128000, eos_token_id: Optional[int] = 128001, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index 78669c78bcbb..192d78776679 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -134,7 +134,7 @@ def __init__( eos_token_id: Optional[int] = 2, pretraining_tp: Optional[int] = 1, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py index 1bcf84d0c6c4..0473ad6ac407 100644 --- a/src/transformers/models/bitnet/configuration_bitnet.py +++ b/src/transformers/models/bitnet/configuration_bitnet.py @@ -117,7 +117,7 @@ def __init__( tie_word_embeddings: Optional[bool] = False, attention_bias: Optional[bool] = False, attention_dropout: Optional[str] = 0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs, ): self.vocab_size = vocab_size diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py index b20ae8c6dad3..7459346645ea 100644 --- a/src/transformers/models/blt/configuration_blt.py +++ b/src/transformers/models/blt/configuration_blt.py @@ -44,7 +44,7 @@ def __init__( rms_norm_eps: Optional[float] = 1e-5, dropout: Optional[float] = 0.0, max_position_embeddings: Optional[int] = 24576, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, hidden_act: Optional[str] = "silu", intermediate_size: Optional[int] = 2816, initializer_range: Optional[float] = 0.02, @@ -99,7 +99,7 @@ def __init__( rms_norm_eps: Optional[float] = 1e-5, dropout: Optional[float] = 0.0, max_position_embeddings: Optional[int] = 24576, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, hidden_act: Optional[str] = "silu", intermediate_size: Optional[int] = 2816, initializer_range: Optional[float] = 0.02, @@ -150,7 +150,7 @@ def __init__( rms_norm_eps: Optional[float] = 1e-5, dropout: Optional[float] = 0.0, max_position_embeddings: Optional[int] = 4096, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, hidden_act: Optional[str] = "silu", intermediate_size: Optional[int] = 5632, initializer_range: Optional[float] = 0.02, @@ -231,7 +231,7 @@ def __init__( rms_norm_eps: Optional[float] = 1e-5, dropout: Optional[float] = 0.0, intermediate_size: Optional[int] = 2048, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, initializer_range: Optional[float] = 0.02, **kwargs, ): @@ -356,7 +356,7 @@ def __init__( global_config: Optional[dict] = None, tie_word_embeddings: Optional[bool] = False, initializer_range: Optional[float] = 0.02, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs, ): # Basic model configuration diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index 72e6eccca2a3..bfa8a9f33469 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -203,7 +203,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[int] = False, attention_dropout: Optional[float] = 0.0, model_parallel_size: Optional[int] = 1, diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index ac75ea93c864..18afd5fd32e9 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -139,7 +139,7 @@ def __init__( bos_token_id: Optional[int] = 5, eos_token_id: Optional[int] = 255001, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, use_qk_norm: Optional[bool] = False, diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index 7bf87307ee1d..910dc6dcb80a 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -138,7 +138,7 @@ def __init__( bos_token_id: Optional[int] = 5, eos_token_id: Optional[int] = 255001, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, sliding_window: Optional[int] = 4096, diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index dab998730c77..af9fa871f391 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -162,7 +162,7 @@ def __init__( bos_token_id: Optional[int] = 5, eos_token_id: Optional[int] = 255001, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, sliding_window: Optional[int] = 4096, diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index 227609c2f1aa..ce1ad2dd5993 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -122,7 +122,7 @@ def __init__( pad_token_id: Optional[int] = None, bos_token_id: Optional[int] = None, eos_token_id: Optional[int] = None, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, @@ -291,7 +291,7 @@ def __init__( eos_token_id: Optional[int] = None, audio_token_id: Optional[int] = 128002, audio_eos_token_id: Optional[int] = 128003, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index 987cb8a8ac06..82182c49bd3f 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -189,7 +189,7 @@ def __init__( use_cache: Optional[bool] = True, initializer_range: Optional[float] = 0.02, output_router_logits: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs: Any, ): if attn_config is None: diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index 7e5a8c93feec..aad76507d3a6 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -154,7 +154,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, diff --git a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py index 0a5e1a8b4f06..7e60d5c858b3 100644 --- a/src/transformers/models/deepseek_v2/modular_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/modular_deepseek_v2.py @@ -167,7 +167,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index eed1ea34def4..f90c5e175ba5 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -186,7 +186,7 @@ def __init__( eos_token_id: Optional[int] = 1, pretraining_tp: Optional[int] = 1, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, rope_interleave: Optional[bool] = True, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index 0eac1f506c72..cbfb5fea5160 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -118,7 +118,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, lambda_std_dev: Optional[float] = 0.1, diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index 844b9519b45a..db2d3014d978 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -148,7 +148,7 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, max_position_embeddings: Optional[int] = 2048, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, num_attention_heads: Optional[int] = 8, num_key_value_heads: Optional[int] = None, attention_bias: Optional[bool] = False, diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index 52603d99dcd4..fd71f7479f6b 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -176,7 +176,7 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, max_position_embeddings: Optional[int] = 2048, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, num_attention_heads: Optional[int] = 8, num_key_value_heads: Optional[int] = None, attention_bias: Optional[bool] = False, diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py index db524dd5789c..a5755ad0a45f 100644 --- a/src/transformers/models/dots1/configuration_dots1.py +++ b/src/transformers/models/dots1/configuration_dots1.py @@ -159,7 +159,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-6, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, routed_scaling_factor: Optional[float] = 1.0, diff --git a/src/transformers/models/ernie4_5/configuration_ernie4_5.py b/src/transformers/models/ernie4_5/configuration_ernie4_5.py index 03aefe766cf6..346eff50e9f2 100644 --- a/src/transformers/models/ernie4_5/configuration_ernie4_5.py +++ b/src/transformers/models/ernie4_5/configuration_ernie4_5.py @@ -125,7 +125,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, use_bias: Optional[bool] = False, head_dim: Optional[int] = 128, **kwargs, diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py index 0fd108a28b40..19ed1853db33 100644 --- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py @@ -161,7 +161,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, use_bias: Optional[int] = False, moe_intermediate_size: Optional[int] = 1536, moe_k: Optional[int] = 6, diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py index 218bc50ad964..4dab03fb9314 100644 --- a/src/transformers/models/evolla/configuration_evolla.py +++ b/src/transformers/models/evolla/configuration_evolla.py @@ -203,7 +203,7 @@ def __init__( hidden_act: Optional[str] = "silu", # llama activation function max_position_embeddings: Optional[int] = 8192, # llama rope max length rms_norm_eps: Optional[int] = 1e-05, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index 68bdaf5ce9b3..a968bcc6f07b 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -143,7 +143,7 @@ def __init__( bos_token_id: Optional[int] = 0, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_dropout: Optional[float] = 0.0, sliding_window: Optional[int] = 4096, sliding_window_pattern: Optional[int] = 4, diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index d03510d54d46..4ddc3466ffd9 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -176,7 +176,7 @@ def __init__( bos_token_id: Optional[int] = 0, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_dropout: Optional[float] = 0.0, sliding_window: Optional[int] = 4096, sliding_window_pattern: Optional[int] = 4, diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 2a6da686b72e..3e7b437954dc 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -128,7 +128,7 @@ def __init__( parallel_attn: Optional[bool] = True, bias: Optional[bool] = False, max_position_embeddings: Optional[int] = 2048, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, bos_token_id: Optional[int] = 11, eos_token_id: Optional[int] = 11, ffn_hidden_size: Optional[int] = None, diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py index 85a7e76f3901..6ba590f15025 100644 --- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py +++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -164,7 +164,7 @@ def __init__( mamba_norm_before_gate: Optional[bool] = True, mamba_rms_norm: Optional[bool] = False, projectors_bias: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, lm_head_multiplier: Optional[float] = 1.0, embedding_multiplier: Optional[float] = 1.0, mlp_multipliers: Optional[int] = None, diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index 0f0f63f2916b..515301b93c0c 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -141,7 +141,7 @@ def __init__( bos_token_id: Optional[int] = None, eos_token_id: Optional[int] = 100257, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, num_experts_per_tok: Optional[int] = 5, diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index e5c738aa4bc5..f6cee224c0ee 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -152,7 +152,7 @@ def __init__( bos_token_id: Optional[int] = None, eos_token_id: Optional[int] = 100257, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, num_experts_per_tok: Optional[int] = 5, diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index ae3b692cb474..bbe4a5ec22d8 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -118,7 +118,7 @@ def __init__( layer_norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, qk_layernorm: Optional[bool] = True, hidden_dropout: Optional[float] = 0.0, attention_dropout: Optional[float] = 0.0, diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index 986ab2c9aa94..a2c6ac12f008 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -131,7 +131,7 @@ def __init__( eos_token_id: Optional[int] = 1, bos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, use_bidirectional_attention: Optional[bool] = None, diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index cc4cf066958a..aa64cc9e63e8 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -158,7 +158,7 @@ def __init__( eos_token_id: Optional[int] = 1, bos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, use_bidirectional_attention: Optional[bool] = None, diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index 7fa77dbb8347..460fb7000354 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -142,7 +142,7 @@ def __init__( eos_token_id: Optional[int] = 1, bos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, query_pre_attn_scalar: Optional[int] = 256, diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 411c75ac516a..4e36cc22e030 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -171,7 +171,7 @@ def __init__( eos_token_id: Optional[int] = 1, bos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, query_pre_attn_scalar: Optional[int] = 256, diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index cbc0e890d9cc..796822cf4e37 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -177,7 +177,7 @@ def __init__( pad_token_id: int = 0, eos_token_id: int = 1, bos_token_id: int = 2, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: bool = False, attention_dropout: float = 0.0, sliding_window: int = 512, diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index adbcd029d7c2..6d431e9acc55 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -187,7 +187,7 @@ def __init__( pad_token_id: int = 0, eos_token_id: int = 1, bos_token_id: int = 2, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: bool = False, attention_dropout: float = 0.0, sliding_window: int = 512, diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index 63685ce76729..e0d2c3d6492a 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -121,7 +121,7 @@ def __init__( rms_norm_eps: Optional[float] = 0.00000015625, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, pad_token_id: Optional[int] = 151329, eos_token_id: Optional[list[int]] = [151329, 151336, 151338], bos_token_id: Optional[int] = None, diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index 026658fa0793..43e6323b0060 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -122,7 +122,7 @@ def __init__( rms_norm_eps: Optional[float] = 0.00000015625, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, pad_token_id: Optional[int] = 151329, eos_token_id: Optional[list[int]] = [151329, 151336, 151338], bos_token_id: Optional[int] = None, diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index a35dec5f4e3f..33d9afd756e5 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -152,7 +152,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, moe_intermediate_size: Optional[int] = 1408, diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index db1f22e58e45..471d06d69ff9 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -166,7 +166,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, moe_intermediate_size: Optional[int] = 1408, diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index e316c14079bd..c8f2ef75ca71 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -220,7 +220,7 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, image_token_id: Optional[int] = None, video_token_id: Optional[int] = None, **kwargs, diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index b34fef15b642..8ae513b63d44 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -257,7 +257,7 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, image_token_id: Optional[int] = None, video_token_id: Optional[int] = None, **kwargs, diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index dc0923801243..05a9a58089dd 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -242,7 +242,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = True, attention_dropout: Optional[float] = 0.0, moe_intermediate_size: Optional[int] = 1408, diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index 62b5fee670df..9e7557c9ecf5 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -183,7 +183,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = True, attention_dropout: Optional[float] = 0.0, moe_intermediate_size: Optional[int] = 1408, diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 1a2dafdf2668..744e0316146c 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -131,7 +131,7 @@ def __init__( eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, use_parallel_residual: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = True, **kwargs, ): diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index f09bc8810da0..409232145f2a 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -100,7 +100,7 @@ def __init__( use_cache: Optional[bool] = True, bos_token_id: Optional[int] = 31996, eos_token_id: Optional[int] = 31999, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_dropout: Optional[float] = 0.1, hidden_dropout: Optional[float] = 0.0, **kwargs, diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py index 65c04c3a67e1..97d3eca0aafe 100644 --- a/src/transformers/models/granite/configuration_granite.py +++ b/src/transformers/models/granite/configuration_granite.py @@ -141,7 +141,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index f1263f080630..98460ec8a363 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -130,7 +130,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, embedding_multiplier: Optional[float] = 1.0, diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py index 55e1546fa435..9a58272ec428 100644 --- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py @@ -147,7 +147,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, embedding_multiplier: Optional[float] = 1.0, diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index 00f87604bf51..b94545710e35 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -132,7 +132,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, embedding_multiplier: Optional[float] = 1.0, diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py index db7ccaf185ae..3f3ee841991f 100644 --- a/src/transformers/models/helium/configuration_helium.py +++ b/src/transformers/models/helium/configuration_helium.py @@ -124,7 +124,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-8, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, pad_token_id: Optional[int] = 3, eos_token_id: Optional[int] = 2, bos_token_id: Optional[int] = 1, diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index 29dd3ac34f98..3dfa5388d1f7 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -116,7 +116,7 @@ def __init__( eod_token_id: Optional[int] = 3, pretraining_tp: Optional[int] = 1, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, head_dim: Optional[int] = None, diff --git a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py index 497a5674f4f3..5ee86b218ae0 100644 --- a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -127,7 +127,7 @@ def __init__( sep_token_id: Optional[int] = 4, pretraining_tp: Optional[int] = 1, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, num_experts: Union[int, list] = 1, diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index 9f5367d1c01c..43a7b069a32e 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -119,7 +119,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, rms_norm_eps: Optional[int] = 1e-6, initializer_range: Optional[float] = 0.01, attention_dropout: Optional[float] = 0.0, diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index d43856daa96a..05c901d96dd4 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -129,7 +129,7 @@ def __init__( num_attention_heads: Optional[int] = 32, num_key_value_heads: Optional[int] = None, max_position_embeddings: Optional[int] = 750, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, hidden_act: Optional[str] = "silu", head_dim: Optional[int] = None, initializer_range: Optional[float] = 0.02, diff --git a/src/transformers/models/lfm2/configuration_lfm2.py b/src/transformers/models/lfm2/configuration_lfm2.py index 4999f6ab433f..6ee32698cc85 100644 --- a/src/transformers/models/lfm2/configuration_lfm2.py +++ b/src/transformers/models/lfm2/configuration_lfm2.py @@ -117,7 +117,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, conv_bias: Optional[bool] = False, conv_L_cache: Optional[int] = 3, block_multiple_of: Optional[int] = 256, diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 3b2543983e06..add6c8ee2f74 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -143,7 +143,7 @@ def __init__( eos_token_id: Optional[int] = 2, pretraining_tp: Optional[int] = 1, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index 7d457cf8523c..a37301a17741 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -104,7 +104,7 @@ def __init__( multi_modal_projector_bias: Optional[bool] = False, projector_dropout: Optional[float] = 0.0, attention_dropout: Optional[float] = 0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs, ): self.hidden_size = hidden_size @@ -290,7 +290,7 @@ def __init__( output_router_logits=False, router_aux_loss_coef=0.001, router_jitter_noise=0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, no_rope_layers=None, no_rope_layer_interval=4, attention_chunk_size=8192, diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index 7933cb5bb0dc..6163a0cad785 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -157,7 +157,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, ffn_hidden_size: Optional[int] = 12288, diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py index 733221273016..5453817e3ea4 100644 --- a/src/transformers/models/mimi/configuration_mimi.py +++ b/src/transformers/models/mimi/configuration_mimi.py @@ -179,7 +179,7 @@ def __init__( norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = False, use_streaming: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, sliding_window: Optional[int] = 250, attention_dropout: Optional[float] = 0.0, layer_scale_initial_scale: Optional[float] = 0.01, diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index 8c4737cc5b67..b99a61a277ea 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -176,7 +176,7 @@ def __init__( output_router_logits: Optional[bool] = False, router_aux_loss_coef: Optional[float] = 0.001, router_jitter_noise: Optional[float] = 0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, layer_types: Optional[list[str]] = None, block_size: Optional[int] = 256, full_attn_alpha_factor: Optional[int] = 1, diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index 50a42c9d5cec..d1bbb96bb5c1 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -200,7 +200,7 @@ def __init__( output_router_logits: Optional[bool] = False, router_aux_loss_coef: Optional[float] = 0.001, router_jitter_noise: Optional[float] = 0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, layer_types: Optional[list[str]] = None, block_size: Optional[int] = 256, full_attn_alpha_factor: Optional[int] = 1, diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py index e17bd2a65423..0fac55d26e2a 100644 --- a/src/transformers/models/mistral/configuration_mistral.py +++ b/src/transformers/models/mistral/configuration_mistral.py @@ -136,7 +136,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, sliding_window: Optional[int] = 4096, attention_dropout: Optional[float] = 0.0, **kwargs, diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index 9a8e2280c252..6784b7eb5f19 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -158,7 +158,7 @@ def __init__( output_router_logits: Optional[bool] = False, router_aux_loss_coef: Optional[float] = 0.001, router_jitter_noise: Optional[float] = 0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs, ): self.vocab_size = vocab_size diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index 6d378425284d..b3a045ae324a 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -154,7 +154,7 @@ def __init__( attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, layer_types: Optional[list[str]] = None, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, local_attention: Optional[int] = 128, embedding_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 9e535d345f2f..795d00743f93 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -181,7 +181,7 @@ def __init__( attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, layer_types: Optional[list[str]] = None, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, local_attention: Optional[int] = 128, embedding_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index cc17f6ce6711..be60950fa593 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -154,7 +154,7 @@ def __init__( local_attention: Optional[int] = 128, global_attn_every_n_layers: Optional[int] = 3, layer_types: Optional[list[str]] = None, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs, ): super().__init__( diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index ffa7da7c130a..e7935b9f2159 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -174,7 +174,7 @@ def __init__( local_attention: Optional[int] = 128, global_attn_every_n_layers: Optional[int] = 3, layer_types: Optional[list[str]] = None, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs, ): super().__init__( diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index 5237cd4e3d8c..e04909e1f7eb 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -141,7 +141,7 @@ def __init__( initializer_range: Optional[float] = 0.02, decoder_start_token_id: Optional[int] = 1, use_cache: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, partial_rotary_factor: Optional[float] = 0.9, is_encoder_decoder: Optional[bool] = True, attention_bias: Optional[bool] = False, diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 1e035bdb87c6..bb66a7916f00 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -164,7 +164,7 @@ def __init__( initializer_range: Optional[float] = 0.02, decoder_start_token_id: Optional[int] = 1, use_cache: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, partial_rotary_factor: Optional[float] = 0.9, is_encoder_decoder: Optional[bool] = True, attention_bias: Optional[bool] = False, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index 8d2e2aef339a..fea1a7cff985 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -252,7 +252,7 @@ def __init__( num_key_value_heads: Optional[int] = None, audio_vocab_size: Optional[int] = None, max_position_embeddings: Optional[int] = 3000, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, hidden_act: Optional[str] = "silu", head_dim: Optional[int] = None, initializer_range: Optional[float] = 0.02, diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index 084a674fc345..c5f888ac6d36 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -122,7 +122,7 @@ def __init__( bos_token_id: Optional[int] = 2, eos_token_id: Optional[int] = 3, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, partial_rotary_factor: Optional[float] = 0.5, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index 6a1fb4f96526..f01e33ead00a 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -134,7 +134,7 @@ def __init__( bos_token_id: Optional[int] = None, eos_token_id: Optional[int] = 50279, tie_word_embeddings: Optional[int] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, clip_qkv: Optional[bool] = None, diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py index 2b4af4c7523c..3ba97d4f162b 100644 --- a/src/transformers/models/olmo2/configuration_olmo2.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -135,7 +135,7 @@ def __init__( bos_token_id: Optional[int] = None, eos_token_id: Optional[int] = 50279, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, rms_norm_eps: Optional[int] = 1e-5, diff --git a/src/transformers/models/olmo2/modular_olmo2.py b/src/transformers/models/olmo2/modular_olmo2.py index 74eddd2d5af4..12705dce6e8c 100644 --- a/src/transformers/models/olmo2/modular_olmo2.py +++ b/src/transformers/models/olmo2/modular_olmo2.py @@ -150,7 +150,7 @@ def __init__( bos_token_id: Optional[int] = None, eos_token_id: Optional[int] = 50279, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, rms_norm_eps: Optional[int] = 1e-5, diff --git a/src/transformers/models/olmo3/configuration_olmo3.py b/src/transformers/models/olmo3/configuration_olmo3.py index 08762d09ff61..6e3f5594cbb5 100644 --- a/src/transformers/models/olmo3/configuration_olmo3.py +++ b/src/transformers/models/olmo3/configuration_olmo3.py @@ -135,7 +135,7 @@ def __init__( bos_token_id: Optional[int] = None, eos_token_id: Optional[int] = 50279, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, rms_norm_eps: Optional[float] = 1e-5, diff --git a/src/transformers/models/olmo3/modular_olmo3.py b/src/transformers/models/olmo3/modular_olmo3.py index ab1b63752721..d8bec6e9f15d 100644 --- a/src/transformers/models/olmo3/modular_olmo3.py +++ b/src/transformers/models/olmo3/modular_olmo3.py @@ -151,7 +151,7 @@ def __init__( bos_token_id: Optional[int] = None, eos_token_id: Optional[int] = 50279, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, rms_norm_eps: Optional[float] = 1e-5, diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py index 5dae49098a29..511d7968fb78 100644 --- a/src/transformers/models/olmoe/configuration_olmoe.py +++ b/src/transformers/models/olmoe/configuration_olmoe.py @@ -122,7 +122,7 @@ def __init__( bos_token_id: Optional[int] = None, eos_token_id: Optional[int] = 50279, tie_word_embeddings: Optional[int] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, clip_qkv: Optional[bool] = None, diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index 5c2452526635..f9dbe11580b2 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -98,7 +98,7 @@ def __init__( layer_norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, qk_layernorm: Optional[bool] = True, hidden_dropout: Optional[float] = 0.0, attention_dropout: Optional[float] = 0.0, diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index 427b453db981..5476cb1b6c7c 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -137,7 +137,7 @@ def __init__( layer_norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, partial_rotary_factor: Optional[float] = 0.5, qk_layernorm: Optional[bool] = False, bos_token_id: Optional[int] = 1, diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index ed096dd8a319..35eb2df30c9d 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -139,7 +139,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, partial_rotary_factor: Optional[float] = 1.0, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 32000, diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index c9ea706b2c4c..46c104d027a7 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -366,7 +366,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, partial_rotary_factor: Optional[int] = 1, bos_token_id: Optional[int] = 199999, eos_token_id: Optional[list[int]] = [199999, 200020], diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py index 17458f141f12..9095c4375c7e 100644 --- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py @@ -388,7 +388,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, partial_rotary_factor: Optional[int] = 1, bos_token_id: Optional[int] = 199999, eos_token_id: Optional[list[int]] = [199999, 200020], diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index 8af5508daf93..f7a9b528211f 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -128,7 +128,7 @@ def __init__( bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[int] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, sliding_window: Optional[int] = None, attention_dropout: Optional[float] = 0.0, num_experts_per_tok: Optional[int] = 2, diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index 3e2098adcd94..62c179b20edc 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -86,7 +86,7 @@ def __init__( patch_size: Optional[int] = 16, hidden_act: Optional[str] = "gelu", attention_dropout: Optional[float] = 0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, initializer_range: Optional[float] = 0.02, **kwargs, ): diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index 418b18027350..bda8bb8abfc7 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -129,7 +129,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-6, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, use_sliding_window: Optional[bool] = False, sliding_window: Optional[int] = 4096, max_window_layers: Optional[int] = 28, diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 69a4e3e0c66f..af96e9a3163f 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -322,7 +322,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-6, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, use_sliding_window: Optional[bool] = False, sliding_window: Optional[int] = 32768, max_window_layers: Optional[int] = 28, @@ -650,7 +650,7 @@ def __init__( sliding_window=32768, max_window_layers=28, attention_dropout=0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, position_id_per_seconds=25, seconds_per_chunk=2, audio_start_token_id=151647, @@ -781,7 +781,7 @@ def __init__( ff_mult=2, emb_dim=512, head_dim=64, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, max_position_embeddings=32768, block_size=24, look_ahead_layers=[10], diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index c37b321ce38b..329e1b798dd6 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -355,7 +355,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-6, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, use_sliding_window: Optional[bool] = False, sliding_window: Optional[int] = 32768, max_window_layers: Optional[int] = 28, @@ -683,7 +683,7 @@ def __init__( sliding_window=32768, max_window_layers=28, attention_dropout=0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, position_id_per_seconds=25, seconds_per_chunk=2, audio_start_token_id=151647, @@ -814,7 +814,7 @@ def __init__( ff_mult=2, emb_dim=512, head_dim=64, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, max_position_embeddings=32768, block_size=24, look_ahead_layers=[10], diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index 41ae4195f63d..5469b3226f3b 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -179,7 +179,7 @@ def __init__( max_window_layers: Optional[int] = 80, layer_types: Optional[list[str]] = None, attention_dropout: Optional[float] = 0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs, ): self.vocab_size = vocab_size diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py index f45577e91516..256d663d3114 100644 --- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py @@ -150,7 +150,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-6, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, use_sliding_window: Optional[bool] = False, sliding_window: Optional[int] = 4096, max_window_layers: Optional[int] = 28, diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 02eee9de1ddb..62fb4a815a4c 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -167,7 +167,7 @@ def __init__( max_window_layers: Optional[int] = 80, layer_types: Optional[list[str]] = None, attention_dropout: Optional[float] = 0.0, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs, ): self.vocab_size = vocab_size diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index 90edaff6aaa3..a1cf6a1ea861 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -134,7 +134,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-6, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, use_sliding_window: Optional[bool] = False, sliding_window: Optional[int] = 4096, diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index e5003c509118..5043a3f38a07 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -148,7 +148,7 @@ def __init__( rms_norm_eps: Optional[int] = 1e-6, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, use_sliding_window: Optional[bool] = False, sliding_window: Optional[int] = 4096, diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index da6dde8c9db7..1e5df811d866 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -165,7 +165,7 @@ def __init__( rms_norm_eps: Optional[float] = 1e-6, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, partial_rotary_factor: Optional[float] = 0.25, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index 281c2a2bf509..4c2b86d4da20 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -295,7 +295,7 @@ def __init__( rms_norm_eps: Optional[float] = 1e-6, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, sliding_window: Optional[int] = None, attention_dropout: Optional[int] = 0, @@ -738,7 +738,7 @@ def __init__( rms_norm_eps: Optional[float] = 0.000001, use_cache: Optional[int] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, sliding_window: Optional[int] = None, attention_dropout: Optional[int] = 0, diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 526169066dc1..7b96007aa6ef 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -169,7 +169,7 @@ def __init__( rms_norm_eps: Optional[float] = 1e-6, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, sliding_window: Optional[int] = None, attention_dropout: Optional[int] = 0, @@ -380,7 +380,7 @@ def __init__( rms_norm_eps: Optional[float] = 0.000001, use_cache: Optional[int] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, sliding_window: Optional[int] = None, attention_dropout: Optional[int] = 0, diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index f4228ddb3f87..546a3da5bb7b 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -146,7 +146,7 @@ def __init__( rms_norm_eps: Optional[float] = 1e-6, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, **kwargs, diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 9216b9951398..7758a23e2970 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -187,7 +187,7 @@ def __init__( rms_norm_eps: Optional[float] = 1e-6, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, **kwargs, diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index 3a3aca4ddacd..130044ee099d 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -120,7 +120,7 @@ def __init__( bos_token_id: Optional[int] = 2, hidden_activation: Optional[str] = "gelu_pytorch_tanh", partial_rotary_factor: Optional[float] = 0.5, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, block_types: Optional[list[str]] = ("recurrent", "recurrent", "attention"), attention_dropout: Optional[float] = 0.0, num_key_value_heads: Optional[int] = None, diff --git a/src/transformers/models/seed_oss/configuration_seed_oss.py b/src/transformers/models/seed_oss/configuration_seed_oss.py index 7961646ae2d8..240cb03bac77 100644 --- a/src/transformers/models/seed_oss/configuration_seed_oss.py +++ b/src/transformers/models/seed_oss/configuration_seed_oss.py @@ -139,7 +139,7 @@ def __init__( eos_token_id: Optional[int] = 2, pretraining_tp: Optional[int] = 1, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = True, attention_out_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.1, diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py index 2ffdf53008c6..04e8e78e575c 100644 --- a/src/transformers/models/smollm3/configuration_smollm3.py +++ b/src/transformers/models/smollm3/configuration_smollm3.py @@ -140,7 +140,7 @@ def __init__( pad_token_id: Optional[int] = 128004, bos_token_id: Optional[int] = 128000, eos_token_id: Optional[int] = 128001, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, use_sliding_window: Optional[bool] = False, sliding_window: Optional[int] = None, no_rope_layers: Optional[int] = None, diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py index fe8bcb52080d..e5551d414c1b 100644 --- a/src/transformers/models/smollm3/modular_smollm3.py +++ b/src/transformers/models/smollm3/modular_smollm3.py @@ -157,7 +157,7 @@ def __init__( pad_token_id: Optional[int] = 128004, bos_token_id: Optional[int] = 128000, eos_token_id: Optional[int] = 128001, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, use_sliding_window: Optional[bool] = False, sliding_window: Optional[int] = None, no_rope_layers: Optional[int] = None, diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index 9beed377ad69..0efdcd94adcd 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -119,7 +119,7 @@ def __init__( layer_norm_eps: Optional[float] = 1.0e-5, use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, use_qkv_bias: Optional[bool] = False, qk_layernorm: Optional[bool] = False, use_parallel_residual: Optional[bool] = False, diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py index 9d87dd6eefa1..cb34ad1d9157 100644 --- a/src/transformers/models/starcoder2/configuration_starcoder2.py +++ b/src/transformers/models/starcoder2/configuration_starcoder2.py @@ -131,7 +131,7 @@ def __init__( use_cache: Optional[bool] = True, bos_token_id: Optional[int] = 50256, eos_token_id: Optional[int] = 50256, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, sliding_window: Optional[int] = None, attention_dropout: Optional[float] = 0.0, residual_dropout: Optional[float] = 0.0, diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py index bea8916d3e6b..6e6f9784e951 100644 --- a/src/transformers/models/t5gemma/configuration_t5gemma.py +++ b/src/transformers/models/t5gemma/configuration_t5gemma.py @@ -140,7 +140,7 @@ def __init__( eos_token_id: Optional[int] = 1, bos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, query_pre_attn_scalar: Optional[int] = 256, diff --git a/src/transformers/models/t5gemma/modular_t5gemma.py b/src/transformers/models/t5gemma/modular_t5gemma.py index 6d49e5c241ad..86ecf53ae6e4 100644 --- a/src/transformers/models/t5gemma/modular_t5gemma.py +++ b/src/transformers/models/t5gemma/modular_t5gemma.py @@ -159,7 +159,7 @@ def __init__( eos_token_id: Optional[int] = 1, bos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, query_pre_attn_scalar: Optional[int] = 256, diff --git a/src/transformers/models/vaultgemma/configuration_vaultgemma.py b/src/transformers/models/vaultgemma/configuration_vaultgemma.py index d50cf5ed93d7..0a784c02c1e6 100644 --- a/src/transformers/models/vaultgemma/configuration_vaultgemma.py +++ b/src/transformers/models/vaultgemma/configuration_vaultgemma.py @@ -140,7 +140,7 @@ def __init__( eos_token_id: Optional[int] = 1, bos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, query_pre_attn_scalar: Optional[int] = 256, diff --git a/src/transformers/models/vaultgemma/modular_vaultgemma.py b/src/transformers/models/vaultgemma/modular_vaultgemma.py index e8b3a4ee6773..a0a9fc207692 100644 --- a/src/transformers/models/vaultgemma/modular_vaultgemma.py +++ b/src/transformers/models/vaultgemma/modular_vaultgemma.py @@ -121,7 +121,7 @@ def __init__( eos_token_id: Optional[int] = 1, bos_token_id: Optional[int] = 2, tie_word_embeddings: Optional[bool] = True, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, query_pre_attn_scalar: Optional[int] = 256, diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py index 40e30822cf59..4d6c92439da5 100644 --- a/src/transformers/models/zamba2/configuration_zamba2.py +++ b/src/transformers/models/zamba2/configuration_zamba2.py @@ -162,7 +162,7 @@ def __init__( use_shared_attention_adapter: Optional[bool] = False, adapter_rank: Optional[int] = 128, use_mem_rope: Optional[bool] = False, - rope_parameters: Optional[RopeParameters | dict[RopeParameters]] = None, + rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, initializer_range: Optional[float] = 0.02, rms_norm_eps: Optional[int] = 1e-5, use_cache: Optional[bool] = True,