From b8f8dd8cbfc4249b0a2d282bba34d76b4dcb6196 Mon Sep 17 00:00:00 2001 From: raushan Date: Mon, 17 Nov 2025 18:23:18 +0100 Subject: [PATCH 01/23] tmp --- src/transformers/modeling_rope_utils.py | 101 +++++++----------- .../models/bamba/configuration_bamba.py | 3 +- .../configuration_efficientloftr.py | 21 ++-- .../efficientloftr/modeling_efficientloftr.py | 2 +- .../models/fuyu/configuration_fuyu.py | 9 +- .../models/glm/configuration_glm.py | 4 +- src/transformers/models/glm/modeling_glm.py | 2 +- src/transformers/models/glm/modular_glm.py | 2 +- .../models/glm4/configuration_glm4.py | 5 +- src/transformers/models/glm4/modeling_glm4.py | 2 +- .../models/glm4_moe/configuration_glm4_moe.py | 5 +- .../models/glm4_moe/modeling_glm4_moe.py | 2 +- .../models/glm4_moe/modular_glm4_moe.py | 5 +- .../glm4v/convert_glm4v_mgt_weights_to_hf.py | 8 +- .../models/glm4v/modeling_glm4v.py | 2 +- .../glm4v_moe/configuration_glm4v_moe.py | 4 +- .../convert_glm4v_moe_mgt_weights_to_hf.py | 7 +- .../models/glm4v_moe/modeling_glm4v_moe.py | 2 +- .../models/glm4v_moe/modular_glm4v_moe.py | 6 +- .../models/gpt_neox/configuration_gpt_neox.py | 12 +-- .../models/gpt_neox/modeling_gpt_neox.py | 2 +- .../models/gpt_neox/modular_gpt_neox.py | 2 +- .../configuration_gpt_neox_japanese.py | 9 +- .../moonshine/configuration_moonshine.py | 5 +- .../models/moonshine/modeling_moonshine.py | 2 +- .../models/moonshine/modular_moonshine.py | 5 +- .../models/nemotron/configuration_nemotron.py | 4 +- .../models/nemotron/modeling_nemotron.py | 2 +- .../persimmon/configuration_persimmon.py | 5 +- .../models/persimmon/modeling_persimmon.py | 2 +- .../models/phi/configuration_phi.py | 5 +- src/transformers/models/phi/modeling_phi.py | 2 +- src/transformers/models/phi/modular_phi.py | 2 +- .../models/phi3/configuration_phi3.py | 5 +- src/transformers/models/phi3/modeling_phi3.py | 2 +- .../configuration_phi4_multimodal.py | 5 +- .../modeling_phi4_multimodal.py | 2 +- .../qwen3_next/configuration_qwen3_next.py | 5 +- .../models/qwen3_next/modeling_qwen3_next.py | 2 +- .../models/qwen3_next/modular_qwen3_next.py | 2 +- .../configuration_recurrent_gemma.py | 5 +- .../modeling_recurrent_gemma.py | 2 +- .../models/stablelm/configuration_stablelm.py | 5 +- .../models/stablelm/modeling_stablelm.py | 2 +- 44 files changed, 104 insertions(+), 184 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index c6a66ba1c4b3..5d9bfc936e81 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -16,7 +16,7 @@ from functools import wraps from typing import Optional, TypedDict -from .configuration_utils import PreTrainedConfig +from .configuration_utils import ALLOWED_LAYER_TYPES, PreTrainedConfig from .utils import is_torch_available, logging @@ -27,57 +27,6 @@ import torch -def standardize_rope_params(config, rope_theta: float | dict[str, float] | None = None): - """ - Helper to standardize the config's rope params field by ensuring the params are defined for each - later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) - """ - rope_parameters = getattr(config, "rope_parameters", None) - layer_types = getattr(config, "layer_types", None) - if rope_theta is None: - rope_theta = getattr(config, "rope_theta", None) - - # Case 1: one RoPE theat = one RoPE param per model without nesting - if not isinstance(rope_theta, dict): - if rope_parameters is None: - rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} - else: - # BC: if there is a 'type' field, copy it it to 'rope_type'. - rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default")) - rope_theta = rope_parameters.get("rope_theta") or rope_theta - rope_parameters.update({"rope_theta": rope_theta, "rope_type": rope_type}) - config.rope_parameters = rope_parameters - - # Case 2: different RoPE for each layer as nested dict - else: - rope_parameters_per_layer_type = {} - for layer_type in layer_types: - if rope_parameters is None: - rope_parameters_per_layer_type[layer_type] = { - "rope_type": "default", - "rope_theta": rope_theta[layer_type], - } - else: - is_field_in_new_format = any(layer_type in rope_parameters for layer_type in layer_types) - if not is_field_in_new_format: - curr_rope_type = rope_parameters.get("rope_type", rope_parameters.get("type")) - rope_parameters_per_layer_type[layer_type] = { - **rope_parameters, - "rope_type": curr_rope_type, - "rope_theta": rope_theta[layer_type], - } - else: - curr_rope_type = rope_parameters[layer_type].get( - "rope_type", rope_parameters[layer_type].get("type") - ) - rope_parameters_per_layer_type[layer_type] = { - **rope_parameters[layer_type], - "rope_type": curr_rope_type, - "rope_theta": rope_theta[layer_type], - } - config.rope_parameters = rope_parameters_per_layer_type - - def dynamic_rope_update(rope_forward): """ Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE @@ -214,7 +163,7 @@ def _compute_linear_scaling_rope_parameters( # Gets the default RoPE parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) attention_factor = 1.0 # Unused in this type of RoPE @@ -277,7 +226,7 @@ def _compute_dynamic_ntk_parameters( rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) max_position_embeddings = config.max_position_embeddings @@ -364,7 +313,7 @@ def _compute_yarn_parameters( rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) @@ -494,7 +443,7 @@ def _compute_longrope_parameters( rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) @@ -579,7 +528,7 @@ def _compute_llama3_parameters( # Gets the default RoPE parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) attention_factor = 1.0 # Unused in this type of RoPE @@ -760,7 +709,7 @@ def _validate_longrope_parameters(rope_parameters: dict, config: PreTrainedConfi rope_type = rope_parameters["rope_type"] _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) @@ -862,9 +811,7 @@ def rope_config_validation(config: PreTrainedConfig, ignore_keys: Optional[set] if rope_parameters_dict is None: return - if getattr(config, "layer_types", None) is not None and all( - key in config.layer_types for key in rope_parameters_dict.keys() - ): + if set(rope_parameters_dict.keys()).issubset(ALLOWED_LAYER_TYPES): pass else: rope_parameters_dict = {"full_attention": rope_parameters_dict} @@ -885,7 +832,7 @@ def rope_config_validation(config: PreTrainedConfig, ignore_keys: Optional[set] ) -class RopeParameters(TypedDict): +class RopeParameters(TypedDict, total=False): """ Args: rope_theta (`float`): @@ -893,6 +840,8 @@ class RopeParameters(TypedDict): rope_type (`str`, *optional*, defaults to "default"): The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. + partial_rotary_factor (`float`, *optional*): + Percentage of the query and keys which will have rotary embedding. factor (`float`, *optional*): Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In most scaling types, a `factor` of x will enable the model to handle sequences of length x * @@ -926,6 +875,7 @@ class RopeParameters(TypedDict): rope_theta: float rope_type: Optional[str] + partial_rotary_factor: Optional[float] factor: Optional[float] original_max_position_embeddings: Optional[int] attention_factor: Optional[float] @@ -935,3 +885,30 @@ class RopeParameters(TypedDict): long_factor: Optional[list[float]] low_freq_factor: Optional[float] high_freq_factor: Optional[float] + + +def standardize_rope_params(config): + """ + Helper to standardize the config's rope params field by ensuring the params are defined for each + later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) + """ + rope_parameters = getattr(config, "rope_parameters", {}) + + # Move `rope_theta` and `partial_rotary_factor` to the params dict, if not there yet + rope_theta = getattr(config, "rope_theta", None) + partial_rotary_factor = getattr(config, "partial_rotary_factor", None) + + # Case 1: one RoPE theat = one RoPE param per model without nesting + if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + rope_parameters.setdefault("rope_type", rope_parameters.get("type", "default")) + rope_parameters.setdefault("rope_theta", rope_theta) + rope_parameters.setdefault("partial_rotary_factor", partial_rotary_factor) + # Case 2: different RoPE for each layer as nested dict + else: + for layer_type in config.layer_types: + rope_parameters[layer_type].setdefault("rope_type", rope_parameters[layer_type].get("type", "default")) + rope_parameters[layer_type].setdefault("rope_theta", rope_theta) + rope_parameters[layer_type].setdefault("partial_rotary_factor", partial_rotary_factor) + + config.rope_parameters = rope_parameters + rope_config_validation(config) diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index 07fd3eaa1aab..f849fbb3cefa 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -171,10 +171,11 @@ def __init__( self.num_logits_to_keep = num_logits_to_keep self.attn_layer_indices = attn_layer_indices + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - self.partial_rotary_factor = 0.5 rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = 0.5 # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index 8b57c903dde8..8ba99bd02d23 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import standardize_rope_params class EfficientLoFTRConfig(PreTrainedConfig): @@ -67,10 +67,7 @@ class EfficientLoFTRConfig(PreTrainedConfig): fine_kernel_size (`int`, *optional*, defaults to 8): Kernel size used for the fine feature matching batch_norm_eps (`float`, *optional*, defaults to 1e-05): - The epsilon used by the batch normalization layers. - partial_rotary_factor (`float`, *optional*, defaults to 4.0): - Dim factor for the RoPE embeddings, in EfficientLoFTR, frequencies should be generated for - the whole hidden_size, so this factor is used to compensate. + The epsilon used by the batch normalization layers rope_parameters (`RopeParameters`, *optional*): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE @@ -121,7 +118,6 @@ def __init__( coarse_matching_border_removal: int = 2, fine_kernel_size: int = 8, batch_norm_eps: float = 1e-5, - partial_rotary_factor: float = 4.0, rope_parameters: Optional[dict] = None, fine_matching_slice_dim: int = 8, fine_matching_regress_temperature: float = 10.0, @@ -176,17 +172,16 @@ def __init__( self.fine_matching_regress_temperature = fine_matching_regress_temperature self.num_key_value_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.initializer_range = initializer_range + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_scaling or rope_parameters or {} + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 4.0) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + # Standardize and validate the correctness of rotary position embeddings parameters + standardize_rope_params(self) super().__init__(**kwargs) diff --git a/src/transformers/models/efficientloftr/modeling_efficientloftr.py b/src/transformers/models/efficientloftr/modeling_efficientloftr.py index bdf6dd67ae48..1774bc48e0dd 100644 --- a/src/transformers/models/efficientloftr/modeling_efficientloftr.py +++ b/src/transformers/models/efficientloftr/modeling_efficientloftr.py @@ -125,7 +125,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index bbe4a5ec22d8..e1315ad9cbca 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -77,9 +77,6 @@ class FuyuConfig(PreTrainedConfig): The dropout ratio after applying the MLP to the hidden states. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio after computing the attention scores. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - Percentage of the query and keys which will have rotary embedding. - pad_token_id (`int`, *optional*): The id of the *padding* token. bos_token_id (`int`, *optional*, defaults to 1): @@ -122,7 +119,6 @@ def __init__( qk_layernorm: Optional[bool] = True, hidden_dropout: Optional[float] = 0.0, attention_dropout: Optional[float] = 0.0, - partial_rotary_factor: Optional[float] = 0.5, pad_token_id: Optional[int] = None, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, @@ -131,6 +127,8 @@ def __init__( **kwargs, ): if text_config is None: + rope_parameters = rope_parameters if rope_parameters is not None else {} + rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) text_config = { "vocab_size": vocab_size, "max_position_embeddings": max_position_embeddings, @@ -146,7 +144,6 @@ def __init__( "qk_layernorm": qk_layernorm, "hidden_dropout": hidden_dropout, "attention_dropout": attention_dropout, - "partial_rotary_factor": partial_rotary_factor, "pad_token_id": pad_token_id, "bos_token_id": bos_token_id, "eos_token_id": eos_token_id, @@ -172,11 +169,11 @@ def __init__( self.qk_layernorm = qk_layernorm self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - self.partial_rotary_factor = partial_rotary_factor self.image_token_id = image_token_id # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 25000.0) diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index e0d2c3d6492a..7f7509730d8f 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -48,7 +48,6 @@ class GlmConfig(PreTrainedConfig): by meanpooling all the original heads within that group. For more details, check out [this paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `num_attention_heads`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. head_dim (`int`, *optional*, defaults to 128): The attention head dimension. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): @@ -112,7 +111,6 @@ def __init__( num_hidden_layers: Optional[int] = 40, num_attention_heads: Optional[int] = 32, num_key_value_heads: Optional[int] = 2, - partial_rotary_factor: Optional[float] = 0.5, head_dim: Optional[int] = 128, hidden_act: Optional[str] = "silu", attention_dropout: Optional[float] = 0.0, @@ -134,7 +132,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.head_dim = head_dim self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -146,6 +143,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index a4880c0145e9..19d08d9fa44e 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -101,7 +101,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py index 059cb296c972..97b47a9b8f7f 100644 --- a/src/transformers/models/glm/modular_glm.py +++ b/src/transformers/models/glm/modular_glm.py @@ -60,7 +60,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index 43e6323b0060..caf6194bba94 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -48,8 +48,6 @@ class Glm4Config(PreTrainedConfig): by meanpooling all the original heads within that group. For more details, check out [this paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `num_attention_heads`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - The factor of the partial rotary position. head_dim (`int`, *optional*, defaults to 128): The attention head dimension. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): @@ -113,7 +111,6 @@ def __init__( num_hidden_layers: Optional[int] = 40, num_attention_heads: Optional[int] = 32, num_key_value_heads: Optional[int] = 2, - partial_rotary_factor: Optional[float] = 0.5, head_dim: Optional[int] = 128, hidden_act: Optional[str] = "silu", attention_dropout: Optional[float] = 0.0, @@ -135,7 +132,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.head_dim = head_dim self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -147,6 +143,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/glm4/modeling_glm4.py b/src/transformers/models/glm4/modeling_glm4.py index ba07da7cab54..a1f6a01c1cc5 100644 --- a/src/transformers/models/glm4/modeling_glm4.py +++ b/src/transformers/models/glm4/modeling_glm4.py @@ -305,7 +305,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index aa1a16a95b37..10da1ba90aac 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -47,8 +47,6 @@ class Glm4MoeConfig(PreTrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 96): Number of attention heads for each attention layer in the Transformer encoder. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - The factor of the partial rotary position. num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if @@ -144,7 +142,6 @@ def __init__( intermediate_size: Optional[int] = 10944, num_hidden_layers: Optional[int] = 46, num_attention_heads: Optional[int] = 96, - partial_rotary_factor: Optional[float] = 0.5, num_key_value_heads: Optional[int] = 8, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 131072, @@ -173,7 +170,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -185,6 +181,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/glm4_moe/modeling_glm4_moe.py b/src/transformers/models/glm4_moe/modeling_glm4_moe.py index 1c6575f3420a..13ab166a6312 100644 --- a/src/transformers/models/glm4_moe/modeling_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modeling_glm4_moe.py @@ -82,7 +82,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index 0912f2289f2f..b8c4cd222475 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -61,8 +61,6 @@ class Glm4MoeConfig(PreTrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 96): Number of attention heads for each attention layer in the Transformer encoder. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - The factor of the partial rotary position. num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if @@ -158,7 +156,6 @@ def __init__( intermediate_size: Optional[int] = 10944, num_hidden_layers: Optional[int] = 46, num_attention_heads: Optional[int] = 96, - partial_rotary_factor: Optional[float] = 0.5, num_key_value_heads: Optional[int] = 8, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 131072, @@ -187,7 +184,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -199,6 +195,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py b/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py index fb57f66a9ae0..dd9f2fba17d3 100644 --- a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py +++ b/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py @@ -702,9 +702,13 @@ def offset_layer(x, offset=llm_layer_offset): "dtype": text_config.get("torch_dtype", "bfloat16"), "use_cache": text_config.get("use_cache", True), "vocab_size": text_config.get("vocab_size", 151552), - "partial_rotary_factor": 0.5, "tie_word_embeddings": False, - "rope_parameters": {"rope_type": "default", "rope_theta": 10000.0, "mrope_section": [8, 12, 12]}, + "rope_parameters": { + "rope_type": "default", + "rope_theta": 10000.0, + "mrope_section": [8, 12, 12], + "partial_rotary_factor": 0.5, + }, } hf_config["text_config"] = txt_config diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py index 47ad72ac96ce..835fd47cc48e 100644 --- a/src/transformers/models/glm4v/modeling_glm4v.py +++ b/src/transformers/models/glm4v/modeling_glm4v.py @@ -425,7 +425,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index 3d158e1b19cb..44004afd6c9c 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -139,7 +139,6 @@ class Glm4vMoeTextConfig(PreTrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 96): Number of attention heads for each attention layer in the Transformer encoder. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if @@ -231,7 +230,6 @@ def __init__( intermediate_size: Optional[int] = 10944, num_hidden_layers: Optional[int] = 46, num_attention_heads: Optional[int] = 96, - partial_rotary_factor: Optional[float] = 0.5, num_key_value_heads: Optional[int] = 8, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 65536, @@ -261,7 +259,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -273,6 +270,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/glm4v_moe/convert_glm4v_moe_mgt_weights_to_hf.py b/src/transformers/models/glm4v_moe/convert_glm4v_moe_mgt_weights_to_hf.py index d8b08716b6c4..54a9564b69c5 100644 --- a/src/transformers/models/glm4v_moe/convert_glm4v_moe_mgt_weights_to_hf.py +++ b/src/transformers/models/glm4v_moe/convert_glm4v_moe_mgt_weights_to_hf.py @@ -707,7 +707,12 @@ def offset_layer(x, offset=llm_layer_offset): "n_shared_experts": text_config.get("n_shared_experts", 1), "norm_topk_prob": text_config.get("norm_topk_prob", True), "num_experts_per_tok": text_config.get("num_experts_per_tok", 8), - "rope_parameters": {"rope_type": "default", "rope_theta": 10000.0, "mrope_section": [8, 12, 12]}, + "rope_parameters": { + "rope_type": "default", + "rope_theta": 10000.0, + "mrope_section": [8, 12, 12], + "partial_rotary_factor": 0.5, + }, } hf_config["text_config"] = txt_config diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py index 631505562bc6..b93d0b03bfe7 100644 --- a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py @@ -129,7 +129,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index c69ca8439315..a0b494664ab1 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -88,7 +88,6 @@ class Glm4vMoeTextConfig(Glm4MoeConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 96): Number of attention heads for each attention layer in the Transformer encoder. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if @@ -177,7 +176,6 @@ def __init__( intermediate_size: Optional[int] = 10944, num_hidden_layers: Optional[int] = 46, num_attention_heads: Optional[int] = 96, - partial_rotary_factor: Optional[float] = 0.5, num_key_value_heads: Optional[int] = 8, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 65536, @@ -207,7 +205,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -219,6 +216,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) @@ -376,7 +374,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 744e0316146c..8c2c77a8deb3 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -50,8 +50,6 @@ class GPTNeoXConfig(PreTrainedConfig): hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. - rotary_pct (`float`, *optional*, defaults to 0.25): - percentage of hidden dimensions to allocate to rotary embeddings attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio probability of the attention score. hidden_dropout (`float`, *optional*, defaults to 0.0): @@ -59,8 +57,7 @@ class GPTNeoXConfig(PreTrainedConfig): hidden states. classifier_dropout (`float`, *optional*, defaults to 0.1): Argument used when doing token classification, used in the model [`GPTNeoXForTokenClassification`]. - - The dropout ratio for the hidden layer. + The dropout ratio for the c;assifier head. max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). @@ -119,7 +116,6 @@ def __init__( num_attention_heads: Optional[int] = 64, intermediate_size: Optional[int] = 24576, hidden_act: Optional[str] = "gelu", - rotary_pct: Optional[float] = 0.25, attention_dropout: Optional[float] = 0.0, hidden_dropout: Optional[float] = 0.0, classifier_dropout: Optional[float] = 0.1, @@ -143,8 +139,6 @@ def __init__( self.num_attention_heads = num_attention_heads self.intermediate_size = intermediate_size self.hidden_act = hidden_act - self.rotary_pct = rotary_pct - self.partial_rotary_factor = rotary_pct self.attention_dropout = attention_dropout self.hidden_dropout = hidden_dropout self.classifier_dropout = classifier_dropout @@ -156,10 +150,8 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) self.attention_bias = attention_bias - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rotary_emb_base", 10000.0) diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index fc7d6fd40a80..e3cd7c7d4d39 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -88,7 +88,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/gpt_neox/modular_gpt_neox.py b/src/transformers/models/gpt_neox/modular_gpt_neox.py index c267753db350..a2baca515668 100644 --- a/src/transformers/models/gpt_neox/modular_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modular_gpt_neox.py @@ -62,7 +62,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index 409232145f2a..077b6429f510 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -49,8 +49,6 @@ class GPTNeoXJapaneseConfig(PreTrainedConfig): intermediate_multiple_size. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. - rotary_pct (`float`, *optional*, defaults to 1.00): - percentage of hidden dimensions to allocate to rotary embeddings max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with. initializer_range (`float`, *optional*, defaults to 0.02): @@ -93,7 +91,6 @@ def __init__( num_attention_heads: Optional[int] = 32, intermediate_multiple_size: Optional[int] = 4, hidden_act: Optional[str] = "gelu", - rotary_pct: Optional[float] = 1.00, max_position_embeddings: Optional[int] = 2048, initializer_range: Optional[float] = 0.02, layer_norm_eps: Optional[int] = 1e-5, @@ -113,19 +110,15 @@ def __init__( self.num_attention_heads = num_attention_heads self.intermediate_multiple_size = intermediate_multiple_size self.hidden_act = hidden_act - self.rotary_pct = rotary_pct - self.partial_rotary_factor = rotary_pct self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 1.0) self.attention_dropout = attention_dropout self.hidden_dropout = hidden_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rotary_emb_base", 10000.0) diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index e04909e1f7eb..59513e50477d 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -87,8 +87,6 @@ class MoonshineConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.9): - Percentage of the query and keys which will have rotary embedding. is_encoder_decoder (`bool`, *optional*, defaults to `True`): Whether the model is used as an encoder/decoder or not. attention_bias (`bool`, *optional*, defaults to `False`): @@ -142,7 +140,6 @@ def __init__( decoder_start_token_id: Optional[int] = 1, use_cache: Optional[bool] = True, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.9, is_encoder_decoder: Optional[bool] = True, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, @@ -174,13 +171,13 @@ def __init__( self.initializer_range = initializer_range self.decoder_start_token_id = decoder_start_token_id self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.is_encoder_decoder = is_encoder_decoder self.attention_bias = attention_bias self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index 0840c1623489..e54541c91b2b 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -118,7 +118,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 38314c4535a6..ab379620345c 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -110,8 +110,6 @@ class MoonshineConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.9): - Percentage of the query and keys which will have rotary embedding. is_encoder_decoder (`bool`, *optional*, defaults to `True`): Whether the model is used as an encoder/decoder or not. attention_bias (`bool`, *optional*, defaults to `False`): @@ -165,7 +163,6 @@ def __init__( decoder_start_token_id: Optional[int] = 1, use_cache: Optional[bool] = True, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.9, is_encoder_decoder: Optional[bool] = True, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, @@ -197,13 +194,13 @@ def __init__( self.initializer_range = initializer_range self.decoder_start_token_id = decoder_start_token_id self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.is_encoder_decoder = is_encoder_decoder self.attention_bias = attention_bias self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index c5f888ac6d36..f9d57bf46b73 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -80,7 +80,6 @@ class NemotronConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -123,7 +122,6 @@ def __init__( eos_token_id: Optional[int] = 3, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.5, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, @@ -141,13 +139,13 @@ def __init__( self.initializer_range = initializer_range self.norm_eps = norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index af1d14ee2da0..751305b0a0ea 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -132,7 +132,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index f9dbe11580b2..3760519d4266 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -70,8 +70,6 @@ class PersimmonConfig(PreTrainedConfig): The dropout ratio after applying the MLP to the hidden states. attention_dropout (`float`, *optional*, default to 0.0): The dropout ratio after computing the attention scores. - partial_rotary_factor (`float`, *optional*, default to 0.5): - Percentage of the query and keys which will have rotary embedding. Example: @@ -102,7 +100,6 @@ def __init__( qk_layernorm: Optional[bool] = True, hidden_dropout: Optional[float] = 0.0, attention_dropout: Optional[float] = 0.0, - partial_rotary_factor: Optional[float] = 0.5, pad_token_id: Optional[int] = None, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, @@ -121,10 +118,10 @@ def __init__( self.qk_layernorm = qk_layernorm self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - self.partial_rotary_factor = partial_rotary_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 25000.0) diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index 4b09a2dd75bf..094b26dbabc0 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -99,7 +99,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index 5476cb1b6c7c..d386b30c6959 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -79,8 +79,6 @@ class PhiConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - Percentage of the query and keys which will have rotary embedding. qk_layernorm (`bool`, *optional*, defaults to `False`): Whether or not to normalize the Queries and Keys after projecting the hidden states. bos_token_id (`int`, *optional*, defaults to 1): @@ -138,7 +136,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.5, qk_layernorm: Optional[bool] = False, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, @@ -162,11 +159,11 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.qk_layernorm = qk_layernorm # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 4a1530b78564..5e91a9f70265 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -70,7 +70,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/phi/modular_phi.py b/src/transformers/models/phi/modular_phi.py index 3ecc9ba9d4f7..a06e3b16cfc6 100644 --- a/src/transformers/models/phi/modular_phi.py +++ b/src/transformers/models/phi/modular_phi.py @@ -54,7 +54,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 35eb2df30c9d..6581a2ce9b36 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -81,8 +81,6 @@ class Phi3Config(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 1.0): - Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0. bos_token_id (`int`, *optional*, defaults to 1): The id of the "beginning-of-sequence" token. eos_token_id (`int`, *optional*, defaults to 32000): @@ -140,7 +138,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 1.0, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 32000, pad_token_id: Optional[int] = 32000, @@ -166,10 +163,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 29b3d2847ed1..3f98bb1b0042 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -104,7 +104,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index 46c104d027a7..53f686234fd2 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -296,8 +296,6 @@ class Phi4MultimodalConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to `1.0`): - Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0. bos_token_id (`int`, *optional*, defaults to 199999): The id of the "beginning-of-sequence" token. eos_token_id (`int` or `list[int]`, *optional*, defaults to `[199999, 200020]`): @@ -367,7 +365,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[int] = 1, bos_token_id: Optional[int] = 199999, eos_token_id: Optional[list[int]] = [199999, 200020], pad_token_id: Optional[int] = 199999, @@ -407,10 +404,10 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py index eab15068d252..304d89e6a5f0 100644 --- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py @@ -1481,7 +1481,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index 83eb062cb6f8..0527148166c0 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -71,8 +71,6 @@ class Qwen3NextConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.25): - Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -166,7 +164,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.25, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, head_dim: Optional[int] = 256, @@ -199,13 +196,13 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.head_dim = head_dim # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.25) self.layer_types = layer_types if self.layer_types is None: diff --git a/src/transformers/models/qwen3_next/modeling_qwen3_next.py b/src/transformers/models/qwen3_next/modeling_qwen3_next.py index 362c8fab007f..d0bf37e64de2 100644 --- a/src/transformers/models/qwen3_next/modeling_qwen3_next.py +++ b/src/transformers/models/qwen3_next/modeling_qwen3_next.py @@ -213,7 +213,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/qwen3_next/modular_qwen3_next.py b/src/transformers/models/qwen3_next/modular_qwen3_next.py index 7deedb9c868b..d4f9d017d2d3 100644 --- a/src/transformers/models/qwen3_next/modular_qwen3_next.py +++ b/src/transformers/models/qwen3_next/modular_qwen3_next.py @@ -203,7 +203,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index 130044ee099d..54b482141b42 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -75,8 +75,6 @@ class RecurrentGemmaConfig(PreTrainedConfig): Beginning of stream token id. hidden_activation (``str` or `function``, *optional*, defaults to `"gelu_pytorch_tanh"`): The hidden activation used in the recurrent block as well as the MLP layer of the decoder layers. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - The partial rotary factor used in the initialization of the rotary embeddings. rope_parameters (`RopeParameters`, *optional*): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionaty should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE @@ -119,7 +117,6 @@ def __init__( eos_token_id: Optional[int] = 1, bos_token_id: Optional[int] = 2, hidden_activation: Optional[str] = "gelu_pytorch_tanh", - partial_rotary_factor: Optional[float] = 0.5, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, block_types: Optional[list[str]] = ("recurrent", "recurrent", "attention"), attention_dropout: Optional[float] = 0.0, @@ -139,7 +136,6 @@ def __init__( self.logits_soft_cap = logits_soft_cap self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.block_types = list(block_types) self.hidden_activation = hidden_activation self.head_dim = self.hidden_size // self.num_attention_heads @@ -153,6 +149,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py index dc1a3d4951e2..8f3061d495a0 100644 --- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py @@ -102,7 +102,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index 0efdcd94adcd..a0ddcc33a79c 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -86,8 +86,6 @@ class StableLmConfig(PreTrainedConfig): The dropout ratio after applying the MLP to the hidden states. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - partial_rotary_factor (`float`, *optional*, defaults to 0.25): - Percentage of the query and keys which will have rotary embedding. bos_token_id (int, *optional*, defaults to 0): The id of the `BOS` token in the vocabulary. eos_token_id (int, *optional*, defaults to 0): @@ -125,7 +123,6 @@ def __init__( use_parallel_residual: Optional[bool] = False, hidden_dropout: Optional[float] = 0.0, attention_dropout: Optional[float] = 0.0, - partial_rotary_factor: Optional[float] = 0.25, bos_token_id: Optional[int] = 0, eos_token_id: Optional[int] = 0, **kwargs, @@ -148,10 +145,10 @@ def __init__( self.use_parallel_residual = use_parallel_residual self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - self.partial_rotary_factor = partial_rotary_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.25) # Validate the correctness of rotary position embeddings parameters rope_theta = kwargs.get("rope_theta", 10000.0) diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index 3b091726fab4..fd56e5642cf0 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -98,7 +98,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) From 2ee00d06bcb60b917760d21c062678855b74f196 Mon Sep 17 00:00:00 2001 From: raushan Date: Tue, 18 Nov 2025 16:02:44 +0100 Subject: [PATCH 02/23] batch push --- src/transformers/modeling_rope_utils.py | 181 +++++++++--------- .../models/apertus/configuration_apertus.py | 11 +- .../models/apertus/modular_apertus.py | 7 +- .../models/arcee/configuration_arcee.py | 10 +- .../models/aria/configuration_aria.py | 10 +- .../models/bamba/configuration_bamba.py | 10 +- .../models/bitnet/configuration_bitnet.py | 10 +- .../models/blt/configuration_blt.py | 39 ++-- .../chameleon/configuration_chameleon.py | 9 +- .../models/cohere/configuration_cohere.py | 9 +- .../models/cohere2/configuration_cohere2.py | 10 +- .../models/cohere2/modular_cohere2.py | 11 +- .../models/csm/configuration_csm.py | 18 +- .../models/cwm/configuration_cwm.py | 10 +- src/transformers/models/cwm/modular_cwm.py | 6 +- .../models/dbrx/configuration_dbrx.py | 9 +- .../deepseek_v2/configuration_deepseek_v2.py | 10 +- .../deepseek_v3/configuration_deepseek_v3.py | 10 +- .../open_llama/configuration_open_llama.py | 3 +- .../models/dia/configuration_dia.py | 18 +- .../diffllama/configuration_diffllama.py | 10 +- .../models/doge/configuration_doge.py | 10 +- src/transformers/models/doge/modular_doge.py | 10 +- .../models/dots1/configuration_dots1.py | 10 +- .../configuration_efficientloftr.py | 7 +- .../models/emu3/configuration_emu3.py | 10 +- .../models/ernie4_5/configuration_ernie4_5.py | 10 +- .../configuration_ernie4_5_moe.py | 10 +- .../models/evolla/configuration_evolla.py | 10 +- .../models/exaone4/configuration_exaone4.py | 10 +- .../models/exaone4/modular_exaone4.py | 10 +- .../models/falcon/configuration_falcon.py | 10 +- .../falcon_h1/configuration_falcon_h1.py | 10 +- .../flex_olmo/configuration_flex_olmo.py | 10 +- .../models/flex_olmo/modular_flex_olmo.py | 7 +- .../models/fuyu/configuration_fuyu.py | 10 +- .../models/gemma/configuration_gemma.py | 10 +- .../models/gemma/modular_gemma.py | 10 +- .../models/gemma2/configuration_gemma2.py | 10 +- .../models/gemma2/modular_gemma2.py | 11 +- .../models/gemma3/configuration_gemma3.py | 29 ++- .../models/gemma3/modular_gemma3.py | 28 ++- .../models/gemma3n/configuration_gemma3n.py | 28 +-- .../models/gemma3n/modular_gemma3n.py | 24 ++- .../models/glm/configuration_glm.py | 10 +- .../models/glm4/configuration_glm4.py | 10 +- .../models/glm4_moe/configuration_glm4_moe.py | 10 +- .../models/glm4_moe/modular_glm4_moe.py | 10 +- .../models/glm4v/configuration_glm4v.py | 10 +- .../models/glm4v/modular_glm4v.py | 10 +- .../glm4v_moe/configuration_glm4v_moe.py | 10 +- .../models/glm4v_moe/modular_glm4v_moe.py | 10 +- .../models/gpt_neox/configuration_gpt_neox.py | 10 +- .../configuration_gpt_neox_japanese.py | 10 +- .../models/gpt_oss/configuration_gpt_oss.py | 10 +- .../models/granite/configuration_granite.py | 12 +- .../granitemoe/configuration_granitemoe.py | 12 +- .../configuration_granitemoehybrid.py | 10 +- .../configuration_granitemoeshared.py | 12 +- .../models/helium/configuration_helium.py | 10 +- .../configuration_hunyuan_v1_dense.py | 10 +- .../configuration_hunyuan_v1_moe.py | 10 +- .../models/jetmoe/configuration_jetmoe.py | 10 +- .../configuration_kyutai_speech_to_text.py | 10 +- .../models/lfm2/configuration_lfm2.py | 10 +- .../models/lfm2_moe/configuration_lfm2_moe.py | 10 +- .../models/llama/configuration_llama.py | 10 +- .../models/llama4/configuration_llama4.py | 18 +- .../configuration_longcat_flash.py | 10 +- .../models/mimi/configuration_mimi.py | 10 +- .../models/minimax/configuration_minimax.py | 10 +- .../models/minimax/modular_minimax.py | 10 +- .../ministral/configuration_ministral.py | 10 +- .../models/ministral/modular_ministral.py | 10 +- .../models/mistral/configuration_mistral.py | 10 +- .../models/mixtral/configuration_mixtral.py | 10 +- .../models/mllama/configuration_mllama.py | 10 +- .../modernbert/configuration_modernbert.py | 25 ++- .../models/modernbert/modular_modernbert.py | 25 ++- .../configuration_modernbert_decoder.py | 25 ++- .../modular_modernbert_decoder.py | 25 ++- .../moonshine/configuration_moonshine.py | 10 +- .../models/moonshine/modular_moonshine.py | 10 +- .../models/moshi/configuration_moshi.py | 10 +- .../models/nemotron/configuration_nemotron.py | 10 +- .../models/olmo/configuration_olmo.py | 10 +- .../models/olmo2/configuration_olmo2.py | 10 +- .../models/olmo3/configuration_olmo3.py | 10 +- .../models/olmo3/modular_olmo3.py | 10 +- .../models/olmoe/configuration_olmoe.py | 10 +- .../persimmon/configuration_persimmon.py | 10 +- .../models/phi/configuration_phi.py | 10 +- .../models/phi3/configuration_phi3.py | 10 +- .../configuration_phi4_multimodal.py | 10 +- .../models/phimoe/configuration_phimoe.py | 11 +- .../models/pixtral/configuration_pixtral.py | 10 +- .../models/qwen2/configuration_qwen2.py | 10 +- .../configuration_qwen2_5_omni.py | 26 +-- .../qwen2_5_omni/modular_qwen2_5_omni.py | 26 +-- .../qwen2_5_vl/configuration_qwen2_5_vl.py | 10 +- .../qwen2_moe/configuration_qwen2_moe.py | 10 +- .../models/qwen2_vl/configuration_qwen2_vl.py | 10 +- .../models/qwen3/configuration_qwen3.py | 10 +- .../qwen3_moe/configuration_qwen3_moe.py | 10 +- .../qwen3_next/configuration_qwen3_next.py | 10 +- .../configuration_qwen3_omni_moe.py | 35 ++-- .../qwen3_omni_moe/modular_qwen3_omni_moe.py | 15 +- .../models/qwen3_vl/configuration_qwen3_vl.py | 10 +- .../models/qwen3_vl/modular_qwen3_vl.py | 10 +- .../configuration_qwen3_vl_moe.py | 10 +- .../qwen3_vl_moe/modular_qwen3_vl_moe.py | 10 +- .../configuration_recurrent_gemma.py | 10 +- .../models/seed_oss/configuration_seed_oss.py | 10 +- .../models/smollm3/configuration_smollm3.py | 7 +- .../models/smollm3/modular_smollm3.py | 7 +- .../models/stablelm/configuration_stablelm.py | 10 +- .../starcoder2/configuration_starcoder2.py | 10 +- .../models/t5gemma/configuration_t5gemma.py | 10 +- .../vaultgemma/configuration_vaultgemma.py | 10 +- .../models/zamba2/configuration_zamba2.py | 10 +- 120 files changed, 806 insertions(+), 790 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 5d9bfc936e81..aebf3a65b777 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -27,6 +27,87 @@ import torch +class RopeParameters(TypedDict, total=False): + """ + Args: + rope_theta (`float`): + The base period of the RoPE embeddings. + rope_type (`str`, *optional*, defaults to "default"): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + partial_rotary_factor (`float`, *optional*): + Percentage of the query and keys which will have rotary embedding. + factor (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + original_max_position_embeddings (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + attention_factor (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + beta_fast (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + beta_slow (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + short_factor (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + long_factor (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + low_freq_factor (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + high_freq_factor (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + """ + + rope_theta: float + rope_type: Optional[str] + partial_rotary_factor: Optional[float] + factor: Optional[float] + original_max_position_embeddings: Optional[int] + attention_factor: Optional[float] + beta_fast: Optional[float] + beta_slow: Optional[float] + short_factor: Optional[list[float]] + long_factor: Optional[list[float]] + low_freq_factor: Optional[float] + high_freq_factor: Optional[float] + + +def get_standardized_rope_params(config): + """ + Helper to standardize the config's rope params field by ensuring the params are defined for each + later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) + """ + rope_parameters = getattr(config, "rope_parameters", {}) + + # Move `rope_theta` and `partial_rotary_factor` to the params dict, if not there yet + rope_theta = getattr(config, "rope_theta", None) + partial_rotary_factor = getattr(config, "partial_rotary_factor", None) + + # Case 1: one RoPE theat = one RoPE param per model without nesting + if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + rope_parameters.setdefault("rope_type", rope_parameters.get("type", "default")) + rope_parameters.setdefault("rope_theta", rope_theta) + rope_parameters.setdefault("partial_rotary_factor", partial_rotary_factor) + # Case 2: different RoPE for each layer as nested dict + else: + for layer_type in config.layer_types: + rope_parameters[layer_type].setdefault("rope_type", rope_parameters[layer_type].get("type", "default")) + rope_parameters[layer_type].setdefault("rope_theta", rope_theta) + rope_parameters[layer_type].setdefault("partial_rotary_factor", partial_rotary_factor) + + return rope_parameters + + def dynamic_rope_update(rope_forward): """ Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE @@ -157,8 +238,8 @@ def _compute_linear_scaling_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) - rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters + rope_parameters_dict = get_standardized_rope_params(config) + rope_parameters_dict = rope_parameters_dict[layer_type] if layer_type is not None else rope_parameters_dict factor = rope_parameters_dict["factor"] # Gets the default RoPE parameters @@ -222,7 +303,7 @@ def _compute_dynamic_ntk_parameters( """ # TODO (joao): use the new `original_max_position_embeddings` from rope_parameters # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + rope_parameters_dict = get_standardized_rope_params(config) rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] @@ -309,7 +390,7 @@ def _compute_yarn_parameters( post-processing scaling factor applied to the computed cos/sin. """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + rope_parameters_dict = get_standardized_rope_params(config) rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] @@ -439,7 +520,7 @@ def _compute_longrope_parameters( """ # TODO (joao): use the new `original_max_position_embeddings` from rope_parameters # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + rope_parameters_dict = get_standardized_rope_params(config) rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] @@ -523,7 +604,7 @@ def _compute_llama3_parameters( post-processing scaling factor applied to the computed cos/sin. """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + rope_parameters_dict = get_standardized_rope_params(config) rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters # Gets the default RoPE parameters @@ -803,14 +884,16 @@ def _validate_llama3_parameters(rope_parameters: dict, config: PreTrainedConfig, } -def rope_config_validation(config: PreTrainedConfig, ignore_keys: Optional[set] = None): +def rope_config_standardize_and_validate(config: PreTrainedConfig, ignore_keys: Optional[set] = None): """ Validate the RoPE config arguments, given a `PreTrainedConfig` object """ - rope_parameters_dict = getattr(config, "rope_parameters", None) # not a default parameter in `PreTrainedConfig` + rope_parameters_dict = get_standardized_rope_params(config) if rope_parameters_dict is None: return + # Update the config with correctly formatted RoPE parameters + config.rope_parameters = rope_parameters_dict if set(rope_parameters_dict.keys()).issubset(ALLOWED_LAYER_TYPES): pass else: @@ -830,85 +913,3 @@ def rope_config_validation(config: PreTrainedConfig, ignore_keys: Optional[set] logger.warning( f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" ) - - -class RopeParameters(TypedDict, total=False): - """ - Args: - rope_theta (`float`): - The base period of the RoPE embeddings. - rope_type (`str`, *optional*, defaults to "default"): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - partial_rotary_factor (`float`, *optional*): - Percentage of the query and keys which will have rotary embedding. - factor (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - original_max_position_embeddings (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - attention_factor (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - beta_fast (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - beta_slow (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - short_factor (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - long_factor (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - low_freq_factor (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - high_freq_factor (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - """ - - rope_theta: float - rope_type: Optional[str] - partial_rotary_factor: Optional[float] - factor: Optional[float] - original_max_position_embeddings: Optional[int] - attention_factor: Optional[float] - beta_fast: Optional[float] - beta_slow: Optional[float] - short_factor: Optional[list[float]] - long_factor: Optional[list[float]] - low_freq_factor: Optional[float] - high_freq_factor: Optional[float] - - -def standardize_rope_params(config): - """ - Helper to standardize the config's rope params field by ensuring the params are defined for each - later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) - """ - rope_parameters = getattr(config, "rope_parameters", {}) - - # Move `rope_theta` and `partial_rotary_factor` to the params dict, if not there yet - rope_theta = getattr(config, "rope_theta", None) - partial_rotary_factor = getattr(config, "partial_rotary_factor", None) - - # Case 1: one RoPE theat = one RoPE param per model without nesting - if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): - rope_parameters.setdefault("rope_type", rope_parameters.get("type", "default")) - rope_parameters.setdefault("rope_theta", rope_theta) - rope_parameters.setdefault("partial_rotary_factor", partial_rotary_factor) - # Case 2: different RoPE for each layer as nested dict - else: - for layer_type in config.layer_types: - rope_parameters[layer_type].setdefault("rope_type", rope_parameters[layer_type].get("type", "default")) - rope_parameters[layer_type].setdefault("rope_theta", rope_theta) - rope_parameters[layer_type].setdefault("partial_rotary_factor", partial_rotary_factor) - - config.rope_parameters = rope_parameters - rope_config_validation(config) diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index 98fe8e157016..4cf305b19726 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class ApertusConfig(PreTrainedConfig): @@ -160,14 +160,15 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 12000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + rope_parameters["rope_theta"] = kwargs.get("rope_theta", 12000000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index b52e8bd82344..15498aaf1ec9 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -20,7 +20,7 @@ from torch import nn from ...cache_utils import Cache -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -180,9 +180,8 @@ def __init__( del self.head_dim # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 12000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 12000000.0) + rope_config_standardize_and_validate(self) class ApertusMLP(NemotronMLP): diff --git a/src/transformers/models/arcee/configuration_arcee.py b/src/transformers/models/arcee/configuration_arcee.py index b9892eaf8b61..26e36e123ab9 100644 --- a/src/transformers/models/arcee/configuration_arcee.py +++ b/src/transformers/models/arcee/configuration_arcee.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class ArceeConfig(PreTrainedConfig): @@ -165,12 +165,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index 7e11d4d99d11..6bcd0df6d9fa 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ..auto import CONFIG_MAPPING, AutoConfig @@ -170,12 +170,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index f849fbb3cefa..943ffb38102d 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -174,13 +174,13 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = 0.5 # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py index 0473ad6ac407..0918b6470723 100644 --- a/src/transformers/models/bitnet/configuration_bitnet.py +++ b/src/transformers/models/bitnet/configuration_bitnet.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -140,12 +140,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py index 7459346645ea..2c2992e3ce25 100644 --- a/src/transformers/models/blt/configuration_blt.py +++ b/src/transformers/models/blt/configuration_blt.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -67,12 +67,12 @@ def __init__( self.initializer_range = initializer_range # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -122,12 +122,12 @@ def __init__( self.initializer_range = initializer_range # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -169,11 +169,12 @@ def __init__( self.initializer_range = initializer_range # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -249,11 +250,12 @@ def __init__( self.initializer_range = initializer_range # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -377,11 +379,12 @@ def __init__( self.monotonicity = kwargs.get("monotonicity", False) # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # Cross attention configurations self.cross_attn_k = cross_attn_k diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index bfa8a9f33469..003265a6cec5 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -232,11 +232,12 @@ def __init__( self.swin_norm = swin_norm # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) if vq_config is None: vq_config = {} diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index 18afd5fd32e9..23656ae0ad5e 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -167,11 +167,12 @@ def __init__( self.use_qk_norm = use_qk_norm # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index 910dc6dcb80a..12126d4f7b22 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Cohere2Config(PreTrainedConfig): @@ -168,7 +168,8 @@ def __init__( self.layer_types = layer_types # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Need to specify head_dim in the config so it can be used in the attention forward functions self.head_dim = hidden_size // num_attention_heads @@ -193,9 +194,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["Cohere2Config"] diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index af9fa871f391..f769bf7c204b 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -27,8 +27,7 @@ from ...modeling_rope_utils import ( RopeParameters, dynamic_rope_update, - rope_config_validation, - standardize_rope_params, + rope_config_standardize_and_validate, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack @@ -192,7 +191,8 @@ def __init__( self.layer_types = layer_types # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Need to specify head_dim in the config so it can be used in the attention forward functions self.head_dim = hidden_size // num_attention_heads @@ -217,9 +217,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) class Cohere2RotaryEmbedding(CohereRotaryEmbedding): diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index ce1ad2dd5993..eac38977457a 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -163,12 +163,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) class CsmConfig(PreTrainedConfig): @@ -350,12 +350,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cwm/configuration_cwm.py b/src/transformers/models/cwm/configuration_cwm.py index 765f7f713247..5673657c6fa2 100644 --- a/src/transformers/models/cwm/configuration_cwm.py +++ b/src/transformers/models/cwm/configuration_cwm.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import rope_config_standardize_and_validate class CwmConfig(PreTrainedConfig): @@ -179,12 +179,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1_000_000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cwm/modular_cwm.py b/src/transformers/models/cwm/modular_cwm.py index df2a003438a8..ac093fd13733 100644 --- a/src/transformers/models/cwm/modular_cwm.py +++ b/src/transformers/models/cwm/modular_cwm.py @@ -21,7 +21,7 @@ from ...configuration_utils import layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import standardize_rope_params +from ...modeling_rope_utils import rope_config_standardize_and_validate from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging from ..llama.configuration_llama import LlamaConfig @@ -183,8 +183,8 @@ def __init__( del self.attention_bias # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1_000_000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + rope_config_standardize_and_validate(self) class CwmRotaryEmbedding(Qwen2RotaryEmbedding): diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index 82182c49bd3f..193ba0909711 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -17,7 +17,7 @@ from typing import Any, Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -223,11 +223,12 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters["rope_theta"] = 10000.0 # Validate the correctness of rotary position embeddings parameters - standardize_rope_params(self, rope_theta=10000.0) - rope_config_validation(self) + rope_config_standardize_and_validate(self) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index 101d699194fd..75ed18f6d3f7 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class DeepseekV2Config(PreTrainedConfig): @@ -211,12 +211,12 @@ def __init__( self.head_dim = qk_rope_head_dim # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index 928a0e1fcf7a..19804559fcdf 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -19,7 +19,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} @@ -227,17 +227,17 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) for key in ["beta_fast", "beta_slow", "factor"]: if key in self.rope_parameters: self.rope_parameters[key] = float(self.rope_parameters[key]) - rope_config_validation(self) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py index 64545d7abcf6..96d23881617a 100644 --- a/src/transformers/models/deprecated/open_llama/configuration_open_llama.py +++ b/src/transformers/models/deprecated/open_llama/configuration_open_llama.py @@ -132,7 +132,8 @@ def __init__( self.rope_theta = rope_theta # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self._rope_parameters_validation() super().__init__( diff --git a/src/transformers/models/dia/configuration_dia.py b/src/transformers/models/dia/configuration_dia.py index b54b5620e524..076e9c42c528 100644 --- a/src/transformers/models/dia/configuration_dia.py +++ b/src/transformers/models/dia/configuration_dia.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -94,12 +94,12 @@ def __init__( self.initializer_range = initializer_range # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__(**kwargs) @@ -200,12 +200,12 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index cbfb5fea5160..b946d57c0f7d 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -20,7 +20,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class DiffLlamaConfig(PreTrainedConfig): @@ -147,12 +147,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index b1058db36a72..250fb272045c 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -23,7 +23,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class DogeConfig(PreTrainedConfig): @@ -191,12 +191,12 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # for backward compatibility if num_key_value_heads is None: diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index 008466fbf4ac..0090ccaafd68 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -31,7 +31,7 @@ from ...integrations.flex_attention import compile_friendly_flex_attention from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import AttentionInterface, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, is_torch_flex_attn_available, logging @@ -220,12 +220,12 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # for backward compatibility if num_key_value_heads is None: diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py index 71393a7844ba..b645d8df9652 100644 --- a/src/transformers/models/dots1/configuration_dots1.py +++ b/src/transformers/models/dots1/configuration_dots1.py @@ -15,7 +15,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -194,7 +194,8 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -207,9 +208,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index 8ba99bd02d23..0643b35dd6ed 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import standardize_rope_params +from ...modeling_rope_utils import rope_config_standardize_and_validate class EfficientLoFTRConfig(PreTrainedConfig): @@ -176,12 +176,13 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters or {} + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} or {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 4.0) self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) # Standardize and validate the correctness of rotary position embeddings parameters - standardize_rope_params(self) + rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py index 634efd227f9e..7d4aa1d29449 100644 --- a/src/transformers/models/emu3/configuration_emu3.py +++ b/src/transformers/models/emu3/configuration_emu3.py @@ -17,7 +17,7 @@ from typing import Optional, Union from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Emu3VQVAEConfig(PreTrainedConfig): @@ -228,12 +228,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ernie4_5/configuration_ernie4_5.py b/src/transformers/models/ernie4_5/configuration_ernie4_5.py index 346eff50e9f2..f3439fa4b248 100644 --- a/src/transformers/models/ernie4_5/configuration_ernie4_5.py +++ b/src/transformers/models/ernie4_5/configuration_ernie4_5.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Ernie4_5Config(PreTrainedConfig): @@ -150,12 +150,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py index 66a299b04c00..c6286cca3089 100644 --- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -183,12 +183,12 @@ def __init__( self.use_bias = use_bias # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py index 4dab03fb9314..ad3248543725 100644 --- a/src/transformers/models/evolla/configuration_evolla.py +++ b/src/transformers/models/evolla/configuration_evolla.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -251,12 +251,12 @@ def __init__( self.resampler_ff_mult = resampler_ff_mult # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # Subconfig if protein_encoder_config is None: diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index a968bcc6f07b..9c9f044878f9 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Exaone4Config(PreTrainedConfig): @@ -166,7 +166,8 @@ def __init__( self.sliding_window_pattern = sliding_window_pattern # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.sliding_window is None: @@ -183,9 +184,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index 4ddc3466ffd9..56a7b7f12a7b 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -30,7 +30,7 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import ( @@ -199,7 +199,8 @@ def __init__( self.sliding_window_pattern = sliding_window_pattern # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.sliding_window is None: @@ -216,9 +217,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 3e7b437954dc..fce62fa5a929 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -164,12 +164,12 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py index 6ba590f15025..b02fe9d46466 100644 --- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py +++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -199,12 +199,12 @@ def __init__( self.num_logits_to_keep = num_logits_to_keep # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) self.projectors_bias = projectors_bias mamba_intermediate = mamba_expand * hidden_size if mamba_d_ssm is None else mamba_d_ssm diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index 635d398b46d9..bd4832be6636 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class FlexOlmoConfig(PreTrainedConfig): @@ -177,12 +177,12 @@ def __init__( self.norm_topk_prob = norm_topk_prob # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index a25362a71f35..062d8ad141d9 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -21,7 +21,7 @@ from ...cache_utils import Cache, DynamicCache from ...masking_utils import create_causal_mask from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import OutputRecorder, check_model_inputs @@ -192,9 +192,8 @@ def __init__( del self.clip_qkv # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) # FlexOlmo RMS norm reuses Olmo2 RMS norm, which handles low precision slightly differently than the original Olmoe. diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index e1315ad9cbca..40e6eed2ca65 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -172,13 +172,13 @@ def __init__( self.image_token_id = image_token_id # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 25000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 25000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index a2c6ac12f008..926ed9c406a5 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class GemmaConfig(PreTrainedConfig): @@ -154,12 +154,12 @@ def __init__( self.use_bidirectional_attention = use_bidirectional_attention # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index 1445baef96bf..bb4d219d54aa 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -24,7 +24,7 @@ from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...tokenization_utils import AddedToken, PreTrainedTokenizer @@ -182,12 +182,12 @@ def __init__( self.use_bidirectional_attention = use_bidirectional_attention # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index 460fb7000354..a37b79930226 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Gemma2Config(PreTrainedConfig): @@ -182,7 +182,8 @@ def __init__( self.use_bidirectional_attention = use_bidirectional_attention # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -191,9 +192,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["Gemma2Config"] diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 4e36cc22e030..f62769e66ad0 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -30,8 +30,7 @@ ROPE_INIT_FUNCTIONS, RopeParameters, dynamic_rope_update, - rope_config_validation, - standardize_rope_params, + rope_config_standardize_and_validate, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack @@ -211,7 +210,8 @@ def __init__( self.use_bidirectional_attention = use_bidirectional_attention # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -220,9 +220,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) class Gemma2RMSNorm(GemmaRMSNorm): diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index d549bfffddf1..64c699deda5a 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -22,7 +22,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging from ..siglip import SiglipVisionConfig @@ -187,16 +187,18 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - if rope_parameters is None: - rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling} - elif "full_attention" in rope_parameters: - rope_parameters["full_attention"].update(rope_scaling) - else: - rope_parameters.update(rope_scaling) - - self.rope_parameters = rope_parameters + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) + self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: self.sliding_window = (self.sliding_window // 2) + 1 # due to fa we set exclusive bounds @@ -212,12 +214,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1_000_000.0) - rope_local_base_freq = getattr(self, "rope_local_base_freq", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) class Gemma3Config(PreTrainedConfig): diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 47e1b49ac7bb..b8aae16ddc4d 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -30,8 +30,7 @@ ROPE_INIT_FUNCTIONS, RopeParameters, dynamic_rope_update, - rope_config_validation, - standardize_rope_params, + rope_config_standardize_and_validate, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack @@ -203,16 +202,18 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - if rope_parameters is None: - rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling} - elif "full_attention" in rope_parameters: - rope_parameters["full_attention"].update(rope_scaling) - else: - rope_parameters.update(rope_scaling) + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) - self.rope_parameters = rope_parameters self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: self.sliding_window = (self.sliding_window // 2) + 1 # due to fa we set exclusive bounds @@ -228,12 +229,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1_000_000.0) - rope_local_base_freq = getattr(self, "rope_local_base_freq", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) class Gemma3Config(PreTrainedConfig): diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index 796822cf4e37..76981434317b 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -23,7 +23,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import is_timm_available, logging, requires_backends @@ -225,9 +225,21 @@ def __init__( self.sliding_window = sliding_window self.final_logit_softcapping = final_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) + + # Validate the correctness of rotary position embeddings parameters + rope_config_standardize_and_validate(self) if layer_types is None: self.layer_types = [ @@ -238,14 +250,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - rope_local_base_freq = kwargs.get("rope_local_base_freq", 100000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) - self.hidden_size_per_layer_input = hidden_size_per_layer_input self.num_kv_shared_layers = num_kv_shared_layers diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 375bd93f2723..60d760c3f651 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -28,7 +28,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -236,9 +236,6 @@ def __init__( self.sliding_window = sliding_window self.final_logit_softcapping = final_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if layer_types is None: self.layer_types = [ @@ -249,13 +246,20 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) + # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - rope_local_base_freq = kwargs.get("rope_local_base_freq", 100000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) self.hidden_size_per_layer_input = hidden_size_per_layer_input self.num_kv_shared_layers = num_kv_shared_layers diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index 7f7509730d8f..324e779151cc 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class GlmConfig(PreTrainedConfig): @@ -142,13 +142,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index caf6194bba94..33e850e07893 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Glm4Config(PreTrainedConfig): @@ -142,13 +142,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index 10da1ba90aac..b64435c28faf 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Glm4MoeConfig(PreTrainedConfig): @@ -180,13 +180,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index b8c4cd222475..5e470938d05c 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -20,7 +20,7 @@ from torch import nn from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging from ..cohere.modeling_cohere import CohereAttention from ..deepseek_v3.modeling_deepseek_v3 import ( @@ -194,13 +194,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index 7370a80b52f2..ff63708923b3 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Glm4vVisionConfig(PreTrainedConfig): @@ -240,12 +240,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) self.image_token_id = image_token_id self.video_token_id = video_token_id diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 2df8b6f9d04a..1892343d79d6 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -31,7 +31,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -277,12 +277,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) self.image_token_id = image_token_id self.video_token_id = video_token_id diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index 44004afd6c9c..a63c6c664fa6 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Glm4vMoeVisionConfig(PreTrainedConfig): @@ -269,13 +269,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index a0b494664ab1..04b140772939 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -23,7 +23,7 @@ from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, logging @@ -215,13 +215,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 8c2c77a8deb3..376f2f0189fd 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -149,14 +149,14 @@ def __init__( self.use_parallel_residual = use_parallel_residual # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) self.attention_bias = attention_bias # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rotary_emb_base", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rotary_emb_base", 10000.0) + rope_config_standardize_and_validate(self) if self.hidden_size % self.num_attention_heads != 0: raise ValueError( "The hidden size is not divisible by the number of attention heads! Make sure to update them!" diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index 077b6429f510..517f2a3eacf6 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -115,15 +115,15 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 1.0) self.attention_dropout = attention_dropout self.hidden_dropout = hidden_dropout # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rotary_emb_base", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rotary_emb_base", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["GPTNeoXJapaneseConfig"] diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py index d7e714079e39..ef249ff05d6f 100644 --- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py +++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class GptOssConfig(PreTrainedConfig): @@ -111,12 +111,12 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 150000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 150000.0) + rope_config_standardize_and_validate(self) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py index 97d3eca0aafe..5a5abdbd13e1 100644 --- a/src/transformers/models/granite/configuration_granite.py +++ b/src/transformers/models/granite/configuration_granite.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -177,12 +177,12 @@ def __init__( self.attention_multiplier = attention_multiplier # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, @@ -192,7 +192,5 @@ def __init__( **kwargs, ) - rope_config_validation(self) - __all__ = ["GraniteConfig"] diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index 98460ec8a363..615a5b558044 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -161,12 +161,12 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) self.attention_bias = attention_bias self.attention_dropout = attention_dropout @@ -189,7 +189,5 @@ def __init__( **kwargs, ) - rope_config_validation(self) - __all__ = ["GraniteMoeConfig"] diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py index 9a58272ec428..016b94544364 100644 --- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -200,12 +200,12 @@ def __init__( self.shared_intermediate_size = shared_intermediate_size # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index b94545710e35..c5922ef7c6ce 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -166,12 +166,12 @@ def __init__( self.position_embedding_type = "rope" # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) self.attention_bias = attention_bias self.attention_dropout = attention_dropout @@ -195,7 +195,5 @@ def __init__( **kwargs, ) - rope_config_validation(self) - __all__ = ["GraniteMoeSharedConfig"] diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py index 3f3ee841991f..eb6a07d53134 100644 --- a/src/transformers/models/helium/configuration_helium.py +++ b/src/transformers/models/helium/configuration_helium.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class HeliumConfig(PreTrainedConfig): @@ -149,12 +149,12 @@ def __init__( self.mlp_bias = mlp_bias # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 100000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 100000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index 3dfa5388d1f7..fde580e13223 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -143,12 +143,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) # TODO needs model-specific validation? + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # TODO needs model-specific validation? super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py index 5ee86b218ae0..1a9edbd96b9e 100644 --- a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -17,7 +17,7 @@ from typing import Optional, Union from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -159,12 +159,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index 43a7b069a32e..660e47d137da 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -149,12 +149,12 @@ def __init__( self.rms_norm_eps = rms_norm_eps # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index 05c901d96dd4..5091117a1ea0 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -185,12 +185,12 @@ def __init__( self.sliding_window = sliding_window # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/lfm2/configuration_lfm2.py b/src/transformers/models/lfm2/configuration_lfm2.py index 6ee32698cc85..28e72b7aa7f2 100644 --- a/src/transformers/models/lfm2/configuration_lfm2.py +++ b/src/transformers/models/lfm2/configuration_lfm2.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Lfm2Config(PreTrainedConfig): @@ -150,7 +150,8 @@ def __init__( self.block_auto_adjust_ff_dim = block_auto_adjust_ff_dim # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -158,9 +159,8 @@ def __init__( self.layer_types = ["full_attention" if i in full_attn_idxs else "conv" for i in range(num_hidden_layers)] # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) + rope_config_standardize_and_validate(self) tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys super().__init__( diff --git a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py index f65af16d77b6..8d60a8637f92 100644 --- a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py +++ b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Lfm2MoeConfig(PreTrainedConfig): @@ -138,7 +138,8 @@ def __init__( self.num_hidden_layers = num_hidden_layers # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.max_position_embeddings = max_position_embeddings self.use_cache = use_cache self.norm_eps = norm_eps @@ -162,9 +163,8 @@ def __init__( self.layer_types = layer_types # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) + rope_config_standardize_and_validate(self) tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys super().__init__( diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index add6c8ee2f74..150f54b4b9b9 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class LlamaConfig(PreTrainedConfig): @@ -173,12 +173,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index 88a0f4f82e53..7de3c3e6a830 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -126,12 +126,12 @@ def __init__( self.vision_feature_select_strategy = vision_feature_select_strategy # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__(**kwargs) @@ -318,7 +318,8 @@ def __init__( self.use_qk_norm = use_qk_norm # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.num_experts_per_tok = num_experts_per_tok self.num_local_experts = num_local_experts @@ -353,9 +354,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) class Llama4Config(PreTrainedConfig): diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index e99c2b8265c2..80868a6ca1d4 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class LongcatFlashConfig(PreTrainedConfig): @@ -212,17 +212,17 @@ def __init__( self.routed_scaling_factor = routed_scaling_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000000.0) for key in ["beta_fast", "beta_slow", "factor"]: if key in self.rope_parameters: self.rope_parameters[key] = float(self.rope_parameters[key]) - rope_config_validation(self) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py index 5453817e3ea4..52956f181c0c 100644 --- a/src/transformers/models/mimi/configuration_mimi.py +++ b/src/transformers/models/mimi/configuration_mimi.py @@ -20,7 +20,7 @@ import numpy as np from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -223,12 +223,12 @@ def __init__( self.attention_bias = attention_bias # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # Handle backward compatibility for frame_rate: # If frame_rate is explicitly provided, use it (backward compatibility) diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index 1e582de1bff8..badbf8bbb485 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -23,7 +23,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class MiniMaxConfig(PreTrainedConfig): @@ -223,7 +223,8 @@ def __init__( self.mlp_beta_factor = mlp_beta_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -232,9 +233,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index 2f459f770998..b7e341416afc 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -28,7 +28,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging from ...utils.generic import OutputRecorder, check_model_inputs @@ -248,7 +248,8 @@ def __init__( self.mlp_beta_factor = mlp_beta_factor # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -257,9 +258,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/ministral/configuration_ministral.py b/src/transformers/models/ministral/configuration_ministral.py index 3f286cd69a9f..6cc23d9bea76 100644 --- a/src/transformers/models/ministral/configuration_ministral.py +++ b/src/transformers/models/ministral/configuration_ministral.py @@ -7,7 +7,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class MinistralConfig(PreTrainedConfig): @@ -159,7 +159,8 @@ def __init__( self.layer_types = layer_types # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -167,9 +168,8 @@ def __init__( ] * num_hidden_layers # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["MinistralConfig"] diff --git a/src/transformers/models/ministral/modular_ministral.py b/src/transformers/models/ministral/modular_ministral.py index f79600e82974..309149f4b2de 100644 --- a/src/transformers/models/ministral/modular_ministral.py +++ b/src/transformers/models/ministral/modular_ministral.py @@ -7,7 +7,7 @@ from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import check_model_inputs @@ -161,7 +161,8 @@ def __init__( self.layer_types = layer_types # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -169,9 +170,8 @@ def __init__( ] * num_hidden_layers # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) class MinistralMLP(Qwen2MLP): diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py index 0fac55d26e2a..a8021d3080bf 100644 --- a/src/transformers/models/mistral/configuration_mistral.py +++ b/src/transformers/models/mistral/configuration_mistral.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -168,12 +168,12 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index adc86a035bed..caeb3b530b75 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -188,12 +188,12 @@ def __init__( self.router_jitter_noise = router_jitter_noise # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index 85be3701a84a..2b0263acdca3 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import rope_config_standardize_and_validate from ...utils import logging @@ -249,12 +249,12 @@ def __init__( self.max_position_embeddings = max_position_embeddings # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index b3a045ae324a..19260a95177d 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -22,7 +22,7 @@ from typing import Literal, Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class ModernBertConfig(PreTrainedConfig): @@ -206,9 +206,6 @@ def __init__( self.sparse_pred_ignore_index = sparse_pred_ignore_index self.reference_compile = reference_compile self.repad_logits_with_grad = repad_logits_with_grad - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.classifier_pooling not in ["cls", "mean"]: raise ValueError( @@ -227,13 +224,21 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["sliding_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = getattr(self, "global_rope_theta", 160_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = getattr(self, "local_rope_theta", 10000.0) + # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "global_rope_theta", 160_000.0) - rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) def to_dict(self): output = super().to_dict() diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 8783517edb8a..b276ca0faef2 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -35,7 +35,7 @@ SequenceClassifierOutput, TokenClassifierOutput, ) -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, is_flash_attn_2_available, logging from ...utils.import_utils import is_triton_available @@ -234,9 +234,6 @@ def __init__( self.sparse_pred_ignore_index = sparse_pred_ignore_index self.reference_compile = reference_compile self.repad_logits_with_grad = repad_logits_with_grad - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.classifier_pooling not in ["cls", "mean"]: raise ValueError( @@ -255,13 +252,21 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["sliding_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = getattr(self, "global_rope_theta", 160_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = getattr(self, "local_rope_theta", 10000.0) + # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "global_rope_theta", 160_000.0) - rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) def to_dict(self): output = super().to_dict() diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index be60950fa593..1d8903ecd45d 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class ModernBertDecoderConfig(PreTrainedConfig): @@ -187,9 +187,6 @@ def __init__( self.classifier_activation = classifier_activation self.use_cache = use_cache self.global_attn_every_n_layers = global_attn_every_n_layers - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters # for consistency with ModernBert self.reference_compile = False @@ -204,13 +201,21 @@ def __init__( else: self.layer_types.append("full_attention") + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["sliding_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = getattr(self, "global_rope_theta", 160_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = getattr(self, "local_rope_theta", 10000.0) + # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "global_rope_theta", 160_000.0) - rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index 24c63f499bb2..09ddc064bbf4 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -28,7 +28,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -208,9 +208,6 @@ def __init__( self.classifier_activation = classifier_activation self.use_cache = use_cache self.global_attn_every_n_layers = global_attn_every_n_layers - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters # for consistency with ModernBert self.reference_compile = False @@ -225,13 +222,21 @@ def __init__( else: self.layer_types.append("full_attention") + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: + rope_parameters["full_attention"].update(rope_scaling) + rope_parameters["sliding_attention"].update(rope_scaling) + rope_parameters["full_attention"]["rope_theta"] = getattr(self, "global_rope_theta", 160_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = getattr(self, "local_rope_theta", 10000.0) + # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "global_rope_theta", 160_000.0) - rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) + rope_config_standardize_and_validate(self) # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index 59513e50477d..cd542ffaf9b4 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class MoonshineConfig(PreTrainedConfig): @@ -176,13 +176,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index ab379620345c..1964820bb482 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -35,7 +35,7 @@ Seq2SeqLMOutput, Seq2SeqModelOutput, ) -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -199,13 +199,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index fea1a7cff985..8d77b25cc63a 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -284,12 +284,12 @@ def __init__( self.num_codebooks = num_codebooks # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) audio_encoder_config = kwargs.pop("audio_encoder_config", {}) audio_encoder_model_type = audio_encoder_config.pop("model_type", "mimi") diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index f9d57bf46b73..4de7f35c83d9 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -144,13 +144,13 @@ def __init__( self.mlp_bias = mlp_bias # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index f01e33ead00a..30ba9d1863db 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -160,12 +160,12 @@ def __init__( self.clip_qkv = clip_qkv # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py index 3ba97d4f162b..533b6848e065 100644 --- a/src/transformers/models/olmo2/configuration_olmo2.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -27,7 +27,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Olmo2Config(PreTrainedConfig): @@ -160,12 +160,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo3/configuration_olmo3.py b/src/transformers/models/olmo3/configuration_olmo3.py index 6e3f5594cbb5..179394093f36 100644 --- a/src/transformers/models/olmo3/configuration_olmo3.py +++ b/src/transformers/models/olmo3/configuration_olmo3.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Olmo3Config(PreTrainedConfig): @@ -170,7 +170,8 @@ def __init__( self.rms_norm_eps = rms_norm_eps # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.sliding_window = sliding_window self.layer_types = layer_types @@ -181,9 +182,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["Olmo3Config"] diff --git a/src/transformers/models/olmo3/modular_olmo3.py b/src/transformers/models/olmo3/modular_olmo3.py index d8bec6e9f15d..b01b56b4b382 100644 --- a/src/transformers/models/olmo3/modular_olmo3.py +++ b/src/transformers/models/olmo3/modular_olmo3.py @@ -25,7 +25,7 @@ from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding @@ -186,7 +186,8 @@ def __init__( self.rms_norm_eps = rms_norm_eps # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.sliding_window = sliding_window self.layer_types = layer_types @@ -197,9 +198,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) class Olmo3RMSNorm(Olmo2RMSNorm): diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py index efc04e8a56bb..2d030f312b69 100644 --- a/src/transformers/models/olmoe/configuration_olmoe.py +++ b/src/transformers/models/olmoe/configuration_olmoe.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class OlmoeConfig(PreTrainedConfig): @@ -160,12 +160,12 @@ def __init__( self.norm_topk_prob = norm_topk_prob # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index 3760519d4266..28e0301f1999 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -120,13 +120,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 25000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 25000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index d386b30c6959..bd8be29e1b80 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -162,13 +162,13 @@ def __init__( self.qk_layernorm = qk_layernorm # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 6581a2ce9b36..c064ad628a39 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -165,13 +165,13 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) self._rope_parameters_adjustment() self._rope_parameters_validation() self.sliding_window = sliding_window diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index 53f686234fd2..fbae60d90d4f 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Phi4MultimodalVisionConfig(PreTrainedConfig): @@ -406,13 +406,13 @@ def __init__( self.use_cache = use_cache # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) self._rope_parameters_adjustment() self._rope_parameters_validation() self.sliding_window = sliding_window diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index f7a9b528211f..687010b38290 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -169,11 +169,12 @@ def __init__( self.input_jitter_noise = input_jitter_noise # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) if self.rope_parameters["rope_type"] != "default": if "original_max_position_embeddings" in self.rope_parameters: @@ -189,8 +190,6 @@ def __init__( f"`rope_parameters`'s long_mscale field must be a number, got {rope_parameters_long_mscale}" ) - rope_config_validation(self) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index 62c179b20edc..b39db091a3bf 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -105,12 +105,12 @@ def __init__( self.initializer_range = initializer_range # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["PixtralVisionConfig"] diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index bda8bb8abfc7..4d0eaeeba280 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -159,7 +159,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -172,9 +173,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index af96e9a3163f..4fdac248f53a 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -356,7 +356,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -369,9 +370,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) class Qwen2_5OmniThinkerConfig(PreTrainedConfig): @@ -699,7 +699,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.position_id_per_seconds = position_id_per_seconds # zf self.seconds_per_chunk = seconds_per_chunk # zf self.audio_start_token_id = audio_start_token_id # zf @@ -719,9 +720,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -824,12 +824,12 @@ def __init__( self.enc_se_channels = enc_se_channels # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 5b26bef72601..448af15b8bf3 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -45,7 +45,7 @@ from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutput, ModelOutput -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import ( @@ -389,7 +389,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -402,9 +403,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) class Qwen2_5OmniThinkerConfig(PreTrainedConfig): @@ -732,7 +732,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.position_id_per_seconds = position_id_per_seconds # zf self.seconds_per_chunk = seconds_per_chunk # zf self.audio_start_token_id = audio_start_token_id # zf @@ -752,9 +753,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -857,12 +857,12 @@ def __init__( self.enc_se_channels = enc_se_channels # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index 77ce5556c6cf..3e0195193dca 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -27,7 +27,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Qwen2_5_VLVisionConfig(PreTrainedConfig): @@ -204,7 +204,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -217,11 +218,10 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) if self.rope_parameters["rope_type"] == "mrope": self.rope_parameters["rope_type"] = "default" - rope_config_validation(self, ignore_keys={"mrope_section"}) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py index 256d663d3114..a287417b3594 100644 --- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -187,7 +187,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -210,9 +211,8 @@ def __init__( layer_type_validation(self.layer_types) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 58e80e2011d3..2244585fe198 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -192,7 +192,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -205,11 +206,10 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) if self.rope_parameters["rope_type"] == "mrope": self.rope_parameters["rope_type"] = "default" - rope_config_validation(self, ignore_keys={"mrope_section"}) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index a1cf6a1ea861..c6ce972f1e88 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -167,7 +167,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -180,9 +181,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index 8bc756a17267..c766815e4ba1 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -181,12 +181,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index 0527148166c0..de96a59dd918 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -201,7 +201,8 @@ def __init__( self.head_dim = head_dim # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.25) self.layer_types = layer_types @@ -214,9 +215,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # linear attention part self.linear_conv_kernel_dim = linear_conv_kernel_dim diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index c7746f420514..d03ba4c8ba8b 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -326,12 +326,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -347,7 +347,6 @@ def __init__( tie_word_embeddings=tie_word_embeddings, **kwargs, ) - rope_config_validation(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) class Qwen3OmniMoeThinkerConfig(PreTrainedConfig): @@ -591,7 +590,8 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -604,9 +604,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( tie_word_embeddings=tie_word_embeddings, @@ -770,12 +769,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -1039,12 +1038,12 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) @property def layer_types(self): diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index ea6ac6860133..3617eab80448 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -42,7 +42,7 @@ MoeCausalLMOutputWithPast, MoeModelOutputWithPast, ) -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import PreTrainedModel from ...processing_utils import ProcessorMixin, Unpack from ...tokenization_utils_base import TextInput @@ -217,9 +217,8 @@ def __init__( self.sliding_window = sliding_window # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) class Qwen3OmniMoeThinkerConfig(Qwen2_5OmniThinkerConfig): @@ -674,12 +673,12 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) @property def layer_types(self): diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index 546a3da5bb7b..78cf4d35bb01 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Qwen3VLVisionConfig(PreTrainedConfig): @@ -172,12 +172,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 5000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 5000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 8cbac42f13c3..0e3ca62e7ca1 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -30,7 +30,7 @@ from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, dynamic_rope_update, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, dynamic_rope_update, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import ProcessingKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -213,12 +213,12 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 5000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 5000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py index eab77fa368a2..f9b7a786fa5a 100644 --- a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Qwen3VLMoeTextConfig(PreTrainedConfig): @@ -168,12 +168,12 @@ def __init__( self.head_dim = head_dim or hidden_size // num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 5000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 5000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index 006fa186fe44..bdb54bd179a3 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -23,7 +23,7 @@ from ...activations import ACT2FN from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -190,12 +190,12 @@ def __init__( self.head_dim = head_dim or hidden_size // num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 5000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 5000000.0) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index 54b482141b42..48a0c34bebe6 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -148,13 +148,13 @@ def __init__( self.final_w_init_variance_scale = 2.0 / self.num_hidden_layers # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/seed_oss/configuration_seed_oss.py b/src/transformers/models/seed_oss/configuration_seed_oss.py index 240cb03bac77..2ce745b42966 100644 --- a/src/transformers/models/seed_oss/configuration_seed_oss.py +++ b/src/transformers/models/seed_oss/configuration_seed_oss.py @@ -16,7 +16,7 @@ from typing import Optional from transformers.configuration_utils import PreTrainedConfig -from transformers.modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from transformers.modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class SeedOssConfig(PreTrainedConfig): @@ -172,12 +172,12 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py index 04e8e78e575c..d6a2741e9f29 100644 --- a/src/transformers/models/smollm3/configuration_smollm3.py +++ b/src/transformers/models/smollm3/configuration_smollm3.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class SmolLM3Config(PreTrainedConfig): @@ -202,9 +202,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 2000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 2000000.0) + rope_config_standardize_and_validate(self) __all__ = ["SmolLM3Config"] diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py index e5551d414c1b..f8b140d9def6 100644 --- a/src/transformers/models/smollm3/modular_smollm3.py +++ b/src/transformers/models/smollm3/modular_smollm3.py @@ -21,7 +21,7 @@ from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...modeling_flash_attention_utils import FlashAttentionKwargs -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import logging @@ -219,9 +219,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 2000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 2000000.0) + rope_config_standardize_and_validate(self) class SmolLM3RotaryEmbedding(Qwen2RotaryEmbedding): diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index a0ddcc33a79c..9ebde6e422ed 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -147,13 +147,13 @@ def __init__( self.attention_dropout = attention_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.25) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py index cb34ad1d9157..04fea7033776 100644 --- a/src/transformers/models/starcoder2/configuration_starcoder2.py +++ b/src/transformers/models/starcoder2/configuration_starcoder2.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...utils import logging @@ -157,12 +157,12 @@ def __init__( self.embedding_dropout = embedding_dropout # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py index ae51ab07269b..1ac2be91c247 100644 --- a/src/transformers/models/t5gemma/configuration_t5gemma.py +++ b/src/transformers/models/t5gemma/configuration_t5gemma.py @@ -22,7 +22,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class T5GemmaModuleConfig(PreTrainedConfig): @@ -178,7 +178,8 @@ def __init__( self.layer_types = layer_types # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -187,9 +188,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) class T5GemmaConfig(PreTrainedConfig): diff --git a/src/transformers/models/vaultgemma/configuration_vaultgemma.py b/src/transformers/models/vaultgemma/configuration_vaultgemma.py index 0a784c02c1e6..4edbed15065a 100644 --- a/src/transformers/models/vaultgemma/configuration_vaultgemma.py +++ b/src/transformers/models/vaultgemma/configuration_vaultgemma.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class VaultGemmaConfig(PreTrainedConfig): @@ -178,7 +178,8 @@ def __init__( self.layer_types = layer_types # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -187,9 +188,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) __all__ = ["VaultGemmaConfig"] diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py index 4d6c92439da5..385091866c65 100644 --- a/src/transformers/models/zamba2/configuration_zamba2.py +++ b/src/transformers/models/zamba2/configuration_zamba2.py @@ -23,7 +23,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate class Zamba2Config(PreTrainedConfig): @@ -197,12 +197,12 @@ def __init__( self.use_long_context = use_long_context # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + rope_config_standardize_and_validate(self) self.mamba_d_state = mamba_d_state self.mamba_d_conv = mamba_d_conv From b64791e27f01e5a5a7625131c669f39618092f25 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 26 Nov 2025 13:25:49 +0100 Subject: [PATCH 03/23] maybe better pop and break, and we'll have one theta per config in the rope dict --- .../models/apertus/configuration_apertus.py | 2 +- .../models/apertus/modular_apertus.py | 2 +- .../models/arcee/configuration_arcee.py | 2 +- .../models/aria/configuration_aria.py | 2 +- .../models/bamba/configuration_bamba.py | 2 +- .../models/bitnet/configuration_bitnet.py | 2 +- .../models/blt/configuration_blt.py | 10 ++++----- .../chameleon/configuration_chameleon.py | 2 +- .../models/cohere/configuration_cohere.py | 2 +- .../models/cohere2/configuration_cohere2.py | 2 +- .../models/cohere2/modular_cohere2.py | 2 +- .../models/csm/configuration_csm.py | 4 ++-- .../models/cwm/configuration_cwm.py | 2 +- src/transformers/models/cwm/modular_cwm.py | 2 +- .../deepseek_v2/configuration_deepseek_v2.py | 2 +- .../deepseek_v3/configuration_deepseek_v3.py | 2 +- .../models/dia/configuration_dia.py | 4 ++-- .../diffllama/configuration_diffllama.py | 2 +- .../models/doge/configuration_doge.py | 2 +- src/transformers/models/doge/modular_doge.py | 2 +- .../models/dots1/configuration_dots1.py | 2 +- .../configuration_efficientloftr.py | 2 +- .../models/emu3/configuration_emu3.py | 2 +- .../models/ernie4_5/configuration_ernie4_5.py | 2 +- .../configuration_ernie4_5_moe.py | 2 +- .../models/evolla/configuration_evolla.py | 2 +- .../models/exaone4/configuration_exaone4.py | 2 +- .../models/exaone4/modular_exaone4.py | 2 +- .../models/falcon/configuration_falcon.py | 2 +- .../falcon_h1/configuration_falcon_h1.py | 2 +- .../flex_olmo/configuration_flex_olmo.py | 2 +- .../models/flex_olmo/modular_flex_olmo.py | 2 +- .../models/fuyu/configuration_fuyu.py | 2 +- .../models/gemma/configuration_gemma.py | 2 +- .../models/gemma/modular_gemma.py | 2 +- .../models/gemma2/configuration_gemma2.py | 2 +- .../models/gemma2/modular_gemma2.py | 2 +- .../models/gemma3/configuration_gemma3.py | 4 ++-- .../models/gemma3/modular_gemma3.py | 4 ++-- .../models/glm/configuration_glm.py | 2 +- .../models/glm4/configuration_glm4.py | 2 +- .../models/glm4_moe/configuration_glm4_moe.py | 2 +- .../models/glm4_moe/modular_glm4_moe.py | 2 +- .../models/glm4v/configuration_glm4v.py | 2 +- .../models/glm4v/modular_glm4v.py | 2 +- .../glm4v_moe/configuration_glm4v_moe.py | 2 +- .../models/glm4v_moe/modular_glm4v_moe.py | 2 +- .../models/gpt_neox/configuration_gpt_neox.py | 4 ++-- .../configuration_gpt_neox_japanese.py | 4 ++-- .../models/gpt_oss/configuration_gpt_oss.py | 2 +- .../models/granite/configuration_granite.py | 2 +- .../granitemoe/configuration_granitemoe.py | 2 +- .../configuration_granitemoehybrid.py | 2 +- .../configuration_granitemoeshared.py | 2 +- .../models/helium/configuration_helium.py | 2 +- .../configuration_hunyuan_v1_dense.py | 2 +- .../configuration_hunyuan_v1_moe.py | 2 +- .../models/jetmoe/configuration_jetmoe.py | 2 +- .../configuration_kyutai_speech_to_text.py | 2 +- .../models/llama/configuration_llama.py | 2 +- .../models/llama4/configuration_llama4.py | 4 ++-- .../configuration_longcat_flash.py | 2 +- .../models/mimi/configuration_mimi.py | 2 +- .../models/minimax/configuration_minimax.py | 2 +- .../models/minimax/modular_minimax.py | 2 +- .../ministral/configuration_ministral.py | 2 +- .../models/ministral/modular_ministral.py | 2 +- .../models/mistral/configuration_mistral.py | 2 +- .../models/mixtral/configuration_mixtral.py | 2 +- .../models/mllama/configuration_mllama.py | 2 +- .../modernbert/configuration_modernbert.py | 21 ++++++++++--------- .../models/modernbert/modular_modernbert.py | 21 ++++++++++--------- .../configuration_modernbert_decoder.py | 21 ++++++++++--------- .../modular_modernbert_decoder.py | 21 ++++++++++--------- .../moonshine/configuration_moonshine.py | 2 +- .../models/moonshine/modular_moonshine.py | 2 +- .../models/moshi/configuration_moshi.py | 2 +- .../models/nemotron/configuration_nemotron.py | 2 +- .../models/olmo/configuration_olmo.py | 2 +- .../models/olmo2/configuration_olmo2.py | 2 +- .../models/olmo3/configuration_olmo3.py | 2 +- .../models/olmo3/modular_olmo3.py | 2 +- .../models/olmoe/configuration_olmoe.py | 2 +- .../persimmon/configuration_persimmon.py | 2 +- .../models/phi/configuration_phi.py | 2 +- .../models/phi3/configuration_phi3.py | 2 +- .../configuration_phi4_multimodal.py | 2 +- .../models/phimoe/configuration_phimoe.py | 2 +- .../models/pixtral/configuration_pixtral.py | 2 +- .../models/qwen2/configuration_qwen2.py | 2 +- .../configuration_qwen2_5_omni.py | 6 +++--- .../qwen2_5_omni/modular_qwen2_5_omni.py | 6 +++--- .../qwen2_5_vl/configuration_qwen2_5_vl.py | 2 +- .../qwen2_moe/configuration_qwen2_moe.py | 2 +- .../models/qwen2_vl/configuration_qwen2_vl.py | 2 +- .../models/qwen3/configuration_qwen3.py | 2 +- .../qwen3_moe/configuration_qwen3_moe.py | 2 +- .../qwen3_next/configuration_qwen3_next.py | 2 +- .../configuration_qwen3_omni_moe.py | 8 +++---- .../qwen3_omni_moe/modular_qwen3_omni_moe.py | 4 ++-- .../models/qwen3_vl/configuration_qwen3_vl.py | 2 +- .../models/qwen3_vl/modular_qwen3_vl.py | 2 +- .../configuration_qwen3_vl_moe.py | 2 +- .../qwen3_vl_moe/modular_qwen3_vl_moe.py | 2 +- .../configuration_recurrent_gemma.py | 2 +- .../models/seed_oss/configuration_seed_oss.py | 2 +- .../models/smollm3/configuration_smollm3.py | 2 +- .../models/smollm3/modular_smollm3.py | 2 +- .../models/stablelm/configuration_stablelm.py | 2 +- .../starcoder2/configuration_starcoder2.py | 2 +- .../models/t5gemma/configuration_t5gemma.py | 2 +- .../vaultgemma/configuration_vaultgemma.py | 2 +- .../models/zamba2/configuration_zamba2.py | 2 +- 113 files changed, 172 insertions(+), 168 deletions(-) diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index 4cf305b19726..f142214cd577 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -167,7 +167,7 @@ def __init__( rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_parameters["rope_theta"] = kwargs.get("rope_theta", 12000000.0) + rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 12000000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index 15498aaf1ec9..96c16d8e9aac 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -180,7 +180,7 @@ def __init__( del self.head_dim # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 12000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 12000000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/arcee/configuration_arcee.py b/src/transformers/models/arcee/configuration_arcee.py index 26e36e123ab9..fdf68add7929 100644 --- a/src/transformers/models/arcee/configuration_arcee.py +++ b/src/transformers/models/arcee/configuration_arcee.py @@ -169,7 +169,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index 6bcd0df6d9fa..3ae1e0518876 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -174,7 +174,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index 943ffb38102d..a9f5a93863a5 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -179,7 +179,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = 0.5 # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py index 0918b6470723..4fdebdd2d40f 100644 --- a/src/transformers/models/bitnet/configuration_bitnet.py +++ b/src/transformers/models/bitnet/configuration_bitnet.py @@ -144,7 +144,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py index 2c2992e3ce25..dcbe36314700 100644 --- a/src/transformers/models/blt/configuration_blt.py +++ b/src/transformers/models/blt/configuration_blt.py @@ -71,7 +71,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error @@ -126,7 +126,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error @@ -173,7 +173,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error @@ -254,7 +254,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error @@ -383,7 +383,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) # Cross attention configurations diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index 003265a6cec5..b8cd4ce3d951 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -236,7 +236,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) if vq_config is None: diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index 23656ae0ad5e..c72ed81a3e52 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -171,7 +171,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index 12126d4f7b22..157f61aef75f 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -194,7 +194,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index f769bf7c204b..81fdc126335d 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -217,7 +217,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index eac38977457a..024eee365b0e 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -167,7 +167,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) @@ -354,7 +354,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/cwm/configuration_cwm.py b/src/transformers/models/cwm/configuration_cwm.py index 5673657c6fa2..eb6a07212453 100644 --- a/src/transformers/models/cwm/configuration_cwm.py +++ b/src/transformers/models/cwm/configuration_cwm.py @@ -183,7 +183,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/cwm/modular_cwm.py b/src/transformers/models/cwm/modular_cwm.py index ac093fd13733..928131731215 100644 --- a/src/transformers/models/cwm/modular_cwm.py +++ b/src/transformers/models/cwm/modular_cwm.py @@ -183,7 +183,7 @@ def __init__( del self.attention_bias # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index 75ed18f6d3f7..43bfb0c56b6e 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -215,7 +215,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index 19804559fcdf..fce2c9777eb9 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -231,7 +231,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) for key in ["beta_fast", "beta_slow", "factor"]: if key in self.rope_parameters: diff --git a/src/transformers/models/dia/configuration_dia.py b/src/transformers/models/dia/configuration_dia.py index 076e9c42c528..4485b109d4a1 100644 --- a/src/transformers/models/dia/configuration_dia.py +++ b/src/transformers/models/dia/configuration_dia.py @@ -98,7 +98,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__(**kwargs) @@ -204,7 +204,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index b946d57c0f7d..66dccfa8f37f 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -151,7 +151,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index 250fb272045c..dace12475308 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -195,7 +195,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # for backward compatibility diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index 0090ccaafd68..6b56798cc48f 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -224,7 +224,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # for backward compatibility diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py index b645d8df9652..88640541f777 100644 --- a/src/transformers/models/dots1/configuration_dots1.py +++ b/src/transformers/models/dots1/configuration_dots1.py @@ -208,7 +208,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index 0643b35dd6ed..2f6efbb2ae44 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -179,7 +179,7 @@ def __init__( rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} or {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 4.0) - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) # Standardize and validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py index 7d4aa1d29449..28b2c7a026e2 100644 --- a/src/transformers/models/emu3/configuration_emu3.py +++ b/src/transformers/models/emu3/configuration_emu3.py @@ -232,7 +232,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/ernie4_5/configuration_ernie4_5.py b/src/transformers/models/ernie4_5/configuration_ernie4_5.py index f3439fa4b248..2d77d98c0b07 100644 --- a/src/transformers/models/ernie4_5/configuration_ernie4_5.py +++ b/src/transformers/models/ernie4_5/configuration_ernie4_5.py @@ -154,7 +154,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py index c6286cca3089..9de2be7790c4 100644 --- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py @@ -187,7 +187,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) # MoE arguments diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py index ad3248543725..150928e5ea14 100644 --- a/src/transformers/models/evolla/configuration_evolla.py +++ b/src/transformers/models/evolla/configuration_evolla.py @@ -255,7 +255,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) # Subconfig diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index 9c9f044878f9..40d6e80831d9 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -184,7 +184,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index 56a7b7f12a7b..85b3e9adafd7 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -217,7 +217,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index fce62fa5a929..2afd54d10d39 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -168,7 +168,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py index b02fe9d46466..e0e0d32386af 100644 --- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py +++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -203,7 +203,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) self.projectors_bias = projectors_bias diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index bd4832be6636..e1bad8a4a4eb 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -181,7 +181,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index 062d8ad141d9..337c0e8b8876 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -192,7 +192,7 @@ def __init__( del self.clip_qkv # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index 40e6eed2ca65..7f67ce93643f 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -177,7 +177,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 25000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 25000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index 926ed9c406a5..1c5c792a5874 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -158,7 +158,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index bb4d219d54aa..d3ae7d1864b3 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -186,7 +186,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index a37b79930226..6625ead036f3 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -192,7 +192,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index f62769e66ad0..a256f898e4a4 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -220,7 +220,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index 64c699deda5a..d85b29d9159c 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -196,8 +196,8 @@ def __init__( rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: rope_parameters["full_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) + rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("rope_local_base_freq", 10000.0) self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index b8aae16ddc4d..96cea1feab60 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -211,8 +211,8 @@ def __init__( rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: rope_parameters["full_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) + rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("rope_local_base_freq", 10000.0) self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index 324e779151cc..6236df6741ed 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -147,7 +147,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index 33e850e07893..901de77b43a4 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -147,7 +147,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index b64435c28faf..5699d27e9df5 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -185,7 +185,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # MoE arguments diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index 5e470938d05c..f9d987610f5f 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -199,7 +199,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # MoE arguments diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index ff63708923b3..19cc278ba2fb 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -244,7 +244,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) self.image_token_id = image_token_id self.video_token_id = video_token_id diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 1892343d79d6..7b36d6e58695 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -281,7 +281,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) self.image_token_id = image_token_id self.video_token_id = video_token_id diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index a63c6c664fa6..6bb051e52d3a 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -274,7 +274,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) # MoE arguments diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index 04b140772939..66d21a8afee6 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -220,7 +220,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) # MoE arguments diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 376f2f0189fd..1eb8b2e9cfff 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -131,7 +131,6 @@ def __init__( attention_bias: Optional[bool] = True, **kwargs, ): - super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -155,12 +154,13 @@ def __init__( self.attention_bias = attention_bias # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rotary_emb_base", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rotary_emb_base", 10000.0) rope_config_standardize_and_validate(self) if self.hidden_size % self.num_attention_heads != 0: raise ValueError( "The hidden size is not divisible by the number of attention heads! Make sure to update them!" ) + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) __all__ = ["GPTNeoXConfig"] diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index 517f2a3eacf6..344fcf95ac4c 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -102,7 +102,6 @@ def __init__( hidden_dropout: Optional[float] = 0.0, **kwargs, ): - super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -122,8 +121,9 @@ def __init__( self.hidden_dropout = hidden_dropout # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rotary_emb_base", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rotary_emb_base", 10000.0) rope_config_standardize_and_validate(self) + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) __all__ = ["GPTNeoXJapaneseConfig"] diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py index ef249ff05d6f..995ebe2c1f85 100644 --- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py +++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py @@ -115,7 +115,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 150000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 150000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py index 5a5abdbd13e1..523c86d521a8 100644 --- a/src/transformers/models/granite/configuration_granite.py +++ b/src/transformers/models/granite/configuration_granite.py @@ -181,7 +181,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index 615a5b558044..1c6f2b2aa50e 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -165,7 +165,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) self.attention_bias = attention_bias diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py index 016b94544364..f0885f7fd120 100644 --- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py @@ -204,7 +204,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index c5922ef7c6ce..e0a09aa84658 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -170,7 +170,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) self.attention_bias = attention_bias diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py index eb6a07d53134..b8188e89cf0d 100644 --- a/src/transformers/models/helium/configuration_helium.py +++ b/src/transformers/models/helium/configuration_helium.py @@ -153,7 +153,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 100000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 100000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index fde580e13223..b5d085bfd240 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -147,7 +147,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # TODO needs model-specific validation? super().__init__( diff --git a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py index 1a9edbd96b9e..4588dae1e7ad 100644 --- a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -163,7 +163,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index 660e47d137da..b6a76c5001ab 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -153,7 +153,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index 5091117a1ea0..b3ef555abf58 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -189,7 +189,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 150f54b4b9b9..2ef6b926ca41 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -177,7 +177,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index 7de3c3e6a830..dec757e72431 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -130,7 +130,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__(**kwargs) @@ -354,7 +354,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index 80868a6ca1d4..0ed3ef4bd986 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -216,7 +216,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000000.0) for key in ["beta_fast", "beta_slow", "factor"]: if key in self.rope_parameters: diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py index 52956f181c0c..30be2aaf9b51 100644 --- a/src/transformers/models/mimi/configuration_mimi.py +++ b/src/transformers/models/mimi/configuration_mimi.py @@ -227,7 +227,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # Handle backward compatibility for frame_rate: diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index badbf8bbb485..a67a96a1c59f 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -233,7 +233,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index b7e341416afc..86e25fdb67b1 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -258,7 +258,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ministral/configuration_ministral.py b/src/transformers/models/ministral/configuration_ministral.py index 6cc23d9bea76..ae8f39d3ba92 100644 --- a/src/transformers/models/ministral/configuration_ministral.py +++ b/src/transformers/models/ministral/configuration_ministral.py @@ -168,7 +168,7 @@ def __init__( ] * num_hidden_layers # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/ministral/modular_ministral.py b/src/transformers/models/ministral/modular_ministral.py index 309149f4b2de..76b6f4c82f57 100644 --- a/src/transformers/models/ministral/modular_ministral.py +++ b/src/transformers/models/ministral/modular_ministral.py @@ -170,7 +170,7 @@ def __init__( ] * num_hidden_layers # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py index a8021d3080bf..63349eb8bdc6 100644 --- a/src/transformers/models/mistral/configuration_mistral.py +++ b/src/transformers/models/mistral/configuration_mistral.py @@ -172,7 +172,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index caeb3b530b75..cba5c5d49b86 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -192,7 +192,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index 2b0263acdca3..0816cb97baea 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -253,7 +253,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 500000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index 19260a95177d..847b1081f7d9 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -171,14 +171,6 @@ def __init__( repad_logits_with_grad: Optional[bool] = False, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - cls_token_id=cls_token_id, - sep_token_id=sep_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -234,12 +226,21 @@ def __init__( if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: rope_parameters["full_attention"].update(rope_scaling) rope_parameters["sliding_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = getattr(self, "global_rope_theta", 160_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = getattr(self, "local_rope_theta", 10000.0) + rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + cls_token_id=cls_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + def to_dict(self): output = super().to_dict() output.pop("reference_compile", None) diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index b276ca0faef2..774947c2f8f7 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -199,14 +199,6 @@ def __init__( repad_logits_with_grad: Optional[bool] = False, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - cls_token_id=cls_token_id, - sep_token_id=sep_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -262,12 +254,21 @@ def __init__( if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: rope_parameters["full_attention"].update(rope_scaling) rope_parameters["sliding_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = getattr(self, "global_rope_theta", 160_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = getattr(self, "local_rope_theta", 10000.0) + rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + cls_token_id=cls_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + def to_dict(self): output = super().to_dict() output.pop("reference_compile", None) diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index 1d8903ecd45d..b76288bf8f28 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -157,14 +157,6 @@ def __init__( rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - cls_token_id=cls_token_id, - sep_token_id=sep_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -211,8 +203,8 @@ def __init__( if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: rope_parameters["full_attention"].update(rope_scaling) rope_parameters["sliding_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = getattr(self, "global_rope_theta", 160_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = getattr(self, "local_rope_theta", 10000.0) + rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) @@ -220,5 +212,14 @@ def __init__( # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + cls_token_id=cls_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + __all__ = ["ModernBertDecoderConfig"] diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index 09ddc064bbf4..c4455f60c2ef 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -178,14 +178,6 @@ def __init__( rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - cls_token_id=cls_token_id, - sep_token_id=sep_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -232,8 +224,8 @@ def __init__( if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: rope_parameters["full_attention"].update(rope_scaling) rope_parameters["sliding_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = getattr(self, "global_rope_theta", 160_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = getattr(self, "local_rope_theta", 10000.0) + rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) + rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) @@ -241,6 +233,15 @@ def __init__( # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + cls_token_id=cls_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + class ModernBertDecoderEmbeddings(ModernBertEmbeddings): pass diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index cd542ffaf9b4..bc1b2b4ef58d 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -181,7 +181,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 1964820bb482..5c618137e2c5 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -204,7 +204,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index 8d77b25cc63a..21ba2cebc201 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -288,7 +288,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) audio_encoder_config = kwargs.pop("audio_encoder_config", {}) diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index 4de7f35c83d9..126395815298 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -149,7 +149,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index 30ba9d1863db..4b2f26fb9d6c 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -164,7 +164,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py index 533b6848e065..821847001d7f 100644 --- a/src/transformers/models/olmo2/configuration_olmo2.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -164,7 +164,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/olmo3/configuration_olmo3.py b/src/transformers/models/olmo3/configuration_olmo3.py index 179394093f36..838d672b2c70 100644 --- a/src/transformers/models/olmo3/configuration_olmo3.py +++ b/src/transformers/models/olmo3/configuration_olmo3.py @@ -182,7 +182,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/olmo3/modular_olmo3.py b/src/transformers/models/olmo3/modular_olmo3.py index b01b56b4b382..b38465df64c2 100644 --- a/src/transformers/models/olmo3/modular_olmo3.py +++ b/src/transformers/models/olmo3/modular_olmo3.py @@ -198,7 +198,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py index 2d030f312b69..9c5595f4c35d 100644 --- a/src/transformers/models/olmoe/configuration_olmoe.py +++ b/src/transformers/models/olmoe/configuration_olmoe.py @@ -164,7 +164,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index 28e0301f1999..65921421f992 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -125,7 +125,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 25000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 25000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index bd8be29e1b80..b949a6293869 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -167,7 +167,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index c064ad628a39..e3ec37519135 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -170,7 +170,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) self._rope_parameters_adjustment() self._rope_parameters_validation() diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index fbae60d90d4f..f3929bc39bb5 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -411,7 +411,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) self._rope_parameters_adjustment() self._rope_parameters_validation() diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index 687010b38290..ab3b9ff25a75 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -173,7 +173,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) if self.rope_parameters["rope_type"] != "default": diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index b39db091a3bf..130cadd39989 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -109,7 +109,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index 4d0eaeeba280..f73f8bf3e737 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -173,7 +173,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 4fdac248f53a..72ad2089f9d0 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -370,7 +370,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) @@ -720,7 +720,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -828,7 +828,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 448af15b8bf3..f828591a46a2 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -403,7 +403,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) @@ -753,7 +753,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -861,7 +861,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index 3e0195193dca..e9bcf104c939 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -218,7 +218,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) if self.rope_parameters["rope_type"] == "mrope": self.rope_parameters["rope_type"] = "default" rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py index a287417b3594..c7075454a540 100644 --- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py @@ -211,7 +211,7 @@ def __init__( layer_type_validation(self.layer_types) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 2244585fe198..2e463679d519 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -206,7 +206,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) if self.rope_parameters["rope_type"] == "mrope": self.rope_parameters["rope_type"] = "default" rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index c6ce972f1e88..d31e1e4b3c0b 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -181,7 +181,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index c766815e4ba1..a555cda9abb7 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -185,7 +185,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # MoE arguments diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index de96a59dd918..fb535189dcb6 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -215,7 +215,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # linear attention part diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index d03ba4c8ba8b..f6a3afb11bf9 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -330,7 +330,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) # MoE arguments @@ -604,7 +604,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( @@ -773,7 +773,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # MoE arguments @@ -1042,7 +1042,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) @property diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 3617eab80448..7be065cbf7b2 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -217,7 +217,7 @@ def __init__( self.sliding_window = sliding_window # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 1000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) @@ -677,7 +677,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) @property diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index 78cf4d35bb01..75aba8876957 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -176,7 +176,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 5000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 0e3ca62e7ca1..aaf786293a06 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -217,7 +217,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 5000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py index f9b7a786fa5a..5e0fe5a3a552 100644 --- a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py @@ -172,7 +172,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 5000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index bdb54bd179a3..a0e0ad2a0d1d 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -194,7 +194,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 5000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index 48a0c34bebe6..bdc57d6dbee4 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -153,7 +153,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/seed_oss/configuration_seed_oss.py b/src/transformers/models/seed_oss/configuration_seed_oss.py index 2ce745b42966..2cc0ccfaa5dd 100644 --- a/src/transformers/models/seed_oss/configuration_seed_oss.py +++ b/src/transformers/models/seed_oss/configuration_seed_oss.py @@ -176,7 +176,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py index d6a2741e9f29..b8864114918b 100644 --- a/src/transformers/models/smollm3/configuration_smollm3.py +++ b/src/transformers/models/smollm3/configuration_smollm3.py @@ -202,7 +202,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 2000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 2000000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py index f8b140d9def6..bc7ca2784238 100644 --- a/src/transformers/models/smollm3/modular_smollm3.py +++ b/src/transformers/models/smollm3/modular_smollm3.py @@ -219,7 +219,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 2000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 2000000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index 9ebde6e422ed..c39930e2a59c 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -152,7 +152,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.25) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py index 04fea7033776..d035d5d1d6bf 100644 --- a/src/transformers/models/starcoder2/configuration_starcoder2.py +++ b/src/transformers/models/starcoder2/configuration_starcoder2.py @@ -161,7 +161,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py index 1ac2be91c247..86b2fa85dff7 100644 --- a/src/transformers/models/t5gemma/configuration_t5gemma.py +++ b/src/transformers/models/t5gemma/configuration_t5gemma.py @@ -188,7 +188,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/vaultgemma/configuration_vaultgemma.py b/src/transformers/models/vaultgemma/configuration_vaultgemma.py index 4edbed15065a..93cfdbadd8ad 100644 --- a/src/transformers/models/vaultgemma/configuration_vaultgemma.py +++ b/src/transformers/models/vaultgemma/configuration_vaultgemma.py @@ -188,7 +188,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py index 385091866c65..75668907de07 100644 --- a/src/transformers/models/zamba2/configuration_zamba2.py +++ b/src/transformers/models/zamba2/configuration_zamba2.py @@ -201,7 +201,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("rope_theta", 10000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) self.mamba_d_state = mamba_d_state From a2b780be8bfcc7b4d272f483afa34da9c3841a89 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 26 Nov 2025 14:36:56 +0100 Subject: [PATCH 04/23] update a few models? --- src/transformers/modeling_rope_utils.py | 2 +- .../models/apertus/configuration_apertus.py | 4 +-- .../models/gemma3/configuration_gemma3.py | 2 +- .../models/gemma3/modular_gemma3.py | 2 +- .../models/gemma3n/configuration_gemma3n.py | 26 +++++++++---------- .../models/gemma3n/modular_gemma3n.py | 8 +++--- .../models/gpt_neox/configuration_gpt_neox.py | 2 +- .../models/gpt_neox/modeling_gpt_neox.py | 3 ++- .../modeling_gpt_neox_japanese.py | 3 ++- .../modernbert/configuration_modernbert.py | 2 +- .../models/modernbert/modular_modernbert.py | 2 +- .../configuration_modernbert_decoder.py | 2 +- .../modular_modernbert_decoder.py | 2 +- 13 files changed, 31 insertions(+), 29 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index aebf3a65b777..3c4a6e7345f4 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -91,7 +91,7 @@ def get_standardized_rope_params(config): # Move `rope_theta` and `partial_rotary_factor` to the params dict, if not there yet rope_theta = getattr(config, "rope_theta", None) - partial_rotary_factor = getattr(config, "partial_rotary_factor", None) + partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) # Case 1: one RoPE theat = one RoPE param per model without nesting if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index f142214cd577..33c69b0eb4c4 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -164,10 +164,10 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters - rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 12000000.0) + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 12000000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index d85b29d9159c..9391c7f11e61 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -193,7 +193,7 @@ def __init__( "sliding_attention": {"rope_type": "default"}, "full_attention": {"rope_type": "default"}, } - rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: rope_parameters["full_attention"].update(rope_scaling) rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 96cea1feab60..fdff187cb4f3 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -208,7 +208,7 @@ def __init__( "sliding_attention": {"rope_type": "default"}, "full_attention": {"rope_type": "default"}, } - rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: rope_parameters["full_attention"].update(rope_scaling) rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index 76981434317b..37056c5e7625 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -226,30 +226,30 @@ def __init__( self.final_logit_softcapping = final_logit_softcapping self.layer_types = layer_types + if layer_types is None: + self.layer_types = [ + "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers) + ] + else: + self.layer_types = layer_types + + layer_type_validation(self.layer_types, self.num_hidden_layers) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format default_rope_params = { "sliding_attention": {"rope_type": "default"}, "full_attention": {"rope_type": "default"}, } - rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - rope_parameters["full_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) - if layer_types is None: - self.layer_types = [ - "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers) - ] - else: - self.layer_types = layer_types - - layer_type_validation(self.layer_types, self.num_hidden_layers) - self.hidden_size_per_layer_input = hidden_size_per_layer_input self.num_kv_shared_layers = num_kv_shared_layers diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 60d760c3f651..73d976a75d4d 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -252,11 +252,11 @@ def __init__( "sliding_attention": {"rope_type": "default"}, "full_attention": {"rope_type": "default"}, } - rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - rope_parameters["full_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) + self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 1eb8b2e9cfff..693fee7dc2ce 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -150,7 +150,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) self.attention_bias = attention_bias # Validate the correctness of rotary position embeddings parameters diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index e3cd7c7d4d39..56af644f21d9 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -194,7 +194,8 @@ def __init__(self, config, layer_idx=None): self.config = config self.head_size = config.hidden_size // config.num_attention_heads self.attention_dropout = config.attention_dropout - self.rotary_ndims = int(self.head_size * config.rotary_pct) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) + self.rotary_ndims = int(self.head_size * partial_rotary_factor) self.scaling = self.head_size**-0.5 self.is_causal = True self.layer_idx = layer_idx diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py index f723defcd088..f2112083d388 100755 --- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -174,7 +174,8 @@ def __init__(self, config, use_bias=False, layer_idx=None): ) self.layer_idx = layer_idx - self.rotary_ndims = int(self.head_size * config.rotary_pct) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) + self.rotary_ndims = int(self.head_size * partial_rotary_factor) self.attention_dropout = nn.Dropout(config.attention_dropout) self.norm_factor = math.sqrt(self.head_size) diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index 847b1081f7d9..bd9ce2796701 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -222,7 +222,7 @@ def __init__( "sliding_attention": {"rope_type": "default"}, "full_attention": {"rope_type": "default"}, } - rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: rope_parameters["full_attention"].update(rope_scaling) rope_parameters["sliding_attention"].update(rope_scaling) diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 774947c2f8f7..194af2315329 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -250,7 +250,7 @@ def __init__( "sliding_attention": {"rope_type": "default"}, "full_attention": {"rope_type": "default"}, } - rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: rope_parameters["full_attention"].update(rope_scaling) rope_parameters["sliding_attention"].update(rope_scaling) diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index b76288bf8f28..88aced5e1fbb 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -199,7 +199,7 @@ def __init__( "sliding_attention": {"rope_type": "default"}, "full_attention": {"rope_type": "default"}, } - rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: rope_parameters["full_attention"].update(rope_scaling) rope_parameters["sliding_attention"].update(rope_scaling) diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index c4455f60c2ef..dca4a90f572d 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -220,7 +220,7 @@ def __init__( "sliding_attention": {"rope_type": "default"}, "full_attention": {"rope_type": "default"}, } - rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params + self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: rope_parameters["full_attention"].update(rope_scaling) rope_parameters["sliding_attention"].update(rope_scaling) From ccc697af2be89dac16238195507741435a2f8f39 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 26 Nov 2025 15:08:48 +0100 Subject: [PATCH 05/23] fix tests that are easu first --- src/transformers/models/gemma3/configuration_gemma3.py | 6 +++--- src/transformers/models/gemma3/modular_gemma3.py | 6 +++--- .../models/modernbert/configuration_modernbert.py | 8 ++++---- src/transformers/models/modernbert/modular_modernbert.py | 8 ++++---- .../configuration_modernbert_decoder.py | 8 ++++---- .../modernbert_decoder/modular_modernbert_decoder.py | 8 ++++---- src/transformers/models/nemotron/modeling_nemotron.py | 2 +- src/transformers/models/persimmon/modeling_persimmon.py | 2 +- src/transformers/models/phi/modeling_phi.py | 2 +- src/transformers/models/phi/modular_phi.py | 2 +- src/transformers/models/phi3/configuration_phi3.py | 4 +++- .../phi4_multimodal/configuration_phi4_multimodal.py | 4 +++- .../models/qwen2_5_vl/configuration_qwen2_5_vl.py | 3 ++- .../models/qwen2_vl/configuration_qwen2_vl.py | 3 ++- src/transformers/models/smollm3/configuration_smollm3.py | 5 +++++ src/transformers/models/smollm3/modular_smollm3.py | 5 +++++ src/transformers/models/stablelm/modeling_stablelm.py | 2 +- 17 files changed, 47 insertions(+), 31 deletions(-) diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index 9391c7f11e61..8a952d7f6a2d 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -195,9 +195,9 @@ def __init__( } self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - rope_parameters["full_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("rope_local_base_freq", 10000.0) + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) + self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("rope_local_base_freq", 10000.0) self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index fdff187cb4f3..7ecb636b6186 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -210,9 +210,9 @@ def __init__( } self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - rope_parameters["full_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("rope_local_base_freq", 10000.0) + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) + self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("rope_local_base_freq", 10000.0) self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index bd9ce2796701..8dc66b7e185e 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -224,10 +224,10 @@ def __init__( } self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - rope_parameters["full_attention"].update(rope_scaling) - rope_parameters["sliding_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["sliding_attention"].update(rope_scaling) + self.rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) + self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 194af2315329..fa5a7f6778ab 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -252,10 +252,10 @@ def __init__( } self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - rope_parameters["full_attention"].update(rope_scaling) - rope_parameters["sliding_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["sliding_attention"].update(rope_scaling) + self.rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) + self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index 88aced5e1fbb..ffc1a1c2719c 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -201,10 +201,10 @@ def __init__( } self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - rope_parameters["full_attention"].update(rope_scaling) - rope_parameters["sliding_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["sliding_attention"].update(rope_scaling) + self.rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) + self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index dca4a90f572d..b7e92f6dd8c9 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -222,10 +222,10 @@ def __init__( } self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - rope_parameters["full_attention"].update(rope_scaling) - rope_parameters["sliding_attention"].update(rope_scaling) - rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) - rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["sliding_attention"].update(rope_scaling) + self.rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) + self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index 751305b0a0ea..2b9b19a52c1b 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -250,7 +250,7 @@ def __init__(self, config: NemotronConfig, layer_idx: Optional[int] = None): self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings - self.partial_rotary_factor = config.partial_rotary_factor + self.partial_rotary_factor = config.rope_parameters["partial_rotary_factor"] self.is_causal = True self.rotary_emb = NemotronRotaryEmbedding(config=config) diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index 094b26dbabc0..a7054a2bd989 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -219,7 +219,7 @@ def __init__(self, config: PersimmonConfig, layer_idx: Optional[int] = None): self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads - self.rotary_ndims = int(self.head_dim * config.partial_rotary_factor) + self.rotary_ndims = int(self.head_dim * config.rope_parameters["partial_rotary_factor"]) self.is_causal = True if (self.head_dim * self.num_heads) != self.hidden_size: diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 5e91a9f70265..196b66df0e3a 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -186,7 +186,7 @@ def __init__(self, config: PhiConfig, layer_idx: int): self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) self.dense = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=True) - self.rotary_ndims = int(self.head_dim * config.partial_rotary_factor) + self.rotary_ndims = int(self.head_dim * config.rope_parameters["partial_rotary_factor"]) self.qk_layernorm = config.qk_layernorm if self.qk_layernorm: self.q_layernorm = nn.LayerNorm( diff --git a/src/transformers/models/phi/modular_phi.py b/src/transformers/models/phi/modular_phi.py index a06e3b16cfc6..75e52d934097 100644 --- a/src/transformers/models/phi/modular_phi.py +++ b/src/transformers/models/phi/modular_phi.py @@ -75,7 +75,7 @@ def __init__(self, config: PhiConfig, layer_idx: int): self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) self.dense = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=True) del self.o_proj - self.rotary_ndims = int(self.head_dim * config.partial_rotary_factor) + self.rotary_ndims = int(self.head_dim * config.rope_parameters["partial_rotary_factor"]) self.qk_layernorm = config.qk_layernorm if self.qk_layernorm: self.q_layernorm = nn.LayerNorm( diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index e3ec37519135..5728bbcca6c1 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -203,7 +203,9 @@ def _rope_parameters_validation(self): rope_parameters_type = self.rope_parameters.get("rope_type", None) rope_parameters_short_factor = self.rope_parameters.get("short_factor", None) rope_parameters_long_factor = self.rope_parameters.get("long_factor", None) - rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor) + rotary_ndims = int( + self.hidden_size // self.num_attention_heads * self.rope_parameters["partial_rotary_factor"] + ) if rope_parameters_type not in ["default", "longrope"]: raise ValueError(f"`rope_parameters`'s type field must be one of ['longrope'], got {rope_parameters_type}") diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index f3929bc39bb5..2d6f2d7cfba2 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -444,7 +444,9 @@ def _rope_parameters_validation(self): rope_parameters_type = self.rope_parameters.get("rope_type", None) rope_parameters_short_factor = self.rope_parameters.get("short_factor", None) rope_parameters_long_factor = self.rope_parameters.get("long_factor", None) - rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor) + rotary_ndims = int( + self.hidden_size // self.num_attention_heads * self.rope_parameters["partial_rotary_factor"] + ) if rope_parameters_type not in ["default", "longrope"]: raise ValueError(f"`rope_parameters`'s type field must be one of ['longrope'], got {rope_parameters_type}") diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index e9bcf104c939..e900b53aa3e9 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -219,7 +219,8 @@ def __init__( # Validate the correctness of rotary position embeddings parameters self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) - if self.rope_parameters["rope_type"] == "mrope": + rope_type = rope_parameters.get("type") or rope_parameters.get("rope_type") + if rope_type == "mrope": self.rope_parameters["rope_type"] = "default" rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 2e463679d519..222647711113 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -207,7 +207,8 @@ def __init__( # Validate the correctness of rotary position embeddings parameters self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) - if self.rope_parameters["rope_type"] == "mrope": + rope_type = rope_parameters.get("type") or rope_parameters.get("rope_type") + if rope_type == "mrope": self.rope_parameters["rope_type"] = "default" rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py index b8864114918b..8a040d2a25cd 100644 --- a/src/transformers/models/smollm3/configuration_smollm3.py +++ b/src/transformers/models/smollm3/configuration_smollm3.py @@ -201,6 +201,11 @@ def __init__( self.layer_types = layer_types layer_type_validation(self.layer_types, self.num_hidden_layers) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} + # Validate the correctness of rotary position embeddings parameters self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 2000000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py index bc7ca2784238..3e8d959e070e 100644 --- a/src/transformers/models/smollm3/modular_smollm3.py +++ b/src/transformers/models/smollm3/modular_smollm3.py @@ -218,6 +218,11 @@ def __init__( self.layer_types = layer_types layer_type_validation(self.layer_types, self.num_hidden_layers) + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} + # Validate the correctness of rotary position embeddings parameters self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 2000000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index fd56e5642cf0..27a9f6b47ce1 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -254,7 +254,7 @@ def __init__(self, config: StableLmConfig, layer_idx: Optional[int] = None): self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.rotary_ndims = int(self.head_dim * config.partial_rotary_factor) + self.rotary_ndims = int(self.head_dim * config.rope_parameters["partial_rotary_factor"]) self.is_causal = True self.scaling = self.head_dim**-0.5 From eb282d14efa8e468bde5d1d6b306496f40fe4373 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 26 Nov 2025 17:40:15 +0100 Subject: [PATCH 06/23] dont overwrite if already present!!! --- src/transformers/modeling_rope_utils.py | 4 ++++ .../models/apertus/configuration_apertus.py | 3 ++- .../models/apertus/modular_apertus.py | 3 ++- .../models/arcee/configuration_arcee.py | 3 ++- .../models/aria/configuration_aria.py | 3 ++- .../models/bamba/configuration_bamba.py | 3 ++- .../models/bitnet/configuration_bitnet.py | 3 ++- src/transformers/models/blt/configuration_blt.py | 15 ++++++++++----- .../models/chameleon/configuration_chameleon.py | 3 ++- .../models/cohere/configuration_cohere.py | 3 ++- .../models/cohere2/configuration_cohere2.py | 3 ++- .../models/cohere2/modular_cohere2.py | 3 ++- src/transformers/models/csm/configuration_csm.py | 6 ++++-- src/transformers/models/cwm/configuration_cwm.py | 3 ++- src/transformers/models/cwm/modular_cwm.py | 3 ++- .../deepseek_v2/configuration_deepseek_v2.py | 3 ++- .../deepseek_v3/configuration_deepseek_v3.py | 3 ++- src/transformers/models/dia/configuration_dia.py | 6 ++++-- .../models/diffllama/configuration_diffllama.py | 3 ++- .../models/doge/configuration_doge.py | 3 ++- src/transformers/models/doge/modular_doge.py | 3 ++- .../models/dots1/configuration_dots1.py | 3 ++- .../configuration_efficientloftr.py | 3 ++- .../models/emu3/configuration_emu3.py | 3 ++- .../models/ernie4_5/configuration_ernie4_5.py | 3 ++- .../ernie4_5_moe/configuration_ernie4_5_moe.py | 3 ++- .../models/evolla/configuration_evolla.py | 3 ++- .../models/exaone4/configuration_exaone4.py | 3 ++- .../models/exaone4/modular_exaone4.py | 3 ++- .../models/falcon/configuration_falcon.py | 3 ++- .../models/falcon_h1/configuration_falcon_h1.py | 3 ++- .../models/flex_olmo/configuration_flex_olmo.py | 3 ++- .../models/flex_olmo/modular_flex_olmo.py | 3 ++- .../models/fuyu/configuration_fuyu.py | 3 ++- .../models/gemma/configuration_gemma.py | 3 ++- src/transformers/models/gemma/modular_gemma.py | 3 ++- .../models/gemma2/configuration_gemma2.py | 3 ++- src/transformers/models/gemma2/modular_gemma2.py | 3 ++- src/transformers/models/glm/configuration_glm.py | 3 ++- .../models/glm4/configuration_glm4.py | 3 ++- .../models/glm4_moe/configuration_glm4_moe.py | 3 ++- .../models/glm4_moe/modular_glm4_moe.py | 3 ++- .../models/glm4v/configuration_glm4v.py | 3 ++- src/transformers/models/glm4v/modular_glm4v.py | 3 ++- .../models/glm4v_moe/configuration_glm4v_moe.py | 3 ++- .../models/glm4v_moe/modular_glm4v_moe.py | 3 ++- .../models/gpt_neox/configuration_gpt_neox.py | 3 ++- .../configuration_gpt_neox_japanese.py | 3 ++- .../models/gpt_oss/configuration_gpt_oss.py | 3 ++- .../models/granite/configuration_granite.py | 3 ++- .../models/granitemoe/configuration_granitemoe.py | 3 ++- .../configuration_granitemoehybrid.py | 3 ++- .../configuration_granitemoeshared.py | 3 ++- .../models/helium/configuration_helium.py | 3 ++- .../configuration_hunyuan_v1_dense.py | 3 ++- .../configuration_hunyuan_v1_moe.py | 3 ++- .../models/jetmoe/configuration_jetmoe.py | 3 ++- .../configuration_kyutai_speech_to_text.py | 3 ++- .../models/llama/configuration_llama.py | 3 ++- .../models/llama4/configuration_llama4.py | 6 ++++-- .../longcat_flash/configuration_longcat_flash.py | 3 ++- .../models/mimi/configuration_mimi.py | 3 ++- .../models/minimax/configuration_minimax.py | 3 ++- .../models/minimax/modular_minimax.py | 3 ++- .../models/ministral/configuration_ministral.py | 3 ++- .../models/ministral/modular_ministral.py | 3 ++- .../models/mistral/configuration_mistral.py | 3 ++- .../models/mixtral/configuration_mixtral.py | 3 ++- .../models/mllama/configuration_mllama.py | 3 ++- .../models/moonshine/configuration_moonshine.py | 3 ++- .../models/moonshine/modular_moonshine.py | 3 ++- .../models/moshi/configuration_moshi.py | 3 ++- .../models/nemotron/configuration_nemotron.py | 3 ++- .../models/olmo/configuration_olmo.py | 3 ++- .../models/olmo2/configuration_olmo2.py | 3 ++- .../models/olmo3/configuration_olmo3.py | 3 ++- src/transformers/models/olmo3/modular_olmo3.py | 3 ++- .../models/olmoe/configuration_olmoe.py | 3 ++- .../models/persimmon/configuration_persimmon.py | 3 ++- src/transformers/models/phi/configuration_phi.py | 3 ++- .../models/phi3/configuration_phi3.py | 3 ++- .../configuration_phi4_multimodal.py | 3 ++- .../models/phimoe/configuration_phimoe.py | 3 ++- .../models/pixtral/configuration_pixtral.py | 3 ++- .../models/qwen2/configuration_qwen2.py | 3 ++- .../qwen2_5_omni/configuration_qwen2_5_omni.py | 9 ++++++--- .../models/qwen2_5_omni/modular_qwen2_5_omni.py | 9 ++++++--- .../models/qwen2_5_vl/configuration_qwen2_5_vl.py | 3 ++- .../models/qwen2_moe/configuration_qwen2_moe.py | 3 ++- .../models/qwen2_vl/configuration_qwen2_vl.py | 3 ++- .../models/qwen3/configuration_qwen3.py | 3 ++- .../models/qwen3_moe/configuration_qwen3_moe.py | 3 ++- .../models/qwen3_next/configuration_qwen3_next.py | 3 ++- .../configuration_qwen3_omni_moe.py | 12 ++++++++---- .../qwen3_omni_moe/modular_qwen3_omni_moe.py | 6 ++++-- .../models/qwen3_vl/configuration_qwen3_vl.py | 3 ++- .../models/qwen3_vl/modular_qwen3_vl.py | 3 ++- .../qwen3_vl_moe/configuration_qwen3_vl_moe.py | 3 ++- .../models/qwen3_vl_moe/modular_qwen3_vl_moe.py | 3 ++- .../configuration_recurrent_gemma.py | 3 ++- .../models/seed_oss/configuration_seed_oss.py | 3 ++- .../models/smollm3/configuration_smollm3.py | 3 ++- .../models/smollm3/modular_smollm3.py | 3 ++- .../models/stablelm/configuration_stablelm.py | 3 ++- .../models/starcoder2/configuration_starcoder2.py | 3 ++- .../models/t5gemma/configuration_t5gemma.py | 3 ++- .../models/vaultgemma/configuration_vaultgemma.py | 3 ++- .../models/zamba2/configuration_zamba2.py | 3 ++- 108 files changed, 248 insertions(+), 122 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 3c4a6e7345f4..2a4807dd06d1 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -907,6 +907,10 @@ def rope_config_standardize_and_validate(config: PreTrainedConfig, ignore_keys: # BC: "rope_theta" was originally saved in config rope_parameters["rope_theta"] = rope_parameters.get("rope_theta", getattr(config, "rope_theta", None)) + # Ignore `partial_rotary_factor` if present for all RoPE types + ignore_keys = ignore_keys if ignore_keys is not None else set() + ignore_keys.update(["partial_rotary_factor"]) + if validation_fn is not None: validation_fn(rope_parameters, config=config, ignore_keys=ignore_keys) else: diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index 33c69b0eb4c4..6acf46260d59 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -167,7 +167,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 12000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 12000000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index 96c16d8e9aac..b92b89d1ae2e 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -180,7 +180,8 @@ def __init__( del self.head_dim # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 12000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 12000000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/arcee/configuration_arcee.py b/src/transformers/models/arcee/configuration_arcee.py index fdf68add7929..b23dfe266a0b 100644 --- a/src/transformers/models/arcee/configuration_arcee.py +++ b/src/transformers/models/arcee/configuration_arcee.py @@ -169,7 +169,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index 3ae1e0518876..dc1fdb88fb19 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -174,7 +174,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index a9f5a93863a5..87b7f077fec5 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -179,7 +179,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = 0.5 # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py index 4fdebdd2d40f..e692f4b6c02d 100644 --- a/src/transformers/models/bitnet/configuration_bitnet.py +++ b/src/transformers/models/bitnet/configuration_bitnet.py @@ -144,7 +144,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py index dcbe36314700..21659f280e08 100644 --- a/src/transformers/models/blt/configuration_blt.py +++ b/src/transformers/models/blt/configuration_blt.py @@ -71,7 +71,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error @@ -126,7 +127,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error @@ -173,7 +175,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error @@ -254,7 +257,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error @@ -383,7 +387,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) # Cross attention configurations diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index b8cd4ce3d951..bf18bbbe0781 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -236,7 +236,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) if vq_config is None: diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index c72ed81a3e52..c1a8a49785ac 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -171,7 +171,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index 157f61aef75f..a091cefd1bda 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -194,7 +194,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index 81fdc126335d..a10b735dfa03 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -217,7 +217,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index 024eee365b0e..6c57f131ff00 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -167,7 +167,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) @@ -354,7 +355,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/cwm/configuration_cwm.py b/src/transformers/models/cwm/configuration_cwm.py index eb6a07212453..2d0fbefbb254 100644 --- a/src/transformers/models/cwm/configuration_cwm.py +++ b/src/transformers/models/cwm/configuration_cwm.py @@ -183,7 +183,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/cwm/modular_cwm.py b/src/transformers/models/cwm/modular_cwm.py index 928131731215..120988774b2f 100644 --- a/src/transformers/models/cwm/modular_cwm.py +++ b/src/transformers/models/cwm/modular_cwm.py @@ -183,7 +183,8 @@ def __init__( del self.attention_bias # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index 43bfb0c56b6e..239783a440eb 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -215,7 +215,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index fce2c9777eb9..5c624e0c5ecc 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -231,7 +231,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) for key in ["beta_fast", "beta_slow", "factor"]: if key in self.rope_parameters: diff --git a/src/transformers/models/dia/configuration_dia.py b/src/transformers/models/dia/configuration_dia.py index 4485b109d4a1..79217ea7fa59 100644 --- a/src/transformers/models/dia/configuration_dia.py +++ b/src/transformers/models/dia/configuration_dia.py @@ -98,7 +98,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__(**kwargs) @@ -204,7 +205,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index 66dccfa8f37f..bf483952a1bf 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -151,7 +151,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index dace12475308..09134adaa93f 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -195,7 +195,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # for backward compatibility diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index 6b56798cc48f..1c2cb1fe1cbe 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -224,7 +224,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # for backward compatibility diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py index 88640541f777..5276c20ab533 100644 --- a/src/transformers/models/dots1/configuration_dots1.py +++ b/src/transformers/models/dots1/configuration_dots1.py @@ -208,7 +208,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index 2f6efbb2ae44..ee2410ab00cd 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -179,7 +179,8 @@ def __init__( rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} or {} self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 4.0) - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) # Standardize and validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py index 28b2c7a026e2..b2f064a3f853 100644 --- a/src/transformers/models/emu3/configuration_emu3.py +++ b/src/transformers/models/emu3/configuration_emu3.py @@ -232,7 +232,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/ernie4_5/configuration_ernie4_5.py b/src/transformers/models/ernie4_5/configuration_ernie4_5.py index 2d77d98c0b07..a64ba278afab 100644 --- a/src/transformers/models/ernie4_5/configuration_ernie4_5.py +++ b/src/transformers/models/ernie4_5/configuration_ernie4_5.py @@ -154,7 +154,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py index 9de2be7790c4..fa254966bd3b 100644 --- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py @@ -187,7 +187,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) # MoE arguments diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py index 150928e5ea14..915eebf8d884 100644 --- a/src/transformers/models/evolla/configuration_evolla.py +++ b/src/transformers/models/evolla/configuration_evolla.py @@ -255,7 +255,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) # Subconfig diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index 40d6e80831d9..7ee9836bf0ec 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -184,7 +184,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index 85b3e9adafd7..57bdf3390761 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -217,7 +217,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 2afd54d10d39..548b556153f0 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -168,7 +168,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py index e0e0d32386af..b652ec363e17 100644 --- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py +++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -203,7 +203,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) self.projectors_bias = projectors_bias diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index e1bad8a4a4eb..0fcde1c8d158 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -181,7 +181,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index 337c0e8b8876..d42b152763e4 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -192,7 +192,8 @@ def __init__( del self.clip_qkv # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index 7f67ce93643f..069a7501f80a 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -177,7 +177,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 25000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 25000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index 1c5c792a5874..980965544308 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -158,7 +158,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index d3ae7d1864b3..449a1be5c3e6 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -186,7 +186,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index 6625ead036f3..12cf0c774b55 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -192,7 +192,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index a256f898e4a4..766a5796bb43 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -220,7 +220,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index 6236df6741ed..bcfbec9e06ae 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -147,7 +147,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index 901de77b43a4..23241897b628 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -147,7 +147,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index 5699d27e9df5..42c7d426a154 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -185,7 +185,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # MoE arguments diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index f9d987610f5f..2d029d2eeea6 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -199,7 +199,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # MoE arguments diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index 19cc278ba2fb..e0794f60ac35 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -244,7 +244,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) self.image_token_id = image_token_id self.video_token_id = video_token_id diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 7b36d6e58695..8a19932b38f6 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -281,7 +281,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) self.image_token_id = image_token_id self.video_token_id = video_token_id diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index 6bb051e52d3a..280a0a0c1bff 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -274,7 +274,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) # MoE arguments diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index 66d21a8afee6..f4755c3854ec 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -220,7 +220,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) # MoE arguments diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 693fee7dc2ce..3edc05d98c7f 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -154,7 +154,8 @@ def __init__( self.attention_bias = attention_bias # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rotary_emb_base", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rotary_emb_base", 10000.0) rope_config_standardize_and_validate(self) if self.hidden_size % self.num_attention_heads != 0: raise ValueError( diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index 344fcf95ac4c..5022372e8257 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -121,7 +121,8 @@ def __init__( self.hidden_dropout = hidden_dropout # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rotary_emb_base", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rotary_emb_base", 10000.0) rope_config_standardize_and_validate(self) super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py index 995ebe2c1f85..d6782bf8fec5 100644 --- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py +++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py @@ -115,7 +115,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 150000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 150000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py index 523c86d521a8..b4733888c4c8 100644 --- a/src/transformers/models/granite/configuration_granite.py +++ b/src/transformers/models/granite/configuration_granite.py @@ -181,7 +181,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index 1c6f2b2aa50e..b3291197dc66 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -165,7 +165,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) self.attention_bias = attention_bias diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py index f0885f7fd120..34f046558a34 100644 --- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py @@ -204,7 +204,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index e0a09aa84658..037d17d42afc 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -170,7 +170,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) self.attention_bias = attention_bias diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py index b8188e89cf0d..7cf74dbd39f0 100644 --- a/src/transformers/models/helium/configuration_helium.py +++ b/src/transformers/models/helium/configuration_helium.py @@ -153,7 +153,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 100000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 100000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index b5d085bfd240..8aee7b9f1df0 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -147,7 +147,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # TODO needs model-specific validation? super().__init__( diff --git a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py index 4588dae1e7ad..acfdeddaf945 100644 --- a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -163,7 +163,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index b6a76c5001ab..041ce8269445 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -153,7 +153,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index b3ef555abf58..30c35e6591bb 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -189,7 +189,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 2ef6b926ca41..3e8023438a35 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -177,7 +177,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index dec757e72431..5eda79eaef63 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -130,7 +130,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__(**kwargs) @@ -354,7 +355,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index 0ed3ef4bd986..6d527503cd20 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -216,7 +216,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000000.0) for key in ["beta_fast", "beta_slow", "factor"]: if key in self.rope_parameters: diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py index 30be2aaf9b51..c8fdf20ddb7e 100644 --- a/src/transformers/models/mimi/configuration_mimi.py +++ b/src/transformers/models/mimi/configuration_mimi.py @@ -227,7 +227,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # Handle backward compatibility for frame_rate: diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index a67a96a1c59f..2c5f699cd4d0 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -233,7 +233,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index 86e25fdb67b1..d3b8b932469f 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -258,7 +258,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ministral/configuration_ministral.py b/src/transformers/models/ministral/configuration_ministral.py index ae8f39d3ba92..c859528aa10f 100644 --- a/src/transformers/models/ministral/configuration_ministral.py +++ b/src/transformers/models/ministral/configuration_ministral.py @@ -168,7 +168,8 @@ def __init__( ] * num_hidden_layers # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/ministral/modular_ministral.py b/src/transformers/models/ministral/modular_ministral.py index 76b6f4c82f57..66cfee4c8972 100644 --- a/src/transformers/models/ministral/modular_ministral.py +++ b/src/transformers/models/ministral/modular_ministral.py @@ -170,7 +170,8 @@ def __init__( ] * num_hidden_layers # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py index 63349eb8bdc6..ac7a9b47e43a 100644 --- a/src/transformers/models/mistral/configuration_mistral.py +++ b/src/transformers/models/mistral/configuration_mistral.py @@ -172,7 +172,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index cba5c5d49b86..30964c183322 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -192,7 +192,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index 0816cb97baea..a1f91048606b 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -253,7 +253,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index bc1b2b4ef58d..1e0d569e6fff 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -181,7 +181,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 5c618137e2c5..8ddfb0f44b96 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -204,7 +204,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index 21ba2cebc201..f593c6af0645 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -288,7 +288,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) audio_encoder_config = kwargs.pop("audio_encoder_config", {}) diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index 126395815298..f15a238beaa5 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -149,7 +149,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index 4b2f26fb9d6c..29b1dc2f8f51 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -164,7 +164,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py index 821847001d7f..8beacb6ed2bf 100644 --- a/src/transformers/models/olmo2/configuration_olmo2.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -164,7 +164,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/olmo3/configuration_olmo3.py b/src/transformers/models/olmo3/configuration_olmo3.py index 838d672b2c70..e17805584719 100644 --- a/src/transformers/models/olmo3/configuration_olmo3.py +++ b/src/transformers/models/olmo3/configuration_olmo3.py @@ -182,7 +182,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/olmo3/modular_olmo3.py b/src/transformers/models/olmo3/modular_olmo3.py index b38465df64c2..403e7166fec4 100644 --- a/src/transformers/models/olmo3/modular_olmo3.py +++ b/src/transformers/models/olmo3/modular_olmo3.py @@ -198,7 +198,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py index 9c5595f4c35d..ebcca0121277 100644 --- a/src/transformers/models/olmoe/configuration_olmoe.py +++ b/src/transformers/models/olmoe/configuration_olmoe.py @@ -164,7 +164,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index 65921421f992..449ee64b4cb6 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -125,7 +125,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 25000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 25000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index b949a6293869..b6e4aae55a9c 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -167,7 +167,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 5728bbcca6c1..0b7598de2b7c 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -170,7 +170,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) self._rope_parameters_adjustment() self._rope_parameters_validation() diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index 2d6f2d7cfba2..206a3d8cd94f 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -411,7 +411,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) self._rope_parameters_adjustment() self._rope_parameters_validation() diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index ab3b9ff25a75..87441e94394f 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -173,7 +173,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) if self.rope_parameters["rope_type"] != "default": diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index 130cadd39989..19c70b4ac495 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -109,7 +109,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index f73f8bf3e737..b42e4368c7a1 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -173,7 +173,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 72ad2089f9d0..3099061c2982 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -370,7 +370,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) @@ -720,7 +721,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -828,7 +830,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index f828591a46a2..94c69cfc7a11 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -403,7 +403,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) @@ -753,7 +754,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -861,7 +863,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index e900b53aa3e9..28e9525d400f 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -218,7 +218,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_type = rope_parameters.get("type") or rope_parameters.get("rope_type") if rope_type == "mrope": self.rope_parameters["rope_type"] = "default" diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py index c7075454a540..bf35d711da57 100644 --- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py @@ -211,7 +211,8 @@ def __init__( layer_type_validation(self.layer_types) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 222647711113..77a12064594b 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -206,7 +206,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_type = rope_parameters.get("type") or rope_parameters.get("rope_type") if rope_type == "mrope": self.rope_parameters["rope_type"] = "default" diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index d31e1e4b3c0b..f0acca6c8894 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -181,7 +181,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index a555cda9abb7..ad3f7fc5028c 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -185,7 +185,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # MoE arguments diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index fb535189dcb6..4382e4609067 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -215,7 +215,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # linear attention part diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index f6a3afb11bf9..36f7f5a3b920 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -330,7 +330,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) # MoE arguments @@ -604,7 +605,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( @@ -773,7 +775,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) # MoE arguments @@ -1042,7 +1045,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) @property diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 7be065cbf7b2..44706b8b9d0d 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -217,7 +217,8 @@ def __init__( self.sliding_window = sliding_window # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) @@ -677,7 +678,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) @property diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index 75aba8876957..626d290798fa 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -176,7 +176,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index aaf786293a06..4233e4a65eb9 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -217,7 +217,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py index 5e0fe5a3a552..819202fb8db1 100644 --- a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py @@ -172,7 +172,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index a0e0ad2a0d1d..578afeffb300 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -194,7 +194,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index bdc57d6dbee4..f4c82c21ed83 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -153,7 +153,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/seed_oss/configuration_seed_oss.py b/src/transformers/models/seed_oss/configuration_seed_oss.py index 2cc0ccfaa5dd..3d54919d659b 100644 --- a/src/transformers/models/seed_oss/configuration_seed_oss.py +++ b/src/transformers/models/seed_oss/configuration_seed_oss.py @@ -176,7 +176,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py index 8a040d2a25cd..e83197d146be 100644 --- a/src/transformers/models/smollm3/configuration_smollm3.py +++ b/src/transformers/models/smollm3/configuration_smollm3.py @@ -207,7 +207,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 2000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 2000000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py index 3e8d959e070e..7183867a4d2c 100644 --- a/src/transformers/models/smollm3/modular_smollm3.py +++ b/src/transformers/models/smollm3/modular_smollm3.py @@ -224,7 +224,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 2000000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 2000000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index c39930e2a59c..537aab5e0ec9 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -152,7 +152,8 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.25) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py index d035d5d1d6bf..39c40d314b9d 100644 --- a/src/transformers/models/starcoder2/configuration_starcoder2.py +++ b/src/transformers/models/starcoder2/configuration_starcoder2.py @@ -161,7 +161,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py index 86b2fa85dff7..bb9b5ea13e91 100644 --- a/src/transformers/models/t5gemma/configuration_t5gemma.py +++ b/src/transformers/models/t5gemma/configuration_t5gemma.py @@ -188,7 +188,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/vaultgemma/configuration_vaultgemma.py b/src/transformers/models/vaultgemma/configuration_vaultgemma.py index 93cfdbadd8ad..ec3f7ef95b45 100644 --- a/src/transformers/models/vaultgemma/configuration_vaultgemma.py +++ b/src/transformers/models/vaultgemma/configuration_vaultgemma.py @@ -188,7 +188,8 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py index 75668907de07..678d25c5442f 100644 --- a/src/transformers/models/zamba2/configuration_zamba2.py +++ b/src/transformers/models/zamba2/configuration_zamba2.py @@ -201,7 +201,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + if "rope_theta" not in self.rope_parameters: + self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) rope_config_standardize_and_validate(self) self.mamba_d_state = mamba_d_state From 5a87125f6f4612fa6d127ffcd48addb626f91348 Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 26 Nov 2025 17:44:32 +0100 Subject: [PATCH 07/23] partial rotary factor --- .../configuration_efficientloftr.py | 2 +- .../models/fuyu/configuration_fuyu.py | 2 +- .../models/glm/configuration_glm.py | 2 +- .../models/glm4/configuration_glm4.py | 2 +- .../models/glm4_moe/configuration_glm4_moe.py | 2 +- .../models/glm4_moe/modular_glm4_moe.py | 2 +- .../glm4v_moe/configuration_glm4v_moe.py | 2 +- .../models/glm4v_moe/modular_glm4v_moe.py | 2 +- .../moonshine/configuration_moonshine.py | 2 +- .../models/moonshine/modular_moonshine.py | 2 +- .../models/nemotron/configuration_nemotron.py | 2 +- .../persimmon/configuration_persimmon.py | 2 +- .../models/phi/configuration_phi.py | 2 +- .../models/phi3/configuration_phi3.py | 2 +- .../configuration_phi4_multimodal.py | 2 +- .../qwen3_next/configuration_qwen3_next.py | 2 +- .../configuration_recurrent_gemma.py | 2 +- .../models/stablelm/configuration_stablelm.py | 2 +- tests/causal_lm_tester.py | 23 ++++++++++++++----- 19 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index ee2410ab00cd..3147a38134d9 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -178,7 +178,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} or {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 4.0) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 4.0) if "rope_theta" not in self.rope_parameters: self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index 069a7501f80a..642255ef219c 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -174,7 +174,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index bcfbec9e06ae..30a4fa13dcea 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -144,7 +144,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index 23241897b628..838249c75c52 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -144,7 +144,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index 42c7d426a154..6048e036d0ca 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -182,7 +182,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index 2d029d2eeea6..925296fb14d4 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -196,7 +196,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index 280a0a0c1bff..8fe8903e42fc 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -271,7 +271,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index f4755c3854ec..aad096eb8faf 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -217,7 +217,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index 1e0d569e6fff..3214f3169319 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -178,7 +178,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 8ddfb0f44b96..6b474e46a6a4 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -201,7 +201,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.9) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index f15a238beaa5..aa10f0085155 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -146,7 +146,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index 449ee64b4cb6..195377ded045 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -122,7 +122,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index b6e4aae55a9c..ced9545daac5 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -164,7 +164,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 0b7598de2b7c..d7651c10e2f7 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -167,7 +167,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index 206a3d8cd94f..c77c165bb63a 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -408,7 +408,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 1.0) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index 4382e4609067..32ef13b87f22 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -203,7 +203,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.25) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.25) self.layer_types = layer_types if self.layer_types is None: diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index f4c82c21ed83..6eee018a438c 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -150,7 +150,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index 537aab5e0ec9..122cd3af26af 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -149,7 +149,7 @@ def __init__( rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.25) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.25) # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: diff --git a/tests/causal_lm_tester.py b/tests/causal_lm_tester.py index bbcdadc9b2ca..2b368b426553 100644 --- a/tests/causal_lm_tester.py +++ b/tests/causal_lm_tester.py @@ -437,7 +437,7 @@ def test_model_rope_scaling_from_config(self, scaling_type): long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) set_seed(42) # Fixed seed at init time so the two models get the same random weights - _set_config_rope_params(config, {"rope_type": "default", "rope_theta": 10_000.0}) + _set_config_rope_params(config, {"rope_type": "default", "rope_theta": 10_000.0, "partial_rotary_factor": 1.0}) original_model = self.model_tester_class.base_model_class(config) original_model.to(torch_device) original_model.eval() @@ -445,7 +445,9 @@ def test_model_rope_scaling_from_config(self, scaling_type): original_long_output = original_model(long_input).last_hidden_state set_seed(42) # Fixed seed at init time so the two models get the same random weights - _set_config_rope_params(config, {"rope_type": scaling_type, "factor": 10.0, "rope_theta": 10_000.0}) + _set_config_rope_params( + config, {"rope_type": scaling_type, "factor": 10.0, "rope_theta": 10_000.0, "partial_rotary_factor": 1.0} + ) scaled_model = self.model_tester_class.base_model_class(config) scaled_model.to(torch_device) scaled_model.eval() @@ -497,7 +499,7 @@ def test_model_rope_scaling_frequencies(self): position_ids_long = position_ids_long.unsqueeze(0) # Sanity check original RoPE - _set_config_rope_params(config, {"rope_type": "default", "rope_theta": 10_000.0}) + _set_config_rope_params(config, {"rope_type": "default", "rope_theta": 10_000.0, "partial_rotary_factor": 1.0}) original_rope = rope_class(config=config).to(torch_device) original_cos_short, original_sin_short = original_rope(x, position_ids_short) original_cos_long, original_sin_long = original_rope(x, position_ids_long) @@ -506,7 +508,10 @@ def test_model_rope_scaling_frequencies(self): # Sanity check linear RoPE scaling # New position "x" should match original position with index "x/scaling_factor" - _set_config_rope_params(config, {"rope_type": "linear", "factor": scaling_factor, "rope_theta": 10_000.0}) + _set_config_rope_params( + config, + {"rope_type": "linear", "factor": scaling_factor, "rope_theta": 10_000.0, "partial_rotary_factor": 1.0}, + ) linear_scaling_rope = rope_class(config=config).to(torch_device) linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) @@ -520,7 +525,10 @@ def test_model_rope_scaling_frequencies(self): # Sanity check Dynamic NTK RoPE scaling # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase # with scaling_factor (or that `inv_freq` decreases) - _set_config_rope_params(config, {"rope_type": "dynamic", "factor": scaling_factor, "rope_theta": 10_000.0}) + _set_config_rope_params( + config, + {"rope_type": "dynamic", "factor": scaling_factor, "rope_theta": 10_000.0, "partial_rotary_factor": 1.0}, + ) ntk_scaling_rope = rope_class(config=config).to(torch_device) ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) @@ -534,7 +542,10 @@ def test_model_rope_scaling_frequencies(self): # Sanity check Yarn RoPE scaling # Scaling should be over the entire input - _set_config_rope_params(config, {"rope_type": "yarn", "factor": scaling_factor, "rope_theta": 10_000.0}) + _set_config_rope_params( + config, + {"rope_type": "yarn", "factor": scaling_factor, "rope_theta": 10_000.0, "partial_rotary_factor": 1.0}, + ) yarn_scaling_rope = rope_class(config=config).to(torch_device) yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) From 6d07c326b735ef2be8a4c917e25d8e9ed0afe35f Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 26 Nov 2025 22:20:39 +0100 Subject: [PATCH 08/23] more fixes to the god of fixes --- src/transformers/modeling_rope_utils.py | 12 ++++------ .../models/gpt_neox/configuration_gpt_neox.py | 6 +++-- .../qwen2_5_vl/configuration_qwen2_5_vl.py | 2 +- .../models/qwen2_vl/configuration_qwen2_vl.py | 2 +- tests/utils/test_modeling_rope_utils.py | 24 +++++++++---------- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 2a4807dd06d1..700edbcffbc9 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -91,19 +91,21 @@ def get_standardized_rope_params(config): # Move `rope_theta` and `partial_rotary_factor` to the params dict, if not there yet rope_theta = getattr(config, "rope_theta", None) - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = getattr(config, "partial_rotary_factor", None) # Case 1: one RoPE theat = one RoPE param per model without nesting if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): rope_parameters.setdefault("rope_type", rope_parameters.get("type", "default")) rope_parameters.setdefault("rope_theta", rope_theta) - rope_parameters.setdefault("partial_rotary_factor", partial_rotary_factor) + if partial_rotary_factor is not None: + rope_parameters["partial_rotary_factor"] = partial_rotary_factor # Case 2: different RoPE for each layer as nested dict else: for layer_type in config.layer_types: rope_parameters[layer_type].setdefault("rope_type", rope_parameters[layer_type].get("type", "default")) rope_parameters[layer_type].setdefault("rope_theta", rope_theta) - rope_parameters[layer_type].setdefault("partial_rotary_factor", partial_rotary_factor) + if partial_rotary_factor is not None: + rope_parameters[layer_type]["partial_rotary_factor"] = partial_rotary_factor return rope_parameters @@ -907,10 +909,6 @@ def rope_config_standardize_and_validate(config: PreTrainedConfig, ignore_keys: # BC: "rope_theta" was originally saved in config rope_parameters["rope_theta"] = rope_parameters.get("rope_theta", getattr(config, "rope_theta", None)) - # Ignore `partial_rotary_factor` if present for all RoPE types - ignore_keys = ignore_keys if ignore_keys is not None else set() - ignore_keys.update(["partial_rotary_factor"]) - if validation_fn is not None: validation_fn(rope_parameters, config=config, ignore_keys=ignore_keys) else: diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 3edc05d98c7f..87ebc3daa19f 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -144,8 +144,8 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.tie_word_embeddings = tie_word_embeddings self.use_parallel_residual = use_parallel_residual + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters @@ -161,7 +161,9 @@ def __init__( raise ValueError( "The hidden size is not divisible by the number of attention heads! Make sure to update them!" ) - super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + super().__init__( + bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs + ) __all__ = ["GPTNeoXConfig"] diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index 28e9525d400f..31638f9e11b8 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -220,7 +220,7 @@ def __init__( # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) - rope_type = rope_parameters.get("type") or rope_parameters.get("rope_type") + rope_type = self.rope_parameters.get("type") or self.rope_parameters.get("rope_type") if rope_type == "mrope": self.rope_parameters["rope_type"] = "default" rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 77a12064594b..a451bf2d878a 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -208,7 +208,7 @@ def __init__( # Validate the correctness of rotary position embeddings parameters if "rope_theta" not in self.rope_parameters: self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) - rope_type = rope_parameters.get("type") or rope_parameters.get("rope_type") + rope_type = self.rope_parameters.get("type") or self.rope_parameters.get("rope_type") if rope_type == "mrope": self.rope_parameters["rope_type"] = "default" rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py index 3cf063ac116e..5711f199a6a8 100644 --- a/tests/utils/test_modeling_rope_utils.py +++ b/tests/utils/test_modeling_rope_utils.py @@ -24,7 +24,7 @@ import torch from transformers import ROPE_INIT_FUNCTIONS - from transformers.modeling_rope_utils import rope_config_validation + from transformers.modeling_rope_utils import rope_config_standardize_and_validate from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding @@ -35,13 +35,13 @@ def test_rope_validation(self): all_rope_types = ROPE_INIT_FUNCTIONS.keys() # The base config is always valid (default RoPE) - rope_config_validation(config) + rope_config_standardize_and_validate(config) # If we explicitly set the other RoPE types, then validation should fail for rope_type in all_rope_types: config.rope_parameters = {"rope_type": rope_type, "rope_theta": 10000.0} with self.assertRaises(KeyError): - rope_config_validation(config) + rope_config_standardize_and_validate(config) # Parameters are exclusive to their own RoPE type, and should raise an exception if incorrectly passed valid_param_mapping = { @@ -60,16 +60,16 @@ def test_rope_validation(self): continue else: with self.assertRaises(KeyError): - rope_config_validation(config) + rope_config_standardize_and_validate(config) # Any other parameters passed to RoPE will raise a warning that a particular key is not used # But sometimes we can have model-specific RoPE kwargs and bypass warning with `ignore_keys` model_specific_kwarg = "mrope_sections" # e,g in Qwen2-VL config.rope_parameters = {"rope_type": "default", "rope_theta": 10000.0, model_specific_kwarg: True} - rope_config_validation(config, ignore_keys={model_specific_kwarg}) + rope_config_standardize_and_validate(config, ignore_keys={model_specific_kwarg}) with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_validation(config) + rope_config_standardize_and_validate(config) self.assertEqual(len(logs.output), 1) self.assertIn(model_specific_kwarg, logs.output[0]) @@ -81,10 +81,10 @@ def test_rope_validation(self): "global_attn": {"rope_type": "default", "rope_theta": 10000}, "local_attn": {"rope_type": "linear", "rope_theta": 10000, "factor": 2.0}, } - rope_config_validation(config) + rope_config_standardize_and_validate(config) config.rope_parameters = config.rope_parameters["local_attn"] - rope_config_validation(config) + rope_config_standardize_and_validate(config) def test_yarn_original_original_max_position_embeddings_validation(self): """Tests that models with no/bad `original_max_position_embeddings` raise a warning""" @@ -100,7 +100,7 @@ def test_yarn_original_original_max_position_embeddings_validation(self): config.rope_parameters = rope_config with self.assertRaises(AssertionError): # confirm that no warnings are thrown with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_validation(config) + rope_config_standardize_and_validate(config) # bad rope config, no `original_max_position_embeddings` -> warning rope_config = { @@ -110,7 +110,7 @@ def test_yarn_original_original_max_position_embeddings_validation(self): } config.rope_parameters = rope_config with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_validation(config) + rope_config_standardize_and_validate(config) self.assertEqual(len(logs.output), 1) self.assertIn("is unset", logs.output[0]) @@ -123,7 +123,7 @@ def test_yarn_original_original_max_position_embeddings_validation(self): } config.rope_parameters = rope_config with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_validation(config) + rope_config_standardize_and_validate(config) self.assertEqual(len(logs.output), 1) self.assertIn("implicit factor", logs.output[0]) @@ -373,7 +373,7 @@ def test_longrope_rope_numerically(self): } self.assertEqual(config.rope_parameters.get("attention_factor"), None) # Verify that "TypeError: '<' not supported between instances of 'NoneType' and 'int'" is not raised. - rope_config_validation(config) + rope_config_standardize_and_validate(config) # Check 2: seq_len == 0 -> short factor is applied to the default frequencies config.rope_parameters = { From 22f94e2424f25fdbb249a237d7206a0971d46b4b Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 27 Nov 2025 09:36:26 +0100 Subject: [PATCH 09/23] setdefault --- .../models/apertus/configuration_apertus.py | 3 +-- .../models/apertus/modular_apertus.py | 3 +-- .../models/arcee/configuration_arcee.py | 3 +-- .../models/aria/configuration_aria.py | 3 +-- .../models/bamba/configuration_bamba.py | 3 +-- .../models/bitnet/configuration_bitnet.py | 3 +-- .../models/blt/configuration_blt.py | 15 +++++---------- .../chameleon/configuration_chameleon.py | 3 +-- .../models/cohere/configuration_cohere.py | 3 +-- .../models/cohere2/configuration_cohere2.py | 3 +-- .../models/cohere2/modular_cohere2.py | 3 +-- .../models/csm/configuration_csm.py | 6 ++---- .../models/cwm/configuration_cwm.py | 6 ++++-- src/transformers/models/cwm/modular_cwm.py | 3 +-- .../deepseek_v2/configuration_deepseek_v2.py | 3 +-- .../deepseek_v3/configuration_deepseek_v3.py | 3 +-- .../models/dia/configuration_dia.py | 6 ++---- .../diffllama/configuration_diffllama.py | 3 +-- .../models/doge/configuration_doge.py | 3 +-- src/transformers/models/doge/modular_doge.py | 3 +-- .../models/dots1/configuration_dots1.py | 3 +-- .../configuration_efficientloftr.py | 3 +-- .../models/emu3/configuration_emu3.py | 3 +-- .../models/ernie4_5/configuration_ernie4_5.py | 3 +-- .../ernie4_5_moe/configuration_ernie4_5_moe.py | 3 +-- .../models/evolla/configuration_evolla.py | 3 +-- .../models/exaone4/configuration_exaone4.py | 3 +-- .../models/exaone4/modular_exaone4.py | 3 +-- .../models/falcon/configuration_falcon.py | 3 +-- .../falcon_h1/configuration_falcon_h1.py | 3 +-- .../flex_olmo/configuration_flex_olmo.py | 6 ++++-- .../models/flex_olmo/modular_flex_olmo.py | 3 +-- .../models/fuyu/configuration_fuyu.py | 3 +-- .../models/gemma/configuration_gemma.py | 3 +-- src/transformers/models/gemma/modular_gemma.py | 3 +-- .../models/gemma2/configuration_gemma2.py | 3 +-- .../models/gemma2/modular_gemma2.py | 3 +-- .../models/gemma3/configuration_gemma3.py | 4 ++-- .../models/gemma3/modular_gemma3.py | 4 ++-- .../models/gemma3n/configuration_gemma3n.py | 4 ++-- .../models/gemma3n/modular_gemma3n.py | 4 ++-- .../models/glm/configuration_glm.py | 3 +-- .../models/glm4/configuration_glm4.py | 3 +-- .../models/glm4_moe/configuration_glm4_moe.py | 3 +-- .../models/glm4_moe/modular_glm4_moe.py | 3 +-- .../models/glm4v/configuration_glm4v.py | 3 +-- src/transformers/models/glm4v/modular_glm4v.py | 3 +-- .../glm4v_moe/configuration_glm4v_moe.py | 3 +-- .../models/glm4v_moe/modular_glm4v_moe.py | 3 +-- .../models/gpt_neox/configuration_gpt_neox.py | 3 +-- .../models/gpt_neox/modular_gpt_neox.py | 3 ++- .../configuration_gpt_neox_japanese.py | 3 +-- .../models/gpt_oss/configuration_gpt_oss.py | 3 +-- .../models/granite/configuration_granite.py | 3 +-- .../granitemoe/configuration_granitemoe.py | 3 +-- .../configuration_granitemoehybrid.py | 3 +-- .../configuration_granitemoeshared.py | 3 +-- .../models/helium/configuration_helium.py | 3 +-- .../configuration_hunyuan_v1_dense.py | 3 +-- .../configuration_hunyuan_v1_moe.py | 3 +-- .../models/jetmoe/configuration_jetmoe.py | 3 +-- .../configuration_kyutai_speech_to_text.py | 3 +-- .../models/llama/configuration_llama.py | 3 +-- .../models/llama4/configuration_llama4.py | 6 ++---- .../configuration_longcat_flash.py | 3 +-- .../models/mimi/configuration_mimi.py | 3 +-- .../models/minimax/configuration_minimax.py | 3 +-- .../models/minimax/modular_minimax.py | 3 +-- .../ministral/configuration_ministral.py | 3 +-- .../models/ministral/modular_ministral.py | 3 +-- .../models/mistral/configuration_mistral.py | 3 +-- .../models/mixtral/configuration_mixtral.py | 3 +-- .../models/mllama/configuration_mllama.py | 3 +-- .../modernbert/configuration_modernbert.py | 4 ++-- .../models/modernbert/modular_modernbert.py | 4 ++-- .../configuration_modernbert_decoder.py | 4 ++-- .../modular_modernbert_decoder.py | 4 ++-- .../moonshine/configuration_moonshine.py | 3 +-- .../models/moonshine/modular_moonshine.py | 3 +-- .../models/moshi/configuration_moshi.py | 3 +-- .../models/nemotron/configuration_nemotron.py | 3 +-- .../models/olmo/configuration_olmo.py | 3 +-- .../models/olmo2/configuration_olmo2.py | 3 +-- .../models/olmo3/configuration_olmo3.py | 3 +-- src/transformers/models/olmo3/modular_olmo3.py | 3 +-- .../models/olmoe/configuration_olmoe.py | 3 +-- .../persimmon/configuration_persimmon.py | 3 +-- .../models/phi/configuration_phi.py | 3 +-- .../models/phi3/configuration_phi3.py | 3 +-- .../configuration_phi4_multimodal.py | 3 +-- .../phi4_multimodal/modular_phi4_multimodal.py | 4 ---- .../models/phimoe/configuration_phimoe.py | 3 +-- .../models/pixtral/configuration_pixtral.py | 3 +-- .../models/qwen2/configuration_qwen2.py | 3 +-- .../qwen2_5_omni/configuration_qwen2_5_omni.py | 9 +++------ .../qwen2_5_omni/modular_qwen2_5_omni.py | 9 +++------ .../qwen2_5_vl/configuration_qwen2_5_vl.py | 3 +-- .../qwen2_moe/configuration_qwen2_moe.py | 3 +-- .../models/qwen2_vl/configuration_qwen2_vl.py | 3 +-- .../models/qwen3/configuration_qwen3.py | 3 +-- .../qwen3_moe/configuration_qwen3_moe.py | 3 +-- .../qwen3_next/configuration_qwen3_next.py | 3 +-- .../configuration_qwen3_omni_moe.py | 18 +++++++++--------- .../qwen3_omni_moe/modular_qwen3_omni_moe.py | 6 ++---- .../models/qwen3_vl/configuration_qwen3_vl.py | 3 +-- .../models/qwen3_vl/modular_qwen3_vl.py | 3 +-- .../qwen3_vl_moe/configuration_qwen3_vl_moe.py | 3 +-- .../qwen3_vl_moe/modular_qwen3_vl_moe.py | 3 +-- .../configuration_recurrent_gemma.py | 3 +-- .../models/seed_oss/configuration_seed_oss.py | 3 +-- .../models/smollm3/configuration_smollm3.py | 3 +-- .../models/smollm3/modular_smollm3.py | 3 +-- .../models/stablelm/configuration_stablelm.py | 3 +-- .../starcoder2/configuration_starcoder2.py | 3 +-- .../models/t5gemma/configuration_t5gemma.py | 3 +-- .../vaultgemma/configuration_vaultgemma.py | 3 +-- .../models/zamba2/configuration_zamba2.py | 3 +-- 117 files changed, 151 insertions(+), 266 deletions(-) diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index 5ca362817b36..cc7e14a22330 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -167,8 +167,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 12000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 12000000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index 6f8c03854d8d..f5e81aef1a9f 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -180,8 +180,7 @@ def __init__( del self.head_dim # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 12000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 12000000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/arcee/configuration_arcee.py b/src/transformers/models/arcee/configuration_arcee.py index 79163a73e190..5b8855ec15c7 100644 --- a/src/transformers/models/arcee/configuration_arcee.py +++ b/src/transformers/models/arcee/configuration_arcee.py @@ -169,8 +169,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index fe929b0d3a50..f451798233a0 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -174,8 +174,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index a460042a139b..7057bcc7bd52 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -179,8 +179,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = 0.5 # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py index d60aa27e7c36..e6ff072a8d72 100644 --- a/src/transformers/models/bitnet/configuration_bitnet.py +++ b/src/transformers/models/bitnet/configuration_bitnet.py @@ -144,8 +144,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py index ccfb6b6a2f80..c4c15674f875 100644 --- a/src/transformers/models/blt/configuration_blt.py +++ b/src/transformers/models/blt/configuration_blt.py @@ -71,8 +71,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error @@ -127,8 +126,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error @@ -175,8 +173,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error @@ -257,8 +254,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error @@ -387,8 +383,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) # Cross attention configurations diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index 910bad258f67..7e02e7ced92b 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -236,8 +236,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) if vq_config is None: diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index 0c1f1f1e47ba..8399b3b56622 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -171,8 +171,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index eeb640aa5838..97914120f55e 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -194,8 +194,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index 513ec5f4c960..7717db42e8f7 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -217,8 +217,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index f0f1a8bf07a3..19da4f31405c 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -167,8 +167,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) @@ -355,8 +354,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/cwm/configuration_cwm.py b/src/transformers/models/cwm/configuration_cwm.py index 3c33a3b1a16b..6c9b256ab55d 100644 --- a/src/transformers/models/cwm/configuration_cwm.py +++ b/src/transformers/models/cwm/configuration_cwm.py @@ -183,8 +183,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( @@ -195,5 +194,8 @@ def __init__( **kwargs, ) + # Validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1_000_000.0)) + __all__ = ["CwmConfig"] diff --git a/src/transformers/models/cwm/modular_cwm.py b/src/transformers/models/cwm/modular_cwm.py index b9e0ab1b1f39..88a5e2708976 100644 --- a/src/transformers/models/cwm/modular_cwm.py +++ b/src/transformers/models/cwm/modular_cwm.py @@ -183,8 +183,7 @@ def __init__( del self.attention_bias # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1_000_000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index 5f0ffb4c6dfd..2b3e842da4a0 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -215,8 +215,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index d003a5cb16a0..cc3983f81286 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -231,8 +231,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) for key in ["beta_fast", "beta_slow", "factor"]: if key in self.rope_parameters: diff --git a/src/transformers/models/dia/configuration_dia.py b/src/transformers/models/dia/configuration_dia.py index cd569fd50abb..c518999a2dac 100644 --- a/src/transformers/models/dia/configuration_dia.py +++ b/src/transformers/models/dia/configuration_dia.py @@ -98,8 +98,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__(**kwargs) @@ -205,8 +204,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index a17b4d056a03..ba11f29847fd 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -151,8 +151,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index b5f430f66e5b..a82c2c2c0b03 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -195,8 +195,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) # for backward compatibility diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index 9593b38cca7c..e92e0af535ad 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -224,8 +224,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) # for backward compatibility diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py index d66682ce31f7..8e1e4530246f 100644 --- a/src/transformers/models/dots1/configuration_dots1.py +++ b/src/transformers/models/dots1/configuration_dots1.py @@ -208,8 +208,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index 2e9a072a96b6..8c5604a85ebb 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -179,8 +179,7 @@ def __init__( rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} or {} self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 4.0) - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) # Standardize and validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py index cacd6d791b83..84d2d12495a9 100644 --- a/src/transformers/models/emu3/configuration_emu3.py +++ b/src/transformers/models/emu3/configuration_emu3.py @@ -232,8 +232,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/ernie4_5/configuration_ernie4_5.py b/src/transformers/models/ernie4_5/configuration_ernie4_5.py index 0624c2b87771..6d6236868b79 100644 --- a/src/transformers/models/ernie4_5/configuration_ernie4_5.py +++ b/src/transformers/models/ernie4_5/configuration_ernie4_5.py @@ -154,8 +154,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py index e2e1f47a1e87..e5f453f6db9c 100644 --- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py @@ -187,8 +187,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) # MoE arguments diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py index 915eebf8d884..0a975073e038 100644 --- a/src/transformers/models/evolla/configuration_evolla.py +++ b/src/transformers/models/evolla/configuration_evolla.py @@ -255,8 +255,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) # Subconfig diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index 3407ad51dd20..9c75cb848967 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -184,8 +184,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index af086d9f942c..72a683d2b9f0 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -217,8 +217,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 9aee6d4b6246..052c10f69032 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -168,8 +168,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py index b652ec363e17..76cd76d76a45 100644 --- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py +++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -203,8 +203,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) self.projectors_bias = projectors_bias diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index af1e14c73d63..10c21c3823eb 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -181,8 +181,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( @@ -193,5 +192,8 @@ def __init__( **kwargs, ) + # Validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) + __all__ = ["FlexOlmoConfig"] diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index e445049bcf2d..4960263040ed 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -192,8 +192,7 @@ def __init__( del self.clip_qkv # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index db1665e5ab42..0fcff0f8fd51 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -177,8 +177,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 25000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 25000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index ab55ce52b435..2ee5489b982e 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -158,8 +158,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index c707a7afb874..b9f6d67aef5d 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -186,8 +186,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index c93f19a11bd3..c197f85d9744 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -192,8 +192,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index ad9265356f24..14a89c323cb8 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -220,8 +220,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index 0b337efb8e44..fdabac513e1c 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -196,8 +196,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: self.rope_parameters["full_attention"].update(rope_scaling) - self.rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) - self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("rope_local_base_freq", 10000.0) + self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("rope_theta", 1_000_000.0)) + self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("rope_local_base_freq", 10000.0)) self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 312ef6449d10..fd18a92a071a 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -211,8 +211,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: self.rope_parameters["full_attention"].update(rope_scaling) - self.rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("rope_theta", 1_000_000.0) - self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("rope_local_base_freq", 10000.0) + self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("rope_theta", 1_000_000.0)) + self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("rope_local_base_freq", 10000.0)) self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index 1db2b83b698f..43d30f7af8a5 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -244,8 +244,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: self.rope_parameters["full_attention"].update(rope_scaling) - self.rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) - self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) + self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("rope_theta", 1_000_000.0)) + self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("rope_local_base_freq", 10000.0)) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index ea7d50422d6a..5b982d93a314 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -255,8 +255,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: self.rope_parameters["full_attention"].update(rope_scaling) - self.rope_parameters["full_attention"]["rope_theta"] = kwargs.get("rope_theta", 1_000_000.0) - self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.get("rope_local_base_freq", 10000.0) + self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("rope_theta", 1_000_000.0)) + self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("rope_local_base_freq", 10000.0)) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index 69c7cffc3a59..61054d1975d8 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -147,8 +147,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index 608084b2d816..21933cddf48b 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -147,8 +147,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index 7a996815bd40..0861d319426b 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -185,8 +185,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) # MoE arguments diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index b9575ed2f9c4..d38460ca212f 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -199,8 +199,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) # MoE arguments diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index e5e3803a24db..bc10d3118de5 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -238,8 +238,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 22aa877a0ee5..06c6347b165f 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -275,8 +275,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index 9a21dbffe089..9f0e4551dbcf 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -274,8 +274,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) # MoE arguments diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index eb95466d15ad..5bb557b51065 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -220,8 +220,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) # MoE arguments diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 7e5e1daea09e..d727294b5ff0 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -154,8 +154,7 @@ def __init__( self.attention_bias = attention_bias # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rotary_emb_base", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", 10000.0)) rope_config_standardize_and_validate(self) if self.hidden_size % self.num_attention_heads != 0: raise ValueError( diff --git a/src/transformers/models/gpt_neox/modular_gpt_neox.py b/src/transformers/models/gpt_neox/modular_gpt_neox.py index a2baca515668..f31575b052d2 100644 --- a/src/transformers/models/gpt_neox/modular_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modular_gpt_neox.py @@ -146,7 +146,8 @@ def __init__(self, config, layer_idx=None): self.config = config self.head_size = config.hidden_size // config.num_attention_heads self.attention_dropout = config.attention_dropout - self.rotary_ndims = int(self.head_size * config.rotary_pct) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) + self.rotary_ndims = int(self.head_size * partial_rotary_factor) self.scaling = self.head_size**-0.5 self.is_causal = True self.layer_idx = layer_idx diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index b94f1b72322f..9fced0e6fa04 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -121,8 +121,7 @@ def __init__( self.hidden_dropout = hidden_dropout # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rotary_emb_base", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", 10000.0)) rope_config_standardize_and_validate(self) super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py index d6782bf8fec5..c39f0f6b1e54 100644 --- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py +++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py @@ -115,8 +115,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 150000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 150000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py index 6c8c15c9f267..0000113cc506 100644 --- a/src/transformers/models/granite/configuration_granite.py +++ b/src/transformers/models/granite/configuration_granite.py @@ -181,8 +181,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index bcfbd783e67e..100aacc4ae37 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -165,8 +165,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) self.attention_bias = attention_bias diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py index f6be724a4f4c..5546493b1bec 100644 --- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py @@ -204,8 +204,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index 652e5c7a6573..bce50e9c161f 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -170,8 +170,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) self.attention_bias = attention_bias diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py index 3f0fb130ffd4..03ca67855307 100644 --- a/src/transformers/models/helium/configuration_helium.py +++ b/src/transformers/models/helium/configuration_helium.py @@ -153,8 +153,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 100000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 100000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index 1917723c5343..5549a6f1fdf7 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -147,8 +147,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) # TODO needs model-specific validation? super().__init__( diff --git a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py index fc7410a61055..6b995ffcb8ea 100644 --- a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -163,8 +163,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index e219e2b3ec16..226b5154bd0a 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -153,8 +153,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index 33968484b72f..9eb2d20320bb 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -189,8 +189,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index b896d8b89857..1dc0a40d7e15 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -177,8 +177,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index 78f02b853179..5d16358beeb8 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -130,8 +130,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__(**kwargs) @@ -355,8 +354,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index 6d527503cd20..307faae51354 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -216,8 +216,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000000.0)) for key in ["beta_fast", "beta_slow", "factor"]: if key in self.rope_parameters: diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py index 0a07834f5412..46263fce9f1d 100644 --- a/src/transformers/models/mimi/configuration_mimi.py +++ b/src/transformers/models/mimi/configuration_mimi.py @@ -227,8 +227,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) # Handle backward compatibility for frame_rate: diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index 7b569f2daf41..750b98acc81b 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -233,8 +233,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index 7aa2362dcbf3..0072cc7d9ea6 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -258,8 +258,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_config_standardize_and_validate(self) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ministral/configuration_ministral.py b/src/transformers/models/ministral/configuration_ministral.py index 2e327556430d..c3ad12827dbf 100644 --- a/src/transformers/models/ministral/configuration_ministral.py +++ b/src/transformers/models/ministral/configuration_ministral.py @@ -168,8 +168,7 @@ def __init__( ] * num_hidden_layers # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/ministral/modular_ministral.py b/src/transformers/models/ministral/modular_ministral.py index 77c918b25511..e67260f12eea 100644 --- a/src/transformers/models/ministral/modular_ministral.py +++ b/src/transformers/models/ministral/modular_ministral.py @@ -170,8 +170,7 @@ def __init__( ] * num_hidden_layers # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py index fec1d79e7b81..454f72c6a0d7 100644 --- a/src/transformers/models/mistral/configuration_mistral.py +++ b/src/transformers/models/mistral/configuration_mistral.py @@ -172,8 +172,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index 5d082f2ebec0..2922f316eb6f 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -192,8 +192,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index f6cc53b333cb..c89085162254 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -253,8 +253,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 500000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index 3dff3c8bd464..40a4d66896fa 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -226,8 +226,8 @@ def __init__( if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: self.rope_parameters["full_attention"].update(rope_scaling) self.rope_parameters["sliding_attention"].update(rope_scaling) - self.rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) - self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) + self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("global_rope_theta", 160_000.0)) + self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("local_rope_theta", 10000.0)) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 26b056174db6..3c280f44b14a 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -254,8 +254,8 @@ def __init__( if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: self.rope_parameters["full_attention"].update(rope_scaling) self.rope_parameters["sliding_attention"].update(rope_scaling) - self.rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) - self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) + self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("global_rope_theta", 160_000.0)) + self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("local_rope_theta", 10000.0)) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index 1214b1a7f74a..b693ec38a161 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -203,8 +203,8 @@ def __init__( if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: self.rope_parameters["full_attention"].update(rope_scaling) self.rope_parameters["sliding_attention"].update(rope_scaling) - self.rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) - self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) + self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("global_rope_theta", 160_000.0)) + self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("local_rope_theta", 10000.0)) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index 89b32f7a7b00..b137b7383314 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -224,8 +224,8 @@ def __init__( if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: self.rope_parameters["full_attention"].update(rope_scaling) self.rope_parameters["sliding_attention"].update(rope_scaling) - self.rope_parameters["full_attention"]["rope_theta"] = kwargs.pop("global_rope_theta", 160_000.0) - self.rope_parameters["sliding_attention"]["rope_theta"] = kwargs.pop("local_rope_theta", 10000.0) + self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("global_rope_theta", 160_000.0)) + self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("local_rope_theta", 10000.0)) # Validate the correctness of rotary position embeddings parameters rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index e272b4d696ac..674ee14d04f1 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -181,8 +181,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index f77f743e36dc..5b62c5c53487 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -204,8 +204,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.9) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index ecae0932799e..c96df98ec41d 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -288,8 +288,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) audio_encoder_config = kwargs.pop("audio_encoder_config", {}) diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index 82315fdf903a..e3dc526b9cb8 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -149,8 +149,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index 4c8fbf36b3fa..bfab2296711f 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -164,8 +164,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py index 009bbde32656..686c45417425 100644 --- a/src/transformers/models/olmo2/configuration_olmo2.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -164,8 +164,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/olmo3/configuration_olmo3.py b/src/transformers/models/olmo3/configuration_olmo3.py index 4108a8f86a93..9d5f0d1e08ac 100644 --- a/src/transformers/models/olmo3/configuration_olmo3.py +++ b/src/transformers/models/olmo3/configuration_olmo3.py @@ -182,8 +182,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/olmo3/modular_olmo3.py b/src/transformers/models/olmo3/modular_olmo3.py index c7a0f14d4a10..04e0d67e3012 100644 --- a/src/transformers/models/olmo3/modular_olmo3.py +++ b/src/transformers/models/olmo3/modular_olmo3.py @@ -198,8 +198,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py index 93f304c21bcd..4b0c9537deb8 100644 --- a/src/transformers/models/olmoe/configuration_olmoe.py +++ b/src/transformers/models/olmoe/configuration_olmoe.py @@ -164,8 +164,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index c6e51d9333ea..4adf6a40f884 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -125,8 +125,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 25000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 25000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index fc32bbe3c0f1..a8f17712aa51 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -167,8 +167,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 6a9ba07eed7f..c4f16246debf 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -170,8 +170,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) self._rope_parameters_adjustment() self._rope_parameters_validation() diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index 03fb290895d1..38a247b87f8a 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -411,8 +411,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 1.0) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) self._rope_parameters_adjustment() self._rope_parameters_validation() diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py index c5c673cf68c2..131e3db0fc92 100644 --- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py @@ -333,8 +333,6 @@ class Phi4MultimodalConfig(Phi3Config): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to `1.0`): - Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0. bos_token_id (`int`, *optional*, defaults to 199999): The id of the "beginning-of-sequence" token. eos_token_id (`int` or `list[int]`, *optional*, defaults to `[199999, 200020]`): @@ -390,7 +388,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[int] = 1, bos_token_id: Optional[int] = 199999, eos_token_id: Optional[list[int]] = [199999, 200020], pad_token_id: Optional[int] = 199999, @@ -429,7 +426,6 @@ def __init__( use_cache=use_cache, tie_word_embeddings=tie_word_embeddings, rope_parameters=rope_parameters, - partial_rotary_factor=partial_rotary_factor, bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index 0be33b25d0b7..3a1bd9a29173 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -173,8 +173,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_config_standardize_and_validate(self) if self.rope_parameters["rope_type"] != "default": diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index 19c70b4ac495..f040f6de9d94 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -109,8 +109,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index 09e7bc39c07a..203b88d0dd44 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -173,8 +173,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 9aa70f6cb118..e4bb8c4bbd5a 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -370,8 +370,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) @@ -721,8 +720,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_config_standardize_and_validate(self) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -830,8 +828,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 8fa4e2039f8e..349f23c39e6b 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -403,8 +403,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) @@ -754,8 +753,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_config_standardize_and_validate(self) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -863,8 +861,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index f37600b50395..8c0c4635299f 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -218,8 +218,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_type = self.rope_parameters.get("type") or self.rope_parameters.get("rope_type") if rope_type == "mrope": self.rope_parameters["rope_type"] = "default" diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py index 47aef719e6cd..91c1c783b5d1 100644 --- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py @@ -211,8 +211,7 @@ def __init__( layer_type_validation(self.layer_types) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 88632e7d93df..935311f87ea3 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -206,8 +206,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_type = self.rope_parameters.get("type") or self.rope_parameters.get("rope_type") if rope_type == "mrope": self.rope_parameters["rope_type"] = "default" diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index a8b496ee12cd..5827c36d6bd6 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -181,8 +181,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index 545a8b95afe7..5f30d772e7cf 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -185,8 +185,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) # MoE arguments diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index 6da45d00d86b..f6f307a2d167 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -215,8 +215,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) # linear attention part diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index 4b3c68e56c7d..df685698175b 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -330,9 +330,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) + rope_config_standardize_and_validate(self) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -349,6 +348,10 @@ def __init__( **kwargs, ) + # Validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) + class Qwen3OmniMoeThinkerConfig(PreTrainedConfig): r""" @@ -605,8 +608,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( @@ -775,8 +777,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) # MoE arguments @@ -1045,8 +1046,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) @property diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index e2e3070b560b..f1289d9ac6c9 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -217,8 +217,7 @@ def __init__( self.sliding_window = sliding_window # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 1000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) @@ -678,8 +677,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) @property diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index 79e4e35a1718..d21b055af6a4 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -176,8 +176,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 5000000.0)) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index aef00350ce60..8d93dc5612a9 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -217,8 +217,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 5000000.0)) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py index 3684f5f3c255..bd642ff3c3f9 100644 --- a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py @@ -172,8 +172,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 5000000.0)) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index e3473387302c..2469b533ed09 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -194,8 +194,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 5000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 5000000.0)) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) # MoE arguments diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index 7331fa0d6494..1c323dfc156c 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -153,8 +153,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/seed_oss/configuration_seed_oss.py b/src/transformers/models/seed_oss/configuration_seed_oss.py index 0662f18fb6f7..9a50e32ab3a2 100644 --- a/src/transformers/models/seed_oss/configuration_seed_oss.py +++ b/src/transformers/models/seed_oss/configuration_seed_oss.py @@ -176,8 +176,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py index acf91e465ca7..5954062061ed 100644 --- a/src/transformers/models/smollm3/configuration_smollm3.py +++ b/src/transformers/models/smollm3/configuration_smollm3.py @@ -207,8 +207,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 2000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 2000000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py index 6d94417705f6..fce5cac404d1 100644 --- a/src/transformers/models/smollm3/modular_smollm3.py +++ b/src/transformers/models/smollm3/modular_smollm3.py @@ -224,8 +224,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 2000000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 2000000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index 7b7875355bd8..d8f54fa7c6c1 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -152,8 +152,7 @@ def __init__( self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.25) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py index aafe97939c83..ecac62aa734f 100644 --- a/src/transformers/models/starcoder2/configuration_starcoder2.py +++ b/src/transformers/models/starcoder2/configuration_starcoder2.py @@ -161,8 +161,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) super().__init__( diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py index 43587485d458..b77f4aac69c5 100644 --- a/src/transformers/models/t5gemma/configuration_t5gemma.py +++ b/src/transformers/models/t5gemma/configuration_t5gemma.py @@ -188,8 +188,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/vaultgemma/configuration_vaultgemma.py b/src/transformers/models/vaultgemma/configuration_vaultgemma.py index 947c3e7f20b4..aa5798160ccf 100644 --- a/src/transformers/models/vaultgemma/configuration_vaultgemma.py +++ b/src/transformers/models/vaultgemma/configuration_vaultgemma.py @@ -188,8 +188,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py index e162cea0fcd0..b39837785805 100644 --- a/src/transformers/models/zamba2/configuration_zamba2.py +++ b/src/transformers/models/zamba2/configuration_zamba2.py @@ -201,8 +201,7 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - if "rope_theta" not in self.rope_parameters: - self.rope_parameters["rope_theta"] = kwargs.pop("rope_theta", 10000.0) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) rope_config_standardize_and_validate(self) self.mamba_d_state = mamba_d_state From 6f4ed17f89362e417ee11a5cbcbe9f5927a4c84d Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 27 Nov 2025 10:21:48 +0100 Subject: [PATCH 10/23] fix copies --- .../models/apertus/modular_apertus.py | 59 +++--- .../flex_olmo/configuration_flex_olmo.py | 6 +- .../models/flex_olmo/modular_flex_olmo.py | 63 ++++--- .../configuration_qwen3_omni_moe.py | 117 +----------- .../qwen3_omni_moe/modular_qwen3_omni_moe.py | 173 +++++++++++++++--- 5 files changed, 219 insertions(+), 199 deletions(-) diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index f5e81aef1a9f..5fb42df98531 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -20,11 +20,11 @@ from torch import nn from ...cache_utils import Cache +from ...configuration_utils import PreTrainedConfig from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging -from ..llama.configuration_llama import LlamaConfig from ..llama.modeling_llama import ( LlamaAttention, LlamaDecoderLayer, @@ -43,7 +43,7 @@ logger = logging.get_logger(__name__) -class ApertusConfig(LlamaConfig): +class ApertusConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`ApertusModel`]. It is used to instantiate a Apertus model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -116,6 +116,7 @@ class ApertusConfig(LlamaConfig): ```""" model_type = "apertus" + keys_to_ignore_at_inference = ["past_key_values"] base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k @@ -124,6 +125,11 @@ class ApertusConfig(LlamaConfig): "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } def __init__( self, @@ -154,34 +160,41 @@ def __init__( attention_dropout: Optional[float] = 0.0, **kwargs, ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} + + # Validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 12000000.0)) + rope_config_standardize_and_validate(self) + super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=use_cache, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, - rope_parameters=rope_parameters, - attention_bias=attention_bias, - attention_dropout=attention_dropout, **kwargs, ) - del self.pretraining_tp - del self.mlp_bias - del self.head_dim - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 12000000.0)) - rope_config_standardize_and_validate(self) class ApertusMLP(NemotronMLP): diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index 10c21c3823eb..6aa2b28ac27a 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -175,13 +175,14 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.norm_topk_prob = norm_topk_prob + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) rope_config_standardize_and_validate(self) super().__init__( @@ -192,8 +193,5 @@ def __init__( **kwargs, ) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - __all__ = ["FlexOlmoConfig"] diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index 4960263040ed..4777802972a8 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -19,6 +19,7 @@ from torch import nn from ...cache_utils import Cache, DynamicCache +from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_outputs import MoeModelOutputWithPast from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate @@ -27,7 +28,6 @@ from ...utils.generic import OutputRecorder, check_model_inputs from ..mixtral.modeling_mixtral import MixtralModel, MixtralPreTrainedModel from ..olmo2.modeling_olmo2 import Olmo2Attention, Olmo2RMSNorm, Olmo2RotaryEmbedding -from ..olmoe.configuration_olmoe import OlmoeConfig from ..olmoe.modeling_olmoe import ( OlmoeDecoderLayer, OlmoeForCausalLM, @@ -36,7 +36,7 @@ ) -class FlexOlmoConfig(OlmoeConfig): +class FlexOlmoConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`FlexOlmoModel`]. It is used to instantiate an FlexOlmo model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -120,6 +120,7 @@ class FlexOlmoConfig(OlmoeConfig): model_type = "flex_olmo" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_local_experts": "num_experts"} base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k @@ -162,26 +163,40 @@ def __init__( norm_topk_prob: Optional[bool] = False, **kwargs, ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.num_experts_per_tok = num_experts_per_tok + self.num_experts = num_experts + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + self.norm_topk_prob = norm_topk_prob + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} + + # Validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) + rope_config_standardize_and_validate(self) + super().__init__( - vocab_size=vocab_size, - max_position_embeddings=max_position_embeddings, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=use_cache, - rope_parameters=rope_parameters, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - num_experts_per_tok=num_experts_per_tok, - num_experts=num_experts, - output_router_logits=output_router_logits, - router_aux_loss_coef=router_aux_loss_coef, - norm_topk_prob=norm_topk_prob, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, @@ -189,12 +204,6 @@ def __init__( **kwargs, ) - del self.clip_qkv - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) - # FlexOlmo RMS norm reuses Olmo2 RMS norm, which handles low precision slightly differently than the original Olmoe. class FlexOlmoRMSNorm(Olmo2RMSNorm): diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index df685698175b..2a57746fc71e 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -172,115 +172,6 @@ def __init__( class Qwen3OmniMoeTextConfig(PreTrainedConfig): - r""" - This is the configuration class to store the configuration of a [`Qwen3OmniMoeTextModel`]. It is used to instantiate a - Qwen3OmniMoeText model according to the specified arguments, defining the model architecture. Instantiating a configuration - with the defaults will yield a similar configuration to that of [Qwen/Qwen3-15B-A2B](https://huggingface.co/Qwen/Qwen3-15B-A2B). - - Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the - documentation from [`PreTrainedConfig`] for more information. - - - Args: - vocab_size (`int`, *optional*, defaults to 151936): - Vocabulary size of the Qwen3OmniMoeText model. Defines the number of different tokens that can be represented by the - `inputs_ids` passed when calling [`Qwen3OmniMoeTextModel`] - hidden_size (`int`, *optional*, defaults to 2048): - Dimension of the hidden representations. - intermediate_size (`int`, *optional*, defaults to 6144): - Dimension of the MLP representations. - num_hidden_layers (`int`, *optional*, defaults to 24): - Number of hidden layers in the Transformer encoder. - num_attention_heads (`int`, *optional*, defaults to 32): - Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*, defaults to 4): - This is the number of key_value heads that should be used to implement Grouped Query Attention. If - `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if - `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When - converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed - by meanpooling all the original heads within that group. For more details, check out [this - paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`. - - hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): - The non-linear activation function (function or string) in the decoder. - max_position_embeddings (`int`, *optional*, defaults to 32768): - The maximum sequence length that this model might ever be used with. - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - rms_norm_eps (`float`, *optional*, defaults to 1e-06): - The epsilon used by the rms normalization layers. - use_cache (`bool`, *optional*, defaults to `True`): - Whether or not the model should return the last key/values attentions (not used by all models). Only - relevant if `config.is_decoder=True`. - tie_word_embeddings (`bool`, *optional*, defaults to `False`): - Whether the model's input and output word embeddings should be tied. - rope_parameters (`RopeParameters`, *optional*): - Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain - a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE - with longer `max_position_embeddings`. - attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): - Whether to use a bias in the query, key, value and output projection layers during self-attention. - use_sliding_window (`bool`, *optional*, defaults to `False`): - Whether to use sliding window attention. - sliding_window (`int`, *optional*, defaults to 4096): - Sliding window attention (SWA) window size. If not specified, will default to `4096`. - attention_dropout (`float`, *optional*, defaults to 0.0): - The dropout ratio for the attention probabilities. - decoder_sparse_step (`int`, *optional*, defaults to 1): - The frequency of the MoE layer. - moe_intermediate_size (`int`, *optional*, defaults to 768): - Intermediate size of the routed expert. - num_experts_per_tok (`int`, *optional*, defaults to 8): - Number of selected experts. - num_experts (`int`, *optional*, defaults to 128): - Number of routed experts. - norm_topk_prob (`bool`, *optional*, defaults to `False`): - Whether to normalize the topk probabilities. - output_router_logits (`bool`, *optional*, defaults to `False`): - Whether or not the router logits should be returned by the model. Enabling this will also - allow the model to output the auxiliary loss, including load balancing loss and router z-loss. - router_aux_loss_coef (`float`, *optional*, defaults to 0.001): - The aux loss factor for the total loss. - mlp_only_layers (`list[int]`, *optional*, defaults to `[]`): - Indicate which layers use Qwen3OmniMoeTextMLP rather than Qwen3OmniMoeTextSparseMoeBlock - The list contains layer index, from 0 to num_layers-1 if we have num_layers layers - If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity. - - ```python - >>> from transformers import Qwen3OmniMoeTextModel, Qwen3OmniMoeTextConfig - - >>> # Initializing a Qwen3OmniMoeText style configuration - >>> configuration = Qwen3OmniMoeTextConfig() - - >>> # Initializing a model from the Qwen3-15B-A2B" style configuration - >>> model = Qwen3OmniMoeTextModel(configuration) - - >>> # Accessing the model configuration - >>> configuration = model.config - ```""" - - model_type = "qwen3_omni_moe_text" - keys_to_ignore_at_inference = ["past_key_values"] - - # Default tensor parallel plan for base model `Qwen3OmniMoeText` - base_model_tp_plan = { - "layers.*.self_attn.q_proj": "colwise", - "layers.*.self_attn.k_proj": "colwise", - "layers.*.self_attn.v_proj": "colwise", - "layers.*.self_attn.o_proj": "rowwise", - "layers.*.mlp.experts.gate_up_proj": "local_rowwise", - "layers.*.mlp.experts.down_proj": "local_rowwise", - "layers.*.mlp.experts": "gather", - "layers.*.mlp.gate_proj": "colwise", - "layers.*.mlp.up_proj": "colwise", - "layers.*.mlp.down_proj": "rowwise", - } - base_model_pp_plan = { - "embed_tokens": (["input_ids"], ["inputs_embeds"]), - "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), - "norm": (["hidden_states"], ["hidden_states"]), - } - def __init__( self, vocab_size: Optional[int] = 3584, @@ -330,8 +221,8 @@ def __init__( self.rope_parameters = rope_parameters if rope_parameters is not None else {} # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) + rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -348,10 +239,6 @@ def __init__( **kwargs, ) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) - class Qwen3OmniMoeThinkerConfig(PreTrainedConfig): r""" diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index f1289d9ac6c9..8af03d507194 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -156,7 +156,116 @@ class Qwen3OmniMoeVisionEncoderConfig(Qwen3VLMoeVisionConfig): pass -class Qwen3OmniMoeTextConfig(Qwen3MoeConfig): +class Qwen3OmniMoeTextConfig(PreTrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen3OmniMoeTextModel`]. It is used to instantiate a + Qwen3OmniMoeText model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of [Qwen/Qwen3-15B-A2B](https://huggingface.co/Qwen/Qwen3-15B-A2B). + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the Qwen3OmniMoeText model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen3OmniMoeTextModel`] + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 6144): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 4): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`. + + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + use_sliding_window (`bool`, *optional*, defaults to `False`): + Whether to use sliding window attention. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention (SWA) window size. If not specified, will default to `4096`. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + decoder_sparse_step (`int`, *optional*, defaults to 1): + The frequency of the MoE layer. + moe_intermediate_size (`int`, *optional*, defaults to 768): + Intermediate size of the routed expert. + num_experts_per_tok (`int`, *optional*, defaults to 8): + Number of selected experts. + num_experts (`int`, *optional*, defaults to 128): + Number of routed experts. + norm_topk_prob (`bool`, *optional*, defaults to `False`): + Whether to normalize the topk probabilities. + output_router_logits (`bool`, *optional*, defaults to `False`): + Whether or not the router logits should be returned by the model. Enabling this will also + allow the model to output the auxiliary loss, including load balancing loss and router z-loss. + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): + The aux loss factor for the total loss. + mlp_only_layers (`list[int]`, *optional*, defaults to `[]`): + Indicate which layers use Qwen3OmniMoeTextMLP rather than Qwen3OmniMoeTextSparseMoeBlock + The list contains layer index, from 0 to num_layers-1 if we have num_layers layers + If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity. + + ```python + >>> from transformers import Qwen3OmniMoeTextModel, Qwen3OmniMoeTextConfig + + >>> # Initializing a Qwen3OmniMoeText style configuration + >>> configuration = Qwen3OmniMoeTextConfig() + + >>> # Initializing a model from the Qwen3-15B-A2B" style configuration + >>> model = Qwen3OmniMoeTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen3_omni_moe_text" + keys_to_ignore_at_inference = ["past_key_values"] + + # Default tensor parallel plan for base model `Qwen3OmniMoeText` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + def __init__( self, vocab_size: Optional[int] = 3584, @@ -185,41 +294,45 @@ def __init__( mlp_only_layers: Optional[list[int]] = None, **kwargs, ): - super().__init__( - vocab_size, - hidden_size, - intermediate_size, - num_hidden_layers, - num_attention_heads, - num_key_value_heads, - hidden_act, - max_position_embeddings, - initializer_range, - rms_norm_eps, - use_cache, - tie_word_embeddings, - rope_parameters, - attention_bias, - False, - sliding_window, - attention_dropout, - decoder_sparse_step, - moe_intermediate_size, - num_experts_per_tok, - num_experts, - norm_topk_prob, - output_router_logits, - router_aux_loss_coef, - mlp_only_layers, - **kwargs, - ) - del self.use_sliding_window + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads self.sliding_window = sliding_window + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + # Try to set `rope_scaling` if available, otherwise use `rope_parameters` + rope_scaling = kwargs.pop("rope_scaling", None) + rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters if rope_parameters is not None else {} + # Validate the correctness of rotary position embeddings parameters self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) + # MoE arguments + self.decoder_sparse_step = decoder_sparse_step + self.moe_intermediate_size = moe_intermediate_size + self.num_experts_per_tok = num_experts_per_tok + self.num_experts = num_experts + self.norm_topk_prob = norm_topk_prob + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers + + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + class Qwen3OmniMoeThinkerConfig(Qwen2_5OmniThinkerConfig): r""" From b3fa5cfddf2a419331c39b454433382ff5e03e31 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Thu, 27 Nov 2025 16:19:51 +0100 Subject: [PATCH 11/23] Update src/transformers/modeling_rope_utils.py Co-authored-by: Anton Vlasjuk <73884904+vasqu@users.noreply.github.com> --- src/transformers/modeling_rope_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 700edbcffbc9..2978f3bf190f 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -36,7 +36,7 @@ class RopeParameters(TypedDict, total=False): The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. partial_rotary_factor (`float`, *optional*): - Percentage of the query and keys which will have rotary embedding. + The percentage of the query and key head embedding on which RoPE will be applied. factor (`float`, *optional*): Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In most scaling types, a `factor` of x will enable the model to handle sequences of length x * From b2ca2eb9473577af0d62ed75f7b0804c958fec21 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Thu, 27 Nov 2025 16:23:50 +0100 Subject: [PATCH 12/23] Update src/transformers/models/efficientloftr/configuration_efficientloftr.py Co-authored-by: Anton Vlasjuk <73884904+vasqu@users.noreply.github.com> --- .../models/efficientloftr/configuration_efficientloftr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index 8c5604a85ebb..e5bf7ad86709 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -177,7 +177,7 @@ def __init__( # Try to set `rope_scaling` if available, otherwise use `rope_parameters` rope_scaling = kwargs.pop("rope_scaling", None) rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} or {} + self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 4.0) self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) From 32adaac69894eea89718a8dfb8d9af17e6ffc0ef Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 27 Nov 2025 17:02:22 +0100 Subject: [PATCH 13/23] attempt one --- src/transformers/modeling_rope_utils.py | 650 +++++++++--------- .../models/apertus/configuration_apertus.py | 14 +- 2 files changed, 323 insertions(+), 341 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 2978f3bf190f..e1b65e39bf5a 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -27,89 +27,6 @@ import torch -class RopeParameters(TypedDict, total=False): - """ - Args: - rope_theta (`float`): - The base period of the RoPE embeddings. - rope_type (`str`, *optional*, defaults to "default"): - The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', - 'llama3'], with 'default' being the original RoPE implementation. - partial_rotary_factor (`float`, *optional*): - The percentage of the query and key head embedding on which RoPE will be applied. - factor (`float`, *optional*): - Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In - most scaling types, a `factor` of x will enable the model to handle sequences of length x * - original maximum pre-trained length. - original_max_position_embeddings (`int`, *optional*): - Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during - pretraining. - attention_factor (`float`, *optional*): - Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention - computation. If unspecified, it defaults to value recommended by the implementation, using the - `factor` field to infer the suggested value. - beta_fast (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear - ramp function. If unspecified, it defaults to 32. - beta_slow (`float`, *optional*): - Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear - ramp function. If unspecified, it defaults to 1. - short_factor (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to short contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - long_factor (`list[float]`, *optional*): - Only used with 'longrope'. The scaling factor to be applied to long contexts (< - `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden - size divided by the number of attention heads divided by 2 - low_freq_factor (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE - high_freq_factor (`float`, *optional*): - Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE - """ - - rope_theta: float - rope_type: Optional[str] - partial_rotary_factor: Optional[float] - factor: Optional[float] - original_max_position_embeddings: Optional[int] - attention_factor: Optional[float] - beta_fast: Optional[float] - beta_slow: Optional[float] - short_factor: Optional[list[float]] - long_factor: Optional[list[float]] - low_freq_factor: Optional[float] - high_freq_factor: Optional[float] - - -def get_standardized_rope_params(config): - """ - Helper to standardize the config's rope params field by ensuring the params are defined for each - later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) - """ - rope_parameters = getattr(config, "rope_parameters", {}) - - # Move `rope_theta` and `partial_rotary_factor` to the params dict, if not there yet - rope_theta = getattr(config, "rope_theta", None) - partial_rotary_factor = getattr(config, "partial_rotary_factor", None) - - # Case 1: one RoPE theat = one RoPE param per model without nesting - if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): - rope_parameters.setdefault("rope_type", rope_parameters.get("type", "default")) - rope_parameters.setdefault("rope_theta", rope_theta) - if partial_rotary_factor is not None: - rope_parameters["partial_rotary_factor"] = partial_rotary_factor - # Case 2: different RoPE for each layer as nested dict - else: - for layer_type in config.layer_types: - rope_parameters[layer_type].setdefault("rope_type", rope_parameters[layer_type].get("type", "default")) - rope_parameters[layer_type].setdefault("rope_theta", rope_theta) - if partial_rotary_factor is not None: - rope_parameters[layer_type]["partial_rotary_factor"] = partial_rotary_factor - - return rope_parameters - - def dynamic_rope_update(rope_forward): """ Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE @@ -240,8 +157,8 @@ def _compute_linear_scaling_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - rope_parameters_dict = get_standardized_rope_params(config) - rope_parameters_dict = rope_parameters_dict[layer_type] if layer_type is not None else rope_parameters_dict + config.standardize_rope_params() + rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters factor = rope_parameters_dict["factor"] # Gets the default RoPE parameters @@ -305,7 +222,7 @@ def _compute_dynamic_ntk_parameters( """ # TODO (joao): use the new `original_max_position_embeddings` from rope_parameters # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - rope_parameters_dict = get_standardized_rope_params(config) + config.standardize_rope_params() rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] @@ -392,7 +309,7 @@ def _compute_yarn_parameters( post-processing scaling factor applied to the computed cos/sin. """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - rope_parameters_dict = get_standardized_rope_params(config) + config.standardize_rope_params() rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] @@ -522,7 +439,7 @@ def _compute_longrope_parameters( """ # TODO (joao): use the new `original_max_position_embeddings` from rope_parameters # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - rope_parameters_dict = get_standardized_rope_params(config) + config.standardize_rope_params() rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] @@ -606,7 +523,7 @@ def _compute_llama3_parameters( post-processing scaling factor applied to the computed cos/sin. """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - rope_parameters_dict = get_standardized_rope_params(config) + config.standardize_rope_params() rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters # Gets the default RoPE parameters @@ -652,266 +569,337 @@ def _compute_llama3_parameters( } -def _check_received_keys( - rope_type: str, - received_keys: set, - required_keys: set, - optional_keys: Optional[set] = None, - ignore_keys: Optional[set] = None, -): - """Compare the received keys in `config.rope_parameters` against the expected and optional keys""" - # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present - if "type" in received_keys: - received_keys -= {"type"} - required_keys.add("rope_type") - - # Some models need to store model-specific keys, and we don't want to throw warning at them - if ignore_keys is not None: - received_keys -= ignore_keys - - missing_keys = required_keys - received_keys - if missing_keys: - raise KeyError(f"Missing required keys in `rope_parameters` for 'rope_type'='{rope_type}': {missing_keys}") - - if optional_keys is not None: - unused_keys = received_keys - required_keys - optional_keys - else: - unused_keys = received_keys - required_keys - if unused_keys: - logger.warning(f"Unrecognized keys in `rope_parameters` for 'rope_type'='{rope_type}': {unused_keys}") - - -def _validate_default_rope_parameters( - rope_parameters: dict, config: Optional[PreTrainedConfig] = None, ignore_keys: Optional[set] = None -): - required_keys = {"rope_type", "rope_theta"} - received_keys = set(rope_parameters.keys()) - rope_type = rope_parameters["rope_type"] - _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) - - -def _validate_linear_scaling_rope_parameters( - rope_parameters: dict, config: Optional[PreTrainedConfig] = None, ignore_keys: Optional[set] = None -): - required_keys = {"rope_type", "factor", "rope_theta"} - received_keys = set(rope_parameters.keys()) - rope_type = rope_parameters["rope_type"] - _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) - - factor = rope_parameters["factor"] - if factor is None or not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - - -def _validate_dynamic_scaling_rope_parameters( - rope_parameters: dict, config: Optional[PreTrainedConfig] = None, ignore_keys: Optional[set] = None -): - # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` - optional_keys = {"original_max_position_embeddings"} - required_keys = {"rope_type", "factor"} - received_keys = set(rope_parameters.keys()) - rope_type = rope_parameters["rope_type"] - _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) - - factor = rope_parameters["factor"] - if factor is None or not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - - -def _validate_yarn_parameters( - rope_parameters: dict, config: Optional[PreTrainedConfig] = None, ignore_keys: Optional[set] = None -): - required_keys = {"rope_type", "factor", "rope_theta"} - optional_keys = { - "attention_factor", - "beta_fast", - "beta_slow", - "original_max_position_embeddings", - "mscale", - "mscale_all_dim", - } - received_keys = set(rope_parameters.keys()) - rope_type = rope_parameters["rope_type"] - _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) - - factor = rope_parameters["factor"] - if factor is None or not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - - attention_factor = rope_parameters.get("attention_factor") - if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0): - logger.warning( - f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}" - ) - beta_fast = rope_parameters.get("beta_fast") - if beta_fast is not None and not isinstance(beta_fast, float): - logger.warning(f"`rope_parameters`'s beta_fast field must be a float, got {beta_fast}") - beta_slow = rope_parameters.get("beta_slow") - if beta_slow is not None and not isinstance(beta_slow, float): - logger.warning(f"`rope_parameters`'s beta_slow field must be a float, got {beta_slow}") - - if (beta_fast or 32) < (beta_slow or 1): - logger.warning( - f"`rope_parameters`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " - f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)" - ) +class RopeParameters(TypedDict, total=False): + """ + Args: + rope_theta (`float`): + The base period of the RoPE embeddings. + rope_type (`str`, *optional*, defaults to "default"): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + partial_rotary_factor (`float`, *optional*): + The percentage of the query and key head embedding on which RoPE will be applied. + factor (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + original_max_position_embeddings (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + attention_factor (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + beta_fast (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + beta_slow (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + short_factor (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + long_factor (`list[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + low_freq_factor (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + high_freq_factor (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + """ - # Models should set `config.rope_parameters["original_max_position_embeddings"]` to their original (pre-yarn) context - # length, with `config.max_position_embeddings` corresponding to their post-yarn context length. - # However, for BC purposes, we allow the former to be unset. - original_max_position_embeddings = config.rope_parameters.get("original_max_position_embeddings") - if original_max_position_embeddings is not None: - # Double-check: `factor` should be the ratio between the pre-yarn and post-yarn context lengths. - implicit_factor = config.max_position_embeddings / original_max_position_embeddings - if implicit_factor != factor: - logger.warning_once( - f"The explicitly set RoPE scaling factor (config.rope_parameters['factor'] = {factor}) does not match " - "the ratio implicitly set by other parameters (implicit factor = " - "post-yarn context length / pre-yarn context length = " - "config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = " - f"{implicit_factor}). Using the explicit factor ({factor}) in YaRN. This may cause unexpected " - "behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config." - ) - # No `config.rope_parameters["original_max_position_embeddings"]`. Is `config.max_position_embeddings` the - # pre-yarn or the post-yarn context length? - # BC: we assume it is the pre-yarn context length. - else: - logger.warning_once( - "config.rope_parameters['original_max_position_embeddings'], the pre-yarn context length, is unset. We will " - "**assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect " - "config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * " - "factor) -- we recommend updating both fields for optimal downstream model usage." - ) + rope_theta: float + rope_type: Optional[str] + partial_rotary_factor: Optional[float] + factor: Optional[float] + original_max_position_embeddings: Optional[int] + attention_factor: Optional[float] + beta_fast: Optional[float] + beta_slow: Optional[float] + short_factor: Optional[list[float]] + long_factor: Optional[list[float]] + low_freq_factor: Optional[float] + high_freq_factor: Optional[float] -def _validate_longrope_parameters(rope_parameters: dict, config: PreTrainedConfig, ignore_keys: Optional[set] = None): - required_keys = {"rope_type", "short_factor", "long_factor", "rope_theta"} - # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` - optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"} - received_keys = set(rope_parameters.keys()) - rope_type = rope_parameters["rope_type"] - _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) +class RotaryEmbeddingConfigMixin: + """ + A Mixin containing the functionality to standardize and validate RoPE parameters. + """ - partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0) - head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) - dim = int(head_dim * partial_rotary_factor) + def convert_rope_params_to_dict(self, default_theta=10_000.0, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} - short_factor = rope_parameters.get("short_factor") - if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor): - logger.warning(f"`rope_parameters`'s short_factor field must be a list of numbers, got {short_factor}") - if len(short_factor) != dim // 2: - logger.warning(f"`rope_parameters`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") - - long_factor = rope_parameters.get("long_factor") - if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor): - logger.warning(f"`rope_parameters`'s long_factor field must be a list of numbers, got {long_factor}") - if len(long_factor) != dim // 2: - logger.warning(f"`rope_parameters`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") - - # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over - # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_parameters` and is - # unique to longrope (= undesirable) - if hasattr(config, "original_max_position_embeddings"): - logger.warning_once( - "This model has set a `original_max_position_embeddings` field, to be used together with " - "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`" - "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, " - "as it is compatible with most model architectures." - ) - else: - factor = rope_parameters.get("factor") - if factor is None: - logger.warning("Missing required keys in `rope_parameters`: 'factor'") - elif not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) + self.standardize_rope_params() + self.validate() + return kwargs - attention_factor = rope_parameters.get("attention_factor") - if attention_factor is not None: - if not isinstance(attention_factor, float) or attention_factor < 0.0: + def standardize_rope_params(self): + """ + Helper to standardize the config's rope params field by ensuring the params are defined for each + later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) + """ + # Move `rope_theta` and `partial_rotary_factor` to the params dict, if not there yet + rope_theta = getattr(self, "rope_theta", None) + partial_rotary_factor = getattr(self, "partial_rotary_factor", None) + rope_parameters = self.rope_parameters + + # Case 1: RoPE param keys do not intersect with possible `layer_types` -> one global dict + if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + rope_parameters.setdefault("rope_type", rope_parameters.get("type", "default")) + rope_parameters.setdefault("rope_theta", rope_theta) + if partial_rotary_factor is not None: + rope_parameters["partial_rotary_factor"] = partial_rotary_factor + # Case 2: different RoPE for each layer -> several params as nested dict + else: + for layer_type in self.layer_types: + rope_parameters[layer_type].setdefault("rope_type", rope_parameters[layer_type].get("type", "default")) + rope_parameters[layer_type].setdefault("rope_theta", rope_theta) + if partial_rotary_factor is not None: + rope_parameters[layer_type]["partial_rotary_factor"] = partial_rotary_factor + + self.rope_parameters = rope_parameters + + def validate(self: PreTrainedConfig, ignore_keys: Optional[set] = None): + """ + Validate the RoPE config arguments, given a `PreTrainedConfig` object + """ + rope_parameters_dict = self.rope_parameters + if rope_parameters_dict is None: + return + + if set(rope_parameters_dict.keys()).issubset(ALLOWED_LAYER_TYPES): + pass + else: + rope_parameters_dict = {"full_attention": rope_parameters_dict} + + for rope_parameters in rope_parameters_dict.values(): + rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default")) + validation_fn = getattr(self, f"_validate_{rope_type}_parameters") + rope_parameters["rope_type"] = rope_type + + if validation_fn is not None: + validation_fn(rope_parameters, ignore_keys=ignore_keys) + else: logger.warning( - f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}" + f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" ) + def _validate_default_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + required_keys = {"rope_type", "rope_theta"} + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] + self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) -def _validate_llama3_parameters(rope_parameters: dict, config: PreTrainedConfig, ignore_keys: Optional[set] = None): - required_keys = { - "rope_type", - "factor", - "original_max_position_embeddings", - "low_freq_factor", - "high_freq_factor", - "rope_theta", - } - rope_type = rope_parameters["rope_type"] - received_keys = set(rope_parameters.keys()) - _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) - - factor = rope_parameters["factor"] - if factor is None or not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - - low_freq_factor = rope_parameters["low_freq_factor"] - high_freq_factor = rope_parameters["high_freq_factor"] - if low_freq_factor is None or not isinstance(low_freq_factor, float): - logger.warning(f"`rope_parameters`'s low_freq_factor field must be a float, got {low_freq_factor}") - if high_freq_factor is None or not isinstance(high_freq_factor, float): - logger.warning(f"`rope_parameters`'s high_freq_factor field must be a float, got {high_freq_factor}") - if high_freq_factor <= low_freq_factor: - logger.warning( - "`rope_parameters`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=" - f"{high_freq_factor} and low_freq_factor={low_freq_factor}" - ) + def _validate_linear_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + required_keys = {"rope_type", "factor", "rope_theta"} + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] + self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) - original_max_position_embeddings = rope_parameters["original_max_position_embeddings"] - if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int): - logger.warning( - "`rope_parameters`'s original_max_position_embeddings field must be an integer, got " - f"{original_max_position_embeddings}" - ) - if original_max_position_embeddings >= config.max_position_embeddings: - logger.warning( - "`rope_parameters`'s original_max_position_embeddings field must be less than max_position_embeddings, got " - f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}" - ) + factor = rope_parameters["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") + def _validate_dynamic_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` + optional_keys = {"original_max_position_embeddings"} + required_keys = {"rope_type", "factor"} + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] + self._check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) -# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types. -ROPE_VALIDATION_FUNCTIONS = { - "default": _validate_default_rope_parameters, - "linear": _validate_linear_scaling_rope_parameters, - "dynamic": _validate_dynamic_scaling_rope_parameters, - "yarn": _validate_yarn_parameters, - "longrope": _validate_longrope_parameters, - "llama3": _validate_llama3_parameters, -} + factor = rope_parameters["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") + def _validate_yarn_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + required_keys = {"rope_type", "factor", "rope_theta"} + optional_keys = { + "attention_factor", + "beta_fast", + "beta_slow", + "original_max_position_embeddings", + "mscale", + "mscale_all_dim", + } + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] + self._check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) + + factor = rope_parameters["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") -def rope_config_standardize_and_validate(config: PreTrainedConfig, ignore_keys: Optional[set] = None): - """ - Validate the RoPE config arguments, given a `PreTrainedConfig` object - """ - rope_parameters_dict = get_standardized_rope_params(config) - if rope_parameters_dict is None: - return - - # Update the config with correctly formatted RoPE parameters - config.rope_parameters = rope_parameters_dict - if set(rope_parameters_dict.keys()).issubset(ALLOWED_LAYER_TYPES): - pass - else: - rope_parameters_dict = {"full_attention": rope_parameters_dict} + attention_factor = rope_parameters.get("attention_factor") + if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0): + logger.warning( + f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + beta_fast = rope_parameters.get("beta_fast") + if beta_fast is not None and not isinstance(beta_fast, float): + logger.warning(f"`rope_parameters`'s beta_fast field must be a float, got {beta_fast}") + beta_slow = rope_parameters.get("beta_slow") + if beta_slow is not None and not isinstance(beta_slow, float): + logger.warning(f"`rope_parameters`'s beta_slow field must be a float, got {beta_slow}") + + if (beta_fast or 32) < (beta_slow or 1): + logger.warning( + f"`rope_parameters`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " + f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)" + ) - for rope_parameters in rope_parameters_dict.values(): - rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default")) - validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type) + # Models should set `config.rope_parameters["original_max_position_embeddings"]` to their original (pre-yarn) context + # length, with `config.max_position_embeddings` corresponding to their post-yarn context length. + # However, for BC purposes, we allow the former to be unset. + original_max_position_embeddings = self.rope_parameters.get("original_max_position_embeddings") + if original_max_position_embeddings is not None: + # Double-check: `factor` should be the ratio between the pre-yarn and post-yarn context lengths. + implicit_factor = self.max_position_embeddings / original_max_position_embeddings + if implicit_factor != factor: + logger.warning_once( + f"The explicitly set RoPE scaling factor (config.rope_parameters['factor'] = {factor}) does not match " + "the ratio implicitly set by other parameters (implicit factor = " + "post-yarn context length / pre-yarn context length = " + "config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = " + f"{implicit_factor}). Using the explicit factor ({factor}) in YaRN. This may cause unexpected " + "behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config." + ) + # No `config.rope_parameters["original_max_position_embeddings"]`. Is `config.max_position_embeddings` the + # pre-yarn or the post-yarn context length? + # BC: we assume it is the pre-yarn context length. + else: + logger.warning_once( + "config.rope_parameters['original_max_position_embeddings'], the pre-yarn context length, is unset. We will " + "**assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect " + "config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * " + "factor) -- we recommend updating both fields for optimal downstream model usage." + ) - rope_parameters["rope_type"] = rope_type - # BC: "rope_theta" was originally saved in config - rope_parameters["rope_theta"] = rope_parameters.get("rope_theta", getattr(config, "rope_theta", None)) + def _validate_longrope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + required_keys = {"rope_type", "short_factor", "long_factor", "rope_theta"} + # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` + optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"} + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] + self._check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) + + partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0) + head_dim = getattr(self, "head_dim", self.hidden_size // self.num_attention_heads) + dim = int(head_dim * partial_rotary_factor) + + short_factor = rope_parameters.get("short_factor") + if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor): + logger.warning(f"`rope_parameters`'s short_factor field must be a list of numbers, got {short_factor}") + if len(short_factor) != dim // 2: + logger.warning( + f"`rope_parameters`'s short_factor field must have length {dim // 2}, got {len(short_factor)}" + ) + + long_factor = rope_parameters.get("long_factor") + if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor): + logger.warning(f"`rope_parameters`'s long_factor field must be a list of numbers, got {long_factor}") + if len(long_factor) != dim // 2: + logger.warning( + f"`rope_parameters`'s long_factor field must have length {dim // 2}, got {len(long_factor)}" + ) - if validation_fn is not None: - validation_fn(rope_parameters, config=config, ignore_keys=ignore_keys) + # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over + # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_parameters` and is + # unique to longrope (= undesirable) + if hasattr(self, "original_max_position_embeddings"): + logger.warning_once( + "This model has set a `original_max_position_embeddings` field, to be used together with " + "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`" + "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, " + "as it is compatible with most model architectures." + ) else: + factor = rope_parameters.get("factor") + if factor is None: + logger.warning("Missing required keys in `rope_parameters`: 'factor'") + elif not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_parameters.get("attention_factor") + if attention_factor is not None: + if not isinstance(attention_factor, float) or attention_factor < 0.0: + logger.warning( + f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + + def _validate_llama3_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + required_keys = { + "rope_type", + "factor", + "original_max_position_embeddings", + "low_freq_factor", + "high_freq_factor", + "rope_theta", + } + rope_type = rope_parameters["rope_type"] + received_keys = set(rope_parameters.keys()) + self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) + + factor = rope_parameters["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") + + low_freq_factor = rope_parameters["low_freq_factor"] + high_freq_factor = rope_parameters["high_freq_factor"] + if low_freq_factor is None or not isinstance(low_freq_factor, float): + logger.warning(f"`rope_parameters`'s low_freq_factor field must be a float, got {low_freq_factor}") + if high_freq_factor is None or not isinstance(high_freq_factor, float): + logger.warning(f"`rope_parameters`'s high_freq_factor field must be a float, got {high_freq_factor}") + if high_freq_factor <= low_freq_factor: logger.warning( - f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" + "`rope_parameters`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=" + f"{high_freq_factor} and low_freq_factor={low_freq_factor}" ) + + original_max_position_embeddings = rope_parameters["original_max_position_embeddings"] + if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int): + logger.warning( + "`rope_parameters`'s original_max_position_embeddings field must be an integer, got " + f"{original_max_position_embeddings}" + ) + if original_max_position_embeddings >= self.max_position_embeddings: + logger.warning( + "`rope_parameters`'s original_max_position_embeddings field must be less than max_position_embeddings, got " + f"{original_max_position_embeddings} and max_position_embeddings={self.max_position_embeddings}" + ) + + @staticmethod + def _check_received_keys( + rope_type: str, + received_keys: set, + required_keys: set, + optional_keys: Optional[set] = None, + ignore_keys: Optional[set] = None, + ): + """Compare the received keys in `config.rope_parameters` against the expected and optional keys""" + # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present + if "type" in received_keys: + received_keys -= {"type"} + required_keys.add("rope_type") + + # Some models need to store model-specific keys, and we don't want to throw warning at them + if ignore_keys is not None: + received_keys -= ignore_keys + + missing_keys = required_keys - received_keys + if missing_keys: + raise KeyError(f"Missing required keys in `rope_parameters` for 'rope_type'='{rope_type}': {missing_keys}") + + if optional_keys is not None: + unused_keys = received_keys - required_keys - optional_keys + else: + unused_keys = received_keys - required_keys + if unused_keys: + logger.warning(f"Unrecognized keys in `rope_parameters` for 'rope_type'='{rope_type}': {unused_keys}") diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index cc7e14a22330..f1ce10830e8a 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class ApertusConfig(PreTrainedConfig): +class ApertusConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`ApertusModel`]. It is used to instantiate a Apertus model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -160,15 +160,9 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout + self.rope_parameters = rope_parameters - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 12000000.0)) - rope_config_standardize_and_validate(self) + kwargs = self.convert_rope_params_to_dict(default_theta=12000000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, From 5bb12c4eb846d23125a496d1bd81119c6c6ad318 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 27 Nov 2025 19:00:22 +0100 Subject: [PATCH 14/23] update all models --- src/transformers/modeling_rope_utils.py | 6 +- .../models/apertus/configuration_apertus.py | 1 - .../models/apertus/modular_apertus.py | 15 +- .../models/arcee/configuration_arcee.py | 14 +- .../models/aria/configuration_aria.py | 14 +- .../models/bamba/configuration_bamba.py | 19 +- .../models/bitnet/configuration_bitnet.py | 14 +- .../models/blt/configuration_blt.py | 63 ++----- .../chameleon/configuration_chameleon.py | 14 +- .../models/cohere/configuration_cohere.py | 14 +- .../models/cohere2/configuration_cohere2.py | 30 ++-- .../models/cohere2/modular_cohere2.py | 30 ++-- .../models/csm/configuration_csm.py | 37 ++-- .../models/cwm/configuration_cwm.py | 17 +- src/transformers/models/cwm/modular_cwm.py | 6 +- .../models/dbrx/configuration_dbrx.py | 14 +- .../deepseek_v2/configuration_deepseek_v2.py | 14 +- .../deepseek_v3/configuration_deepseek_v3.py | 15 +- .../models/dia/configuration_dia.py | 23 +-- .../diffllama/configuration_diffllama.py | 12 +- .../models/doge/configuration_doge.py | 14 +- src/transformers/models/doge/modular_doge.py | 14 +- .../models/dots1/configuration_dots1.py | 14 +- .../configuration_efficientloftr.py | 14 +- .../models/emu3/configuration_emu3.py | 14 +- .../models/ernie4_5/configuration_ernie4_5.py | 14 +- .../configuration_ernie4_5_moe.py | 15 +- .../models/evolla/configuration_evolla.py | 14 +- .../models/exaone4/configuration_exaone4.py | 13 +- .../models/exaone4/modular_exaone4.py | 13 +- .../models/falcon/configuration_falcon.py | 14 +- .../falcon_h1/configuration_falcon_h1.py | 16 +- .../flex_olmo/configuration_flex_olmo.py | 15 +- .../models/flex_olmo/modular_flex_olmo.py | 15 +- .../models/fuyu/configuration_fuyu.py | 18 +- .../models/gemma/configuration_gemma.py | 14 +- .../models/gemma/modular_gemma.py | 14 +- .../models/gemma2/configuration_gemma2.py | 27 ++- .../models/gemma2/modular_gemma2.py | 27 ++- .../models/gemma3/configuration_gemma3.py | 60 ++++--- .../models/gemma3/modular_gemma3.py | 57 +++--- .../models/gemma3n/configuration_gemma3n.py | 60 ++++--- .../models/gemma3n/modular_gemma3n.py | 58 ++++--- .../models/glm/configuration_glm.py | 14 +- .../models/glm4/configuration_glm4.py | 14 +- .../models/glm4_moe/configuration_glm4_moe.py | 14 +- .../models/glm4_moe/modular_glm4_moe.py | 14 +- .../models/glm4v/configuration_glm4v.py | 14 +- .../models/glm4v/modular_glm4v.py | 14 +- .../glm4v_moe/configuration_glm4v_moe.py | 16 +- .../models/glm4v_moe/modular_glm4v_moe.py | 16 +- .../models/gpt_neox/configuration_gpt_neox.py | 26 +-- .../configuration_gpt_neox_japanese.py | 26 +-- .../models/gpt_oss/configuration_gpt_oss.py | 12 +- .../models/granite/configuration_granite.py | 14 +- .../granitemoe/configuration_granitemoe.py | 16 +- .../configuration_granitemoehybrid.py | 14 +- .../configuration_granitemoeshared.py | 20 +-- .../models/helium/configuration_helium.py | 14 +- .../configuration_hunyuan_v1_dense.py | 17 +- .../configuration_hunyuan_v1_moe.py | 14 +- .../models/janus/configuration_janus.py | 3 +- .../models/jetmoe/configuration_jetmoe.py | 14 +- .../configuration_kyutai_speech_to_text.py | 12 +- .../models/lfm2/configuration_lfm2.py | 12 +- .../models/lfm2_moe/configuration_lfm2_moe.py | 14 +- .../models/llama/configuration_llama.py | 14 +- .../models/llama4/configuration_llama4.py | 41 ++--- .../configuration_longcat_flash.py | 16 +- .../models/mimi/configuration_mimi.py | 14 +- .../models/minimax/configuration_minimax.py | 13 +- .../models/minimax/modular_minimax.py | 13 +- .../ministral/configuration_ministral.py | 27 ++- .../models/ministral/modular_ministral.py | 27 ++- .../models/mistral/configuration_mistral.py | 14 +- .../models/mixtral/configuration_mixtral.py | 14 +- .../models/mllama/configuration_mllama.py | 14 +- .../modernbert/configuration_modernbert.py | 47 +++-- .../models/modernbert/modular_modernbert.py | 47 +++-- .../configuration_modernbert_decoder.py | 48 ++++-- .../modular_modernbert_decoder.py | 48 ++++-- .../moonshine/configuration_moonshine.py | 13 +- .../models/moonshine/modular_moonshine.py | 13 +- .../models/moshi/configuration_moshi.py | 12 +- .../models/nemotron/configuration_nemotron.py | 14 +- .../models/olmo/configuration_olmo.py | 14 +- .../models/olmo2/configuration_olmo2.py | 14 +- .../models/olmo3/configuration_olmo3.py | 28 ++- .../models/olmo3/modular_olmo3.py | 28 ++- .../models/olmoe/configuration_olmoe.py | 14 +- .../persimmon/configuration_persimmon.py | 14 +- .../models/phi/configuration_phi.py | 14 +- .../models/phi3/configuration_phi3.py | 24 ++- .../configuration_phi4_multimodal.py | 24 ++- .../models/phimoe/configuration_phimoe.py | 35 ++-- .../models/pixtral/configuration_pixtral.py | 17 +- .../models/qwen2/configuration_qwen2.py | 11 +- .../configuration_qwen2_5_omni.py | 44 ++--- .../qwen2_5_omni/modular_qwen2_5_omni.py | 44 ++--- .../qwen2_5_vl/configuration_qwen2_5_vl.py | 32 ++-- .../qwen2_moe/configuration_qwen2_moe.py | 13 +- .../models/qwen2_vl/configuration_qwen2_vl.py | 32 ++-- .../models/qwen3/configuration_qwen3.py | 13 +- .../qwen3_moe/configuration_qwen3_moe.py | 14 +- .../qwen3_next/configuration_qwen3_next.py | 16 +- .../configuration_qwen3_omni_moe.py | 163 ++++++++++++++---- .../qwen3_omni_moe/modular_qwen3_omni_moe.py | 31 ++-- .../models/qwen3_vl/configuration_qwen3_vl.py | 16 +- .../models/qwen3_vl/modular_qwen3_vl.py | 16 +- .../configuration_qwen3_vl_moe.py | 16 +- .../qwen3_vl_moe/modular_qwen3_vl_moe.py | 16 +- .../configuration_recurrent_gemma.py | 14 +- .../models/seed_oss/configuration_seed_oss.py | 14 +- .../models/smollm3/configuration_smollm3.py | 25 ++- .../models/smollm3/modular_smollm3.py | 25 ++- .../models/stablelm/configuration_stablelm.py | 14 +- .../starcoder2/configuration_starcoder2.py | 14 +- .../models/t5gemma/configuration_t5gemma.py | 27 ++- .../vaultgemma/configuration_vaultgemma.py | 27 ++- .../models/zamba2/configuration_zamba2.py | 26 ++- 120 files changed, 1076 insertions(+), 1512 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index e1b65e39bf5a..e0582388eaff 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -629,7 +629,9 @@ class RotaryEmbeddingConfigMixin: A Mixin containing the functionality to standardize and validate RoPE parameters. """ - def convert_rope_params_to_dict(self, default_theta=10_000.0, **kwargs): + def convert_rope_params_to_dict( + self, default_theta: int | float = 10_000.0, ignore_keys: Optional[set] = None, **kwargs + ): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} @@ -637,7 +639,7 @@ def convert_rope_params_to_dict(self, default_theta=10_000.0, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) self.standardize_rope_params() - self.validate() + self.validate(ignore_keys=ignore_keys) return kwargs def standardize_rope_params(self): diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index f1ce10830e8a..30d651cf6c9c 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -163,7 +163,6 @@ def __init__( self.rope_parameters = rope_parameters kwargs = self.convert_rope_params_to_dict(default_theta=12000000.0, **kwargs) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index 5fb42df98531..c0791dafeafd 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -21,7 +21,7 @@ from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -43,7 +43,7 @@ logger = logging.get_logger(__name__) -class ApertusConfig(PreTrainedConfig): +class ApertusConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`ApertusModel`]. It is used to instantiate a Apertus model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -178,16 +178,9 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout + self.rope_parameters = rope_parameters - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 12000000.0)) - rope_config_standardize_and_validate(self) - + kwargs = self.convert_rope_params_to_dict(default_theta=12000000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/arcee/configuration_arcee.py b/src/transformers/models/arcee/configuration_arcee.py index 5b8855ec15c7..cb49d5f35427 100644 --- a/src/transformers/models/arcee/configuration_arcee.py +++ b/src/transformers/models/arcee/configuration_arcee.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class ArceeConfig(PreTrainedConfig): +class ArceeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`ArceeModel`]. It is used to instantiate an Arcee model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -163,14 +163,8 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index f451798233a0..2144a1b1bec5 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -21,11 +21,11 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ..auto import CONFIG_MAPPING, AutoConfig -class AriaTextConfig(PreTrainedConfig): +class AriaTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This class handles the configuration for the text component of the Aria model. Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria @@ -168,14 +168,8 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index 7057bcc7bd52..5dd525db2ee3 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class BambaConfig(PreTrainedConfig): +class BambaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`BambaModel`]. It is used to instantiate a BambaModel model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -171,17 +171,6 @@ def __init__( self.num_logits_to_keep = num_logits_to_keep self.attn_layer_indices = attn_layer_indices - - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = 0.5 - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - mamba_intermediate = mamba_expand * hidden_size if mamba_intermediate % mamba_n_heads != 0: @@ -204,6 +193,10 @@ def __init__( self.mamba_conv_bias = mamba_conv_bias self.mamba_proj_bias = mamba_proj_bias self.z_loss_coefficient = z_loss_coefficient + self.rope_parameters = rope_parameters + + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) + self.rope_parameters["partial_rotary_factor"] = 0.5 super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py index e6ff072a8d72..9b3061521abc 100644 --- a/src/transformers/models/bitnet/configuration_bitnet.py +++ b/src/transformers/models/bitnet/configuration_bitnet.py @@ -16,14 +16,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class BitNetConfig(PreTrainedConfig): +class BitNetConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`BitNetModel`]. It is used to instantiate an BitNet model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -138,14 +138,8 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py index c4c15674f875..461b76de419e 100644 --- a/src/transformers/models/blt/configuration_blt.py +++ b/src/transformers/models/blt/configuration_blt.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class BltLocalEncoderConfig(PreTrainedConfig): +class BltLocalEncoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): """ Configuration class for the Blt Local Encoder component. """ @@ -65,21 +65,15 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.hidden_act = hidden_act self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) super().__init__(**kwargs, tie_word_embeddings=False) -class BltLocalDecoderConfig(PreTrainedConfig): +class BltLocalDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): """ Configuration class for the Blt Local Decoder component. """ @@ -120,21 +114,15 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.hidden_act = hidden_act self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) super().__init__(**kwargs, tie_word_embeddings=False) -class BltGlobalTransformerConfig(PreTrainedConfig): +class BltGlobalTransformerConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): """ Configuration class for the Blt Global Transformer component. """ @@ -167,21 +155,15 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.hidden_act = hidden_act self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) super().__init__(**kwargs, tie_word_embeddings=False) -class BltPatcherConfig(PreTrainedConfig): +class BltPatcherConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" Configuration class for the Blt Patcher/Entropy model component. @@ -248,21 +230,15 @@ def __init__( self.hidden_act = "silu" # Blt uses silu activation self.intermediate_size = intermediate_size or int(8 * self.hidden_size / 3) self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) super().__init__(**kwargs, tie_word_embeddings=False) -class BltConfig(PreTrainedConfig): +class BltConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`BltModel`]. It is used to instantiate a Blt model according to the specified arguments, defining the model architecture. @@ -377,14 +353,6 @@ def __init__( self.realtime_patching = kwargs.get("realtime_patching", True) self.patching_threshold_add = kwargs.get("patching_threshold_add") self.monotonicity = kwargs.get("monotonicity", False) - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) # Cross attention configurations self.cross_attn_k = cross_attn_k @@ -437,6 +405,9 @@ def __init__( encoder_cross_output_size if encoder_cross_output_size != self.global_config.hidden_size else None ) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) + # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index 7e02e7ced92b..3bbb9f1d1eb7 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class ChameleonVQVAEConfig(PreTrainedConfig): +class ChameleonVQVAEConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`ChameleonVQModel`]. It is used to instantiate a `ChameleonVQModel` according to the specified arguments, defining the model architecture. @@ -230,14 +230,8 @@ def __init__( self.attention_dropout = attention_dropout self.model_parallel_size = model_parallel_size self.swin_norm = swin_norm - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) if vq_config is None: vq_config = {} diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index 8399b3b56622..d5ce47d3daa3 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -22,14 +22,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class CohereConfig(PreTrainedConfig): +class CohereConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere model according to the specified arguments, defining the model architecture. @@ -165,14 +165,8 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.use_qk_norm = use_qk_norm - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index 97914120f55e..0e63ab7f10e2 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class Cohere2Config(PreTrainedConfig): +class Cohere2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere model according to the specified arguments, defining the model architecture. @@ -166,21 +166,10 @@ def __init__( self.attention_dropout = attention_dropout self.sliding_window = sliding_window self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + # Need to specify head_dim in the config so it can be used in the attention forward functions self.head_dim = hidden_size // num_attention_heads - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4) @@ -193,9 +182,16 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) __all__ = ["Cohere2Config"] diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index 7717db42e8f7..140cd1ef2e71 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -26,8 +26,8 @@ from ...modeling_outputs import BaseModelOutputWithPast from ...modeling_rope_utils import ( RopeParameters, + RotaryEmbeddingConfigMixin, dynamic_rope_update, - rope_config_standardize_and_validate, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack @@ -48,7 +48,7 @@ logger = logging.get_logger(__name__) -class Cohere2Config(PreTrainedConfig): +class Cohere2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere model according to the specified arguments, defining the model architecture. @@ -189,21 +189,10 @@ def __init__( self.attention_dropout = attention_dropout self.sliding_window = sliding_window self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + # Need to specify head_dim in the config so it can be used in the attention forward functions self.head_dim = hidden_size // num_attention_heads - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4) @@ -216,9 +205,16 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class Cohere2RotaryEmbedding(CohereRotaryEmbedding): diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index 19da4f31405c..e28e15ea1545 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -24,7 +24,7 @@ logger = logging.get_logger(__name__) -class CsmDepthDecoderConfig(PreTrainedConfig): +class CsmDepthDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`CsmDepthDecoderModel`]. It is used to instantiate an CSM depth decoder model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield @@ -132,13 +132,6 @@ def __init__( if kwargs.pop("tie_word_embeddings", False): raise ValueError("`tie_word_embeddings=True` is not supported for CsmDepthDecoderConfig") - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=False, - **kwargs, - ) self.num_codebooks = num_codebooks self.vocab_size = vocab_size self.backbone_hidden_size = backbone_hidden_size @@ -161,14 +154,16 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=False, + **kwargs, + ) class CsmConfig(PreTrainedConfig): @@ -348,14 +343,8 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cwm/configuration_cwm.py b/src/transformers/models/cwm/configuration_cwm.py index 6c9b256ab55d..57353f666875 100644 --- a/src/transformers/models/cwm/configuration_cwm.py +++ b/src/transformers/models/cwm/configuration_cwm.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_standardize_and_validate +from ...modeling_rope_utils import RotaryEmbeddingConfigMixin -class CwmConfig(PreTrainedConfig): +class CwmConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): """ Configuration for Code World Model (CWM). This is an inherited Llama3-compatible configuration with layer-interleaved @@ -177,14 +177,8 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1_000_000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, @@ -194,8 +188,5 @@ def __init__( **kwargs, ) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1_000_000.0)) - __all__ = ["CwmConfig"] diff --git a/src/transformers/models/cwm/modular_cwm.py b/src/transformers/models/cwm/modular_cwm.py index 88a5e2708976..97be1d987662 100644 --- a/src/transformers/models/cwm/modular_cwm.py +++ b/src/transformers/models/cwm/modular_cwm.py @@ -21,7 +21,6 @@ from ...configuration_utils import layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import rope_config_standardize_and_validate from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging from ..llama.configuration_llama import LlamaConfig @@ -181,10 +180,7 @@ def __init__( # CWM models don't use attention bias, remove it from config del self.attention_bias - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1_000_000.0)) - rope_config_standardize_and_validate(self) + kwargs = self.convert_rope_params_to_dict(default_theta=1_000_000.0, **kwargs) class CwmRotaryEmbedding(Qwen2RotaryEmbedding): diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index 193ba0909711..bcc250cd0cf8 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -17,7 +17,7 @@ from typing import Any, Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging @@ -111,7 +111,7 @@ def __init__( raise ValueError(f"Found unknown {kwargs=}") -class DbrxConfig(PreTrainedConfig): +class DbrxConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`DbrxModel`]. It is used to instantiate a Dbrx model according to the @@ -221,14 +221,8 @@ def __init__( if tie_word_embeddings: raise ValueError("tie_word_embeddings is not supported for DBRX models.") - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["rope_theta"] = 10000.0 - - # Validate the correctness of rotary position embeddings parameters - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000.0, **kwargs) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index 2b3e842da4a0..34acf9a535b4 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class DeepseekV2Config(PreTrainedConfig): +class DeepseekV2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate a DeepSeek model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -209,14 +209,8 @@ def __init__( self.mlp_bias = mlp_bias self.head_dim = qk_rope_head_dim - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index cc3983f81286..8afc6175de51 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -19,13 +19,13 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} -class DeepseekV3Config(PreTrainedConfig): +class DeepseekV3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -225,20 +225,13 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) for key in ["beta_fast", "beta_slow", "factor"]: if key in self.rope_parameters: self.rope_parameters[key] = float(self.rope_parameters[key]) - rope_config_standardize_and_validate(self) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/dia/configuration_dia.py b/src/transformers/models/dia/configuration_dia.py index c518999a2dac..26084c0f0dda 100644 --- a/src/transformers/models/dia/configuration_dia.py +++ b/src/transformers/models/dia/configuration_dia.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -92,14 +92,9 @@ def __init__( self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000.0, **kwargs) + super().__init__(**kwargs) @@ -198,14 +193,8 @@ def __init__( self.num_channels = num_channels self.initializer_range = initializer_range self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000.0, **kwargs) super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index ba11f29847fd..c25ed339e5a3 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -20,7 +20,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters class DiffLlamaConfig(PreTrainedConfig): @@ -145,14 +145,8 @@ def __init__( self.attention_dropout = attention_dropout self.lambda_std_dev = lambda_std_dev self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index a82c2c2c0b03..98ce0701cf9e 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -23,10 +23,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class DogeConfig(PreTrainedConfig): +class DogeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M). @@ -189,14 +189,8 @@ def __init__( self.norm_topk_prob = norm_topk_prob self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000.0, **kwargs) # for backward compatibility if num_key_value_heads is None: diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index e92e0af535ad..150010654281 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -31,7 +31,7 @@ from ...integrations.flex_attention import compile_friendly_flex_attention from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import AttentionInterface, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, is_torch_flex_attn_available, logging @@ -55,7 +55,7 @@ from torch.nn.attention.flex_attention import BlockMask -class DogeConfig(PreTrainedConfig): +class DogeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M). @@ -218,14 +218,8 @@ def __init__( self.norm_topk_prob = norm_topk_prob self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000.0, **kwargs) # for backward compatibility if num_key_value_heads is None: diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py index 8e1e4530246f..dd39da9c2ade 100644 --- a/src/transformers/models/dots1/configuration_dots1.py +++ b/src/transformers/models/dots1/configuration_dots1.py @@ -15,14 +15,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class Dots1Config(PreTrainedConfig): +class Dots1Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Dots1Model`]. It is used to instantiate a `dots.llm1` model according to the specified arguments, defining the model architecture. Instantiating a @@ -192,11 +192,6 @@ def __init__( self.sliding_window = sliding_window self.max_window_layers = max_window_layers - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.layer_types = layer_types if self.layer_types is None: self.layer_types = [ @@ -207,9 +202,8 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index e5bf7ad86709..de124eff902d 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -14,10 +14,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_standardize_and_validate +from ...modeling_rope_utils import RotaryEmbeddingConfigMixin -class EfficientLoFTRConfig(PreTrainedConfig): +class EfficientLoFTRConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`EfficientLoFTRFromKeypointMatching`]. It is used to instantiate a EfficientLoFTR model according to the specified arguments, defining the model @@ -173,16 +173,10 @@ def __init__( self.num_key_value_heads = num_attention_heads self.initializer_range = initializer_range - - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 4.0) - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - # Standardize and validate the correctness of rotary position embeddings parameters - rope_config_standardize_and_validate(self) super().__init__(**kwargs) diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py index 84d2d12495a9..0e57cfcc3020 100644 --- a/src/transformers/models/emu3/configuration_emu3.py +++ b/src/transformers/models/emu3/configuration_emu3.py @@ -17,7 +17,7 @@ from typing import Optional, Union from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin class Emu3VQVAEConfig(PreTrainedConfig): @@ -110,7 +110,7 @@ def __init__( self.attention_dropout = attention_dropout -class Emu3TextConfig(PreTrainedConfig): +class Emu3TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Emu3TextModel`]. It is used to instantiate a emu3 model according to the specified arguments, defining the model architecture. Instantiating a @@ -226,14 +226,8 @@ def __init__( self.attention_bias = attention_bias self.initializer_range = initializer_range self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ernie4_5/configuration_ernie4_5.py b/src/transformers/models/ernie4_5/configuration_ernie4_5.py index 6d6236868b79..70d4d3e092f6 100644 --- a/src/transformers/models/ernie4_5/configuration_ernie4_5.py +++ b/src/transformers/models/ernie4_5/configuration_ernie4_5.py @@ -16,10 +16,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class Ernie4_5Config(PreTrainedConfig): +class Ernie4_5Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Ernie4_5Model`]. It is used to instantiate an Ernie 4.5 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -148,14 +148,8 @@ def __init__( self.use_cache = use_cache self.use_bias = use_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py index e5f453f6db9c..24417775b698 100644 --- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py @@ -16,14 +16,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class Ernie4_5_MoeConfig(PreTrainedConfig): +class Ernie4_5_MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Ernie4_5_MoeModel`]. It is used to instantiate a Ernie 4.5 MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -181,14 +181,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.use_bias = use_bias - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) # MoE arguments self.moe_intermediate_size = moe_intermediate_size @@ -202,6 +194,9 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py index 0a975073e038..73fd19acae12 100644 --- a/src/transformers/models/evolla/configuration_evolla.py +++ b/src/transformers/models/evolla/configuration_evolla.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging @@ -99,7 +99,7 @@ def __init__( self.token_dropout = token_dropout -class EvollaConfig(PreTrainedConfig): +class EvollaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`EvollaModel`]. It is used to instantiate an Evolla model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -249,14 +249,8 @@ def __init__( self.resampler_heads = resampler_heads self.resampler_num_latents = resampler_num_latents self.resampler_ff_mult = resampler_ff_mult - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) # Subconfig if protein_encoder_config is None: diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index 9c75cb848967..ddf11f3cd242 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class Exaone4Config(PreTrainedConfig): +class Exaone4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a @@ -164,10 +164,6 @@ def __init__( self.attention_dropout = attention_dropout self.sliding_window = sliding_window self.sliding_window_pattern = sliding_window_pattern - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.sliding_window is None: @@ -183,9 +179,8 @@ def __init__( self.cache_implementation = "hybrid" layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index 72a683d2b9f0..68dc0d94f456 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -30,7 +30,7 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import ( @@ -58,7 +58,7 @@ _CONFIG_FOR_DOC = "Exaone4Config" -class Exaone4Config(PreTrainedConfig): +class Exaone4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a @@ -197,10 +197,6 @@ def __init__( self.attention_dropout = attention_dropout self.sliding_window = sliding_window self.sliding_window_pattern = sliding_window_pattern - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.sliding_window is None: @@ -216,9 +212,8 @@ def __init__( self.cache_implementation = "hybrid" layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 052c10f69032..3aa3d29edb30 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class FalconConfig(PreTrainedConfig): +class FalconConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -162,14 +162,8 @@ def __init__( else: self.ffn_hidden_size = ffn_hidden_size - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py index 76cd76d76a45..aa0671157ca4 100644 --- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py +++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class FalconH1Config(PreTrainedConfig): +class FalconH1Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`FalconH1Model`]. It is used to instantiate a FalconH1Model model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -197,15 +197,6 @@ def __init__( self.use_cache = use_cache self.num_logits_to_keep = num_logits_to_keep - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - self.projectors_bias = projectors_bias mamba_intermediate = mamba_expand * hidden_size if mamba_d_ssm is None else mamba_d_ssm @@ -271,6 +262,9 @@ def __init__( else: self.ssm_out_multiplier = 1.0 + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index 6aa2b28ac27a..8cf9263a3d4a 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class FlexOlmoConfig(PreTrainedConfig): +class FlexOlmoConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`FlexOlmoModel`]. It is used to instantiate an FlexOlmo model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -175,15 +175,8 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.norm_topk_prob = norm_topk_prob - - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index 4777802972a8..9e1f14490910 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -22,7 +22,7 @@ from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import OutputRecorder, check_model_inputs @@ -36,7 +36,7 @@ ) -class FlexOlmoConfig(PreTrainedConfig): +class FlexOlmoConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`FlexOlmoModel`]. It is used to instantiate an FlexOlmo model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -186,15 +186,8 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.norm_topk_prob = norm_topk_prob - - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index 0fcff0f8fd51..19f115d03012 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -25,7 +25,7 @@ logger = logging.get_logger(__name__) -class FuyuConfig(PreTrainedConfig): +class FuyuConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`FuyuForCausalLM`]. It is used to instantiate an Fuyu model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -127,8 +127,6 @@ def __init__( **kwargs, ): if text_config is None: - rope_parameters = rope_parameters if rope_parameters is not None else {} - rope_parameters["partial_rotary_factor"] = kwargs.get("partial_rotary_factor", 0.5) text_config = { "vocab_size": vocab_size, "max_position_embeddings": max_position_embeddings, @@ -170,15 +168,11 @@ def __init__( self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout self.image_token_id = image_token_id - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=25000.0, **kwargs) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 25000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) + self.text_config.rope_parameters["partial_rotary_factor"] = self.rope_parameters["partial_rotary_factor"] super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index 2ee5489b982e..822da444e162 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class GemmaConfig(PreTrainedConfig): +class GemmaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -152,14 +152,8 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.use_bidirectional_attention = use_bidirectional_attention - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index b9f6d67aef5d..e2671ab54685 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -24,7 +24,7 @@ from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...tokenization_utils import AddedToken, PreTrainedTokenizer @@ -53,7 +53,7 @@ logger = logging.get_logger(__name__) -class GemmaConfig(PreTrainedConfig): +class GemmaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -180,14 +180,8 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.use_bidirectional_attention = use_bidirectional_attention - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index c197f85d9744..1bae4315d3f6 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class Gemma2Config(PreTrainedConfig): +class Gemma2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -153,13 +153,6 @@ def __init__( use_bidirectional_attention: Optional[bool] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -180,10 +173,6 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types self.use_bidirectional_attention = use_bidirectional_attention - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -191,9 +180,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) __all__ = ["Gemma2Config"] diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 14a89c323cb8..74a010c680f7 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -29,8 +29,8 @@ from ...modeling_rope_utils import ( ROPE_INIT_FUNCTIONS, RopeParameters, + RotaryEmbeddingConfigMixin, dynamic_rope_update, - rope_config_standardize_and_validate, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack @@ -53,7 +53,7 @@ logger = logging.get_logger(__name__) -class Gemma2Config(PreTrainedConfig): +class Gemma2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -181,13 +181,6 @@ def __init__( use_bidirectional_attention: Optional[bool] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -208,10 +201,6 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types self.use_bidirectional_attention = use_bidirectional_attention - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -219,9 +208,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class Gemma2RMSNorm(GemmaRMSNorm): diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index fdabac513e1c..c2c1e3e8bf14 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -22,7 +22,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging from ..siglip import SiglipVisionConfig @@ -30,7 +30,7 @@ logger = logging.get_logger(__name__) -class Gemma3TextConfig(PreTrainedConfig): +class Gemma3TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -160,13 +160,6 @@ def __init__( use_bidirectional_attention: Optional[bool] = False, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -187,18 +180,6 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` - # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format - default_rope_params = { - "sliding_attention": {"rope_type": "default"}, - "full_attention": {"rope_type": "default"}, - } - self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params - if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - self.rope_parameters["full_attention"].update(rope_scaling) - self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("rope_theta", 1_000_000.0)) - self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("rope_local_base_freq", 10000.0)) - self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: self.sliding_window = (self.sliding_window // 2) + 1 # due to fa we set exclusive bounds @@ -213,8 +194,41 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta={"global": 1_000_000.0, "local": 10_000.0}, **kwargs) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate() + return kwargs class Gemma3Config(PreTrainedConfig): diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index fd18a92a071a..b47355418c58 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -30,7 +30,6 @@ ROPE_INIT_FUNCTIONS, RopeParameters, dynamic_rope_update, - rope_config_standardize_and_validate, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack @@ -175,13 +174,6 @@ def __init__( use_bidirectional_attention: Optional[bool] = False, **kwargs, ): - PreTrainedConfig.__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -202,18 +194,6 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` - # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format - default_rope_params = { - "sliding_attention": {"rope_type": "default"}, - "full_attention": {"rope_type": "default"}, - } - self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params - if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - self.rope_parameters["full_attention"].update(rope_scaling) - self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("rope_theta", 1_000_000.0)) - self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("rope_local_base_freq", 10000.0)) - self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: self.sliding_window = (self.sliding_window // 2) + 1 # due to fa we set exclusive bounds @@ -228,8 +208,41 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta={"global": 1_000_000.0, "local": 10_000.0}, **kwargs) + + PreTrainedConfig.__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate() + return kwargs class Gemma3Config(PreTrainedConfig): diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index 43d30f7af8a5..72ad8286a122 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -23,7 +23,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import is_timm_available, logging, requires_backends @@ -34,7 +34,7 @@ logger = logging.get_logger(__name__) -class Gemma3nTextConfig(PreTrainedConfig): +class Gemma3nTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Gemma3nTextModel`]. It is used to instantiate an Gemma3nTextModel model according to the specified arguments, defining the model architecture. Instantiating a @@ -192,13 +192,6 @@ def __init__( activation_sparsity_pattern: Optional[Union[float, Sequence[float]]] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs, - ) - if isinstance(intermediate_size, Sequence) and (intsize_len := len(intermediate_size)) != num_hidden_layers: raise ValueError( "intermediate_size must have an explicit intermediate size for every layer or one for all layers. " @@ -235,21 +228,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) - # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` - # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format - default_rope_params = { - "sliding_attention": {"rope_type": "default"}, - "full_attention": {"rope_type": "default"}, - } - self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params - if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - self.rope_parameters["full_attention"].update(rope_scaling) - self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("rope_theta", 1_000_000.0)) - self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("rope_local_base_freq", 10000.0)) - - # Validate the correctness of rotary position embeddings parameters - rope_config_standardize_and_validate(self) - self.hidden_size_per_layer_input = hidden_size_per_layer_input self.num_kv_shared_layers = num_kv_shared_layers @@ -270,6 +248,40 @@ def __init__( f"Expected {num_hidden_layers} values but got {len_asp}." ) self.activation_sparsity_pattern = activation_sparsity_pattern + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta={"global": 1_000_000.0, "local": 10_000.0}, **kwargs) + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate() + return kwargs class Gemma3nAudioConfig(PreTrainedConfig): diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 5b982d93a314..1f7cbae7fce0 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -28,7 +28,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -203,13 +203,6 @@ def __init__( activation_sparsity_pattern: Optional[Union[float, Sequence[float]]] = None, **kwargs, ): - PreTrainedConfig.__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs, - ) - if isinstance(intermediate_size, Sequence) and (intsize_len := len(intermediate_size)) != num_hidden_layers: raise ValueError( "intermediate_size must have an explicit intermediate size for every layer or one for all layers. " @@ -246,21 +239,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) - # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` - # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format - default_rope_params = { - "sliding_attention": {"rope_type": "default"}, - "full_attention": {"rope_type": "default"}, - } - self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params - if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - self.rope_parameters["full_attention"].update(rope_scaling) - self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("rope_theta", 1_000_000.0)) - self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("rope_local_base_freq", 10000.0)) - - # Validate the correctness of rotary position embeddings parameters - rope_config_standardize_and_validate(self) - self.hidden_size_per_layer_input = hidden_size_per_layer_input self.num_kv_shared_layers = num_kv_shared_layers @@ -281,6 +259,40 @@ def __init__( f"Expected {num_hidden_layers} values but got {len_asp}." ) self.activation_sparsity_pattern = activation_sparsity_pattern + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta={"global": 1_000_000.0, "local": 10_000.0}, **kwargs) + + PreTrainedConfig.__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate() + return kwargs class Gemma3nAudioConfig(PreTrainedConfig): diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index 61054d1975d8..4b7eda369f5d 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -17,10 +17,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class GlmConfig(PreTrainedConfig): +class GlmConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`GlmModel`]. It is used to instantiate an Glm model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -140,16 +140,10 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index 21933cddf48b..7535b003982f 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -17,10 +17,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class Glm4Config(PreTrainedConfig): +class Glm4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Glm4Model`]. It is used to instantiate an Glm4 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -140,16 +140,10 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index 0861d319426b..cb2c762a8c92 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class Glm4MoeConfig(PreTrainedConfig): +class Glm4MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Glm4MoeModel`]. It is used to instantiate a Glm4Moe model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -178,16 +178,10 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - # MoE arguments self.moe_intermediate_size = moe_intermediate_size self.num_experts_per_tok = num_experts_per_tok diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index d38460ca212f..a5548c70133b 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -20,7 +20,7 @@ from torch import nn from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging from ..cohere.modeling_cohere import CohereAttention from ..deepseek_v3.modeling_deepseek_v3 import ( @@ -39,7 +39,7 @@ logger = logging.get_logger(__name__) -class Glm4MoeConfig(PreTrainedConfig): +class Glm4MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Glm4MoeModel`]. It is used to instantiate a Glm4Moe model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -192,16 +192,10 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - # MoE arguments self.moe_intermediate_size = moe_intermediate_size self.num_experts_per_tok = num_experts_per_tok diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index bc10d3118de5..7239054465c9 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin class Glm4vVisionConfig(PreTrainedConfig): @@ -117,7 +117,7 @@ def __init__( self.attention_dropout = attention_dropout -class Glm4vTextConfig(PreTrainedConfig): +class Glm4vTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a @@ -232,14 +232,8 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, ignore_keys={"mrope_section"}, **kwargs) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 06c6347b165f..5956e6247188 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -31,7 +31,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -154,7 +154,7 @@ def __init__( self.attention_dropout = attention_dropout -class Glm4vTextConfig(PreTrainedConfig): +class Glm4vTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a @@ -269,14 +269,8 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, ignore_keys={"mrope_section"}, **kwargs) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index 9f0e4551dbcf..ff6028d0159f 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin class Glm4vMoeVisionConfig(PreTrainedConfig): @@ -117,7 +117,7 @@ def __init__( self.attention_dropout = attention_dropout -class Glm4vMoeTextConfig(PreTrainedConfig): +class Glm4vMoeTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a @@ -252,7 +252,6 @@ def __init__( router_aux_loss_coef: Optional[float] = 0.0001, **kwargs, ): - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -267,16 +266,10 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, ignore_keys={"mrope_section"}, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) - # MoE arguments self.moe_intermediate_size = moe_intermediate_size self.num_experts_per_tok = num_experts_per_tok @@ -288,6 +281,7 @@ def __init__( self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.router_aux_loss_coef = router_aux_loss_coef + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) class Glm4vMoeConfig(PreTrainedConfig): diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index 5bb557b51065..129080d4876c 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -23,7 +23,7 @@ from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, logging @@ -66,7 +66,7 @@ class Glm4vMoeRMSNorm(Glm4MoeRMSNorm): pass -class Glm4vMoeTextConfig(Glm4MoeConfig): +class Glm4vMoeTextConfig(Glm4MoeConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a @@ -198,7 +198,6 @@ def __init__( router_aux_loss_coef: Optional[float] = 0.0001, **kwargs, ): - PreTrainedConfig.__init__(self, tie_word_embeddings=tie_word_embeddings, **kwargs) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -213,16 +212,10 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, ignore_keys={"mrope_section"}, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) - # MoE arguments self.moe_intermediate_size = moe_intermediate_size self.num_experts_per_tok = num_experts_per_tok @@ -234,6 +227,7 @@ def __init__( self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.router_aux_loss_coef = router_aux_loss_coef + PreTrainedConfig.__init__(self, tie_word_embeddings=tie_word_embeddings, **kwargs) class Glm4vMoeConfig(Glm4vConfig): diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index d727294b5ff0..128c7ab763c2 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class GPTNeoXConfig(PreTrainedConfig): +class GPTNeoXConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`GPTNeoXModel`]. It is used to instantiate an GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -145,17 +145,12 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache self.use_parallel_residual = use_parallel_residual + self.attention_bias = attention_bias + self.rope_parameters = rope_parameters - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) - self.attention_bias = attention_bias - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", 10000.0)) - rope_config_standardize_and_validate(self) if self.hidden_size % self.num_attention_heads != 0: raise ValueError( "The hidden size is not divisible by the number of attention heads! Make sure to update them!" @@ -164,5 +159,16 @@ def __init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs ) + def convert_rope_params_to_dict(self, default_theta=10_000.0, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", default_theta)) + self.standardize_rope_params() + self.validate() + return kwargs + __all__ = ["GPTNeoXConfig"] diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index 9fced0e6fa04..185772a9d784 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class GPTNeoXJapaneseConfig(PreTrainedConfig): +class GPTNeoXJapaneseConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`GPTNeoXModelJapanese`]. It is used to instantiate a GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a @@ -112,18 +112,24 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 1.0) self.attention_dropout = attention_dropout self.hidden_dropout = hidden_dropout + self.rope_parameters = rope_parameters - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", 10000.0)) - rope_config_standardize_and_validate(self) + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + def convert_rope_params_to_dict(self, default_theta=10_000.0, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", default_theta)) + self.standardize_rope_params() + self.validate() + return kwargs + __all__ = ["GPTNeoXJapaneseConfig"] diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py index c39f0f6b1e54..ab2eab4b1535 100644 --- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py +++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters class GptOssConfig(PreTrainedConfig): @@ -109,14 +109,8 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.output_router_logits = output_router_logits self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 150000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=150000, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py index 0000113cc506..262c2e77dda3 100644 --- a/src/transformers/models/granite/configuration_granite.py +++ b/src/transformers/models/granite/configuration_granite.py @@ -22,14 +22,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class GraniteConfig(PreTrainedConfig): +class GraniteConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`GraniteModel`]. It is used to instantiate an Granite model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -175,14 +175,8 @@ def __init__( self.logits_scaling = logits_scaling self.residual_multiplier = residual_multiplier self.attention_multiplier = attention_multiplier - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index 100aacc4ae37..a2a69c28130b 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -22,14 +22,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class GraniteMoeConfig(PreTrainedConfig): +class GraniteMoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`GraniteMoeModel`]. It is used to instantiate an GraniteMoe model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -159,15 +159,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - self.attention_bias = attention_bias self.attention_dropout = attention_dropout @@ -181,6 +172,9 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py index 5546493b1bec..ee8b07698b57 100644 --- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py @@ -18,14 +18,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class GraniteMoeHybridConfig(PreTrainedConfig): +class GraniteMoeHybridConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`GraniteMoeHybridConfig`]. It is used to instantiate an GraniteMoeHybrid model according to the specified arguments, defining the model architecture. @@ -198,14 +198,8 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.shared_intermediate_size = shared_intermediate_size - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index bce50e9c161f..cd5bd99d78bd 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -22,14 +22,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class GraniteMoeSharedConfig(PreTrainedConfig): +class GraniteMoeSharedConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`GraniteMoeSharedModel`]. It is used to instantiate an GraniteMoeShared model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -162,17 +162,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - # this model has rope embedding type, hardcoded for BC - self.position_embedding_type = "rope" - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - self.attention_bias = attention_bias self.attention_dropout = attention_dropout @@ -187,6 +176,11 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.shared_intermediate_size = shared_intermediate_size + # this model has rope embedding type, hardcoded for BC + self.position_embedding_type = "rope" + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py index 03ca67855307..380ec689a1a6 100644 --- a/src/transformers/models/helium/configuration_helium.py +++ b/src/transformers/models/helium/configuration_helium.py @@ -17,10 +17,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class HeliumConfig(PreTrainedConfig): +class HeliumConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`HeliumModel`]. It is used to instantiate an Helium model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -147,14 +147,8 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 100000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=100000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index 5549a6f1fdf7..479c202aaa7a 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class HunYuanDenseV1Config(PreTrainedConfig): +class HunYuanDenseV1Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`HunYuanDenseV1Config`]. It is used to instantiate an HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -141,14 +141,8 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) # TODO needs model-specific validation? + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, @@ -158,10 +152,11 @@ def __init__( **kwargs, ) - def _rope_parameters_validation(self): + def validate(self, ignore_keys=None): """ Validate the `rope_parameters` configuration. """ + super().validate(ignore_keys=ignore_keys) if self.rope_parameters is None: return diff --git a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py index 6b995ffcb8ea..fd3b4b5d7944 100644 --- a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -17,14 +17,14 @@ from typing import Optional, Union from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class HunYuanMoEV1Config(PreTrainedConfig): +class HunYuanMoEV1Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`HunYuanMoEV1Model`]. It is used to instantiate an HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -157,14 +157,8 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/janus/configuration_janus.py b/src/transformers/models/janus/configuration_janus.py index 4f4f29e21741..c7e9eef4e5f3 100644 --- a/src/transformers/models/janus/configuration_janus.py +++ b/src/transformers/models/janus/configuration_janus.py @@ -20,6 +20,7 @@ # limitations under the License. from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RotaryEmbeddingConfigMixin from ...utils import logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -122,7 +123,7 @@ def __init__( self.num_image_tokens = num_image_tokens -class JanusVQVAEConfig(PreTrainedConfig): +class JanusVQVAEConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`JanusVQVAEModel`]. It is used to instantiate a `JanusVQVAEModel` according to the specified arguments, defining the model architecture. diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index 226b5154bd0a..47c71e708ad8 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class JetMoeConfig(PreTrainedConfig): +class JetMoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`JetMoeModel`]. It is used to instantiate a JetMoe model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -147,14 +147,8 @@ def __init__( self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id self.rms_norm_eps = rms_norm_eps - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index 9eb2d20320bb..0b2b3a19e902 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -183,14 +183,8 @@ def __init__( self.attention_dropout = attention_dropout self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.sliding_window = sliding_window - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/lfm2/configuration_lfm2.py b/src/transformers/models/lfm2/configuration_lfm2.py index 9d1acae3f1ea..3e27aa05715c 100644 --- a/src/transformers/models/lfm2/configuration_lfm2.py +++ b/src/transformers/models/lfm2/configuration_lfm2.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters class Lfm2Config(PreTrainedConfig): @@ -148,20 +148,14 @@ def __init__( self.block_multiple_of = block_multiple_of self.block_ffn_dim_multiplier = block_ffn_dim_multiplier self.block_auto_adjust_ff_dim = block_auto_adjust_ff_dim - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: full_attn_idxs = full_attn_idxs if full_attn_idxs is not None else list(range(num_hidden_layers)) self.layer_types = ["full_attention" if i in full_attn_idxs else "conv" for i in range(num_hidden_layers)] - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self) - + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000, **kwargs) tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py index a0e26e2a25f0..dd39648bc0a9 100644 --- a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py +++ b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py @@ -14,10 +14,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class Lfm2MoeConfig(PreTrainedConfig): +class Lfm2MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Lfm2MoeModel`]. It is used to instantiate a LFM2 Moe model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -136,10 +136,6 @@ def __init__( self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.max_position_embeddings = max_position_embeddings self.use_cache = use_cache self.norm_eps = norm_eps @@ -162,10 +158,8 @@ def __init__( self.norm_topk_prob = norm_topk_prob self.layer_types = layer_types - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters["rope_theta"] = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self) - + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000, **kwargs) tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index 1dc0a40d7e15..dfdc02f03d91 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class LlamaConfig(PreTrainedConfig): +class LlamaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -171,14 +171,8 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index 5d16358beeb8..1e7a0ba39f20 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class Llama4VisionConfig(PreTrainedConfig): +class Llama4VisionConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Llama4VisionModel`]. It is used to instantiate a Llama4 vision model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -124,18 +124,13 @@ def __init__( self.projector_dropout = projector_dropout self.attention_dropout = attention_dropout self.vision_feature_select_strategy = vision_feature_select_strategy - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__(**kwargs) -class Llama4TextConfig(PreTrainedConfig): +class Llama4TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Llama4TextModel`]. It is used to instantiate a Llama4 text model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -286,13 +281,6 @@ def __init__( attn_scale=0.1, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.attn_temperature_tuning = attn_temperature_tuning self.attn_scale = attn_scale self.floor_scale = floor_scale @@ -316,11 +304,6 @@ def __init__( self.attention_dropout = attention_dropout self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.use_qk_norm = use_qk_norm - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.num_experts_per_tok = num_experts_per_tok self.num_local_experts = num_local_experts @@ -353,9 +336,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000, **kwargs) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class Llama4Config(PreTrainedConfig): diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index 307faae51354..11324a287a7e 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -18,10 +18,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class LongcatFlashConfig(PreTrainedConfig): +class LongcatFlashConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`LongcatFlashModel`]. It is used to instantiate a LongCat Flash model according to the specified arguments, defining the model architecture. Instantiating a @@ -210,20 +210,12 @@ def __init__( self.zero_expert_num = zero_expert_num self.expert_ffn_hidden_size = expert_ffn_hidden_size self.routed_scaling_factor = routed_scaling_factor - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000000.0)) - + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000000, **kwargs) for key in ["beta_fast", "beta_slow", "factor"]: if key in self.rope_parameters: self.rope_parameters[key] = float(self.rope_parameters[key]) - rope_config_standardize_and_validate(self) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py index 46263fce9f1d..986fda46900d 100644 --- a/src/transformers/models/mimi/configuration_mimi.py +++ b/src/transformers/models/mimi/configuration_mimi.py @@ -20,14 +20,14 @@ import numpy as np from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class MimiConfig(PreTrainedConfig): +class MimiConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of an [`MimiModel`]. It is used to instantiate a Mimi model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -221,14 +221,8 @@ def __init__( self.head_dim = head_dim or hidden_size // num_attention_heads self.layer_scale_initial_scale = layer_scale_initial_scale self.attention_bias = attention_bias - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) # Handle backward compatibility for frame_rate: # If frame_rate is explicitly provided, use it (backward compatibility) diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index 750b98acc81b..10c896204f31 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -23,10 +23,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class MiniMaxConfig(PreTrainedConfig): +class MiniMaxConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -221,10 +221,6 @@ def __init__( self.linear_attn_beta_factor = linear_attn_beta_factor self.mlp_alpha_factor = mlp_alpha_factor self.mlp_beta_factor = mlp_beta_factor - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -232,9 +228,8 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000, **kwargs) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index 0072cc7d9ea6..f071f991d3ea 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -28,7 +28,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging from ...utils.generic import OutputRecorder, check_model_inputs @@ -51,7 +51,7 @@ logger = logging.get_logger(__name__) -class MiniMaxConfig(PreTrainedConfig): +class MiniMaxConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -246,10 +246,6 @@ def __init__( self.linear_attn_beta_factor = linear_attn_beta_factor self.mlp_alpha_factor = mlp_alpha_factor self.mlp_beta_factor = mlp_beta_factor - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -257,9 +253,8 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000, **kwargs) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/ministral/configuration_ministral.py b/src/transformers/models/ministral/configuration_ministral.py index c3ad12827dbf..1aef5af2b0e2 100644 --- a/src/transformers/models/ministral/configuration_ministral.py +++ b/src/transformers/models/ministral/configuration_ministral.py @@ -7,10 +7,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class MinistralConfig(PreTrainedConfig): +class MinistralConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`MinistralModel`]. It is used to instantiate an Ministral model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -130,13 +130,6 @@ def __init__( layer_types: Optional[list[str]] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -157,19 +150,21 @@ def __init__( self.use_cache = use_cache self.attention_dropout = attention_dropout self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ "sliding_attention" if self.sliding_window is not None else "full_attention" ] * num_hidden_layers - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) __all__ = ["MinistralConfig"] diff --git a/src/transformers/models/ministral/modular_ministral.py b/src/transformers/models/ministral/modular_ministral.py index e67260f12eea..8c26d5dc6f84 100644 --- a/src/transformers/models/ministral/modular_ministral.py +++ b/src/transformers/models/ministral/modular_ministral.py @@ -7,7 +7,7 @@ from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import check_model_inputs @@ -131,14 +131,6 @@ def __init__( layer_types: Optional[list[str]] = None, **kwargs, ): - PreTrainedConfig.__init__( - self, - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -159,19 +151,22 @@ def __init__( self.use_cache = use_cache self.attention_dropout = attention_dropout self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ "sliding_attention" if self.sliding_window is not None else "full_attention" ] * num_hidden_layers - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + PreTrainedConfig.__init__( + self, + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class MinistralMLP(Qwen2MLP): diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py index 454f72c6a0d7..0c6cb6e854b4 100644 --- a/src/transformers/models/mistral/configuration_mistral.py +++ b/src/transformers/models/mistral/configuration_mistral.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class MistralConfig(PreTrainedConfig): +class MistralConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -166,14 +166,8 @@ def __init__( "Detected Mistral model with layer_types. Consider using AutoModel or Ministral classes instead to enable alternating attention compatibility." ) - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index 2922f316eb6f..fda6269afdf0 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class MixtralConfig(PreTrainedConfig): +class MixtralConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`MixtralModel`]. It is used to instantiate an Mixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -186,14 +186,8 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.router_jitter_noise = router_jitter_noise - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index c89085162254..fa4132cc6926 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_standardize_and_validate +from ...modeling_rope_utils import RotaryEmbeddingConfigMixin from ...utils import logging @@ -138,7 +138,7 @@ def max_aspect_ratio_id(self) -> int: return len(self.supported_aspect_ratios) -class MllamaTextConfig(PreTrainedConfig): +class MllamaTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`MllamaTextModel`]. It is used to instantiate an Mllama text model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -247,14 +247,8 @@ def __init__( self.dropout = dropout self.hidden_act = hidden_act self.max_position_embeddings = max_position_embeddings - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 500000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=500000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index 40a4d66896fa..460740a6756e 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -22,10 +22,10 @@ from typing import Literal, Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class ModernBertConfig(PreTrainedConfig): +class ModernBertConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -216,21 +216,8 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` - # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format - default_rope_params = { - "sliding_attention": {"rope_type": "default"}, - "full_attention": {"rope_type": "default"}, - } - self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params - if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - self.rope_parameters["full_attention"].update(rope_scaling) - self.rope_parameters["sliding_attention"].update(rope_scaling) - self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("global_rope_theta", 160_000.0)) - self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("local_rope_theta", 10000.0)) - - # Validate the correctness of rotary position embeddings parameters - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta={"global": 160_000.0, "local": 10_000.0}, **kwargs) super().__init__( pad_token_id=pad_token_id, @@ -241,6 +228,32 @@ def __init__( **kwargs, ) + def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["sliding_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate() + return kwargs + def to_dict(self): output = super().to_dict() output.pop("reference_compile", None) diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 3c280f44b14a..333267ab3add 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -35,7 +35,7 @@ SequenceClassifierOutput, TokenClassifierOutput, ) -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, is_flash_attn_2_available, logging from ...utils.import_utils import is_triton_available @@ -53,7 +53,7 @@ logger = logging.get_logger(__name__) -class ModernBertConfig(PreTrainedConfig): +class ModernBertConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -244,21 +244,8 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` - # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format - default_rope_params = { - "sliding_attention": {"rope_type": "default"}, - "full_attention": {"rope_type": "default"}, - } - self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params - if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - self.rope_parameters["full_attention"].update(rope_scaling) - self.rope_parameters["sliding_attention"].update(rope_scaling) - self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("global_rope_theta", 160_000.0)) - self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("local_rope_theta", 10000.0)) - - # Validate the correctness of rotary position embeddings parameters - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta={"global": 160_000.0, "local": 10_000.0}, **kwargs) super().__init__( pad_token_id=pad_token_id, @@ -269,6 +256,32 @@ def __init__( **kwargs, ) + def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["sliding_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate() + return kwargs + def to_dict(self): output = super().to_dict() output.pop("reference_compile", None) diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index b693ec38a161..ce402c1cb647 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class ModernBertDecoderConfig(PreTrainedConfig): +class ModernBertDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`ModernBertDecoderModel`]. It is used to instantiate a ModernBert decoder model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -193,24 +193,10 @@ def __init__( else: self.layer_types.append("full_attention") - # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` - # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format - default_rope_params = { - "sliding_attention": {"rope_type": "default"}, - "full_attention": {"rope_type": "default"}, - } - self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params - if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - self.rope_parameters["full_attention"].update(rope_scaling) - self.rope_parameters["sliding_attention"].update(rope_scaling) - self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("global_rope_theta", 160_000.0)) - self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("local_rope_theta", 10000.0)) - - # Validate the correctness of rotary position embeddings parameters - rope_config_standardize_and_validate(self) - # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta={"global": 160_000.0, "local": 10_000.0}, **kwargs) super().__init__( pad_token_id=pad_token_id, @@ -221,5 +207,31 @@ def __init__( **kwargs, ) + def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["sliding_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate() + return kwargs + __all__ = ["ModernBertDecoderConfig"] diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index b137b7383314..b15d6ba07621 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -28,7 +28,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -46,7 +46,7 @@ logger = logging.get_logger(__name__) -class ModernBertDecoderConfig(PreTrainedConfig): +class ModernBertDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`ModernBertDecoderModel`]. It is used to instantiate a ModernBert decoder model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -214,24 +214,10 @@ def __init__( else: self.layer_types.append("full_attention") - # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` - # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format - default_rope_params = { - "sliding_attention": {"rope_type": "default"}, - "full_attention": {"rope_type": "default"}, - } - self.rope_parameters = rope_parameters if rope_parameters is not None else default_rope_params - if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - self.rope_parameters["full_attention"].update(rope_scaling) - self.rope_parameters["sliding_attention"].update(rope_scaling) - self.rope_parameters["full_attention"].setdefault("rope_theta", kwargs.pop("global_rope_theta", 160_000.0)) - self.rope_parameters["sliding_attention"].setdefault("rope_theta", kwargs.pop("local_rope_theta", 10000.0)) - - # Validate the correctness of rotary position embeddings parameters - rope_config_standardize_and_validate(self) - # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta={"global": 160_000.0, "local": 10_000.0}, **kwargs) super().__init__( pad_token_id=pad_token_id, @@ -242,6 +228,32 @@ def __init__( **kwargs, ) + def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["sliding_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate() + return kwargs + class ModernBertDecoderEmbeddings(ModernBertEmbeddings): pass diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index 674ee14d04f1..cc29ec6e6680 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -21,10 +21,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class MoonshineConfig(PreTrainedConfig): +class MoonshineConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -174,15 +174,10 @@ def __init__( self.is_encoder_decoder = is_encoder_decoder self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.9) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 5b62c5c53487..2260ac657d6d 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -35,7 +35,7 @@ Seq2SeqLMOutput, Seq2SeqModelOutput, ) -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -47,7 +47,7 @@ logger = logging.get_logger(__name__) -class MoonshineConfig(PreTrainedConfig): +class MoonshineConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -197,15 +197,10 @@ def __init__( self.is_encoder_decoder = is_encoder_decoder self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.9) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index c96df98ec41d..b8926b934ad9 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -282,14 +282,8 @@ def __init__( self.ffn_dim = ffn_dim self.rms_norm_eps = rms_norm_eps self.num_codebooks = num_codebooks - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) audio_encoder_config = kwargs.pop("audio_encoder_config", {}) audio_encoder_model_type = audio_encoder_config.pop("model_type", "mimi") diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index e3dc526b9cb8..5f58f0672ef7 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -18,14 +18,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class NemotronConfig(PreTrainedConfig): +class NemotronConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`NemotronModel`]. It is used to instantiate an Nemotron model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -142,16 +142,10 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index bfab2296711f..065fafddd54a 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -22,14 +22,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class OlmoConfig(PreTrainedConfig): +class OlmoConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`OlmoModel`]. It is used to instantiate an OLMo model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -158,14 +158,8 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.clip_qkv = clip_qkv - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py index 686c45417425..eafce294cad0 100644 --- a/src/transformers/models/olmo2/configuration_olmo2.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -27,10 +27,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class Olmo2Config(PreTrainedConfig): +class Olmo2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -158,14 +158,8 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo3/configuration_olmo3.py b/src/transformers/models/olmo3/configuration_olmo3.py index 9d5f0d1e08ac..42d7873f5b43 100644 --- a/src/transformers/models/olmo3/configuration_olmo3.py +++ b/src/transformers/models/olmo3/configuration_olmo3.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class Olmo3Config(PreTrainedConfig): +class Olmo3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Olmo3Model`]. It is used to instantiate an OLMo3 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -143,13 +143,6 @@ def __init__( layer_types: Optional[list[str]] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -168,11 +161,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rms_norm_eps = rms_norm_eps - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.sliding_window = sliding_window self.layer_types = layer_types if self.layer_types is None: @@ -181,9 +169,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) __all__ = ["Olmo3Config"] diff --git a/src/transformers/models/olmo3/modular_olmo3.py b/src/transformers/models/olmo3/modular_olmo3.py index 04e0d67e3012..c24bc09a0e75 100644 --- a/src/transformers/models/olmo3/modular_olmo3.py +++ b/src/transformers/models/olmo3/modular_olmo3.py @@ -25,7 +25,7 @@ from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding @@ -41,7 +41,7 @@ ) -class Olmo3Config(PreTrainedConfig): +class Olmo3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Olmo3Model`]. It is used to instantiate an OLMo3 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -159,13 +159,6 @@ def __init__( layer_types: Optional[list[str]] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -184,11 +177,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rms_norm_eps = rms_norm_eps - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - self.sliding_window = sliding_window self.layer_types = layer_types if self.layer_types is None: @@ -197,9 +185,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class Olmo3RMSNorm(Olmo2RMSNorm): diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py index 4b0c9537deb8..eb61e69bd69e 100644 --- a/src/transformers/models/olmoe/configuration_olmoe.py +++ b/src/transformers/models/olmoe/configuration_olmoe.py @@ -14,10 +14,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class OlmoeConfig(PreTrainedConfig): +class OlmoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`OlmoeModel`]. It is used to instantiate an OLMoE model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -158,14 +158,8 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.norm_topk_prob = norm_topk_prob - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index 4adf6a40f884..77e09a13a125 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class PersimmonConfig(PreTrainedConfig): +class PersimmonConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`PersimmonModel`]. It is used to instantiate an Persimmon model according to the specified arguments, defining the model architecture. Instantiating a @@ -118,16 +118,10 @@ def __init__( self.qk_layernorm = qk_layernorm self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 25000.0)) - rope_config_standardize_and_validate(self) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index a8f17712aa51..e073e1d619fa 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -18,14 +18,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class PhiConfig(PreTrainedConfig): +class PhiConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`PhiModel`]. It is used to instantiate an Phi model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -160,16 +160,10 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache self.qk_layernorm = qk_layernorm - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index c4f16246debf..bd0e01a316d4 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -18,14 +18,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class Phi3Config(PreTrainedConfig): +class Phi3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -163,17 +163,9 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 1.0) - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - self._rope_parameters_adjustment() - self._rope_parameters_validation() self.sliding_window = sliding_window super().__init__( @@ -184,20 +176,24 @@ def __init__( **kwargs, ) - def _rope_parameters_adjustment(self): + def standardize_rope_params(self): """ Adjust the `type` of the `rope_parameters` configuration for backward compatibility. """ + super().standardize_rope_params() rope_parameters_type = self.rope_parameters.get("rope_type", None) # For backward compatibility if previous version used "su" or "yarn" if rope_parameters_type is not None and rope_parameters_type in ["su", "yarn"]: self.rope_parameters["rope_type"] = "longrope" - def _rope_parameters_validation(self): + def validate(self, ignore_keys: Optional[set] = None): """ Validate the `rope_parameters` configuration. """ + super().validate(ignore_keys=ignore_keys) + + # Run Phi3 specific validation if not isinstance(self.rope_parameters, dict): raise ValueError(f"`rope_parameters` must be a dictionary but got {self.rope_parameters}") rope_parameters_type = self.rope_parameters.get("rope_type", None) diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index 38a247b87f8a..bc4a6813f67d 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin class Phi4MultimodalVisionConfig(PreTrainedConfig): @@ -243,7 +243,7 @@ def __init__( self.nemo_final_size = length -class Phi4MultimodalConfig(PreTrainedConfig): +class Phi4MultimodalConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Phi4MultimodalModel`]. It is used to instantiate a Phi4Multimodal model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -404,17 +404,9 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 1.0) - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - self._rope_parameters_adjustment() - self._rope_parameters_validation() self.sliding_window = sliding_window super().__init__( @@ -425,20 +417,24 @@ def __init__( **kwargs, ) - def _rope_parameters_adjustment(self): + def standardize_rope_params(self): """ Adjust the `type` of the `rope_parameters` configuration for backward compatibility. """ + super().standardize_rope_params() rope_parameters_type = self.rope_parameters.get("rope_type", None) # For backward compatibility if previous version used "su" or "yarn" if rope_parameters_type is not None and rope_parameters_type in ["su", "yarn"]: self.rope_parameters["rope_type"] = "longrope" - def _rope_parameters_validation(self): + def validate(self, ignore_keys: Optional[set] = None): """ Validate the `rope_parameters` configuration. """ + super().validate(ignore_keys=ignore_keys) + + # Run Phi4Multimodal specific validation if not isinstance(self.rope_parameters, dict): raise ValueError(f"`rope_parameters` must be a dictionary but got {self.rope_parameters}") rope_parameters_type = self.rope_parameters.get("rope_type", None) diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index 3a1bd9a29173..3eb5d29bec1a 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -18,14 +18,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class PhimoeConfig(PreTrainedConfig): +class PhimoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`PhimoeModel`]. It is used to instantiate a Phi-moe model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -167,15 +167,24 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.router_jitter_noise = router_jitter_noise self.input_jitter_noise = input_jitter_noise - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000, **kwargs) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def validate(self, ignore_keys=None): + """ + Validate the `rope_parameters` configuration. + """ + super().validate(ignore_keys=ignore_keys) + # Run model-specific rope validation if self.rope_parameters["rope_type"] != "default": if "original_max_position_embeddings" in self.rope_parameters: self.original_max_position_embeddings = self.rope_parameters["original_max_position_embeddings"] @@ -190,13 +199,5 @@ def __init__( f"`rope_parameters`'s long_mscale field must be a number, got {rope_parameters_long_mscale}" ) - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - __all__ = ["PhimoeConfig"] diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index f040f6de9d94..f1de0f52e22d 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -16,14 +16,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class PixtralVisionConfig(PreTrainedConfig): +class PixtralVisionConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`PixtralVisionModel`]. It is used to instantiate an Pixtral vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -90,8 +90,6 @@ def __init__( initializer_range: Optional[float] = 0.02, **kwargs, ): - super().__init__(**kwargs) - self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers @@ -103,14 +101,9 @@ def __init__( self.hidden_act = hidden_act self.head_dim = hidden_size // num_attention_heads self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__(**kwargs) __all__ = ["PixtralVisionConfig"] diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index 203b88d0dd44..72a8bd98a954 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -157,10 +157,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -172,9 +168,8 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index e4bb8c4bbd5a..5cf841a78e40 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging @@ -211,7 +211,7 @@ def __init__( self.output_dim = output_dim -class Qwen2_5OmniTextConfig(PreTrainedConfig): +class Qwen2_5OmniTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen2_5OmniThinkerForConditionalGeneration`]. It is used to instantiate an Qwen2.5-Omni-Thinker model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -330,10 +330,6 @@ def __init__( attention_dropout: Optional[float] = 0.0, **kwargs, ): - super().__init__( - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -354,10 +350,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -369,9 +361,12 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000, ignore_keys={"mrope_section"}, **kwargs) + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class Qwen2_5OmniThinkerConfig(PreTrainedConfig): @@ -494,7 +489,7 @@ def __init__( super().__init__(**kwargs) -class Qwen2_5OmniTalkerConfig(PreTrainedConfig): +class Qwen2_5OmniTalkerConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen2_5OmniTalkerForConditionalGeneration`]. It is used to instantiate an Qwen2.5-Omni-Talker model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -697,10 +692,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.position_id_per_seconds = position_id_per_seconds # zf self.seconds_per_chunk = seconds_per_chunk # zf self.audio_start_token_id = audio_start_token_id # zf @@ -719,14 +710,13 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000, ignore_keys={"mrope_section"}, **kwargs) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) -class Qwen2_5OmniDiTConfig(PreTrainedConfig): +class Qwen2_5OmniDiTConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of the Qwen2_5OmniToken2WavDiT used in the Qwen2.5-Omni-Token2Wav model. It defines the architecture of the DiT model, which is used for generating mel-spectrograms from tokens. @@ -822,14 +812,8 @@ def __init__( self.enc_attention_channels = enc_attention_channels self.enc_res2net_scale = enc_res2net_scale self.enc_se_channels = enc_se_channels - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 349f23c39e6b..811cf124c308 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -45,7 +45,7 @@ from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutput, ModelOutput -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import ( @@ -244,7 +244,7 @@ def __init__( del self.encoder_layerdrop -class Qwen2_5OmniTextConfig(PreTrainedConfig): +class Qwen2_5OmniTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen2_5OmniThinkerForConditionalGeneration`]. It is used to instantiate an Qwen2.5-Omni-Thinker model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -363,10 +363,6 @@ def __init__( attention_dropout: Optional[float] = 0.0, **kwargs, ): - super().__init__( - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -387,10 +383,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -402,9 +394,12 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000, ignore_keys={"mrope_section"}, **kwargs) + super().__init__( + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class Qwen2_5OmniThinkerConfig(PreTrainedConfig): @@ -527,7 +522,7 @@ def __init__( super().__init__(**kwargs) -class Qwen2_5OmniTalkerConfig(PreTrainedConfig): +class Qwen2_5OmniTalkerConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen2_5OmniTalkerForConditionalGeneration`]. It is used to instantiate an Qwen2.5-Omni-Talker model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -730,10 +725,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.position_id_per_seconds = position_id_per_seconds # zf self.seconds_per_chunk = seconds_per_chunk # zf self.audio_start_token_id = audio_start_token_id # zf @@ -752,14 +743,13 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000, ignore_keys={"mrope_section"}, **kwargs) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) -class Qwen2_5OmniDiTConfig(PreTrainedConfig): +class Qwen2_5OmniDiTConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of the Qwen2_5OmniToken2WavDiT used in the Qwen2.5-Omni-Token2Wav model. It defines the architecture of the DiT model, which is used for generating mel-spectrograms from tokens. @@ -855,14 +845,8 @@ def __init__( self.enc_attention_channels = enc_attention_channels self.enc_res2net_scale = enc_res2net_scale self.enc_se_channels = enc_se_channels - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index 8c0c4635299f..9b59591230c6 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -27,7 +27,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin class Qwen2_5_VLVisionConfig(PreTrainedConfig): @@ -70,7 +70,7 @@ def __init__( self.initializer_range = initializer_range -class Qwen2_5_VLTextConfig(PreTrainedConfig): +class Qwen2_5_VLTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen2_5_VLTextModel`]. It is used to instantiate a Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -202,10 +202,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -217,14 +213,26 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_type = self.rope_parameters.get("type") or self.rope_parameters.get("rope_type") - if rope_type == "mrope": - self.rope_parameters["rope_type"] = "default" - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000, ignore_keys={"mrope_section"}, **kwargs) + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + def convert_rope_params_to_dict( + self, default_theta: int | float = 10_000.0, ignore_keys: Optional[set] = None, **kwargs + ): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) + if self.rope_parameters.get("rope_type", self.rope_parameters.get("type")) == "mrope": + self.rope_parameters["rope_type"] = "default" + self.standardize_rope_params() + self.validate(ignore_keys=ignore_keys) + return kwargs + class Qwen2_5_VLConfig(PreTrainedConfig): r""" diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py index 91c1c783b5d1..d97ca8ffe59a 100644 --- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class Qwen2MoeConfig(PreTrainedConfig): +class Qwen2MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen2MoeModel`]. It is used to instantiate a Qwen2MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -185,10 +185,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -210,9 +206,8 @@ def __init__( ] layer_type_validation(self.layer_types) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 935311f87ea3..d74001107b1f 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging @@ -58,7 +58,7 @@ def __init__( self.initializer_range = initializer_range -class Qwen2VLTextConfig(PreTrainedConfig): +class Qwen2VLTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen2VLTextModel`]. It is used to instantiate a Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -190,10 +190,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -205,14 +201,26 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_type = self.rope_parameters.get("type") or self.rope_parameters.get("rope_type") - if rope_type == "mrope": - self.rope_parameters["rope_type"] = "default" - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section"}) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=1000000, ignore_keys={"mrope_section"}, **kwargs) + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + def convert_rope_params_to_dict( + self, default_theta: int | float = 10_000.0, ignore_keys: Optional[set] = None, **kwargs + ): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) + if self.rope_parameters.get("rope_type", self.rope_parameters.get("type")) == "mrope": + self.rope_parameters["rope_type"] = "default" + self.standardize_rope_params() + self.validate(ignore_keys=ignore_keys) + return kwargs + class Qwen2VLConfig(PreTrainedConfig): r""" diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index 5827c36d6bd6..a108700133a8 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class Qwen3Config(PreTrainedConfig): +class Qwen3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -165,10 +165,6 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -180,9 +176,8 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index 5f30d772e7cf..3ceb04c7199c 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class Qwen3MoeConfig(PreTrainedConfig): +class Qwen3MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen3MoeModel`]. It is used to instantiate a Qwen3MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -179,14 +179,8 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index f6f307a2d167..1456c18105cb 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class Qwen3NextConfig(PreTrainedConfig): +class Qwen3NextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a Qwen3-Next model according to the specified arguments, defining the model architecture. @@ -184,7 +184,6 @@ def __init__( layer_types: Optional[list[str]] = None, **kwargs, ): - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -199,10 +198,8 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.head_dim = head_dim - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.25) self.layer_types = layer_types @@ -214,10 +211,6 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - # linear attention part self.linear_conv_kernel_dim = linear_conv_kernel_dim self.linear_key_head_dim = linear_key_head_dim @@ -235,6 +228,7 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.mlp_only_layers = mlp_only_layers + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) __all__ = ["Qwen3NextConfig"] diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index 2a57746fc71e..ed29ec68420a 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging @@ -171,7 +171,116 @@ def __init__( self.deepstack_visual_indexes = deepstack_visual_indexes -class Qwen3OmniMoeTextConfig(PreTrainedConfig): +class Qwen3OmniMoeTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): + r""" + This is the configuration class to store the configuration of a [`Qwen3OmniMoeTextModel`]. It is used to instantiate a + Qwen3OmniMoeText model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of [Qwen/Qwen3-15B-A2B](https://huggingface.co/Qwen/Qwen3-15B-A2B). + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the Qwen3OmniMoeText model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen3OmniMoeTextModel`] + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 6144): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 4): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`. + + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + use_sliding_window (`bool`, *optional*, defaults to `False`): + Whether to use sliding window attention. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention (SWA) window size. If not specified, will default to `4096`. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + decoder_sparse_step (`int`, *optional*, defaults to 1): + The frequency of the MoE layer. + moe_intermediate_size (`int`, *optional*, defaults to 768): + Intermediate size of the routed expert. + num_experts_per_tok (`int`, *optional*, defaults to 8): + Number of selected experts. + num_experts (`int`, *optional*, defaults to 128): + Number of routed experts. + norm_topk_prob (`bool`, *optional*, defaults to `False`): + Whether to normalize the topk probabilities. + output_router_logits (`bool`, *optional*, defaults to `False`): + Whether or not the router logits should be returned by the model. Enabling this will also + allow the model to output the auxiliary loss, including load balancing loss and router z-loss. + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): + The aux loss factor for the total loss. + mlp_only_layers (`list[int]`, *optional*, defaults to `[]`): + Indicate which layers use Qwen3OmniMoeTextMLP rather than Qwen3OmniMoeTextSparseMoeBlock + The list contains layer index, from 0 to num_layers-1 if we have num_layers layers + If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity. + + ```python + >>> from transformers import Qwen3OmniMoeTextModel, Qwen3OmniMoeTextConfig + + >>> # Initializing a Qwen3OmniMoeText style configuration + >>> configuration = Qwen3OmniMoeTextConfig() + + >>> # Initializing a model from the Qwen3-15B-A2B" style configuration + >>> model = Qwen3OmniMoeTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen3_omni_moe_text" + keys_to_ignore_at_inference = ["past_key_values"] + + # Default tensor parallel plan for base model `Qwen3OmniMoeText` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + def __init__( self, vocab_size: Optional[int] = 3584, @@ -215,14 +324,10 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict( + default_theta=1000000, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}, **kwargs + ) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -341,7 +446,7 @@ def __init__( self.video_token_id = video_token_id -class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig): +class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen3OmniMoeTalkerCodePredictorModel`]. It is used to instantiate a Qwen3OmniMoeTalkerCodePredictor model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -479,10 +584,6 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} self.layer_types = layer_types if self.layer_types is None: @@ -494,9 +595,8 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, @@ -505,7 +605,7 @@ def __init__( self.num_code_groups = num_code_groups -class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig): +class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen3OmniMoeTalkerTextModel`]. It is used to instantiate a Qwen3OmniMoeTalkerText model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -658,14 +758,8 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -819,7 +913,7 @@ def __init__( super().__init__(**kwargs) -class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig): +class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen3OmniMoeCode2WavConfig`]. It is used to instantiate a Qwen3-Omni code-to-waveform decoder, responsible for converting discrete audio codes into high-fidelity waveforms. @@ -908,7 +1002,6 @@ def __init__( attention_dropout=0.0, **kwargs, ): - super().__init__(**kwargs) self.codebook_size = codebook_size self.hidden_size = hidden_size self.max_position_embeddings = max_position_embeddings @@ -926,15 +1019,9 @@ def __init__( self.upsampling_ratios = upsampling_ratios self.decoder_dim = decoder_dim self.attention_dropout = attention_dropout - - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__(**kwargs) @property def layer_types(self): diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 8af03d507194..4983a696f260 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -42,7 +42,7 @@ MoeCausalLMOutputWithPast, MoeModelOutputWithPast, ) -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import PreTrainedModel from ...processing_utils import ProcessorMixin, Unpack from ...tokenization_utils_base import TextInput @@ -156,7 +156,7 @@ class Qwen3OmniMoeVisionEncoderConfig(Qwen3VLMoeVisionConfig): pass -class Qwen3OmniMoeTextConfig(PreTrainedConfig): +class Qwen3OmniMoeTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen3OmniMoeTextModel`]. It is used to instantiate a Qwen3OmniMoeText model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -309,14 +309,10 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 1000000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict( + default_theta=1000000, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}, **kwargs + ) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -676,7 +672,7 @@ def __init__( super().__init__(**kwargs) -class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig): +class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen3OmniMoeCode2WavConfig`]. It is used to instantiate a Qwen3-Omni code-to-waveform decoder, responsible for converting discrete audio codes into high-fidelity waveforms. @@ -765,7 +761,6 @@ def __init__( attention_dropout=0.0, **kwargs, ): - super().__init__(**kwargs) self.codebook_size = codebook_size self.hidden_size = hidden_size self.max_position_embeddings = max_position_embeddings @@ -783,15 +778,9 @@ def __init__( self.upsampling_ratios = upsampling_ratios self.decoder_dim = decoder_dim self.attention_dropout = attention_dropout - - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__(**kwargs) @property def layer_types(self): diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index d21b055af6a4..227e8454d03e 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin class Qwen3VLVisionConfig(PreTrainedConfig): @@ -62,7 +62,7 @@ def __init__( self.deepstack_visual_indexes = deepstack_visual_indexes -class Qwen3VLTextConfig(PreTrainedConfig): +class Qwen3VLTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen3VLTextModel`]. It is used to instantiate a Qwen3-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -170,14 +170,10 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 5000000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict( + default_theta=500000, ignore_keys={"mrope_section", "mrope_interleaved"}, **kwargs + ) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 8d93dc5612a9..e14419657a2c 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -30,7 +30,7 @@ from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, dynamic_rope_update, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import ProcessingKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -103,7 +103,7 @@ def __init__( self.deepstack_visual_indexes = deepstack_visual_indexes -class Qwen3VLTextConfig(PreTrainedConfig): +class Qwen3VLTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen3VLTextModel`]. It is used to instantiate a Qwen3-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -211,14 +211,10 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 5000000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict( + default_theta=500000, ignore_keys={"mrope_section", "mrope_interleaved"}, **kwargs + ) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py index bd642ff3c3f9..617da3ad9886 100644 --- a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class Qwen3VLMoeTextConfig(PreTrainedConfig): +class Qwen3VLMoeTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen3VLMoeTextModel`]. It is used to instantiate a Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -166,14 +166,10 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.head_dim = head_dim or hidden_size // num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 5000000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict( + default_theta=500000, ignore_keys={"mrope_section", "mrope_interleaved"}, **kwargs + ) # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index 2469b533ed09..21f59b7d743b 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -23,7 +23,7 @@ from ...activations import ACT2FN from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -47,7 +47,7 @@ logger = logging.get_logger(__name__) -class Qwen3VLMoeTextConfig(PreTrainedConfig): +class Qwen3VLMoeTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen3VLMoeTextModel`]. It is used to instantiate a Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -188,14 +188,10 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.head_dim = head_dim or hidden_size // num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 5000000.0)) - rope_config_standardize_and_validate(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict( + default_theta=500000, ignore_keys={"mrope_section", "mrope_interleaved"}, **kwargs + ) # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index 1c323dfc156c..b172b1897686 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class RecurrentGemmaConfig(PreTrainedConfig): +class RecurrentGemmaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`RecurrentGemmaModel`]. It is used to instantiate a RecurrentGemma model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -146,16 +146,10 @@ def __init__( self.attention_bias = attention_bias self.w_init_variance_scale = w_init_variance_scale self.final_w_init_variance_scale = 2.0 / self.num_hidden_layers - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/seed_oss/configuration_seed_oss.py b/src/transformers/models/seed_oss/configuration_seed_oss.py index 9a50e32ab3a2..28e41e4c3f42 100644 --- a/src/transformers/models/seed_oss/configuration_seed_oss.py +++ b/src/transformers/models/seed_oss/configuration_seed_oss.py @@ -16,10 +16,10 @@ from typing import Optional from transformers.configuration_utils import PreTrainedConfig -from transformers.modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from transformers.modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class SeedOssConfig(PreTrainedConfig): +class SeedOssConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`SeedOssModel`]. It is used to instantiate an SeedOss model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -170,14 +170,8 @@ def __init__( self.residual_dropout = residual_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py index 5954062061ed..f6c699f04d63 100644 --- a/src/transformers/models/smollm3/configuration_smollm3.py +++ b/src/transformers/models/smollm3/configuration_smollm3.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class SmolLM3Config(PreTrainedConfig): +class SmolLM3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`SmolLM3Model`]. It is used to instantiate a SmolLM3 model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -151,12 +151,6 @@ def __init__( mlp_bias: Optional[bool] = False, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.mlp_bias = mlp_bias @@ -201,14 +195,15 @@ def __init__( self.layer_types = layer_types layer_type_validation(self.layer_types, self.num_hidden_layers) - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=2000000, **kwargs) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 2000000.0)) - rope_config_standardize_and_validate(self) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) __all__ = ["SmolLM3Config"] diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py index fce5cac404d1..c68f2c1c15d4 100644 --- a/src/transformers/models/smollm3/modular_smollm3.py +++ b/src/transformers/models/smollm3/modular_smollm3.py @@ -21,7 +21,7 @@ from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...modeling_flash_attention_utils import FlashAttentionKwargs -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import logging @@ -42,7 +42,7 @@ logger = logging.get_logger(__name__) -class SmolLM3Config(PreTrainedConfig): +class SmolLM3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`SmolLM3Model`]. It is used to instantiate a SmolLM3 model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -168,12 +168,6 @@ def __init__( mlp_bias: Optional[bool] = False, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.mlp_bias = mlp_bias @@ -218,14 +212,15 @@ def __init__( self.layer_types = layer_types layer_type_validation(self.layer_types, self.num_hidden_layers) - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=2000000, **kwargs) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 2000000.0)) - rope_config_standardize_and_validate(self) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) class SmolLM3RotaryEmbedding(Qwen2RotaryEmbedding): diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index d8f54fa7c6c1..907c28c4b56b 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class StableLmConfig(PreTrainedConfig): +class StableLmConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`~StableLmModel`]. It is used to instantiate an StableLM model according to the specified arguments, defining the model @@ -145,16 +145,10 @@ def __init__( self.use_parallel_residual = use_parallel_residual self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.25) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) - super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py index ecac62aa734f..4d24650c1f54 100644 --- a/src/transformers/models/starcoder2/configuration_starcoder2.py +++ b/src/transformers/models/starcoder2/configuration_starcoder2.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class Starcoder2Config(PreTrainedConfig): +class Starcoder2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -155,14 +155,8 @@ def __init__( self.attention_dropout = attention_dropout self.residual_dropout = residual_dropout self.embedding_dropout = embedding_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py index b77f4aac69c5..4a513c04341e 100644 --- a/src/transformers/models/t5gemma/configuration_t5gemma.py +++ b/src/transformers/models/t5gemma/configuration_t5gemma.py @@ -22,10 +22,10 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class T5GemmaModuleConfig(PreTrainedConfig): +class T5GemmaModuleConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -150,13 +150,6 @@ def __init__( attn_logit_softcapping: Optional[float] = 50.0, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -176,10 +169,6 @@ def __init__( self.final_logit_softcapping = final_logit_softcapping self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -187,9 +176,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class T5GemmaConfig(PreTrainedConfig): diff --git a/src/transformers/models/vaultgemma/configuration_vaultgemma.py b/src/transformers/models/vaultgemma/configuration_vaultgemma.py index aa5798160ccf..9b83891bc688 100644 --- a/src/transformers/models/vaultgemma/configuration_vaultgemma.py +++ b/src/transformers/models/vaultgemma/configuration_vaultgemma.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class VaultGemmaConfig(PreTrainedConfig): +class VaultGemmaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`VaultGemmaModel`]. It is used to instantiate an VaultGemma model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -150,13 +150,6 @@ def __init__( attn_logit_softcapping: Optional[float] = 50.0, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -176,10 +169,6 @@ def __init__( self.final_logit_softcapping = final_logit_softcapping self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} if self.layer_types is None: self.layer_types = [ @@ -187,9 +176,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) __all__ = ["VaultGemmaConfig"] diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py index b39837785805..9ae04a3f3a20 100644 --- a/src/transformers/models/zamba2/configuration_zamba2.py +++ b/src/transformers/models/zamba2/configuration_zamba2.py @@ -23,10 +23,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_standardize_and_validate +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class Zamba2Config(PreTrainedConfig): +class Zamba2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Zamba2Model`]. It is used to instantiate a Zamba2 model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -173,12 +173,6 @@ def __init__( use_long_context: Optional[bool] = False, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -195,14 +189,8 @@ def __init__( self.attention_dropout = attention_dropout self.use_mem_rope = use_mem_rope self.use_long_context = use_long_context - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - rope_parameters = rope_scaling or rope_parameters - self.rope_parameters = rope_parameters if rope_parameters is not None else {} - - # Validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", 10000.0)) - rope_config_standardize_and_validate(self) + self.rope_parameters = rope_parameters + kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) self.mamba_d_state = mamba_d_state self.mamba_d_conv = mamba_d_conv @@ -246,6 +234,12 @@ def __init__( self.num_logits_to_keep = num_logits_to_keep self.hybrid_layer_ids = [index for index, type in enumerate(self.layers_block_type) if type == "hybrid"] self.use_mem_eff_path = use_mem_eff_path + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) __all__ = ["Zamba2Config"] From f5dd9d5e55cc5499be1a04019c1d6c0d75dedc22 Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 27 Nov 2025 19:07:40 +0100 Subject: [PATCH 15/23] fix tests --- src/transformers/modeling_rope_utils.py | 12 +++---- .../configuration_hunyuan_v1_dense.py | 4 +-- .../models/phi3/configuration_phi3.py | 4 +-- .../configuration_phi4_multimodal.py | 4 +-- .../models/phimoe/configuration_phimoe.py | 4 +-- tests/utils/test_modeling_rope_utils.py | 31 +++++++++---------- 6 files changed, 29 insertions(+), 30 deletions(-) diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index e0582388eaff..885bf864c647 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -639,7 +639,7 @@ def convert_rope_params_to_dict( # Standardize and validate the correctness of rotary position embeddings parameters self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) self.standardize_rope_params() - self.validate(ignore_keys=ignore_keys) + self.validate_rope(ignore_keys=ignore_keys) return kwargs def standardize_rope_params(self): @@ -668,7 +668,7 @@ def standardize_rope_params(self): self.rope_parameters = rope_parameters - def validate(self: PreTrainedConfig, ignore_keys: Optional[set] = None): + def validate_rope(self: PreTrainedConfig, ignore_keys: Optional[set] = None): """ Validate the RoPE config arguments, given a `PreTrainedConfig` object """ @@ -683,7 +683,7 @@ def validate(self: PreTrainedConfig, ignore_keys: Optional[set] = None): for rope_parameters in rope_parameters_dict.values(): rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default")) - validation_fn = getattr(self, f"_validate_{rope_type}_parameters") + validation_fn = getattr(self, f"_validate_{rope_type}_rope_parameters") rope_parameters["rope_type"] = rope_type if validation_fn is not None: @@ -721,7 +721,7 @@ def _validate_dynamic_rope_parameters(self, rope_parameters: dict, ignore_keys: if factor is None or not isinstance(factor, float) or factor < 1.0: logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - def _validate_yarn_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + def _validate_yarn_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): required_keys = {"rope_type", "factor", "rope_theta"} optional_keys = { "attention_factor", @@ -784,7 +784,7 @@ def _validate_yarn_parameters(self, rope_parameters: dict, ignore_keys: Optional "factor) -- we recommend updating both fields for optimal downstream model usage." ) - def _validate_longrope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + def _validate_longrope_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): required_keys = {"rope_type", "short_factor", "long_factor", "rope_theta"} # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"} @@ -836,7 +836,7 @@ def _validate_longrope_parameters(self, rope_parameters: dict, ignore_keys: Opti f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}" ) - def _validate_llama3_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + def _validate_llama3_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): required_keys = { "rope_type", "factor", diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index 479c202aaa7a..bcc329da0792 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -152,11 +152,11 @@ def __init__( **kwargs, ) - def validate(self, ignore_keys=None): + def validate_rope(self, ignore_keys=None): """ Validate the `rope_parameters` configuration. """ - super().validate(ignore_keys=ignore_keys) + super().validate_rope(ignore_keys=ignore_keys) if self.rope_parameters is None: return diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index bd0e01a316d4..508652241bcb 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -187,11 +187,11 @@ def standardize_rope_params(self): if rope_parameters_type is not None and rope_parameters_type in ["su", "yarn"]: self.rope_parameters["rope_type"] = "longrope" - def validate(self, ignore_keys: Optional[set] = None): + def validate_rope(self, ignore_keys: Optional[set] = None): """ Validate the `rope_parameters` configuration. """ - super().validate(ignore_keys=ignore_keys) + super().validate_rope(ignore_keys=ignore_keys) # Run Phi3 specific validation if not isinstance(self.rope_parameters, dict): diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index bc4a6813f67d..ed4467c61b5b 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -428,11 +428,11 @@ def standardize_rope_params(self): if rope_parameters_type is not None and rope_parameters_type in ["su", "yarn"]: self.rope_parameters["rope_type"] = "longrope" - def validate(self, ignore_keys: Optional[set] = None): + def validate_rope(self, ignore_keys: Optional[set] = None): """ Validate the `rope_parameters` configuration. """ - super().validate(ignore_keys=ignore_keys) + super().validate_rope(ignore_keys=ignore_keys) # Run Phi4Multimodal specific validation if not isinstance(self.rope_parameters, dict): diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index 3eb5d29bec1a..57f40cd3b690 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -178,11 +178,11 @@ def __init__( **kwargs, ) - def validate(self, ignore_keys=None): + def validate_rope(self, ignore_keys=None): """ Validate the `rope_parameters` configuration. """ - super().validate(ignore_keys=ignore_keys) + super().validate_rope(ignore_keys=ignore_keys) # Run model-specific rope validation if self.rope_parameters["rope_type"] != "default": diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py index 5711f199a6a8..730c4967368e 100644 --- a/tests/utils/test_modeling_rope_utils.py +++ b/tests/utils/test_modeling_rope_utils.py @@ -24,7 +24,6 @@ import torch from transformers import ROPE_INIT_FUNCTIONS - from transformers.modeling_rope_utils import rope_config_standardize_and_validate from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding @@ -35,13 +34,13 @@ def test_rope_validation(self): all_rope_types = ROPE_INIT_FUNCTIONS.keys() # The base config is always valid (default RoPE) - rope_config_standardize_and_validate(config) + config.validate_rope() # If we explicitly set the other RoPE types, then validation should fail for rope_type in all_rope_types: config.rope_parameters = {"rope_type": rope_type, "rope_theta": 10000.0} with self.assertRaises(KeyError): - rope_config_standardize_and_validate(config) + config.validate_rope() # Parameters are exclusive to their own RoPE type, and should raise an exception if incorrectly passed valid_param_mapping = { @@ -60,31 +59,31 @@ def test_rope_validation(self): continue else: with self.assertRaises(KeyError): - rope_config_standardize_and_validate(config) + config.validate_rope() # Any other parameters passed to RoPE will raise a warning that a particular key is not used # But sometimes we can have model-specific RoPE kwargs and bypass warning with `ignore_keys` model_specific_kwarg = "mrope_sections" # e,g in Qwen2-VL config.rope_parameters = {"rope_type": "default", "rope_theta": 10000.0, model_specific_kwarg: True} - rope_config_standardize_and_validate(config, ignore_keys={model_specific_kwarg}) + config.validate_rope(ignore_keys={model_specific_kwarg}) with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_standardize_and_validate(config) + config.validate_rope() self.assertEqual(len(logs.output), 1) self.assertIn(model_specific_kwarg, logs.output[0]) # We can indicate Different RoPE params for each attention type # We can also have only one RoPE params defined for all layer, we don't raise an error # because it is not required to have separate RoPE per layer type - config.layer_types = ["global_attn", "local_attn"] + config.layer_types = ["full_attention", "sliding_attention"] config.rope_parameters = { - "global_attn": {"rope_type": "default", "rope_theta": 10000}, - "local_attn": {"rope_type": "linear", "rope_theta": 10000, "factor": 2.0}, + "full_attention": {"rope_type": "default", "rope_theta": 10000}, + "sliding_attention": {"rope_type": "linear", "rope_theta": 10000, "factor": 2.0}, } - rope_config_standardize_and_validate(config) + config.validate_rope() - config.rope_parameters = config.rope_parameters["local_attn"] - rope_config_standardize_and_validate(config) + config.rope_parameters = config.rope_parameters["full_attention"] + config.validate_rope() def test_yarn_original_original_max_position_embeddings_validation(self): """Tests that models with no/bad `original_max_position_embeddings` raise a warning""" @@ -100,7 +99,7 @@ def test_yarn_original_original_max_position_embeddings_validation(self): config.rope_parameters = rope_config with self.assertRaises(AssertionError): # confirm that no warnings are thrown with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_standardize_and_validate(config) + config.validate_rope() # bad rope config, no `original_max_position_embeddings` -> warning rope_config = { @@ -110,7 +109,7 @@ def test_yarn_original_original_max_position_embeddings_validation(self): } config.rope_parameters = rope_config with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_standardize_and_validate(config) + config.validate_rope() self.assertEqual(len(logs.output), 1) self.assertIn("is unset", logs.output[0]) @@ -123,7 +122,7 @@ def test_yarn_original_original_max_position_embeddings_validation(self): } config.rope_parameters = rope_config with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_standardize_and_validate(config) + config.validate_rope() self.assertEqual(len(logs.output), 1) self.assertIn("implicit factor", logs.output[0]) @@ -373,7 +372,7 @@ def test_longrope_rope_numerically(self): } self.assertEqual(config.rope_parameters.get("attention_factor"), None) # Verify that "TypeError: '<' not supported between instances of 'NoneType' and 'int'" is not raised. - rope_config_standardize_and_validate(config) + config.validate_rope() # Check 2: seq_len == 0 -> short factor is applied to the default frequencies config.rope_parameters = { From a50598ef65e99bf579ecc4cb83c8b1dc54c5808a Mon Sep 17 00:00:00 2001 From: raushan Date: Thu, 27 Nov 2025 19:35:22 +0100 Subject: [PATCH 16/23] fix tests --- .../chameleon/configuration_chameleon.py | 4 +-- .../models/csm/configuration_csm.py | 2 +- .../models/dia/configuration_dia.py | 6 ++-- .../diffllama/configuration_diffllama.py | 4 +-- .../models/gemma3/configuration_gemma3.py | 2 +- .../models/gemma3/modular_gemma3.py | 2 +- .../models/gemma3n/configuration_gemma3n.py | 2 +- .../models/gemma3n/modular_gemma3n.py | 2 +- .../models/gpt_neox/configuration_gpt_neox.py | 2 +- .../configuration_gpt_neox_japanese.py | 2 +- .../models/gpt_oss/configuration_gpt_oss.py | 4 +-- .../configuration_hunyuan_v1_dense.py | 31 ------------------- .../configuration_kyutai_speech_to_text.py | 4 +-- .../models/lfm2/configuration_lfm2.py | 4 +-- .../modernbert/configuration_modernbert.py | 2 +- .../models/modernbert/modular_modernbert.py | 2 +- .../configuration_modernbert_decoder.py | 2 +- .../modular_modernbert_decoder.py | 2 +- .../models/moshi/configuration_moshi.py | 4 +-- .../models/phi3/configuration_phi3.py | 21 ++++++++----- .../configuration_phi4_multimodal.py | 21 ++++++++----- .../models/qwen2/configuration_qwen2.py | 4 +-- .../qwen2_5_vl/configuration_qwen2_5_vl.py | 2 +- .../models/qwen2_vl/configuration_qwen2_vl.py | 2 +- 24 files changed, 58 insertions(+), 75 deletions(-) diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index 3bbb9f1d1eb7..e38824337924 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -24,7 +24,7 @@ logger = logging.get_logger(__name__) -class ChameleonVQVAEConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class ChameleonVQVAEConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`ChameleonVQModel`]. It is used to instantiate a `ChameleonVQModel` according to the specified arguments, defining the model architecture. @@ -98,7 +98,7 @@ def __init__( self.initializer_range = initializer_range -class ChameleonConfig(PreTrainedConfig): +class ChameleonConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`ChameleonModel`]. It is used to instantiate a chameleon model according to the specified arguments, defining the model architecture. Instantiating a diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index e28e15ea1545..f245b6b65c89 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -166,7 +166,7 @@ def __init__( ) -class CsmConfig(PreTrainedConfig): +class CsmConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`CsmForConditionalGeneration`]. It is used to instantiate an CSM model according to the specified arguments, defining the model architecture. Instantiating a configuration diff --git a/src/transformers/models/dia/configuration_dia.py b/src/transformers/models/dia/configuration_dia.py index 26084c0f0dda..98427b504377 100644 --- a/src/transformers/models/dia/configuration_dia.py +++ b/src/transformers/models/dia/configuration_dia.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class DiaEncoderConfig(PreTrainedConfig): +class DiaEncoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`DiaEncoder`]. It is used to instantiate a Dia encoder according to the specified arguments, defining the encoder architecture. @@ -98,7 +98,7 @@ def __init__( super().__init__(**kwargs) -class DiaDecoderConfig(PreTrainedConfig): +class DiaDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`DiaDecoder`]. It is used to instantiate a Dia decoder according to the specified arguments, defining the decoder architecture. diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index c25ed339e5a3..93b231b8d669 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -20,10 +20,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class DiffLlamaConfig(PreTrainedConfig): +class DiffLlamaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`DiffLlamaModel`]. It is used to instantiate an DiffLlama model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index c2c1e3e8bf14..45b82d5a5077 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -227,7 +227,7 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate() + self.validate_rope() return kwargs diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index b47355418c58..c4fa5bb78adf 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -241,7 +241,7 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate() + self.validate_rope() return kwargs diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index 72ad8286a122..e834734e9039 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -280,7 +280,7 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate() + self.validate_rope() return kwargs diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 1f7cbae7fce0..4677e173327f 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -291,7 +291,7 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate() + self.validate_rope() return kwargs diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 128c7ab763c2..b004eb220eb5 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -167,7 +167,7 @@ def convert_rope_params_to_dict(self, default_theta=10_000.0, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", default_theta)) self.standardize_rope_params() - self.validate() + self.validate_rope() return kwargs diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index 185772a9d784..822236419970 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -128,7 +128,7 @@ def convert_rope_params_to_dict(self, default_theta=10_000.0, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", default_theta)) self.standardize_rope_params() - self.validate() + self.validate_rope() return kwargs diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py index ab2eab4b1535..d68dc7f270c8 100644 --- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py +++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py @@ -17,10 +17,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class GptOssConfig(PreTrainedConfig): +class GptOssConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This will yield a configuration to that of the BERT [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture. diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index bcc329da0792..187c9013a5a6 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -152,36 +152,5 @@ def __init__( **kwargs, ) - def validate_rope(self, ignore_keys=None): - """ - Validate the `rope_parameters` configuration. - """ - super().validate_rope(ignore_keys=ignore_keys) - if self.rope_parameters is None: - return - - if not isinstance(self.rope_parameters, dict) or len(self.rope_parameters) != 2: - raise ValueError( - "`rope_parameters` must be a dictionary with with two fields, `type` and `factor` or `type` and `alpha`, " - f"got {self.rope_parameters}" - ) - rope_parameters_type = self.rope_parameters.get("type", None) - rope_parameters_factor = self.rope_parameters.get("factor", None) - rope_parameters_alpha = self.rope_parameters.get("alpha", None) - if rope_parameters_type is None or rope_parameters_type not in ["linear", "dynamic"]: - raise ValueError( - f"`rope_parameters`'s type field must be one of ['linear', 'dynamic'], got {rope_parameters_type}" - ) - if rope_parameters_factor is None and rope_parameters_alpha is None: - raise ValueError("`rope_parameters`'s factor or alpha field must be have one, got both of none") - if rope_parameters_factor is not None: - if not isinstance(rope_parameters_factor, float) or rope_parameters_factor <= 1.0: - raise ValueError( - f"`rope_parameters`'s factor field must be a float > 1.0, got {rope_parameters_factor}" - ) - if rope_parameters_alpha is not None: - if not isinstance(rope_parameters_alpha, float) or rope_parameters_alpha <= 1.0: - raise ValueError(f"`rope_parameters`'s alpha field must be a float > 1.0, got {rope_parameters_alpha}") - __all__ = ["HunYuanDenseV1Config"] diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index 0b2b3a19e902..b722c1fb70f6 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -24,7 +24,7 @@ logger = logging.get_logger(__name__) -class KyutaiSpeechToTextConfig(PreTrainedConfig): +class KyutaiSpeechToTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`KyutaiSpeechToTextForConditionalGeneration`]. It is used to instantiate a Kyutai Speech-to-Text model according to the specified arguments, defining the model diff --git a/src/transformers/models/lfm2/configuration_lfm2.py b/src/transformers/models/lfm2/configuration_lfm2.py index 3e27aa05715c..711e8fdbd128 100644 --- a/src/transformers/models/lfm2/configuration_lfm2.py +++ b/src/transformers/models/lfm2/configuration_lfm2.py @@ -14,10 +14,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin -class Lfm2Config(PreTrainedConfig): +class Lfm2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Lfm2Model`]. It is used to instantiate a LFM2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index 460740a6756e..30417a39b02f 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -251,7 +251,7 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate() + self.validate_rope() return kwargs def to_dict(self): diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 333267ab3add..21165e38f078 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -279,7 +279,7 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate() + self.validate_rope() return kwargs def to_dict(self): diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index ce402c1cb647..0fbfb790f0d3 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -230,7 +230,7 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate() + self.validate_rope() return kwargs diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index b15d6ba07621..701313455946 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -251,7 +251,7 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate() + self.validate_rope() return kwargs diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index b8926b934ad9..6e0800d33b31 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -149,7 +149,7 @@ def __init__( super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) -class MoshiConfig(PreTrainedConfig): +class MoshiConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`MoshiModel`]. It is used to instantiate a Moshi model according to the specified arguments, defining the audio encoder, Moshi depth decoder and Moshi decoder diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 508652241bcb..63aa62b7cfb6 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -165,7 +165,6 @@ def __init__( self.use_cache = use_cache self.rope_parameters = rope_parameters kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 1.0) self.sliding_window = sliding_window super().__init__( @@ -176,16 +175,24 @@ def __init__( **kwargs, ) - def standardize_rope_params(self): - """ - Adjust the `type` of the `rope_parameters` configuration for backward compatibility. - """ - super().standardize_rope_params() - rope_parameters_type = self.rope_parameters.get("rope_type", None) + def convert_rope_params_to_dict( + self, default_theta: int | float = 10_000.0, ignore_keys: Optional[set] = None, **kwargs + ): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) + self.standardize_rope_params() + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 1.0) # For backward compatibility if previous version used "su" or "yarn" + rope_parameters_type = self.rope_parameters.get("rope_type", None) if rope_parameters_type is not None and rope_parameters_type in ["su", "yarn"]: self.rope_parameters["rope_type"] = "longrope" + self.validate_rope(ignore_keys=ignore_keys) + return kwargs def validate_rope(self, ignore_keys: Optional[set] = None): """ diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index ed4467c61b5b..49961cb8b773 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -406,7 +406,6 @@ def __init__( self.use_cache = use_cache self.rope_parameters = rope_parameters kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 1.0) self.sliding_window = sliding_window super().__init__( @@ -417,16 +416,24 @@ def __init__( **kwargs, ) - def standardize_rope_params(self): - """ - Adjust the `type` of the `rope_parameters` configuration for backward compatibility. - """ - super().standardize_rope_params() - rope_parameters_type = self.rope_parameters.get("rope_type", None) + def convert_rope_params_to_dict( + self, default_theta: int | float = 10_000.0, ignore_keys: Optional[set] = None, **kwargs + ): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) + self.standardize_rope_params() + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 1.0) # For backward compatibility if previous version used "su" or "yarn" + rope_parameters_type = self.rope_parameters.get("rope_type", None) if rope_parameters_type is not None and rope_parameters_type in ["su", "yarn"]: self.rope_parameters["rope_type"] = "longrope" + self.validate_rope(ignore_keys=ignore_keys) + return kwargs def validate_rope(self, ignore_keys: Optional[set] = None): """ diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index 72a8bd98a954..893c402e00d2 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...utils import logging logger = logging.get_logger(__name__) -class Qwen2Config(PreTrainedConfig): +class Qwen2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index 9b59591230c6..841ce91e06f4 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -230,7 +230,7 @@ def convert_rope_params_to_dict( if self.rope_parameters.get("rope_type", self.rope_parameters.get("type")) == "mrope": self.rope_parameters["rope_type"] = "default" self.standardize_rope_params() - self.validate(ignore_keys=ignore_keys) + self.validate_rope(ignore_keys=ignore_keys) return kwargs diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index d74001107b1f..3dc35647f71f 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -218,7 +218,7 @@ def convert_rope_params_to_dict( if self.rope_parameters.get("rope_type", self.rope_parameters.get("type")) == "mrope": self.rope_parameters["rope_type"] = "default" self.standardize_rope_params() - self.validate(ignore_keys=ignore_keys) + self.validate_rope(ignore_keys=ignore_keys) return kwargs From e4f2b8257dd0cf91eb1d89def5f77c15dd8829c9 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 28 Nov 2025 09:37:43 +0100 Subject: [PATCH 17/23] oops --- .../models/gpt_neox_japanese/configuration_gpt_neox_japanese.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index 822236419970..4af8c51112c7 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -117,7 +117,7 @@ def __init__( self.rope_parameters = rope_parameters kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 1.0) super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) def convert_rope_params_to_dict(self, default_theta=10_000.0, **kwargs): From f9260c4cf98604511952a2e9bb058e91d9464905 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 28 Nov 2025 09:42:41 +0100 Subject: [PATCH 18/23] fix slow tests with nested rope models --- src/transformers/models/gemma3/configuration_gemma3.py | 1 - src/transformers/models/gemma3/modular_gemma3.py | 1 - src/transformers/models/gemma3n/configuration_gemma3n.py | 1 - src/transformers/models/gemma3n/modular_gemma3n.py | 1 - src/transformers/models/modernbert/configuration_modernbert.py | 1 - src/transformers/models/modernbert/modular_modernbert.py | 1 - .../modernbert_decoder/configuration_modernbert_decoder.py | 1 - .../models/modernbert_decoder/modular_modernbert_decoder.py | 1 - 8 files changed, 8 deletions(-) diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index 45b82d5a5077..d8d1553501fc 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -207,7 +207,6 @@ def __init__( def convert_rope_params_to_dict(self, default_theta=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or self.rope_parameters # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index c4fa5bb78adf..ef0a8b89b590 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -221,7 +221,6 @@ def __init__( def convert_rope_params_to_dict(self, default_theta=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or self.rope_parameters # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index e834734e9039..08eb6207e6be 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -260,7 +260,6 @@ def __init__( def convert_rope_params_to_dict(self, default_theta=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or self.rope_parameters # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 4677e173327f..2781869a857c 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -271,7 +271,6 @@ def __init__( def convert_rope_params_to_dict(self, default_theta=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or self.rope_parameters # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index 30417a39b02f..de4dbf25da3b 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -230,7 +230,6 @@ def __init__( def convert_rope_params_to_dict(self, default_theta=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or self.rope_parameters # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 21165e38f078..fad3c675444c 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -258,7 +258,6 @@ def __init__( def convert_rope_params_to_dict(self, default_theta=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or self.rope_parameters # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index 0fbfb790f0d3..1b47fe02793d 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -209,7 +209,6 @@ def __init__( def convert_rope_params_to_dict(self, default_theta=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or self.rope_parameters # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index 701313455946..fdf612cc9436 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -230,7 +230,6 @@ def __init__( def convert_rope_params_to_dict(self, default_theta=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or self.rope_parameters # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format From a1dbf305c24f88c8e940cfc385ec578b96bac3bf Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 28 Nov 2025 10:25:29 +0100 Subject: [PATCH 19/23] fix copies --- src/transformers/models/janus/configuration_janus.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/transformers/models/janus/configuration_janus.py b/src/transformers/models/janus/configuration_janus.py index c7e9eef4e5f3..4f4f29e21741 100644 --- a/src/transformers/models/janus/configuration_janus.py +++ b/src/transformers/models/janus/configuration_janus.py @@ -20,7 +20,6 @@ # limitations under the License. from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RotaryEmbeddingConfigMixin from ...utils import logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -123,7 +122,7 @@ def __init__( self.num_image_tokens = num_image_tokens -class JanusVQVAEConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class JanusVQVAEConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`JanusVQVAEModel`]. It is used to instantiate a `JanusVQVAEModel` according to the specified arguments, defining the model architecture. From 80a12835e3364144cc77857d0098a82578b1588c Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 28 Nov 2025 11:19:46 +0100 Subject: [PATCH 20/23] deal with circular import and move the mixin to base config class --- src/transformers/configuration_utils.py | 7 ++- src/transformers/modeling_rope_utils.py | 49 +++++++++++-------- .../models/apertus/configuration_apertus.py | 6 +-- .../models/apertus/modular_apertus.py | 6 +-- .../models/arcee/configuration_arcee.py | 5 +- .../models/aria/configuration_aria.py | 5 +- .../models/bamba/configuration_bamba.py | 8 ++- .../models/bitnet/configuration_bitnet.py | 6 +-- .../models/blt/configuration_blt.py | 21 ++++---- .../chameleon/configuration_chameleon.py | 5 +- .../models/cohere/configuration_cohere.py | 6 +-- .../models/cohere2/configuration_cohere2.py | 5 +- .../models/cohere2/modular_cohere2.py | 4 +- .../models/csm/configuration_csm.py | 10 ++-- .../models/cwm/configuration_cwm.py | 5 +- src/transformers/models/cwm/modular_cwm.py | 2 +- .../models/dbrx/configuration_dbrx.py | 5 +- .../deepseek_v2/configuration_deepseek_v2.py | 5 +- .../deepseek_v3/configuration_deepseek_v3.py | 25 +++++++--- .../models/dia/configuration_dia.py | 9 ++-- .../diffllama/configuration_diffllama.py | 5 +- .../models/doge/configuration_doge.py | 5 +- src/transformers/models/doge/modular_doge.py | 5 +- .../models/dots1/configuration_dots1.py | 5 +- .../configuration_efficientloftr.py | 6 +-- .../models/emu3/configuration_emu3.py | 6 +-- .../models/ernie4_5/configuration_ernie4_5.py | 6 +-- .../configuration_ernie4_5_moe.py | 7 ++- .../models/evolla/configuration_evolla.py | 6 +-- .../models/exaone4/configuration_exaone4.py | 5 +- .../models/exaone4/modular_exaone4.py | 5 +- .../models/falcon/configuration_falcon.py | 5 +- .../falcon_h1/configuration_falcon_h1.py | 5 +- .../flex_olmo/configuration_flex_olmo.py | 6 +-- .../models/flex_olmo/modular_flex_olmo.py | 6 +-- .../models/fuyu/configuration_fuyu.py | 10 ++-- .../models/gemma/configuration_gemma.py | 5 +- .../models/gemma/modular_gemma.py | 5 +- .../models/gemma2/configuration_gemma2.py | 6 +-- .../models/gemma2/modular_gemma2.py | 5 +- .../models/gemma3/configuration_gemma3.py | 15 +++--- .../models/gemma3/modular_gemma3.py | 11 ++--- .../models/gemma3n/configuration_gemma3n.py | 12 ++--- .../models/gemma3n/modular_gemma3n.py | 11 ++--- .../models/glm/configuration_glm.py | 7 ++- .../models/glm4/configuration_glm4.py | 7 ++- .../models/glm4_moe/configuration_glm4_moe.py | 7 ++- .../models/glm4_moe/modular_glm4_moe.py | 7 ++- .../models/glm4v/configuration_glm4v.py | 11 +++-- .../models/glm4v/modular_glm4v.py | 7 ++- .../glm4v_moe/configuration_glm4v_moe.py | 7 ++- .../models/glm4v_moe/modular_glm4v_moe.py | 7 +-- .../models/gpt_neox/configuration_gpt_neox.py | 15 +++--- .../configuration_gpt_neox_japanese.py | 14 +++--- .../models/gpt_oss/configuration_gpt_oss.py | 6 +-- .../models/granite/configuration_granite.py | 5 +- .../granitemoe/configuration_granitemoe.py | 5 +- .../configuration_granitemoehybrid.py | 5 +- .../configuration_granitemoeshared.py | 5 +- .../models/helium/configuration_helium.py | 6 +-- .../configuration_hunyuan_v1_dense.py | 5 +- .../configuration_hunyuan_v1_moe.py | 5 +- .../models/jetmoe/configuration_jetmoe.py | 5 +- .../configuration_kyutai_speech_to_text.py | 5 +- .../models/lfm2/configuration_lfm2.py | 6 +-- .../models/lfm2_moe/configuration_lfm2_moe.py | 6 +-- .../models/llama/configuration_llama.py | 5 +- .../models/llama4/configuration_llama4.py | 10 ++-- .../configuration_longcat_flash.py | 25 +++++++--- .../models/mimi/configuration_mimi.py | 5 +- .../models/minimax/configuration_minimax.py | 6 +-- .../models/minimax/modular_minimax.py | 6 +-- .../ministral/configuration_ministral.py | 6 +-- .../models/ministral/modular_ministral.py | 2 +- .../models/mistral/configuration_mistral.py | 5 +- .../models/mixtral/configuration_mixtral.py | 6 +-- .../models/mllama/configuration_mllama.py | 5 +- .../modernbert/configuration_modernbert.py | 14 +++--- .../models/modernbert/modular_modernbert.py | 16 +++--- .../configuration_modernbert_decoder.py | 14 +++--- .../modular_modernbert_decoder.py | 16 +++--- .../moonshine/configuration_moonshine.py | 7 ++- .../models/moonshine/modular_moonshine.py | 7 ++- .../models/moshi/configuration_moshi.py | 5 +- .../models/nanochat/configuration_nanochat.py | 10 +--- .../models/nemotron/configuration_nemotron.py | 7 ++- .../models/olmo/configuration_olmo.py | 5 +- .../models/olmo2/configuration_olmo2.py | 5 +- .../models/olmo3/configuration_olmo3.py | 6 +-- .../models/olmo3/modular_olmo3.py | 6 +-- .../models/olmoe/configuration_olmoe.py | 5 +- .../persimmon/configuration_persimmon.py | 7 ++- .../models/phi/configuration_phi.py | 7 ++- .../models/phi3/configuration_phi3.py | 8 +-- .../configuration_phi4_multimodal.py | 8 +-- .../models/phimoe/configuration_phimoe.py | 6 +-- .../models/pixtral/configuration_pixtral.py | 6 +-- .../models/qwen2/configuration_qwen2.py | 5 +- .../configuration_qwen2_5_omni.py | 14 +++--- .../qwen2_5_omni/modular_qwen2_5_omni.py | 18 +++---- .../qwen2_5_vl/configuration_qwen2_5_vl.py | 7 ++- .../qwen2_moe/configuration_qwen2_moe.py | 5 +- .../models/qwen2_vl/configuration_qwen2_vl.py | 8 +-- .../models/qwen3/configuration_qwen3.py | 5 +- .../qwen3_moe/configuration_qwen3_moe.py | 5 +- .../qwen3_next/configuration_qwen3_next.py | 7 ++- .../configuration_qwen3_omni_moe.py | 14 +++--- .../qwen3_omni_moe/modular_qwen3_omni_moe.py | 13 +++-- .../models/qwen3_vl/configuration_qwen3_vl.py | 4 +- .../models/qwen3_vl/modular_qwen3_vl.py | 14 +++--- .../configuration_qwen3_vl_moe.py | 4 +- .../qwen3_vl_moe/modular_qwen3_vl_moe.py | 14 +++--- .../configuration_recurrent_gemma.py | 7 ++- .../models/seed_oss/configuration_seed_oss.py | 5 +- .../models/smollm3/configuration_smollm3.py | 7 ++- .../models/smollm3/modular_smollm3.py | 7 ++- .../models/stablelm/configuration_stablelm.py | 7 ++- .../starcoder2/configuration_starcoder2.py | 5 +- .../models/t5gemma/configuration_t5gemma.py | 6 +-- .../vaultgemma/configuration_vaultgemma.py | 6 +-- .../models/zamba2/configuration_zamba2.py | 5 +- 121 files changed, 451 insertions(+), 497 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 53d6a8a900a8..f9e3d97a3c28 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -26,6 +26,7 @@ from . import __version__ from .dynamic_module_utils import custom_object_save from .modeling_gguf_pytorch_utils import load_gguf_checkpoint +from .modeling_rope_utils import RotaryEmbeddingConfigMixin from .utils import ( CONFIG_NAME, PushToHubMixin, @@ -49,7 +50,7 @@ SpecificPreTrainedConfigType = TypeVar("SpecificPreTrainedConfigType", bound="PreTrainedConfig") -class PreTrainedConfig(PushToHubMixin): +class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin): # no-format r""" Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as @@ -261,6 +262,10 @@ def __init__( dtype = getattr(torch, dtype) + # BC for rotary embeddings. We will pop out legacy keys from kwargs and rename to new format + if hasattr(self, "rope_parameters"): + kwargs = self.convert_rope_params_to_dict(**kwargs) + # Attributes common for all models self.return_dict = return_dict self.output_hidden_states = output_hidden_states diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 885bf864c647..01880039deaa 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -14,9 +14,8 @@ import math from functools import wraps -from typing import Optional, TypedDict +from typing import TYPE_CHECKING, Optional, TypedDict -from .configuration_utils import ALLOWED_LAYER_TYPES, PreTrainedConfig from .utils import is_torch_available, logging @@ -26,6 +25,9 @@ if is_torch_available(): import torch +if TYPE_CHECKING: + from .configuration_utils import PreTrainedConfig + def dynamic_rope_update(rope_forward): """ @@ -125,7 +127,7 @@ def wrapper(self, x, position_ids, layer_type=None): def _compute_linear_scaling_rope_parameters( - config: Optional[PreTrainedConfig] = None, + config: Optional["PreTrainedConfig"] = None, device: Optional["torch.device"] = None, seq_len: Optional[int] = None, layer_type: Optional[str] = None, @@ -133,7 +135,7 @@ def _compute_linear_scaling_rope_parameters( """ Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev Args: - config ([`~transformers.PreTrainedConfig`]): + config ([`~transformers."PreTrainedConfig"`]): The model configuration. This function assumes that the config will provide at least the following properties: @@ -179,7 +181,7 @@ def _compute_linear_scaling_rope_parameters( def _compute_dynamic_ntk_parameters( - config: Optional[PreTrainedConfig] = None, + config: Optional["PreTrainedConfig"] = None, device: Optional["torch.device"] = None, seq_len: Optional[int] = None, layer_type: Optional[str] = None, @@ -188,7 +190,7 @@ def _compute_dynamic_ntk_parameters( Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla Args: - config ([`~transformers.PreTrainedConfig`]): + config ([`~transformers."PreTrainedConfig"`]): The model configuration. This function assumes that the config will provide at least the following properties: @@ -251,7 +253,7 @@ def _compute_dynamic_ntk_parameters( def _compute_yarn_parameters( - config: PreTrainedConfig, + config: "PreTrainedConfig", device: "torch.device", seq_len: Optional[int] = None, layer_type: Optional[str] = None, @@ -261,7 +263,7 @@ def _compute_yarn_parameters( [original paper](https://huggingface.co/papers/2309.00071) Args: - config ([`~transformers.PreTrainedConfig`]): + config ([`~transformers."PreTrainedConfig"`]): The model configuration. This function assumes that the config will provide at least the following properties: @@ -389,7 +391,7 @@ def linear_ramp_factor(min, max, dim): def _compute_longrope_parameters( - config: PreTrainedConfig, + config: "PreTrainedConfig", device: "torch.device", seq_len: Optional[int] = None, layer_type: Optional[str] = None, @@ -399,7 +401,7 @@ def _compute_longrope_parameters( [original implementation](https://github.com/microsoft/LongRoPE) Args: - config ([`~transformers.PreTrainedConfig`]): + config ([`~transformers."PreTrainedConfig"`]): The model configuration. This function assumes that the config will provide at least the following properties: @@ -479,7 +481,7 @@ def _compute_longrope_parameters( def _compute_llama3_parameters( - config: PreTrainedConfig, + config: "PreTrainedConfig", device: "torch.device", seq_len: Optional[int] = None, layer_type: Optional[str] = None, @@ -488,7 +490,7 @@ def _compute_llama3_parameters( Computes the inverse frequencies for llama 3.1. Args: - config ([`~transformers.PreTrainedConfig`]): + config ([`~transformers."PreTrainedConfig"`]): The model configuration. This function assumes that the config will provide at least the following properties: @@ -629,17 +631,20 @@ class RotaryEmbeddingConfigMixin: A Mixin containing the functionality to standardize and validate RoPE parameters. """ - def convert_rope_params_to_dict( - self, default_theta: int | float = 10_000.0, ignore_keys: Optional[set] = None, **kwargs - ): + default_theta = 10_000.0 + + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: Optional[set] = None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} # Standardize and validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) + if "partial_rotary_factor" in kwargs: + self.rope_parameters.setdefault("partial_rotary_factor", kwargs["partial_rotary_factor"]) + self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys) + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs def standardize_rope_params(self): @@ -653,7 +658,7 @@ def standardize_rope_params(self): rope_parameters = self.rope_parameters # Case 1: RoPE param keys do not intersect with possible `layer_types` -> one global dict - if not set(rope_parameters.keys()).issubset(ALLOWED_LAYER_TYPES): + if getattr(self, "layer_types", None) is None or not set(rope_parameters.keys()).issubset(self.layer_types): rope_parameters.setdefault("rope_type", rope_parameters.get("type", "default")) rope_parameters.setdefault("rope_theta", rope_theta) if partial_rotary_factor is not None: @@ -668,15 +673,17 @@ def standardize_rope_params(self): self.rope_parameters = rope_parameters - def validate_rope(self: PreTrainedConfig, ignore_keys: Optional[set] = None): + def validate_rope(self: "PreTrainedConfig", ignore_keys: Optional[set] = None): """ - Validate the RoPE config arguments, given a `PreTrainedConfig` object + Validate the RoPE config arguments, given a `"PreTrainedConfig"` object """ rope_parameters_dict = self.rope_parameters if rope_parameters_dict is None: return - if set(rope_parameters_dict.keys()).issubset(ALLOWED_LAYER_TYPES): + if getattr(self, "layer_types", None) is not None and set(rope_parameters_dict.keys()).issubset( + self.layer_types + ): pass else: rope_parameters_dict = {"full_attention": rope_parameters_dict} diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index 30d651cf6c9c..5f25d3a5d094 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class ApertusConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class ApertusConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`ApertusModel`]. It is used to instantiate a Apertus model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -99,6 +99,7 @@ class ApertusConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "apertus" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 12000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k @@ -162,7 +163,6 @@ def __init__( self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=12000000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index c0791dafeafd..13408fa23919 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -21,7 +21,7 @@ from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -43,7 +43,7 @@ logger = logging.get_logger(__name__) -class ApertusConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class ApertusConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`ApertusModel`]. It is used to instantiate a Apertus model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -117,6 +117,7 @@ class ApertusConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "apertus" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 12000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k @@ -180,7 +181,6 @@ def __init__( self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=12000000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/arcee/configuration_arcee.py b/src/transformers/models/arcee/configuration_arcee.py index cb49d5f35427..50d120f3d7cb 100644 --- a/src/transformers/models/arcee/configuration_arcee.py +++ b/src/transformers/models/arcee/configuration_arcee.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class ArceeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class ArceeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`ArceeModel`]. It is used to instantiate an Arcee model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -164,7 +164,6 @@ def __init__( self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index 2144a1b1bec5..268a9307c741 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -21,11 +21,11 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ..auto import CONFIG_MAPPING, AutoConfig -class AriaTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class AriaTextConfig(PreTrainedConfig): r""" This class handles the configuration for the text component of the Aria model. Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria @@ -169,7 +169,6 @@ def __init__( self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index 5dd525db2ee3..fe03c6ad0f66 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class BambaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class BambaConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`BambaModel`]. It is used to instantiate a BambaModel model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -194,9 +194,7 @@ def __init__( self.mamba_proj_bias = mamba_proj_bias self.z_loss_coefficient = z_loss_coefficient self.rope_parameters = rope_parameters - - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) - self.rope_parameters["partial_rotary_factor"] = 0.5 + kwargs["partial_rotary_factor"] = 0.5 # hardcode for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py index 9b3061521abc..27d77785722c 100644 --- a/src/transformers/models/bitnet/configuration_bitnet.py +++ b/src/transformers/models/bitnet/configuration_bitnet.py @@ -16,14 +16,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class BitNetConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class BitNetConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`BitNetModel`]. It is used to instantiate an BitNet model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -97,6 +97,7 @@ class BitNetConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "bitnet" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 def __init__( self, @@ -139,7 +140,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py index 461b76de419e..326176af5e9a 100644 --- a/src/transformers/models/blt/configuration_blt.py +++ b/src/transformers/models/blt/configuration_blt.py @@ -17,19 +17,20 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class BltLocalEncoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class BltLocalEncoderConfig(PreTrainedConfig): """ Configuration class for the Blt Local Encoder component. """ model_type = "blt_local_encoder" + default_theta = 500000.0 def __init__( self, @@ -66,19 +67,19 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) super().__init__(**kwargs, tie_word_embeddings=False) -class BltLocalDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class BltLocalDecoderConfig(PreTrainedConfig): """ Configuration class for the Blt Local Decoder component. """ model_type = "blt_local_decoder" + default_theta = 500000.0 def __init__( self, @@ -115,19 +116,19 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) super().__init__(**kwargs, tie_word_embeddings=False) -class BltGlobalTransformerConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class BltGlobalTransformerConfig(PreTrainedConfig): """ Configuration class for the Blt Global Transformer component. """ model_type = "blt_global_transformer" + default_theta = 500000.0 def __init__( self, @@ -156,14 +157,13 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) super().__init__(**kwargs, tie_word_embeddings=False) -class BltPatcherConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class BltPatcherConfig(PreTrainedConfig): r""" Configuration class for the Blt Patcher/Entropy model component. @@ -231,14 +231,13 @@ def __init__( self.intermediate_size = intermediate_size or int(8 * self.hidden_size / 3) self.initializer_range = initializer_range self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) super().__init__(**kwargs, tie_word_embeddings=False) -class BltConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class BltConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`BltModel`]. It is used to instantiate a Blt model according to the specified arguments, defining the model architecture. @@ -307,6 +306,7 @@ class BltConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "blt" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 sub_configs = { "patcher_config": BltPatcherConfig, "encoder_config": BltLocalEncoderConfig, @@ -406,7 +406,6 @@ def __init__( ) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index e38824337924..d22cecb87ef1 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -98,7 +98,7 @@ def __init__( self.initializer_range = initializer_range -class ChameleonConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class ChameleonConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`ChameleonModel`]. It is used to instantiate a chameleon model according to the specified arguments, defining the model architecture. Instantiating a @@ -231,7 +231,6 @@ def __init__( self.model_parallel_size = model_parallel_size self.swin_norm = swin_norm self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) if vq_config is None: vq_config = {} diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index d5ce47d3daa3..6b9e40e5e143 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -22,14 +22,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class CohereConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class CohereConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere model according to the specified arguments, defining the model architecture. @@ -106,6 +106,7 @@ class CohereConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "cohere" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -166,7 +167,6 @@ def __init__( self.attention_dropout = attention_dropout self.use_qk_norm = use_qk_norm self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index 0e63ab7f10e2..7aadaaff94ce 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class Cohere2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Cohere2Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere model according to the specified arguments, defining the model architecture. @@ -183,7 +183,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index 140cd1ef2e71..e4b83c535dd1 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -26,7 +26,6 @@ from ...modeling_outputs import BaseModelOutputWithPast from ...modeling_rope_utils import ( RopeParameters, - RotaryEmbeddingConfigMixin, dynamic_rope_update, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS @@ -48,7 +47,7 @@ logger = logging.get_logger(__name__) -class Cohere2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Cohere2Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`CohereModel`]. It is used to instantiate an Cohere model according to the specified arguments, defining the model architecture. @@ -206,7 +205,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index f245b6b65c89..d673444bb1c2 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -24,7 +24,7 @@ logger = logging.get_logger(__name__) -class CsmDepthDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class CsmDepthDecoderConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`CsmDepthDecoderModel`]. It is used to instantiate an CSM depth decoder model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults will yield @@ -103,6 +103,7 @@ class CsmDepthDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "csm_depth_decoder_model" base_config_key = "depth_decoder_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 def __init__( self, @@ -155,7 +156,6 @@ def __init__( self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, @@ -166,7 +166,7 @@ def __init__( ) -class CsmConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class CsmConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`CsmForConditionalGeneration`]. It is used to instantiate an CSM model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -259,6 +259,7 @@ class CsmConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "csm" base_config_key = "csm_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 sub_configs = { "codec_config": AutoConfig, "depth_decoder_config": CsmDepthDecoderConfig, @@ -344,7 +345,6 @@ def __init__( self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cwm/configuration_cwm.py b/src/transformers/models/cwm/configuration_cwm.py index 57353f666875..b1790aa10696 100644 --- a/src/transformers/models/cwm/configuration_cwm.py +++ b/src/transformers/models/cwm/configuration_cwm.py @@ -22,10 +22,9 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RotaryEmbeddingConfigMixin -class CwmConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class CwmConfig(PreTrainedConfig): """ Configuration for Code World Model (CWM). This is an inherited Llama3-compatible configuration with layer-interleaved @@ -92,6 +91,7 @@ class CwmConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "cwm" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1_000_000.0 # Default tensor parallel plan for base model `CwmModel` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -178,7 +178,6 @@ def __init__( self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1_000_000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cwm/modular_cwm.py b/src/transformers/models/cwm/modular_cwm.py index 97be1d987662..a2830174ddb0 100644 --- a/src/transformers/models/cwm/modular_cwm.py +++ b/src/transformers/models/cwm/modular_cwm.py @@ -102,6 +102,7 @@ class CwmConfig(LlamaConfig): """ model_type = "cwm" + default_theta = 1_000_000.0 def __init__( self, @@ -180,7 +181,6 @@ def __init__( # CWM models don't use attention bias, remove it from config del self.attention_bias - kwargs = self.convert_rope_params_to_dict(default_theta=1_000_000.0, **kwargs) class CwmRotaryEmbedding(Qwen2RotaryEmbedding): diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index bcc250cd0cf8..3399c8618f5c 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -17,7 +17,7 @@ from typing import Any, Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -111,7 +111,7 @@ def __init__( raise ValueError(f"Found unknown {kwargs=}") -class DbrxConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class DbrxConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`DbrxModel`]. It is used to instantiate a Dbrx model according to the @@ -222,7 +222,6 @@ def __init__( raise ValueError("tie_word_embeddings is not supported for DBRX models.") self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000.0, **kwargs) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index 745fa26578dd..2b0ee668ae69 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class DeepseekV2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class DeepseekV2Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`DeepseekV2Model`]. It is used to instantiate a DeepSeek model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -215,7 +215,6 @@ def __init__( self.head_dim = qk_rope_head_dim self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index 8afc6175de51..abf68f399b8b 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -19,13 +19,13 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} -class DeepseekV3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class DeepseekV3Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -226,11 +226,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) - - for key in ["beta_fast", "beta_slow", "factor"]: - if key in self.rope_parameters: - self.rope_parameters[key] = float(self.rope_parameters[key]) super().__init__( pad_token_id=pad_token_id, @@ -240,5 +235,21 @@ def __init__( **kwargs, ) + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: Optional[set] = None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + + # Convert to float because RoPE fn expect a float. Models on the hub were saved as int + for key in ["beta_fast", "beta_slow", "factor"]: + if key in self.rope_parameters: + self.rope_parameters[key] = float(self.rope_parameters[key]) + return kwargs + __all__ = ["DeepseekV3Config"] diff --git a/src/transformers/models/dia/configuration_dia.py b/src/transformers/models/dia/configuration_dia.py index 98427b504377..7927d299ca8b 100644 --- a/src/transformers/models/dia/configuration_dia.py +++ b/src/transformers/models/dia/configuration_dia.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class DiaEncoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class DiaEncoderConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`DiaEncoder`]. It is used to instantiate a Dia encoder according to the specified arguments, defining the encoder architecture. @@ -93,12 +93,11 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000.0, **kwargs) super().__init__(**kwargs) -class DiaDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class DiaDecoderConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`DiaDecoder`]. It is used to instantiate a Dia decoder according to the specified arguments, defining the decoder architecture. @@ -194,7 +193,7 @@ def __init__( self.initializer_range = initializer_range self.use_cache = use_cache self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000.0, **kwargs) + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index 93b231b8d669..5f9a731f1bd1 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -20,10 +20,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class DiffLlamaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class DiffLlamaConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`DiffLlamaModel`]. It is used to instantiate an DiffLlama model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults @@ -146,7 +146,6 @@ def __init__( self.lambda_std_dev = lambda_std_dev self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index 98ce0701cf9e..aa139d084ddf 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -23,10 +23,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class DogeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class DogeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M). @@ -190,7 +190,6 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000.0, **kwargs) # for backward compatibility if num_key_value_heads is None: diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index 150010654281..8d53ef5ec681 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -31,7 +31,7 @@ from ...integrations.flex_attention import compile_friendly_flex_attention from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import AttentionInterface, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, is_torch_flex_attn_available, logging @@ -55,7 +55,7 @@ from torch.nn.attention.flex_attention import BlockMask -class DogeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class DogeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`DogeModel`]. It is used to instantiate an Doge model according to the specified arguments, defining the model architecture like [SmallDoge/Doge-320M](https://huggingface.co/SmallDoge/Doge-320M). @@ -219,7 +219,6 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000.0, **kwargs) # for backward compatibility if num_key_value_heads is None: diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py index dd39da9c2ade..57ded54fcc75 100644 --- a/src/transformers/models/dots1/configuration_dots1.py +++ b/src/transformers/models/dots1/configuration_dots1.py @@ -15,14 +15,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class Dots1Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Dots1Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Dots1Model`]. It is used to instantiate a `dots.llm1` model according to the specified arguments, defining the model architecture. Instantiating a @@ -203,7 +203,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index de124eff902d..19c20a9a5fbf 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -14,10 +14,9 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RotaryEmbeddingConfigMixin -class EfficientLoFTRConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class EfficientLoFTRConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`EfficientLoFTRFromKeypointMatching`]. It is used to instantiate a EfficientLoFTR model according to the specified arguments, defining the model @@ -174,8 +173,7 @@ def __init__( self.num_key_value_heads = num_attention_heads self.initializer_range = initializer_range self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 4.0) + kwargs.setdefault("partial_rotary_factor", 4.0) # assign default for BC super().__init__(**kwargs) diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py index 0e57cfcc3020..546803916db4 100644 --- a/src/transformers/models/emu3/configuration_emu3.py +++ b/src/transformers/models/emu3/configuration_emu3.py @@ -17,7 +17,7 @@ from typing import Optional, Union from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters class Emu3VQVAEConfig(PreTrainedConfig): @@ -110,7 +110,7 @@ def __init__( self.attention_dropout = attention_dropout -class Emu3TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Emu3TextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Emu3TextModel`]. It is used to instantiate a emu3 model according to the specified arguments, defining the model architecture. Instantiating a @@ -188,6 +188,7 @@ class Emu3TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "emu3_text_model" base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 def __init__( self, @@ -227,7 +228,6 @@ def __init__( self.initializer_range = initializer_range self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ernie4_5/configuration_ernie4_5.py b/src/transformers/models/ernie4_5/configuration_ernie4_5.py index 70d4d3e092f6..13a8d4d94fe7 100644 --- a/src/transformers/models/ernie4_5/configuration_ernie4_5.py +++ b/src/transformers/models/ernie4_5/configuration_ernie4_5.py @@ -16,10 +16,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class Ernie4_5Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Ernie4_5Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Ernie4_5Model`]. It is used to instantiate an Ernie 4.5 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -92,6 +92,7 @@ class Ernie4_5Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "ernie4_5" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 # Default tensor parallel plan for base model `Ernie4_5Model` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -149,7 +150,6 @@ def __init__( self.use_bias = use_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py index 24417775b698..bf3b8403d782 100644 --- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py @@ -16,14 +16,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class Ernie4_5_MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Ernie4_5_MoeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Ernie4_5_MoeModel`]. It is used to instantiate a Ernie 4.5 MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -115,6 +115,7 @@ class Ernie4_5_MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "ernie4_5_moe" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = {"num_experts": "moe_num_experts", "num_experts_per_tok": "moe_k"} + default_theta = 500000.0 # Default tensor parallel plan for base model `Ernie4_5_MoE` base_model_tp_plan = { @@ -193,9 +194,7 @@ def __init__( self.moe_norm_min = moe_norm_min self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef - self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py index 73fd19acae12..91981b3aaeb0 100644 --- a/src/transformers/models/evolla/configuration_evolla.py +++ b/src/transformers/models/evolla/configuration_evolla.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -99,7 +99,7 @@ def __init__( self.token_dropout = token_dropout -class EvollaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class EvollaConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`EvollaModel`]. It is used to instantiate an Evolla model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -190,6 +190,7 @@ class EvollaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "EvollaModel" sub_configs = {"protein_encoder_config": SaProtConfig} + default_theta = 500000.0 def __init__( self, @@ -250,7 +251,6 @@ def __init__( self.resampler_num_latents = resampler_num_latents self.resampler_ff_mult = resampler_ff_mult self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) # Subconfig if protein_encoder_config is None: diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index ddf11f3cd242..fbe7de454282 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class Exaone4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Exaone4Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a @@ -180,7 +180,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index 68dc0d94f456..8c054f693f7a 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -30,7 +30,7 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import ( @@ -58,7 +58,7 @@ _CONFIG_FOR_DOC = "Exaone4Config" -class Exaone4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Exaone4Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Exaone4Model`]. It is used to instantiate a EXAONE 4.0 model according to the specified arguments, defining the model architecture. Instantiating a @@ -213,7 +213,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 3aa3d29edb30..d8d0266c3bee 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class FalconConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class FalconConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`FalconModel`]. It is used to instantiate a Falcon model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -163,7 +163,6 @@ def __init__( self.ffn_hidden_size = ffn_hidden_size self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py index aa0671157ca4..23d5ca53d4cc 100644 --- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py +++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class FalconH1Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class FalconH1Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`FalconH1Model`]. It is used to instantiate a FalconH1Model model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -263,7 +263,6 @@ def __init__( self.ssm_out_multiplier = 1.0 self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index 8cf9263a3d4a..a61148f95ee7 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class FlexOlmoConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class FlexOlmoConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`FlexOlmoModel`]. It is used to instantiate an FlexOlmo model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -110,6 +110,7 @@ class FlexOlmoConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "flex_olmo" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = {"num_local_experts": "num_experts"} + default_theta = 500000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k @@ -176,7 +177,6 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.norm_topk_prob = norm_topk_prob self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index 9e1f14490910..e3b24c5d02fe 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -22,7 +22,7 @@ from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import OutputRecorder, check_model_inputs @@ -36,7 +36,7 @@ ) -class FlexOlmoConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class FlexOlmoConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`FlexOlmoModel`]. It is used to instantiate an FlexOlmo model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -121,6 +121,7 @@ class FlexOlmoConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "flex_olmo" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = {"num_local_experts": "num_experts"} + default_theta = 500000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k @@ -187,7 +188,6 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.norm_topk_prob = norm_topk_prob self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index 19f115d03012..dbe828e01fbe 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -25,7 +25,7 @@ logger = logging.get_logger(__name__) -class FuyuConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class FuyuConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`FuyuForCausalLM`]. It is used to instantiate an Fuyu model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -98,6 +98,7 @@ class FuyuConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "fuyu" sub_configs = {"text_config": AutoConfig} keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 25000.0 def __init__( self, @@ -169,10 +170,7 @@ def __init__( self.attention_dropout = attention_dropout self.image_token_id = image_token_id self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=25000.0, **kwargs) - - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) - self.text_config.rope_parameters["partial_rotary_factor"] = self.rope_parameters["partial_rotary_factor"] + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index 822da444e162..df0a94c014f1 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class GemmaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class GemmaConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -153,7 +153,6 @@ def __init__( self.attention_dropout = attention_dropout self.use_bidirectional_attention = use_bidirectional_attention self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index f0ab6fc47d2d..e305ba56a561 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -23,7 +23,7 @@ from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -50,7 +50,7 @@ logger = logging.get_logger(__name__) -class GemmaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class GemmaConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -178,7 +178,6 @@ def __init__( self.attention_dropout = attention_dropout self.use_bidirectional_attention = use_bidirectional_attention self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index 1bae4315d3f6..a26cffab7bca 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class Gemma2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Gemma2Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -181,7 +181,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index 1bf3bf27fa91..b991f2d2fc65 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -29,7 +29,6 @@ from ...modeling_rope_utils import ( ROPE_INIT_FUNCTIONS, RopeParameters, - RotaryEmbeddingConfigMixin, dynamic_rope_update, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS @@ -53,7 +52,7 @@ logger = logging.get_logger(__name__) -class Gemma2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Gemma2Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Gemma2Model`]. It is used to instantiate an Gemma2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -209,7 +208,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index d8d1553501fc..17c25e7f506a 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -22,7 +22,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..siglip import SiglipVisionConfig @@ -30,7 +30,7 @@ logger = logging.get_logger(__name__) -class Gemma3TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Gemma3TextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Gemma3TextModel`]. It is used to instantiate an Gemma3Text model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -116,6 +116,7 @@ class Gemma3TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "gemma3_text" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = {"global": 1_000_000.0, "local": 10_000.0} base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -195,8 +196,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta={"global": 1_000_000.0, "local": 10_000.0}, **kwargs) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -205,7 +204,7 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -218,15 +217,15 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): if rope_scaling is not None: self.rope_parameters["full_attention"].update(rope_scaling) self.rope_parameters["full_attention"].setdefault( - "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"]) ) self.rope_parameters["sliding_attention"].setdefault( - "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"]) ) # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 66c7bb4824e8..2d489d77f5c7 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -144,6 +144,7 @@ class Gemma3TextConfig(Gemma2Config, PreTrainedConfig): """ model_type = "gemma3_text" + default_theta = {"global": 1_000_000.0, "local": 10_000.0} def __init__( self, @@ -209,8 +210,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta={"global": 1_000_000.0, "local": 10_000.0}, **kwargs) - PreTrainedConfig.__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -219,7 +218,7 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -232,15 +231,15 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): if rope_scaling is not None: self.rope_parameters["full_attention"].update(rope_scaling) self.rope_parameters["full_attention"].setdefault( - "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"]) ) self.rope_parameters["sliding_attention"].setdefault( - "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"]) ) # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index 08eb6207e6be..60c709ec55c7 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -23,7 +23,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import is_timm_available, logging, requires_backends @@ -34,7 +34,7 @@ logger = logging.get_logger(__name__) -class Gemma3nTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Gemma3nTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Gemma3nTextModel`]. It is used to instantiate an Gemma3nTextModel model according to the specified arguments, defining the model architecture. Instantiating a @@ -143,6 +143,7 @@ class Gemma3nTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "gemma3n_text" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = {"global": 1_000_000.0, "local": 10_000.0} base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -249,7 +250,6 @@ def __init__( ) self.activation_sparsity_pattern = activation_sparsity_pattern self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta={"global": 1_000_000.0, "local": 10_000.0}, **kwargs) super().__init__( pad_token_id=pad_token_id, @@ -258,7 +258,7 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -271,10 +271,10 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): if rope_scaling is not None: self.rope_parameters["full_attention"].update(rope_scaling) self.rope_parameters["full_attention"].setdefault( - "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"]) ) self.rope_parameters["sliding_attention"].setdefault( - "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"]) ) # Standardize and validate the correctness of rotary position embeddings parameters diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index 76b81c62b382..a97ef8bceeeb 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -168,6 +168,7 @@ class Gemma3nTextConfig(Gemma2Config, PreTrainedConfig): """ model_type = "gemma3n_text" + default_theta = {"global": 1_000_000.0, "local": 10_000.0} def __init__( self, @@ -260,8 +261,6 @@ def __init__( ) self.activation_sparsity_pattern = activation_sparsity_pattern self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta={"global": 1_000_000.0, "local": 10_000.0}, **kwargs) - PreTrainedConfig.__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -269,7 +268,7 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -282,15 +281,15 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): if rope_scaling is not None: self.rope_parameters["full_attention"].update(rope_scaling) self.rope_parameters["full_attention"].setdefault( - "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"]) ) self.rope_parameters["sliding_attention"].setdefault( - "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"]) ) # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index 4b7eda369f5d..59328a6a4f43 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -17,10 +17,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class GlmConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class GlmConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`GlmModel`]. It is used to instantiate an Glm model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -141,8 +141,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index 7535b003982f..a6d0d2bc3d9c 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -17,10 +17,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class Glm4Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Glm4Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Glm4Model`]. It is used to instantiate an Glm4 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -141,8 +141,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index cb2c762a8c92..573b7fc10f53 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class Glm4MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Glm4MoeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Glm4MoeModel`]. It is used to instantiate a Glm4Moe model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -179,8 +179,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index a5548c70133b..b4a8dc4dc82d 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -20,7 +20,7 @@ from torch import nn from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..cohere.modeling_cohere import CohereAttention from ..deepseek_v3.modeling_deepseek_v3 import ( @@ -39,7 +39,7 @@ logger = logging.get_logger(__name__) -class Glm4MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Glm4MoeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Glm4MoeModel`]. It is used to instantiate a Glm4Moe model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -193,8 +193,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index 7239054465c9..bb5f6e574354 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters class Glm4vVisionConfig(PreTrainedConfig): @@ -117,7 +117,7 @@ def __init__( self.attention_dropout = attention_dropout -class Glm4vTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Glm4vTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a @@ -233,9 +233,12 @@ def __init__( self.use_cache = use_cache self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, ignore_keys={"mrope_section"}, **kwargs) - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__( + tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope"}, + **kwargs, + ) class Glm4vConfig(PreTrainedConfig): diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 500a09cfd878..aa1ae597aa24 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -31,7 +31,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -154,7 +154,7 @@ def __init__( self.attention_dropout = attention_dropout -class Glm4vTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Glm4vTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Glm4vModel`]. It is used to instantiate a GLM-4.1V model according to the specified arguments, defining the model architecture. Instantiating a @@ -270,9 +270,8 @@ def __init__( self.use_cache = use_cache self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, ignore_keys={"mrope_section"}, **kwargs) - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs) class Glm4vConfig(PreTrainedConfig): diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index ff6028d0159f..8cfd0124277e 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters class Glm4vMoeVisionConfig(PreTrainedConfig): @@ -117,7 +117,7 @@ def __init__( self.attention_dropout = attention_dropout -class Glm4vMoeTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Glm4vMoeTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a @@ -267,8 +267,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, ignore_keys={"mrope_section"}, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index cab4758e3009..9a10a4e4d9b5 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -213,8 +213,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, ignore_keys={"mrope_section"}, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC # MoE arguments self.moe_intermediate_size = moe_intermediate_size @@ -227,7 +226,9 @@ def __init__( self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.router_aux_loss_coef = router_aux_loss_coef - PreTrainedConfig.__init__(self, tie_word_embeddings=tie_word_embeddings, **kwargs) + PreTrainedConfig.__init__( + self, tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs + ) class Glm4vMoeConfig(Glm4vConfig): diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index b004eb220eb5..8de9ac83a2b3 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class GPTNeoXConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class GPTNeoXConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`GPTNeoXModel`]. It is used to instantiate an GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -148,9 +148,6 @@ def __init__( self.attention_bias = attention_bias self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) - if self.hidden_size % self.num_attention_heads != 0: raise ValueError( "The hidden size is not divisible by the number of attention heads! Make sure to update them!" @@ -159,15 +156,17 @@ def __init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs ) - def convert_rope_params_to_dict(self, default_theta=10_000.0, **kwargs): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} # Standardize and validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", default_theta)) + # Model uses non-standard naming for rope params, overwrite! + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", self.default_theta)) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) self.standardize_rope_params() - self.validate_rope() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index 4af8c51112c7..93d0e925d1af 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class GPTNeoXJapaneseConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class GPTNeoXJapaneseConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`GPTNeoXModelJapanese`]. It is used to instantiate a GPTNeoX model according to the specified arguments, defining the model architecture. Instantiating a @@ -116,19 +116,19 @@ def __init__( self.hidden_dropout = hidden_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 1.0) super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) - def convert_rope_params_to_dict(self, default_theta=10_000.0, **kwargs): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} # Standardize and validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", default_theta)) + # Model uses non-standard naming for rope params, overwrite! + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", self.default_theta)) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 1.0) self.standardize_rope_params() - self.validate_rope() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py index d68dc7f270c8..78beab335b04 100644 --- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py +++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py @@ -17,10 +17,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class GptOssConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class GptOssConfig(PreTrainedConfig): r""" This will yield a configuration to that of the BERT [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture. @@ -28,6 +28,7 @@ class GptOssConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): """ model_type = "gpt_oss" + default_theta = 150000.0 base_model_pp_plan = { "embed_tokens": (["input_ids"], ["inputs_embeds"]), "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), @@ -110,7 +111,6 @@ def __init__( self.output_router_logits = output_router_logits self.use_cache = use_cache self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=150000, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py index 262c2e77dda3..c94c40169e4e 100644 --- a/src/transformers/models/granite/configuration_granite.py +++ b/src/transformers/models/granite/configuration_granite.py @@ -22,14 +22,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class GraniteConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class GraniteConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`GraniteModel`]. It is used to instantiate an Granite model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -176,7 +176,6 @@ def __init__( self.residual_multiplier = residual_multiplier self.attention_multiplier = attention_multiplier self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index a2a69c28130b..55c619c99e72 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -22,14 +22,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class GraniteMoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class GraniteMoeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`GraniteMoeModel`]. It is used to instantiate an GraniteMoe model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -173,7 +173,6 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py index ee8b07698b57..08320e9fb513 100644 --- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py @@ -18,14 +18,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class GraniteMoeHybridConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class GraniteMoeHybridConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`GraniteMoeHybridConfig`]. It is used to instantiate an GraniteMoeHybrid model according to the specified arguments, defining the model architecture. @@ -199,7 +199,6 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.shared_intermediate_size = shared_intermediate_size self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index cd5bd99d78bd..23f35a0f1989 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -22,14 +22,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class GraniteMoeSharedConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class GraniteMoeSharedConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`GraniteMoeSharedModel`]. It is used to instantiate an GraniteMoeShared model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -179,7 +179,6 @@ def __init__( # this model has rope embedding type, hardcoded for BC self.position_embedding_type = "rope" self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py index 380ec689a1a6..d633b23ccebd 100644 --- a/src/transformers/models/helium/configuration_helium.py +++ b/src/transformers/models/helium/configuration_helium.py @@ -17,10 +17,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class HeliumConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class HeliumConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`HeliumModel`]. It is used to instantiate an Helium model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -93,6 +93,7 @@ class HeliumConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "helium" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 100000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -148,7 +149,6 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=100000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index 187c9013a5a6..f750e2302d39 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class HunYuanDenseV1Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class HunYuanDenseV1Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`HunYuanDenseV1Config`]. It is used to instantiate an HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -142,7 +142,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py index fd3b4b5d7944..ab6844c8d903 100644 --- a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -17,14 +17,14 @@ from typing import Optional, Union from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class HunYuanMoEV1Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class HunYuanMoEV1Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`HunYuanMoEV1Model`]. It is used to instantiate an HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -158,7 +158,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index 47c71e708ad8..3f3c1f632061 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class JetMoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class JetMoeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`JetMoeModel`]. It is used to instantiate a JetMoe model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -148,7 +148,6 @@ def __init__( self.eos_token_id = eos_token_id self.rms_norm_eps = rms_norm_eps self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index b722c1fb70f6..40205002803b 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -24,7 +24,7 @@ logger = logging.get_logger(__name__) -class KyutaiSpeechToTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class KyutaiSpeechToTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`KyutaiSpeechToTextForConditionalGeneration`]. It is used to instantiate a Kyutai Speech-to-Text model according to the specified arguments, defining the model @@ -184,7 +184,6 @@ def __init__( self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.sliding_window = sliding_window self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/lfm2/configuration_lfm2.py b/src/transformers/models/lfm2/configuration_lfm2.py index 711e8fdbd128..9b9129455d3a 100644 --- a/src/transformers/models/lfm2/configuration_lfm2.py +++ b/src/transformers/models/lfm2/configuration_lfm2.py @@ -14,10 +14,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class Lfm2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Lfm2Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Lfm2Model`]. It is used to instantiate a LFM2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -100,6 +100,7 @@ class Lfm2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "lfm2" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 def __init__( self, @@ -155,7 +156,6 @@ def __init__( self.layer_types = ["full_attention" if i in full_attn_idxs else "conv" for i in range(num_hidden_layers)] self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000, **kwargs) tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py index dd39648bc0a9..b6a1dcb1512a 100644 --- a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py +++ b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py @@ -14,10 +14,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class Lfm2MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Lfm2MoeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Lfm2MoeModel`]. It is used to instantiate a LFM2 Moe model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -103,6 +103,7 @@ class Lfm2MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "lfm2_moe" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 def __init__( self, @@ -159,7 +160,6 @@ def __init__( self.layer_types = layer_types self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000, **kwargs) tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index dfdc02f03d91..cc3db887fbb3 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class LlamaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class LlamaConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`LlamaModel`]. It is used to instantiate an LLaMA model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -172,7 +172,6 @@ def __init__( self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index 1e7a0ba39f20..af6b57805a92 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class Llama4VisionConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Llama4VisionConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Llama4VisionModel`]. It is used to instantiate a Llama4 vision model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -126,11 +126,11 @@ def __init__( self.vision_feature_select_strategy = vision_feature_select_strategy self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__(**kwargs) -class Llama4TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Llama4TextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Llama4TextModel`]. It is used to instantiate a Llama4 text model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -213,6 +213,7 @@ class Llama4TextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "llama4_text" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -337,7 +338,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000, **kwargs) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index 11324a287a7e..79f2e82daf30 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -18,10 +18,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class LongcatFlashConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class LongcatFlashConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`LongcatFlashModel`]. It is used to instantiate a LongCat Flash model according to the specified arguments, defining the model architecture. Instantiating a @@ -122,6 +122,7 @@ class LongcatFlashConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "longcat_flash" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 10000000.0 base_model_tp_plan = { "layers.*.self_attn.*.q_b_proj": "colwise", "layers.*.self_attn.*.kv_b_proj": "colwise", @@ -211,10 +212,6 @@ def __init__( self.expert_ffn_hidden_size = expert_ffn_hidden_size self.routed_scaling_factor = routed_scaling_factor self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000000, **kwargs) - for key in ["beta_fast", "beta_slow", "factor"]: - if key in self.rope_parameters: - self.rope_parameters[key] = float(self.rope_parameters[key]) super().__init__( pad_token_id=pad_token_id, @@ -224,5 +221,21 @@ def __init__( **kwargs, ) + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: Optional[set] = None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + + # Convert to float because RoPE fn expect a float. Models on the hub were saved as int + for key in ["beta_fast", "beta_slow", "factor"]: + if key in self.rope_parameters: + self.rope_parameters[key] = float(self.rope_parameters[key]) + return kwargs + __all__ = ["LongcatFlashConfig"] diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py index 986fda46900d..7869bcfafa37 100644 --- a/src/transformers/models/mimi/configuration_mimi.py +++ b/src/transformers/models/mimi/configuration_mimi.py @@ -20,14 +20,14 @@ import numpy as np from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class MimiConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class MimiConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of an [`MimiModel`]. It is used to instantiate a Mimi model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -222,7 +222,6 @@ def __init__( self.layer_scale_initial_scale = layer_scale_initial_scale self.attention_bias = attention_bias self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) # Handle backward compatibility for frame_rate: # If frame_rate is explicitly provided, use it (backward compatibility) diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index 10c896204f31..7f5e34d71e77 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -23,10 +23,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class MiniMaxConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class MiniMaxConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -132,6 +132,7 @@ class MiniMaxConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "minimax" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -229,7 +230,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000, **kwargs) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index f071f991d3ea..d9b428bbce86 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -28,7 +28,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging from ...utils.generic import OutputRecorder, check_model_inputs @@ -51,7 +51,7 @@ logger = logging.get_logger(__name__) -class MiniMaxConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class MiniMaxConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`MiniMaxModel`]. It is used to instantiate an MiniMax model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -157,6 +157,7 @@ class MiniMaxConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "minimax" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -254,7 +255,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000, **kwargs) super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/ministral/configuration_ministral.py b/src/transformers/models/ministral/configuration_ministral.py index 1aef5af2b0e2..0afccfb429d9 100644 --- a/src/transformers/models/ministral/configuration_ministral.py +++ b/src/transformers/models/ministral/configuration_ministral.py @@ -7,10 +7,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class MinistralConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class MinistralConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`MinistralModel`]. It is used to instantiate an Ministral model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -157,7 +157,7 @@ def __init__( ] * num_hidden_layers self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/ministral/modular_ministral.py b/src/transformers/models/ministral/modular_ministral.py index 8c26d5dc6f84..9e3a185f2c25 100644 --- a/src/transformers/models/ministral/modular_ministral.py +++ b/src/transformers/models/ministral/modular_ministral.py @@ -158,7 +158,7 @@ def __init__( ] * num_hidden_layers self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + PreTrainedConfig.__init__( self, pad_token_id=pad_token_id, diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py index 0c6cb6e854b4..1c6cb9c4e684 100644 --- a/src/transformers/models/mistral/configuration_mistral.py +++ b/src/transformers/models/mistral/configuration_mistral.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class MistralConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class MistralConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`MistralModel`]. It is used to instantiate an Mistral model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -167,7 +167,6 @@ def __init__( ) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index fda6269afdf0..18e58f5f6484 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class MixtralConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class MixtralConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`MixtralModel`]. It is used to instantiate an Mixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -114,6 +114,7 @@ class MixtralConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "mixtral" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -187,7 +188,6 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.router_jitter_noise = router_jitter_noise self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index fa4132cc6926..3760b0b228f9 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -16,7 +16,6 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RotaryEmbeddingConfigMixin from ...utils import logging @@ -138,7 +137,7 @@ def max_aspect_ratio_id(self) -> int: return len(self.supported_aspect_ratios) -class MllamaTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class MllamaTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`MllamaTextModel`]. It is used to instantiate an Mllama text model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -208,6 +207,7 @@ class MllamaTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "mllama_text_model" base_config_key = "text_config" + default_theta = 500000.0 def __init__( self, @@ -248,7 +248,6 @@ def __init__( self.hidden_act = hidden_act self.max_position_embeddings = max_position_embeddings self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=500000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index de4dbf25da3b..e9064615575a 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -22,10 +22,10 @@ from typing import Literal, Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class ModernBertConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class ModernBertConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -130,8 +130,8 @@ class ModernBertConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): ```""" model_type = "modernbert" - attribute_map = {"rope_theta": "global_rope_theta"} keys_to_ignore_at_inference = ["past_key_values"] + default_theta = {"global": 160_000.0, "local": 10_000.0} def __init__( self, @@ -217,8 +217,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta={"global": 160_000.0, "local": 10_000.0}, **kwargs) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -228,7 +226,7 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -242,10 +240,10 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): self.rope_parameters["full_attention"].update(rope_scaling) self.rope_parameters["sliding_attention"].update(rope_scaling) self.rope_parameters["full_attention"].setdefault( - "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + "rope_theta", kwargs.pop("global_rope_theta", self.default_theta["global"]) ) self.rope_parameters["sliding_attention"].setdefault( - "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + "rope_theta", kwargs.pop("local_rope_theta", self.default_theta["local"]) ) # Standardize and validate the correctness of rotary position embeddings parameters diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index fad3c675444c..49f6244cfb2f 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -35,7 +35,7 @@ SequenceClassifierOutput, TokenClassifierOutput, ) -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, is_flash_attn_2_available, logging from ...utils.import_utils import is_triton_available @@ -53,7 +53,7 @@ logger = logging.get_logger(__name__) -class ModernBertConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class ModernBertConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`ModernBertModel`]. It is used to instantiate an ModernBert model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -158,8 +158,8 @@ class ModernBertConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): ```""" model_type = "modernbert" - attribute_map = {"rope_theta": "global_rope_theta"} keys_to_ignore_at_inference = ["past_key_values"] + default_theta = {"global": 160_000.0, "local": 10_000.0} def __init__( self, @@ -245,8 +245,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta={"global": 160_000.0, "local": 10_000.0}, **kwargs) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -256,7 +254,7 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -270,15 +268,15 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): self.rope_parameters["full_attention"].update(rope_scaling) self.rope_parameters["sliding_attention"].update(rope_scaling) self.rope_parameters["full_attention"].setdefault( - "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + "rope_theta", kwargs.pop("global_rope_theta", self.default_theta["global"]) ) self.rope_parameters["sliding_attention"].setdefault( - "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + "rope_theta", kwargs.pop("local_rope_theta", self.default_theta["local"]) ) # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs def to_dict(self): diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index 1b47fe02793d..7e35a8bd3c3b 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class ModernBertDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class ModernBertDecoderConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`ModernBertDecoderModel`]. It is used to instantiate a ModernBert decoder model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -120,8 +120,8 @@ class ModernBertDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): ```""" model_type = "modernbert-decoder" - attribute_map = {"rope_theta": "global_rope_theta"} keys_to_ignore_at_inference = ["past_key_values"] + default_theta = {"global": 160_000.0, "local": 10_000.0} def __init__( self, @@ -196,8 +196,6 @@ def __init__( # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta={"global": 160_000.0, "local": 10_000.0}, **kwargs) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -207,7 +205,7 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + def convert_rope_params_to_dict(self, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -221,10 +219,10 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): self.rope_parameters["full_attention"].update(rope_scaling) self.rope_parameters["sliding_attention"].update(rope_scaling) self.rope_parameters["full_attention"].setdefault( - "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + "rope_theta", kwargs.pop("global_rope_theta", self.default_theta["global"]) ) self.rope_parameters["sliding_attention"].setdefault( - "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + "rope_theta", kwargs.pop("local_rope_theta", self.default_theta["local"]) ) # Standardize and validate the correctness of rotary position embeddings parameters diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index fdf612cc9436..cfff8ec564ad 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -28,7 +28,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -46,7 +46,7 @@ logger = logging.get_logger(__name__) -class ModernBertDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class ModernBertDecoderConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`ModernBertDecoderModel`]. It is used to instantiate a ModernBert decoder model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -141,8 +141,8 @@ class ModernBertDecoderConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): ```""" model_type = "modernbert-decoder" - attribute_map = {"rope_theta": "global_rope_theta"} keys_to_ignore_at_inference = ["past_key_values"] + default_theta = {"global": 160_000.0, "local": 10_000.0} def __init__( self, @@ -217,8 +217,6 @@ def __init__( # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta={"global": 160_000.0, "local": 10_000.0}, **kwargs) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -228,7 +226,7 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict(self, default_theta=None, **kwargs): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -242,15 +240,15 @@ def convert_rope_params_to_dict(self, default_theta=None, **kwargs): self.rope_parameters["full_attention"].update(rope_scaling) self.rope_parameters["sliding_attention"].update(rope_scaling) self.rope_parameters["full_attention"].setdefault( - "rope_theta", kwargs.pop("rope_theta", default_theta["global"]) + "rope_theta", kwargs.pop("global_rope_theta", self.default_theta["global"]) ) self.rope_parameters["sliding_attention"].setdefault( - "rope_theta", kwargs.pop("rope_local_base_freq", default_theta["local"]) + "rope_theta", kwargs.pop("local_rope_theta", self.default_theta["local"]) ) # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index cc29ec6e6680..ddc6b3c2ba8b 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -21,10 +21,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class MoonshineConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class MoonshineConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -175,8 +175,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.9) + kwargs.setdefault("partial_rotary_factor", 0.9) # assign default for BC super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 2260ac657d6d..717012fd1fc2 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -35,7 +35,7 @@ Seq2SeqLMOutput, Seq2SeqModelOutput, ) -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -47,7 +47,7 @@ logger = logging.get_logger(__name__) -class MoonshineConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class MoonshineConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`MoonshineModel`]. It is used to instantiate a Moonshine model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -198,8 +198,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.9) + kwargs.setdefault("partial_rotary_factor", 0.9) # assign default for BC super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index 6e0800d33b31..f17dd6dcc14b 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -149,7 +149,7 @@ def __init__( super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) -class MoshiConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class MoshiConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`MoshiModel`]. It is used to instantiate a Moshi model according to the specified arguments, defining the audio encoder, Moshi depth decoder and Moshi decoder @@ -283,7 +283,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.num_codebooks = num_codebooks self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) audio_encoder_config = kwargs.pop("audio_encoder_config", {}) audio_encoder_model_type = audio_encoder_config.pop("model_type", "mimi") diff --git a/src/transformers/models/nanochat/configuration_nanochat.py b/src/transformers/models/nanochat/configuration_nanochat.py index 998b08b31959..e690e26fe7f5 100644 --- a/src/transformers/models/nanochat/configuration_nanochat.py +++ b/src/transformers/models/nanochat/configuration_nanochat.py @@ -14,7 +14,7 @@ # limitations under the License. from ...configuration_utils import PretrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class NanoChatConfig(PretrainedConfig): @@ -144,6 +144,7 @@ def __init__( self.use_cache = use_cache self.final_logit_softcapping = final_logit_softcapping self.attention_bias = attention_bias + self.rope_parameters = rope_parameters super().__init__( bos_token_id=bos_token_id, @@ -153,12 +154,5 @@ def __init__( **kwargs, ) - # Validate the correctness of rotary position embeddings parameters - # Must be done after super().__init__() to avoid being overridden by kwargs - self.rope_parameters = rope_parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - __all__ = ["NanoChatConfig"] diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index 5f58f0672ef7..083efe87beed 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -18,14 +18,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class NemotronConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class NemotronConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`NemotronModel`]. It is used to instantiate an Nemotron model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -143,8 +143,7 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index 065fafddd54a..d72c6ee4d163 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -22,14 +22,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class OlmoConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class OlmoConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`OlmoModel`]. It is used to instantiate an OLMo model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -159,7 +159,6 @@ def __init__( self.attention_dropout = attention_dropout self.clip_qkv = clip_qkv self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py index eafce294cad0..d5a60ea02484 100644 --- a/src/transformers/models/olmo2/configuration_olmo2.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -27,10 +27,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class Olmo2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Olmo2Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Olmo2Model`]. It is used to instantiate an OLMo2 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -159,7 +159,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo3/configuration_olmo3.py b/src/transformers/models/olmo3/configuration_olmo3.py index 42d7873f5b43..5bd057c477a2 100644 --- a/src/transformers/models/olmo3/configuration_olmo3.py +++ b/src/transformers/models/olmo3/configuration_olmo3.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class Olmo3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Olmo3Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Olmo3Model`]. It is used to instantiate an OLMo3 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -170,7 +170,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/olmo3/modular_olmo3.py b/src/transformers/models/olmo3/modular_olmo3.py index c24bc09a0e75..488280bb7c55 100644 --- a/src/transformers/models/olmo3/modular_olmo3.py +++ b/src/transformers/models/olmo3/modular_olmo3.py @@ -25,7 +25,7 @@ from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding @@ -41,7 +41,7 @@ ) -class Olmo3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Olmo3Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Olmo3Model`]. It is used to instantiate an OLMo3 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -186,7 +186,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py index eb61e69bd69e..38316e05d364 100644 --- a/src/transformers/models/olmoe/configuration_olmoe.py +++ b/src/transformers/models/olmoe/configuration_olmoe.py @@ -14,10 +14,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class OlmoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class OlmoeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`OlmoeModel`]. It is used to instantiate an OLMoE model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -159,7 +159,6 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.norm_topk_prob = norm_topk_prob self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index 77e09a13a125..a11b1658811e 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class PersimmonConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class PersimmonConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`PersimmonModel`]. It is used to instantiate an Persimmon model according to the specified arguments, defining the model architecture. Instantiating a @@ -119,8 +119,7 @@ def __init__( self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index e073e1d619fa..5af2c3220c2f 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -18,14 +18,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class PhiConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class PhiConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`PhiModel`]. It is used to instantiate an Phi model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -161,8 +161,7 @@ def __init__( self.use_cache = use_cache self.qk_layernorm = qk_layernorm self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 63aa62b7cfb6..bcdff4058426 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -18,14 +18,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class Phi3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Phi3Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Phi3Model`]. It is used to instantiate a Phi-3 model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -164,7 +164,7 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + kwargs.setdefault("partial_rotary_factor", 1.0) # assign default for BC self.sliding_window = sliding_window super().__init__( @@ -184,8 +184,8 @@ def convert_rope_params_to_dict( # Standardize and validate the correctness of rotary position embeddings parameters self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) + self.rope_parameters.setdefault("partial_rotary_factor", kwargs["partial_rotary_factor"]) self.standardize_rope_params() - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 1.0) # For backward compatibility if previous version used "su" or "yarn" rope_parameters_type = self.rope_parameters.get("rope_type", None) diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index 49961cb8b773..c06858bea98a 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters class Phi4MultimodalVisionConfig(PreTrainedConfig): @@ -243,7 +243,7 @@ def __init__( self.nemo_final_size = length -class Phi4MultimodalConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Phi4MultimodalConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Phi4MultimodalModel`]. It is used to instantiate a Phi4Multimodal model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -405,7 +405,7 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + kwargs.setdefault("partial_rotary_factor", 1.0) # assign default for BC self.sliding_window = sliding_window super().__init__( @@ -425,8 +425,8 @@ def convert_rope_params_to_dict( # Standardize and validate the correctness of rotary position embeddings parameters self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) + self.rope_parameters.setdefault("partial_rotary_factor", kwargs["partial_rotary_factor"]) self.standardize_rope_params() - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 1.0) # For backward compatibility if previous version used "su" or "yarn" rope_parameters_type = self.rope_parameters.get("rope_type", None) diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index 57f40cd3b690..9c89b8036827 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -18,14 +18,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class PhimoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class PhimoeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`PhimoeModel`]. It is used to instantiate a Phi-moe model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -110,6 +110,7 @@ class PhimoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "phimoe" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 def __init__( self, @@ -168,7 +169,6 @@ def __init__( self.router_jitter_noise = router_jitter_noise self.input_jitter_noise = input_jitter_noise self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index f1de0f52e22d..89586615c4a2 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -16,14 +16,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class PixtralVisionConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class PixtralVisionConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`PixtralVisionModel`]. It is used to instantiate an Pixtral vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -102,7 +102,7 @@ def __init__( self.head_dim = hidden_size // num_attention_heads self.initializer_range = initializer_range self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index 893c402e00d2..ffb46ca932aa 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class Qwen2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen2Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen2Model`]. It is used to instantiate a Qwen2 model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -169,7 +169,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 5cf841a78e40..2fb4c25e88e4 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -211,7 +211,7 @@ def __init__( self.output_dim = output_dim -class Qwen2_5OmniTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen2_5OmniTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen2_5OmniThinkerForConditionalGeneration`]. It is used to instantiate an Qwen2.5-Omni-Thinker model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -291,6 +291,7 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "qwen2_5_omni_text" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 # Default tensor parallel plan for base model `Qwen25OmniText` base_model_tp_plan = { @@ -362,7 +363,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000, ignore_keys={"mrope_section"}, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, **kwargs, @@ -489,7 +489,7 @@ def __init__( super().__init__(**kwargs) -class Qwen2_5OmniTalkerConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen2_5OmniTalkerConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen2_5OmniTalkerForConditionalGeneration`]. It is used to instantiate an Qwen2.5-Omni-Talker model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -608,6 +608,7 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): ```""" model_type = "qwen2_5_omni_talker" + default_theta = 1000000.0 attribute_map = { "image_token_id": "image_token_index", "video_token_id": "video_token_index", @@ -711,12 +712,11 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000, ignore_keys={"mrope_section"}, **kwargs) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) -class Qwen2_5OmniDiTConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen2_5OmniDiTConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of the Qwen2_5OmniToken2WavDiT used in the Qwen2.5-Omni-Token2Wav model. It defines the architecture of the DiT model, which is used for generating mel-spectrograms from tokens. @@ -813,7 +813,7 @@ def __init__( self.enc_res2net_scale = enc_res2net_scale self.enc_se_channels = enc_se_channels self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index f7de647f8a37..44c71c5eb0fa 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -45,7 +45,7 @@ from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutput, ModelOutput -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import ( @@ -245,7 +245,7 @@ def __init__( del self.encoder_layerdrop -class Qwen2_5OmniTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen2_5OmniTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen2_5OmniThinkerForConditionalGeneration`]. It is used to instantiate an Qwen2.5-Omni-Thinker model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -325,6 +325,7 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "qwen2_5_omni_text" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 # Default tensor parallel plan for base model `Qwen25OmniText` base_model_tp_plan = { @@ -396,9 +397,9 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000, ignore_keys={"mrope_section"}, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope"}, **kwargs, ) @@ -523,7 +524,7 @@ def __init__( super().__init__(**kwargs) -class Qwen2_5OmniTalkerConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen2_5OmniTalkerConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen2_5OmniTalkerForConditionalGeneration`]. It is used to instantiate an Qwen2.5-Omni-Talker model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -642,6 +643,7 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): ```""" model_type = "qwen2_5_omni_talker" + default_theta = 1000000.0 attribute_map = { "image_token_id": "image_token_index", "video_token_id": "video_token_index", @@ -745,12 +747,10 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000, ignore_keys={"mrope_section"}, **kwargs) + super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs) - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) - -class Qwen2_5OmniDiTConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen2_5OmniDiTConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of the Qwen2_5OmniToken2WavDiT used in the Qwen2.5-Omni-Token2Wav model. It defines the architecture of the DiT model, which is used for generating mel-spectrograms from tokens. @@ -847,7 +847,7 @@ def __init__( self.enc_res2net_scale = enc_res2net_scale self.enc_se_channels = enc_se_channels self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index 934e7b0ebaa4..e474080a711e 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -28,7 +28,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters class Qwen2_5_VLVisionConfig(PreTrainedConfig): @@ -71,7 +71,7 @@ def __init__( self.initializer_range = initializer_range -class Qwen2_5_VLTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen2_5_VLTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen2_5_VLTextModel`]. It is used to instantiate a Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -151,6 +151,7 @@ class Qwen2_5_VLTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "qwen2_5_vl_text" base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 # Default tensor parallel plan for base model `Qwen2_5_VL` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -224,8 +225,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000, ignore_keys={"mrope_section"}, **kwargs) - super().__init__( tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py index d97ca8ffe59a..567bc0d66dd5 100644 --- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class Qwen2MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen2MoeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen2MoeModel`]. It is used to instantiate a Qwen2MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -207,7 +207,6 @@ def __init__( layer_type_validation(self.layer_types) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index a00aa3b887f1..b50ffdef6c5f 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -59,7 +59,7 @@ def __init__( self.initializer_range = initializer_range -class Qwen2VLTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen2VLTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen2VLTextModel`]. It is used to instantiate a Qwen2-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -139,6 +139,7 @@ class Qwen2VLTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "qwen2_vl_text" base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 # Default tensor parallel plan for base model `Qwen2VL` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -212,13 +213,12 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=1000000, ignore_keys={"mrope_section"}, **kwargs) - super().__init__( tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, + ignore_keys_at_rope_validation={"mrope"}, **kwargs, ) diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index a108700133a8..f3e3caf4061d 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class Qwen3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -177,7 +177,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index 3ceb04c7199c..2d0be9fbff12 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class Qwen3MoeConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3MoeConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3MoeModel`]. It is used to instantiate a Qwen3MoE model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -180,7 +180,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index 1456c18105cb..b6f8ea322905 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class Qwen3NextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3NextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3NextModel`]. It is used to instantiate a Qwen3-Next model according to the specified arguments, defining the model architecture. @@ -199,8 +199,7 @@ def __init__( self.attention_dropout = attention_dropout self.head_dim = head_dim self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.25) + kwargs.setdefault("partial_rotary_factor", 0.25) # assign default for BC self.layer_types = layer_types if self.layer_types is None: diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index ed29ec68420a..4015bf442bfc 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -171,7 +171,7 @@ def __init__( self.deepstack_visual_indexes = deepstack_visual_indexes -class Qwen3OmniMoeTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3OmniMoeTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3OmniMoeTextModel`]. It is used to instantiate a Qwen3OmniMoeText model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -446,7 +446,7 @@ def __init__( self.video_token_id = video_token_id -class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3OmniMoeTalkerCodePredictorConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3OmniMoeTalkerCodePredictorModel`]. It is used to instantiate a Qwen3OmniMoeTalkerCodePredictor model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -596,7 +596,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( tie_word_embeddings=tie_word_embeddings, @@ -605,7 +604,7 @@ def __init__( self.num_code_groups = num_code_groups -class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3OmniMoeTalkerTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3OmniMoeTalkerTextModel`]. It is used to instantiate a Qwen3OmniMoeTalkerText model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -759,7 +758,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -913,7 +911,7 @@ def __init__( super().__init__(**kwargs) -class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3OmniMoeCode2WavConfig`]. It is used to instantiate a Qwen3-Omni code-to-waveform decoder, responsible for converting discrete audio codes into high-fidelity waveforms. @@ -1020,7 +1018,7 @@ def __init__( self.decoder_dim = decoder_dim self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__(**kwargs) @property diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index 4b5654af4bc2..d061aaf5e321 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -42,7 +42,7 @@ MoeCausalLMOutputWithPast, MoeModelOutputWithPast, ) -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import PreTrainedModel from ...processing_utils import ProcessorMixin, Unpack from ...tokenization_utils_base import TextInput @@ -156,7 +156,7 @@ class Qwen3OmniMoeVisionEncoderConfig(Qwen3VLMoeVisionConfig): pass -class Qwen3OmniMoeTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3OmniMoeTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3OmniMoeTextModel`]. It is used to instantiate a Qwen3OmniMoeText model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -246,6 +246,7 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "qwen3_omni_moe_text" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 # Default tensor parallel plan for base model `Qwen3OmniMoeText` base_model_tp_plan = { @@ -310,9 +311,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict( - default_theta=1000000, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}, **kwargs - ) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -326,6 +324,7 @@ def __init__( super().__init__( tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope_section", "interleaved", "mrope_interleaved"}, **kwargs, ) @@ -672,7 +671,7 @@ def __init__( super().__init__(**kwargs) -class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3OmniMoeCode2WavConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3OmniMoeCode2WavConfig`]. It is used to instantiate a Qwen3-Omni code-to-waveform decoder, responsible for converting discrete audio codes into high-fidelity waveforms. @@ -779,7 +778,7 @@ def __init__( self.decoder_dim = decoder_dim self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) + super().__init__(**kwargs) @property diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index 227e8454d03e..ef4791135e12 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters class Qwen3VLVisionConfig(PreTrainedConfig): @@ -62,7 +62,7 @@ def __init__( self.deepstack_visual_indexes = deepstack_visual_indexes -class Qwen3VLTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3VLTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3VLTextModel`]. It is used to instantiate a Qwen3-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index e14419657a2c..82b385c53744 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -30,7 +30,7 @@ from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin, dynamic_rope_update +from ...modeling_rope_utils import RopeParameters, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import ProcessingKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -103,7 +103,7 @@ def __init__( self.deepstack_visual_indexes = deepstack_visual_indexes -class Qwen3VLTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3VLTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3VLTextModel`]. It is used to instantiate a Qwen3-VL model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -171,6 +171,7 @@ class Qwen3VLTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "qwen3_vl_text" base_config_key = "text_config" + default_theta = 500000.0 def __init__( self, @@ -212,11 +213,12 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict( - default_theta=500000, ignore_keys={"mrope_section", "mrope_interleaved"}, **kwargs - ) - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__( + tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope_section", "mrope_interleaved"}, + **kwargs, + ) class Qwen3VLConfig(PreTrainedConfig): diff --git a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py index 617da3ad9886..075f357db772 100644 --- a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class Qwen3VLMoeTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3VLMoeTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3VLMoeTextModel`]. It is used to instantiate a Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index 21f59b7d743b..1186b8433cf4 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -23,7 +23,7 @@ from ...activations import ACT2FN from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -47,7 +47,7 @@ logger = logging.get_logger(__name__) -class Qwen3VLMoeTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Qwen3VLMoeTextConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Qwen3VLMoeTextModel`]. It is used to instantiate a Qwen3-VL-MOE model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -128,6 +128,7 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "qwen3_vl_moe_text" base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 # Default tensor parallel plan for base model `Qwen3VLMoe` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -189,9 +190,6 @@ def __init__( self.attention_dropout = attention_dropout self.head_dim = head_dim or hidden_size // num_attention_heads self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict( - default_theta=500000, ignore_keys={"mrope_section", "mrope_interleaved"}, **kwargs - ) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -200,7 +198,11 @@ def __init__( self.num_experts = num_experts self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__( + tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope_section", "mrope_interleaved"}, + **kwargs, + ) class Qwen3VLMoeVisionConfig(Qwen3VLVisionConfig): diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index b172b1897686..ce5c63210478 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class RecurrentGemmaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class RecurrentGemmaConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`RecurrentGemmaModel`]. It is used to instantiate a RecurrentGemma model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -147,8 +147,7 @@ def __init__( self.w_init_variance_scale = w_init_variance_scale self.final_w_init_variance_scale = 2.0 / self.num_hidden_layers self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.5) + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/seed_oss/configuration_seed_oss.py b/src/transformers/models/seed_oss/configuration_seed_oss.py index 28e41e4c3f42..63a5c20c2858 100644 --- a/src/transformers/models/seed_oss/configuration_seed_oss.py +++ b/src/transformers/models/seed_oss/configuration_seed_oss.py @@ -16,10 +16,10 @@ from typing import Optional from transformers.configuration_utils import PreTrainedConfig -from transformers.modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from transformers.modeling_rope_utils import RopeParameters -class SeedOssConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class SeedOssConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`SeedOssModel`]. It is used to instantiate an SeedOss model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -171,7 +171,6 @@ def __init__( self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py index f6c699f04d63..03701376dc26 100644 --- a/src/transformers/models/smollm3/configuration_smollm3.py +++ b/src/transformers/models/smollm3/configuration_smollm3.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class SmolLM3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class SmolLM3Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`SmolLM3Model`]. It is used to instantiate a SmolLM3 model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -108,6 +108,7 @@ class SmolLM3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "smollm3" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 2000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -196,8 +197,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=2000000, **kwargs) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py index c68f2c1c15d4..c72abf8e1b96 100644 --- a/src/transformers/models/smollm3/modular_smollm3.py +++ b/src/transformers/models/smollm3/modular_smollm3.py @@ -21,7 +21,7 @@ from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...modeling_flash_attention_utils import FlashAttentionKwargs -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import logging @@ -42,7 +42,7 @@ logger = logging.get_logger(__name__) -class SmolLM3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class SmolLM3Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`SmolLM3Model`]. It is used to instantiate a SmolLM3 model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -125,6 +125,7 @@ class SmolLM3Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): model_type = "smollm3" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 2000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -213,8 +214,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=2000000, **kwargs) - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index 907c28c4b56b..4e06ef7c4edd 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class StableLmConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class StableLmConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`~StableLmModel`]. It is used to instantiate an StableLM model according to the specified arguments, defining the model @@ -146,8 +146,7 @@ def __init__( self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) - self.rope_parameters["partial_rotary_factor"] = kwargs.pop("partial_rotary_factor", 0.25) + kwargs.setdefault("partial_rotary_factor", 0.25) # assign default for BC super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py index 4d24650c1f54..d0fcfea44b42 100644 --- a/src/transformers/models/starcoder2/configuration_starcoder2.py +++ b/src/transformers/models/starcoder2/configuration_starcoder2.py @@ -17,14 +17,14 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters from ...utils import logging logger = logging.get_logger(__name__) -class Starcoder2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Starcoder2Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -156,7 +156,6 @@ def __init__( self.residual_dropout = residual_dropout self.embedding_dropout = embedding_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py index 4a513c04341e..93883844a1eb 100644 --- a/src/transformers/models/t5gemma/configuration_t5gemma.py +++ b/src/transformers/models/t5gemma/configuration_t5gemma.py @@ -22,10 +22,10 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class T5GemmaModuleConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class T5GemmaModuleConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`T5GemmaModuleModel`]. It is used to instantiate an T5GemmaModule model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -177,7 +177,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/vaultgemma/configuration_vaultgemma.py b/src/transformers/models/vaultgemma/configuration_vaultgemma.py index 9b83891bc688..a4dc54e3f028 100644 --- a/src/transformers/models/vaultgemma/configuration_vaultgemma.py +++ b/src/transformers/models/vaultgemma/configuration_vaultgemma.py @@ -22,10 +22,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class VaultGemmaConfig(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class VaultGemmaConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`VaultGemmaModel`]. It is used to instantiate an VaultGemma model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -177,7 +177,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10_000.0, **kwargs) + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py index 9ae04a3f3a20..69cfd2404f0a 100644 --- a/src/transformers/models/zamba2/configuration_zamba2.py +++ b/src/transformers/models/zamba2/configuration_zamba2.py @@ -23,10 +23,10 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin +from ...modeling_rope_utils import RopeParameters -class Zamba2Config(PreTrainedConfig, RotaryEmbeddingConfigMixin): +class Zamba2Config(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`Zamba2Model`]. It is used to instantiate a Zamba2 model according to the specified arguments, defining the model architecture. Instantiating a configuration @@ -190,7 +190,6 @@ def __init__( self.use_mem_rope = use_mem_rope self.use_long_context = use_long_context self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict(default_theta=10000, **kwargs) self.mamba_d_state = mamba_d_state self.mamba_d_conv = mamba_d_conv From 0bb5402ba67853c02169a00288265686b32af937 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 28 Nov 2025 11:22:46 +0100 Subject: [PATCH 21/23] fix copies --- src/transformers/models/cwm/configuration_cwm.py | 2 +- src/transformers/models/gemma3/configuration_gemma3.py | 4 ++-- .../models/gemma3n/configuration_gemma3n.py | 7 +++---- src/transformers/models/glm4v/configuration_glm4v.py | 6 +----- .../models/glm4v_moe/configuration_glm4v_moe.py | 2 +- .../models/modernbert/configuration_modernbert.py | 4 ++-- .../configuration_modernbert_decoder.py | 4 ++-- .../models/qwen2_5_omni/configuration_qwen2_5_omni.py | 4 ++-- .../models/qwen2_5_vl/configuration_qwen2_5_vl.py | 1 + .../qwen3_omni_moe/configuration_qwen3_omni_moe.py | 5 ++--- .../models/qwen3_vl/configuration_qwen3_vl.py | 10 ++++++---- .../models/qwen3_vl_moe/configuration_qwen3_vl_moe.py | 10 ++++++---- 12 files changed, 29 insertions(+), 30 deletions(-) diff --git a/src/transformers/models/cwm/configuration_cwm.py b/src/transformers/models/cwm/configuration_cwm.py index b1790aa10696..6c482fd4fc65 100644 --- a/src/transformers/models/cwm/configuration_cwm.py +++ b/src/transformers/models/cwm/configuration_cwm.py @@ -91,7 +91,6 @@ class CwmConfig(PreTrainedConfig): model_type = "cwm" keys_to_ignore_at_inference = ["past_key_values"] - default_theta = 1_000_000.0 # Default tensor parallel plan for base model `CwmModel` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -107,6 +106,7 @@ class CwmConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + default_theta = 1_000_000.0 def __init__( self, diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index 17c25e7f506a..884020640c9b 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -116,7 +116,6 @@ class Gemma3TextConfig(PreTrainedConfig): model_type = "gemma3_text" keys_to_ignore_at_inference = ["past_key_values"] - default_theta = {"global": 1_000_000.0, "local": 10_000.0} base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -131,6 +130,7 @@ class Gemma3TextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + default_theta = {"global": 1_000_000.0, "local": 10_000.0} def __init__( self, @@ -204,7 +204,7 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation, **kwargs): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index 60c709ec55c7..b7415b21877d 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -143,7 +143,6 @@ class Gemma3nTextConfig(PreTrainedConfig): model_type = "gemma3n_text" keys_to_ignore_at_inference = ["past_key_values"] - default_theta = {"global": 1_000_000.0, "local": 10_000.0} base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -158,6 +157,7 @@ class Gemma3nTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + default_theta = {"global": 1_000_000.0, "local": 10_000.0} def __init__( self, @@ -250,7 +250,6 @@ def __init__( ) self.activation_sparsity_pattern = activation_sparsity_pattern self.rope_parameters = rope_parameters - super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -258,7 +257,7 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict(self, **kwargs): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -279,7 +278,7 @@ def convert_rope_params_to_dict(self, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index bb5f6e574354..35c29f07246d 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -234,11 +234,7 @@ def __init__( self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - super().__init__( - tie_word_embeddings=tie_word_embeddings, - ignore_keys_at_rope_validation={"mrope"}, - **kwargs, - ) + super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs) class Glm4vConfig(PreTrainedConfig): diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index 8cfd0124277e..20e4f3ad492c 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -280,7 +280,7 @@ def __init__( self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.router_aux_loss_coef = router_aux_loss_coef - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs) class Glm4vMoeConfig(PreTrainedConfig): diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index e9064615575a..80e6c19092c6 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -226,7 +226,7 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict(self, **kwargs): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -248,7 +248,7 @@ def convert_rope_params_to_dict(self, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs def to_dict(self): diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index 7e35a8bd3c3b..aaca8cef86c0 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -205,7 +205,7 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict(self, **kwargs): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` @@ -227,7 +227,7 @@ def convert_rope_params_to_dict(self, **kwargs): # Standardize and validate the correctness of rotary position embeddings parameters self.standardize_rope_params() - self.validate_rope() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 2fb4c25e88e4..6a23e0668083 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -365,6 +365,7 @@ def __init__( self.rope_parameters = rope_parameters super().__init__( tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope"}, **kwargs, ) @@ -712,8 +713,7 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) self.rope_parameters = rope_parameters - - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs) class Qwen2_5OmniDiTConfig(PreTrainedConfig): diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index e474080a711e..6aaecc20c40f 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -230,6 +230,7 @@ def __init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, + ignore_keys_at_rope_validation={"mrope"}, **kwargs, ) diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index 4015bf442bfc..43e6250c0fd4 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -261,6 +261,7 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig): model_type = "qwen3_omni_moe_text" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 # Default tensor parallel plan for base model `Qwen3OmniMoeText` base_model_tp_plan = { @@ -325,9 +326,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict( - default_theta=1000000, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}, **kwargs - ) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -341,6 +339,7 @@ def __init__( super().__init__( tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope_section", "interleaved", "mrope_interleaved"}, **kwargs, ) diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index ef4791135e12..cf6f17364672 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -130,6 +130,7 @@ class Qwen3VLTextConfig(PreTrainedConfig): model_type = "qwen3_vl_text" base_config_key = "text_config" + default_theta = 500000.0 def __init__( self, @@ -171,11 +172,12 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict( - default_theta=500000, ignore_keys={"mrope_section", "mrope_interleaved"}, **kwargs - ) - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__( + tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope_section", "mrope_interleaved"}, + **kwargs, + ) class Qwen3VLConfig(PreTrainedConfig): diff --git a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py index 075f357db772..bdf9d32c57cc 100644 --- a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py @@ -106,6 +106,7 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig): model_type = "qwen3_vl_moe_text" base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 # Default tensor parallel plan for base model `Qwen3VLMoe` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -167,9 +168,6 @@ def __init__( self.attention_dropout = attention_dropout self.head_dim = head_dim or hidden_size // num_attention_heads self.rope_parameters = rope_parameters - kwargs = self.convert_rope_params_to_dict( - default_theta=500000, ignore_keys={"mrope_section", "mrope_interleaved"}, **kwargs - ) # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -178,7 +176,11 @@ def __init__( self.num_experts = num_experts self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__( + tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope_section", "mrope_interleaved"}, + **kwargs, + ) class Qwen3VLMoeVisionConfig(PreTrainedConfig): From 3e18fd38e8752bf387dba8e779f2e8fa5db53e61 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 28 Nov 2025 11:42:07 +0100 Subject: [PATCH 22/23] fix a few tests --- src/transformers/configuration_utils.py | 5 ++- src/transformers/modeling_rope_utils.py | 1 + .../qwen2_5_vl/configuration_qwen2_5_vl.py | 8 ++-- .../models/qwen2_vl/configuration_qwen2_vl.py | 8 ++-- tests/causal_lm_tester.py | 39 ++++++++++++++++--- 5 files changed, 44 insertions(+), 17 deletions(-) diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index f9e3d97a3c28..d93f5a6e5626 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -264,7 +264,10 @@ def __init__( # BC for rotary embeddings. We will pop out legacy keys from kwargs and rename to new format if hasattr(self, "rope_parameters"): - kwargs = self.convert_rope_params_to_dict(**kwargs) + ignore_keys_at_rope_validation = kwargs.pop("ignore_keys_at_rope_validation", None) + kwargs = self.convert_rope_params_to_dict( + ignore_keys_at_rope_validation=ignore_keys_at_rope_validation, **kwargs + ) # Attributes common for all models self.return_dict = return_dict diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index 01880039deaa..de27e5f8bd20 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -642,6 +642,7 @@ def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: Optional[s self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) if "partial_rotary_factor" in kwargs: self.rope_parameters.setdefault("partial_rotary_factor", kwargs["partial_rotary_factor"]) + ignore_keys_at_rope_validation = {"partial_rotary_factor"} self.standardize_rope_params() self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index 6aaecc20c40f..084b4d8c9ce6 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -234,19 +234,17 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict( - self, default_theta: int | float = 10_000.0, ignore_keys: Optional[set] = None, **kwargs - ): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: Optional[set] = None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} # Standardize and validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) if self.rope_parameters.get("rope_type", self.rope_parameters.get("type")) == "mrope": self.rope_parameters["rope_type"] = "default" self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys) + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index b50ffdef6c5f..e4578375036f 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -222,19 +222,17 @@ def __init__( **kwargs, ) - def convert_rope_params_to_dict( - self, default_theta: int | float = 10_000.0, ignore_keys: Optional[set] = None, **kwargs - ): + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: Optional[set] = None, **kwargs): rope_scaling = kwargs.pop("rope_scaling", None) self.rope_parameters = rope_scaling or self.rope_parameters self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} # Standardize and validate the correctness of rotary position embeddings parameters - self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) if self.rope_parameters.get("rope_type", self.rope_parameters.get("type")) == "mrope": self.rope_parameters["rope_type"] = "default" self.standardize_rope_params() - self.validate_rope(ignore_keys=ignore_keys) + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) return kwargs diff --git a/tests/causal_lm_tester.py b/tests/causal_lm_tester.py index 1890ff42083d..cc5095e69ce0 100644 --- a/tests/causal_lm_tester.py +++ b/tests/causal_lm_tester.py @@ -433,11 +433,14 @@ def test_model_rope_scaling_from_config(self, scaling_type): if not _config_supports_rope_scaling(config): self.skipTest("This model does not support RoPE scaling") + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) short_input = ids_tensor([1, 10], config.vocab_size) long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) set_seed(42) # Fixed seed at init time so the two models get the same random weights - _set_config_rope_params(config, {"rope_type": "default", "rope_theta": 10_000.0, "partial_rotary_factor": 1.0}) + _set_config_rope_params( + config, {"rope_type": "default", "rope_theta": 10_000.0, "partial_rotary_factor": partial_rotary_factor} + ) original_model = self.model_tester_class.base_model_class(config) original_model.to(torch_device) original_model.eval() @@ -446,7 +449,13 @@ def test_model_rope_scaling_from_config(self, scaling_type): set_seed(42) # Fixed seed at init time so the two models get the same random weights _set_config_rope_params( - config, {"rope_type": scaling_type, "factor": 10.0, "rope_theta": 10_000.0, "partial_rotary_factor": 1.0} + config, + { + "rope_type": scaling_type, + "factor": 10.0, + "rope_theta": 10_000.0, + "partial_rotary_factor": partial_rotary_factor, + }, ) scaled_model = self.model_tester_class.base_model_class(config) scaled_model.to(torch_device) @@ -487,6 +496,7 @@ def test_model_rope_scaling_frequencies(self): scaling_factor = 10 short_input_length = 10 + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) long_input_length = int(config.max_position_embeddings * 1.5) # Inputs @@ -499,7 +509,9 @@ def test_model_rope_scaling_frequencies(self): position_ids_long = position_ids_long.unsqueeze(0) # Sanity check original RoPE - _set_config_rope_params(config, {"rope_type": "default", "rope_theta": 10_000.0, "partial_rotary_factor": 1.0}) + _set_config_rope_params( + config, {"rope_type": "default", "rope_theta": 10_000.0, "partial_rotary_factor": partial_rotary_factor} + ) original_rope = rope_class(config=config).to(torch_device) original_cos_short, original_sin_short = original_rope(x, position_ids_short) original_cos_long, original_sin_long = original_rope(x, position_ids_long) @@ -510,7 +522,12 @@ def test_model_rope_scaling_frequencies(self): # New position "x" should match original position with index "x/scaling_factor" _set_config_rope_params( config, - {"rope_type": "linear", "factor": scaling_factor, "rope_theta": 10_000.0, "partial_rotary_factor": 1.0}, + { + "rope_type": "linear", + "factor": scaling_factor, + "rope_theta": 10_000.0, + "partial_rotary_factor": partial_rotary_factor, + }, ) linear_scaling_rope = rope_class(config=config).to(torch_device) linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) @@ -527,7 +544,12 @@ def test_model_rope_scaling_frequencies(self): # with scaling_factor (or that `inv_freq` decreases) _set_config_rope_params( config, - {"rope_type": "dynamic", "factor": scaling_factor, "rope_theta": 10_000.0, "partial_rotary_factor": 1.0}, + { + "rope_type": "dynamic", + "factor": scaling_factor, + "rope_theta": 10_000.0, + "partial_rotary_factor": partial_rotary_factor, + }, ) ntk_scaling_rope = rope_class(config=config).to(torch_device) ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) @@ -544,7 +566,12 @@ def test_model_rope_scaling_frequencies(self): # Scaling should be over the entire input _set_config_rope_params( config, - {"rope_type": "yarn", "factor": scaling_factor, "rope_theta": 10_000.0, "partial_rotary_factor": 1.0}, + { + "rope_type": "yarn", + "factor": scaling_factor, + "rope_theta": 10_000.0, + "partial_rotary_factor": partial_rotary_factor, + }, ) yarn_scaling_rope = rope_class(config=config).to(torch_device) yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) From 594523e9cbe24a3fc016bc1f308339e38d7705e9 Mon Sep 17 00:00:00 2001 From: raushan Date: Fri, 28 Nov 2025 11:53:21 +0100 Subject: [PATCH 23/23] update the migration guide --- MIGRATION_GUIDE_V5.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MIGRATION_GUIDE_V5.md b/MIGRATION_GUIDE_V5.md index 474aa478e2f3..913fbdf496f9 100644 --- a/MIGRATION_GUIDE_V5.md +++ b/MIGRATION_GUIDE_V5.md @@ -328,7 +328,7 @@ model_4bit = AutoModelForCausalLM.from_pretrained( - Methods to init a nested config such as `from_xxx_config` are deleted. Configs can be init from the `__init__` method in the same way. See [#41314](https://github.com/huggingface/transformers/pull/41314). - It is no longer possible to load a config class from a URL file. Configs must be loaded from either a local path or a repo on the Hub. See [#42383](https://github.com/huggingface/transformers/pull/42383). -- All parameters for configuring model's rotary embedding are now stored under `mode.rope_parameters`, including the `rope_theta` and `rope_type`. Model's `config.rope_parameters` is a simple dictionaty in most cases, and can also be a nested dict in special cases (i.e. Gemma3 and ModernBert) with different rope parameterization for each layer type. See [#39847](https://github.com/huggingface/transformers/pull/39847) +- All parameters for configuring model's rotary embedding are now stored under `mode.rope_parameters`, including the `rope_theta` and `rope_type`. Model's `config.rope_parameters` is a simple dictionaty in most cases, and can also be a nested dict in special cases (i.e. Gemma3 and ModernBert) with different rope parameterization for each layer type. Trying to get `config.rope_theta` will throw an attribute error from now on. See [#39847](https://github.com/huggingface/transformers/pull/39847) and [#42255](https://github.com/huggingface/transformers/pull/42255) - Qwen-VL family configuration is in a nested format and trying to access keys directly will throw an error (e.g. `config.vocab_size`). Users are expected to access keys from their respective sub-configs (`config.text_config.vocab_size`). ## Processing