diff --git a/MIGRATION_GUIDE_V5.md b/MIGRATION_GUIDE_V5.md index 474aa478e2f3..913fbdf496f9 100644 --- a/MIGRATION_GUIDE_V5.md +++ b/MIGRATION_GUIDE_V5.md @@ -328,7 +328,7 @@ model_4bit = AutoModelForCausalLM.from_pretrained( - Methods to init a nested config such as `from_xxx_config` are deleted. Configs can be init from the `__init__` method in the same way. See [#41314](https://github.com/huggingface/transformers/pull/41314). - It is no longer possible to load a config class from a URL file. Configs must be loaded from either a local path or a repo on the Hub. See [#42383](https://github.com/huggingface/transformers/pull/42383). -- All parameters for configuring model's rotary embedding are now stored under `mode.rope_parameters`, including the `rope_theta` and `rope_type`. Model's `config.rope_parameters` is a simple dictionaty in most cases, and can also be a nested dict in special cases (i.e. Gemma3 and ModernBert) with different rope parameterization for each layer type. See [#39847](https://github.com/huggingface/transformers/pull/39847) +- All parameters for configuring model's rotary embedding are now stored under `mode.rope_parameters`, including the `rope_theta` and `rope_type`. Model's `config.rope_parameters` is a simple dictionaty in most cases, and can also be a nested dict in special cases (i.e. Gemma3 and ModernBert) with different rope parameterization for each layer type. Trying to get `config.rope_theta` will throw an attribute error from now on. See [#39847](https://github.com/huggingface/transformers/pull/39847) and [#42255](https://github.com/huggingface/transformers/pull/42255) - Qwen-VL family configuration is in a nested format and trying to access keys directly will throw an error (e.g. `config.vocab_size`). Users are expected to access keys from their respective sub-configs (`config.text_config.vocab_size`). ## Processing diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 53d6a8a900a8..d93f5a6e5626 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -26,6 +26,7 @@ from . import __version__ from .dynamic_module_utils import custom_object_save from .modeling_gguf_pytorch_utils import load_gguf_checkpoint +from .modeling_rope_utils import RotaryEmbeddingConfigMixin from .utils import ( CONFIG_NAME, PushToHubMixin, @@ -49,7 +50,7 @@ SpecificPreTrainedConfigType = TypeVar("SpecificPreTrainedConfigType", bound="PreTrainedConfig") -class PreTrainedConfig(PushToHubMixin): +class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin): # no-format r""" Base class for all configuration classes. Handles a few parameters common to all models' configurations as well as @@ -261,6 +262,13 @@ def __init__( dtype = getattr(torch, dtype) + # BC for rotary embeddings. We will pop out legacy keys from kwargs and rename to new format + if hasattr(self, "rope_parameters"): + ignore_keys_at_rope_validation = kwargs.pop("ignore_keys_at_rope_validation", None) + kwargs = self.convert_rope_params_to_dict( + ignore_keys_at_rope_validation=ignore_keys_at_rope_validation, **kwargs + ) + # Attributes common for all models self.return_dict = return_dict self.output_hidden_states = output_hidden_states diff --git a/src/transformers/modeling_rope_utils.py b/src/transformers/modeling_rope_utils.py index c6a66ba1c4b3..de27e5f8bd20 100644 --- a/src/transformers/modeling_rope_utils.py +++ b/src/transformers/modeling_rope_utils.py @@ -14,9 +14,8 @@ import math from functools import wraps -from typing import Optional, TypedDict +from typing import TYPE_CHECKING, Optional, TypedDict -from .configuration_utils import PreTrainedConfig from .utils import is_torch_available, logging @@ -26,56 +25,8 @@ if is_torch_available(): import torch - -def standardize_rope_params(config, rope_theta: float | dict[str, float] | None = None): - """ - Helper to standardize the config's rope params field by ensuring the params are defined for each - later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) - """ - rope_parameters = getattr(config, "rope_parameters", None) - layer_types = getattr(config, "layer_types", None) - if rope_theta is None: - rope_theta = getattr(config, "rope_theta", None) - - # Case 1: one RoPE theat = one RoPE param per model without nesting - if not isinstance(rope_theta, dict): - if rope_parameters is None: - rope_parameters = {"rope_type": "default", "rope_theta": rope_theta} - else: - # BC: if there is a 'type' field, copy it it to 'rope_type'. - rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default")) - rope_theta = rope_parameters.get("rope_theta") or rope_theta - rope_parameters.update({"rope_theta": rope_theta, "rope_type": rope_type}) - config.rope_parameters = rope_parameters - - # Case 2: different RoPE for each layer as nested dict - else: - rope_parameters_per_layer_type = {} - for layer_type in layer_types: - if rope_parameters is None: - rope_parameters_per_layer_type[layer_type] = { - "rope_type": "default", - "rope_theta": rope_theta[layer_type], - } - else: - is_field_in_new_format = any(layer_type in rope_parameters for layer_type in layer_types) - if not is_field_in_new_format: - curr_rope_type = rope_parameters.get("rope_type", rope_parameters.get("type")) - rope_parameters_per_layer_type[layer_type] = { - **rope_parameters, - "rope_type": curr_rope_type, - "rope_theta": rope_theta[layer_type], - } - else: - curr_rope_type = rope_parameters[layer_type].get( - "rope_type", rope_parameters[layer_type].get("type") - ) - rope_parameters_per_layer_type[layer_type] = { - **rope_parameters[layer_type], - "rope_type": curr_rope_type, - "rope_theta": rope_theta[layer_type], - } - config.rope_parameters = rope_parameters_per_layer_type +if TYPE_CHECKING: + from .configuration_utils import PreTrainedConfig def dynamic_rope_update(rope_forward): @@ -176,7 +127,7 @@ def wrapper(self, x, position_ids, layer_type=None): def _compute_linear_scaling_rope_parameters( - config: Optional[PreTrainedConfig] = None, + config: Optional["PreTrainedConfig"] = None, device: Optional["torch.device"] = None, seq_len: Optional[int] = None, layer_type: Optional[str] = None, @@ -184,7 +135,7 @@ def _compute_linear_scaling_rope_parameters( """ Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev Args: - config ([`~transformers.PreTrainedConfig`]): + config ([`~transformers."PreTrainedConfig"`]): The model configuration. This function assumes that the config will provide at least the following properties: @@ -208,13 +159,13 @@ def _compute_linear_scaling_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + config.standardize_rope_params() rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters factor = rope_parameters_dict["factor"] # Gets the default RoPE parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) attention_factor = 1.0 # Unused in this type of RoPE @@ -230,7 +181,7 @@ def _compute_linear_scaling_rope_parameters( def _compute_dynamic_ntk_parameters( - config: Optional[PreTrainedConfig] = None, + config: Optional["PreTrainedConfig"] = None, device: Optional["torch.device"] = None, seq_len: Optional[int] = None, layer_type: Optional[str] = None, @@ -239,7 +190,7 @@ def _compute_dynamic_ntk_parameters( Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla Args: - config ([`~transformers.PreTrainedConfig`]): + config ([`~transformers."PreTrainedConfig"`]): The model configuration. This function assumes that the config will provide at least the following properties: @@ -273,11 +224,11 @@ def _compute_dynamic_ntk_parameters( """ # TODO (joao): use the new `original_max_position_embeddings` from rope_parameters # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + config.standardize_rope_params() rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) max_position_embeddings = config.max_position_embeddings @@ -302,7 +253,7 @@ def _compute_dynamic_ntk_parameters( def _compute_yarn_parameters( - config: PreTrainedConfig, + config: "PreTrainedConfig", device: "torch.device", seq_len: Optional[int] = None, layer_type: Optional[str] = None, @@ -312,7 +263,7 @@ def _compute_yarn_parameters( [original paper](https://huggingface.co/papers/2309.00071) Args: - config ([`~transformers.PreTrainedConfig`]): + config ([`~transformers."PreTrainedConfig"`]): The model configuration. This function assumes that the config will provide at least the following properties: @@ -360,11 +311,11 @@ def _compute_yarn_parameters( post-processing scaling factor applied to the computed cos/sin. """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + config.standardize_rope_params() rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) @@ -440,7 +391,7 @@ def linear_ramp_factor(min, max, dim): def _compute_longrope_parameters( - config: PreTrainedConfig, + config: "PreTrainedConfig", device: "torch.device", seq_len: Optional[int] = None, layer_type: Optional[str] = None, @@ -450,7 +401,7 @@ def _compute_longrope_parameters( [original implementation](https://github.com/microsoft/LongRoPE) Args: - config ([`~transformers.PreTrainedConfig`]): + config ([`~transformers."PreTrainedConfig"`]): The model configuration. This function assumes that the config will provide at least the following properties: @@ -490,11 +441,11 @@ def _compute_longrope_parameters( """ # TODO (joao): use the new `original_max_position_embeddings` from rope_parameters # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + config.standardize_rope_params() rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) dim = int(head_dim * partial_rotary_factor) @@ -530,7 +481,7 @@ def _compute_longrope_parameters( def _compute_llama3_parameters( - config: PreTrainedConfig, + config: "PreTrainedConfig", device: "torch.device", seq_len: Optional[int] = None, layer_type: Optional[str] = None, @@ -539,7 +490,7 @@ def _compute_llama3_parameters( Computes the inverse frequencies for llama 3.1. Args: - config ([`~transformers.PreTrainedConfig`]): + config ([`~transformers."PreTrainedConfig"`]): The model configuration. This function assumes that the config will provide at least the following properties: @@ -574,12 +525,12 @@ def _compute_llama3_parameters( post-processing scaling factor applied to the computed cos/sin. """ # For backward compatibility standardize the `rope_parameters_dict` if it uses old format - standardize_rope_params(config) + config.standardize_rope_params() rope_parameters_dict = config.rope_parameters[layer_type] if layer_type is not None else config.rope_parameters # Gets the default RoPE parameters base = rope_parameters_dict["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = rope_parameters_dict.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) attention_factor = 1.0 # Unused in this type of RoPE @@ -620,272 +571,7 @@ def _compute_llama3_parameters( } -def _check_received_keys( - rope_type: str, - received_keys: set, - required_keys: set, - optional_keys: Optional[set] = None, - ignore_keys: Optional[set] = None, -): - """Compare the received keys in `config.rope_parameters` against the expected and optional keys""" - # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present - if "type" in received_keys: - received_keys -= {"type"} - required_keys.add("rope_type") - - # Some models need to store model-specific keys, and we don't want to throw warning at them - if ignore_keys is not None: - received_keys -= ignore_keys - - missing_keys = required_keys - received_keys - if missing_keys: - raise KeyError(f"Missing required keys in `rope_parameters` for 'rope_type'='{rope_type}': {missing_keys}") - - if optional_keys is not None: - unused_keys = received_keys - required_keys - optional_keys - else: - unused_keys = received_keys - required_keys - if unused_keys: - logger.warning(f"Unrecognized keys in `rope_parameters` for 'rope_type'='{rope_type}': {unused_keys}") - - -def _validate_default_rope_parameters( - rope_parameters: dict, config: Optional[PreTrainedConfig] = None, ignore_keys: Optional[set] = None -): - required_keys = {"rope_type", "rope_theta"} - received_keys = set(rope_parameters.keys()) - rope_type = rope_parameters["rope_type"] - _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) - - -def _validate_linear_scaling_rope_parameters( - rope_parameters: dict, config: Optional[PreTrainedConfig] = None, ignore_keys: Optional[set] = None -): - required_keys = {"rope_type", "factor", "rope_theta"} - received_keys = set(rope_parameters.keys()) - rope_type = rope_parameters["rope_type"] - _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) - - factor = rope_parameters["factor"] - if factor is None or not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - - -def _validate_dynamic_scaling_rope_parameters( - rope_parameters: dict, config: Optional[PreTrainedConfig] = None, ignore_keys: Optional[set] = None -): - # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` - optional_keys = {"original_max_position_embeddings"} - required_keys = {"rope_type", "factor"} - received_keys = set(rope_parameters.keys()) - rope_type = rope_parameters["rope_type"] - _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) - - factor = rope_parameters["factor"] - if factor is None or not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - - -def _validate_yarn_parameters( - rope_parameters: dict, config: Optional[PreTrainedConfig] = None, ignore_keys: Optional[set] = None -): - required_keys = {"rope_type", "factor", "rope_theta"} - optional_keys = { - "attention_factor", - "beta_fast", - "beta_slow", - "original_max_position_embeddings", - "mscale", - "mscale_all_dim", - } - received_keys = set(rope_parameters.keys()) - rope_type = rope_parameters["rope_type"] - _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) - - factor = rope_parameters["factor"] - if factor is None or not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - - attention_factor = rope_parameters.get("attention_factor") - if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0): - logger.warning( - f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}" - ) - beta_fast = rope_parameters.get("beta_fast") - if beta_fast is not None and not isinstance(beta_fast, float): - logger.warning(f"`rope_parameters`'s beta_fast field must be a float, got {beta_fast}") - beta_slow = rope_parameters.get("beta_slow") - if beta_slow is not None and not isinstance(beta_slow, float): - logger.warning(f"`rope_parameters`'s beta_slow field must be a float, got {beta_slow}") - - if (beta_fast or 32) < (beta_slow or 1): - logger.warning( - f"`rope_parameters`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " - f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)" - ) - - # Models should set `config.rope_parameters["original_max_position_embeddings"]` to their original (pre-yarn) context - # length, with `config.max_position_embeddings` corresponding to their post-yarn context length. - # However, for BC purposes, we allow the former to be unset. - original_max_position_embeddings = config.rope_parameters.get("original_max_position_embeddings") - if original_max_position_embeddings is not None: - # Double-check: `factor` should be the ratio between the pre-yarn and post-yarn context lengths. - implicit_factor = config.max_position_embeddings / original_max_position_embeddings - if implicit_factor != factor: - logger.warning_once( - f"The explicitly set RoPE scaling factor (config.rope_parameters['factor'] = {factor}) does not match " - "the ratio implicitly set by other parameters (implicit factor = " - "post-yarn context length / pre-yarn context length = " - "config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = " - f"{implicit_factor}). Using the explicit factor ({factor}) in YaRN. This may cause unexpected " - "behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config." - ) - # No `config.rope_parameters["original_max_position_embeddings"]`. Is `config.max_position_embeddings` the - # pre-yarn or the post-yarn context length? - # BC: we assume it is the pre-yarn context length. - else: - logger.warning_once( - "config.rope_parameters['original_max_position_embeddings'], the pre-yarn context length, is unset. We will " - "**assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect " - "config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * " - "factor) -- we recommend updating both fields for optimal downstream model usage." - ) - - -def _validate_longrope_parameters(rope_parameters: dict, config: PreTrainedConfig, ignore_keys: Optional[set] = None): - required_keys = {"rope_type", "short_factor", "long_factor", "rope_theta"} - # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` - optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"} - received_keys = set(rope_parameters.keys()) - rope_type = rope_parameters["rope_type"] - _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) - - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) - head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) - dim = int(head_dim * partial_rotary_factor) - - short_factor = rope_parameters.get("short_factor") - if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor): - logger.warning(f"`rope_parameters`'s short_factor field must be a list of numbers, got {short_factor}") - if len(short_factor) != dim // 2: - logger.warning(f"`rope_parameters`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") - - long_factor = rope_parameters.get("long_factor") - if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor): - logger.warning(f"`rope_parameters`'s long_factor field must be a list of numbers, got {long_factor}") - if len(long_factor) != dim // 2: - logger.warning(f"`rope_parameters`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") - - # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over - # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_parameters` and is - # unique to longrope (= undesirable) - if hasattr(config, "original_max_position_embeddings"): - logger.warning_once( - "This model has set a `original_max_position_embeddings` field, to be used together with " - "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`" - "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, " - "as it is compatible with most model architectures." - ) - else: - factor = rope_parameters.get("factor") - if factor is None: - logger.warning("Missing required keys in `rope_parameters`: 'factor'") - elif not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - - attention_factor = rope_parameters.get("attention_factor") - if attention_factor is not None: - if not isinstance(attention_factor, float) or attention_factor < 0.0: - logger.warning( - f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}" - ) - - -def _validate_llama3_parameters(rope_parameters: dict, config: PreTrainedConfig, ignore_keys: Optional[set] = None): - required_keys = { - "rope_type", - "factor", - "original_max_position_embeddings", - "low_freq_factor", - "high_freq_factor", - "rope_theta", - } - rope_type = rope_parameters["rope_type"] - received_keys = set(rope_parameters.keys()) - _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) - - factor = rope_parameters["factor"] - if factor is None or not isinstance(factor, float) or factor < 1.0: - logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") - - low_freq_factor = rope_parameters["low_freq_factor"] - high_freq_factor = rope_parameters["high_freq_factor"] - if low_freq_factor is None or not isinstance(low_freq_factor, float): - logger.warning(f"`rope_parameters`'s low_freq_factor field must be a float, got {low_freq_factor}") - if high_freq_factor is None or not isinstance(high_freq_factor, float): - logger.warning(f"`rope_parameters`'s high_freq_factor field must be a float, got {high_freq_factor}") - if high_freq_factor <= low_freq_factor: - logger.warning( - "`rope_parameters`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=" - f"{high_freq_factor} and low_freq_factor={low_freq_factor}" - ) - - original_max_position_embeddings = rope_parameters["original_max_position_embeddings"] - if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int): - logger.warning( - "`rope_parameters`'s original_max_position_embeddings field must be an integer, got " - f"{original_max_position_embeddings}" - ) - if original_max_position_embeddings >= config.max_position_embeddings: - logger.warning( - "`rope_parameters`'s original_max_position_embeddings field must be less than max_position_embeddings, got " - f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}" - ) - - -# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types. -ROPE_VALIDATION_FUNCTIONS = { - "default": _validate_default_rope_parameters, - "linear": _validate_linear_scaling_rope_parameters, - "dynamic": _validate_dynamic_scaling_rope_parameters, - "yarn": _validate_yarn_parameters, - "longrope": _validate_longrope_parameters, - "llama3": _validate_llama3_parameters, -} - - -def rope_config_validation(config: PreTrainedConfig, ignore_keys: Optional[set] = None): - """ - Validate the RoPE config arguments, given a `PreTrainedConfig` object - """ - rope_parameters_dict = getattr(config, "rope_parameters", None) # not a default parameter in `PreTrainedConfig` - if rope_parameters_dict is None: - return - - if getattr(config, "layer_types", None) is not None and all( - key in config.layer_types for key in rope_parameters_dict.keys() - ): - pass - else: - rope_parameters_dict = {"full_attention": rope_parameters_dict} - - for rope_parameters in rope_parameters_dict.values(): - rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default")) - validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type) - - rope_parameters["rope_type"] = rope_type - # BC: "rope_theta" was originally saved in config - rope_parameters["rope_theta"] = rope_parameters.get("rope_theta", getattr(config, "rope_theta", None)) - - if validation_fn is not None: - validation_fn(rope_parameters, config=config, ignore_keys=ignore_keys) - else: - logger.warning( - f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" - ) - - -class RopeParameters(TypedDict): +class RopeParameters(TypedDict, total=False): """ Args: rope_theta (`float`): @@ -893,6 +579,8 @@ class RopeParameters(TypedDict): rope_type (`str`, *optional*, defaults to "default"): The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', 'llama3'], with 'default' being the original RoPE implementation. + partial_rotary_factor (`float`, *optional*): + The percentage of the query and key head embedding on which RoPE will be applied. factor (`float`, *optional*): Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In most scaling types, a `factor` of x will enable the model to handle sequences of length x * @@ -926,6 +614,7 @@ class RopeParameters(TypedDict): rope_theta: float rope_type: Optional[str] + partial_rotary_factor: Optional[float] factor: Optional[float] original_max_position_embeddings: Optional[int] attention_factor: Optional[float] @@ -935,3 +624,292 @@ class RopeParameters(TypedDict): long_factor: Optional[list[float]] low_freq_factor: Optional[float] high_freq_factor: Optional[float] + + +class RotaryEmbeddingConfigMixin: + """ + A Mixin containing the functionality to standardize and validate RoPE parameters. + """ + + default_theta = 10_000.0 + + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: Optional[set] = None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) + if "partial_rotary_factor" in kwargs: + self.rope_parameters.setdefault("partial_rotary_factor", kwargs["partial_rotary_factor"]) + ignore_keys_at_rope_validation = {"partial_rotary_factor"} + + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs + + def standardize_rope_params(self): + """ + Helper to standardize the config's rope params field by ensuring the params are defined for each + later type. For old model the fn will duplicate a single rope param in each layer type (backward compatibility) + """ + # Move `rope_theta` and `partial_rotary_factor` to the params dict, if not there yet + rope_theta = getattr(self, "rope_theta", None) + partial_rotary_factor = getattr(self, "partial_rotary_factor", None) + rope_parameters = self.rope_parameters + + # Case 1: RoPE param keys do not intersect with possible `layer_types` -> one global dict + if getattr(self, "layer_types", None) is None or not set(rope_parameters.keys()).issubset(self.layer_types): + rope_parameters.setdefault("rope_type", rope_parameters.get("type", "default")) + rope_parameters.setdefault("rope_theta", rope_theta) + if partial_rotary_factor is not None: + rope_parameters["partial_rotary_factor"] = partial_rotary_factor + # Case 2: different RoPE for each layer -> several params as nested dict + else: + for layer_type in self.layer_types: + rope_parameters[layer_type].setdefault("rope_type", rope_parameters[layer_type].get("type", "default")) + rope_parameters[layer_type].setdefault("rope_theta", rope_theta) + if partial_rotary_factor is not None: + rope_parameters[layer_type]["partial_rotary_factor"] = partial_rotary_factor + + self.rope_parameters = rope_parameters + + def validate_rope(self: "PreTrainedConfig", ignore_keys: Optional[set] = None): + """ + Validate the RoPE config arguments, given a `"PreTrainedConfig"` object + """ + rope_parameters_dict = self.rope_parameters + if rope_parameters_dict is None: + return + + if getattr(self, "layer_types", None) is not None and set(rope_parameters_dict.keys()).issubset( + self.layer_types + ): + pass + else: + rope_parameters_dict = {"full_attention": rope_parameters_dict} + + for rope_parameters in rope_parameters_dict.values(): + rope_type = rope_parameters.get("rope_type", rope_parameters.get("type", "default")) + validation_fn = getattr(self, f"_validate_{rope_type}_rope_parameters") + rope_parameters["rope_type"] = rope_type + + if validation_fn is not None: + validation_fn(rope_parameters, ignore_keys=ignore_keys) + else: + logger.warning( + f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" + ) + + def _validate_default_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + required_keys = {"rope_type", "rope_theta"} + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] + self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) + + def _validate_linear_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + required_keys = {"rope_type", "factor", "rope_theta"} + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] + self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) + + factor = rope_parameters["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") + + def _validate_dynamic_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` + optional_keys = {"original_max_position_embeddings"} + required_keys = {"rope_type", "factor"} + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] + self._check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) + + factor = rope_parameters["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") + + def _validate_yarn_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + required_keys = {"rope_type", "factor", "rope_theta"} + optional_keys = { + "attention_factor", + "beta_fast", + "beta_slow", + "original_max_position_embeddings", + "mscale", + "mscale_all_dim", + } + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] + self._check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) + + factor = rope_parameters["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_parameters.get("attention_factor") + if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0): + logger.warning( + f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + beta_fast = rope_parameters.get("beta_fast") + if beta_fast is not None and not isinstance(beta_fast, float): + logger.warning(f"`rope_parameters`'s beta_fast field must be a float, got {beta_fast}") + beta_slow = rope_parameters.get("beta_slow") + if beta_slow is not None and not isinstance(beta_slow, float): + logger.warning(f"`rope_parameters`'s beta_slow field must be a float, got {beta_slow}") + + if (beta_fast or 32) < (beta_slow or 1): + logger.warning( + f"`rope_parameters`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " + f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)" + ) + + # Models should set `config.rope_parameters["original_max_position_embeddings"]` to their original (pre-yarn) context + # length, with `config.max_position_embeddings` corresponding to their post-yarn context length. + # However, for BC purposes, we allow the former to be unset. + original_max_position_embeddings = self.rope_parameters.get("original_max_position_embeddings") + if original_max_position_embeddings is not None: + # Double-check: `factor` should be the ratio between the pre-yarn and post-yarn context lengths. + implicit_factor = self.max_position_embeddings / original_max_position_embeddings + if implicit_factor != factor: + logger.warning_once( + f"The explicitly set RoPE scaling factor (config.rope_parameters['factor'] = {factor}) does not match " + "the ratio implicitly set by other parameters (implicit factor = " + "post-yarn context length / pre-yarn context length = " + "config.max_position_embeddings / config.rope_parameters['original_max_position_embeddings'] = " + f"{implicit_factor}). Using the explicit factor ({factor}) in YaRN. This may cause unexpected " + "behaviour in model usage, please correct the 'max_position_embeddings' fields in the model config." + ) + # No `config.rope_parameters["original_max_position_embeddings"]`. Is `config.max_position_embeddings` the + # pre-yarn or the post-yarn context length? + # BC: we assume it is the pre-yarn context length. + else: + logger.warning_once( + "config.rope_parameters['original_max_position_embeddings'], the pre-yarn context length, is unset. We will " + "**assume** config.max_position_embeddings holds the pre-yarn context length. Some use cases may expect " + "config.max_position_embeddings to hold the post-yarn context length (pre-yarn context length * " + "factor) -- we recommend updating both fields for optimal downstream model usage." + ) + + def _validate_longrope_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + required_keys = {"rope_type", "short_factor", "long_factor", "rope_theta"} + # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` + optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"} + received_keys = set(rope_parameters.keys()) + rope_type = rope_parameters["rope_type"] + self._check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) + + partial_rotary_factor = rope_parameters.get("partial_rotary_factor", 1.0) + head_dim = getattr(self, "head_dim", self.hidden_size // self.num_attention_heads) + dim = int(head_dim * partial_rotary_factor) + + short_factor = rope_parameters.get("short_factor") + if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor): + logger.warning(f"`rope_parameters`'s short_factor field must be a list of numbers, got {short_factor}") + if len(short_factor) != dim // 2: + logger.warning( + f"`rope_parameters`'s short_factor field must have length {dim // 2}, got {len(short_factor)}" + ) + + long_factor = rope_parameters.get("long_factor") + if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor): + logger.warning(f"`rope_parameters`'s long_factor field must be a list of numbers, got {long_factor}") + if len(long_factor) != dim // 2: + logger.warning( + f"`rope_parameters`'s long_factor field must have length {dim // 2}, got {len(long_factor)}" + ) + + # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over + # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_parameters` and is + # unique to longrope (= undesirable) + if hasattr(self, "original_max_position_embeddings"): + logger.warning_once( + "This model has set a `original_max_position_embeddings` field, to be used together with " + "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_parameters`" + "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, " + "as it is compatible with most model architectures." + ) + else: + factor = rope_parameters.get("factor") + if factor is None: + logger.warning("Missing required keys in `rope_parameters`: 'factor'") + elif not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_parameters.get("attention_factor") + if attention_factor is not None: + if not isinstance(attention_factor, float) or attention_factor < 0.0: + logger.warning( + f"`rope_parameters`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + + def _validate_llama3_rope_parameters(self, rope_parameters: dict, ignore_keys: Optional[set] = None): + required_keys = { + "rope_type", + "factor", + "original_max_position_embeddings", + "low_freq_factor", + "high_freq_factor", + "rope_theta", + } + rope_type = rope_parameters["rope_type"] + received_keys = set(rope_parameters.keys()) + self._check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) + + factor = rope_parameters["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_parameters`'s factor field must be a float >= 1, got {factor}") + + low_freq_factor = rope_parameters["low_freq_factor"] + high_freq_factor = rope_parameters["high_freq_factor"] + if low_freq_factor is None or not isinstance(low_freq_factor, float): + logger.warning(f"`rope_parameters`'s low_freq_factor field must be a float, got {low_freq_factor}") + if high_freq_factor is None or not isinstance(high_freq_factor, float): + logger.warning(f"`rope_parameters`'s high_freq_factor field must be a float, got {high_freq_factor}") + if high_freq_factor <= low_freq_factor: + logger.warning( + "`rope_parameters`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=" + f"{high_freq_factor} and low_freq_factor={low_freq_factor}" + ) + + original_max_position_embeddings = rope_parameters["original_max_position_embeddings"] + if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int): + logger.warning( + "`rope_parameters`'s original_max_position_embeddings field must be an integer, got " + f"{original_max_position_embeddings}" + ) + if original_max_position_embeddings >= self.max_position_embeddings: + logger.warning( + "`rope_parameters`'s original_max_position_embeddings field must be less than max_position_embeddings, got " + f"{original_max_position_embeddings} and max_position_embeddings={self.max_position_embeddings}" + ) + + @staticmethod + def _check_received_keys( + rope_type: str, + received_keys: set, + required_keys: set, + optional_keys: Optional[set] = None, + ignore_keys: Optional[set] = None, + ): + """Compare the received keys in `config.rope_parameters` against the expected and optional keys""" + # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present + if "type" in received_keys: + received_keys -= {"type"} + required_keys.add("rope_type") + + # Some models need to store model-specific keys, and we don't want to throw warning at them + if ignore_keys is not None: + received_keys -= ignore_keys + + missing_keys = required_keys - received_keys + if missing_keys: + raise KeyError(f"Missing required keys in `rope_parameters` for 'rope_type'='{rope_type}': {missing_keys}") + + if optional_keys is not None: + unused_keys = received_keys - required_keys - optional_keys + else: + unused_keys = received_keys - required_keys + if unused_keys: + logger.warning(f"Unrecognized keys in `rope_parameters` for 'rope_type'='{rope_type}': {unused_keys}") diff --git a/src/transformers/models/apertus/configuration_apertus.py b/src/transformers/models/apertus/configuration_apertus.py index 6588095e8521..5f25d3a5d094 100644 --- a/src/transformers/models/apertus/configuration_apertus.py +++ b/src/transformers/models/apertus/configuration_apertus.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class ApertusConfig(PreTrainedConfig): @@ -99,6 +99,7 @@ class ApertusConfig(PreTrainedConfig): model_type = "apertus" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 12000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k @@ -160,14 +161,7 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 12000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/apertus/modular_apertus.py b/src/transformers/models/apertus/modular_apertus.py index a60daa2f8194..13408fa23919 100644 --- a/src/transformers/models/apertus/modular_apertus.py +++ b/src/transformers/models/apertus/modular_apertus.py @@ -20,11 +20,11 @@ from torch import nn from ...cache_utils import Cache -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...configuration_utils import PreTrainedConfig +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging -from ..llama.configuration_llama import LlamaConfig from ..llama.modeling_llama import ( LlamaAttention, LlamaDecoderLayer, @@ -43,7 +43,7 @@ logger = logging.get_logger(__name__) -class ApertusConfig(LlamaConfig): +class ApertusConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`ApertusModel`]. It is used to instantiate a Apertus model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -116,6 +116,8 @@ class ApertusConfig(LlamaConfig): ```""" model_type = "apertus" + keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 12000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k @@ -124,6 +126,11 @@ class ApertusConfig(LlamaConfig): "layers.*.mlp.up_proj": "colwise", "layers.*.mlp.down_proj": "rowwise", } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } def __init__( self, @@ -154,35 +161,33 @@ def __init__( attention_dropout: Optional[float] = 0.0, **kwargs, ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.rope_parameters = rope_parameters + super().__init__( - vocab_size=vocab_size, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - max_position_embeddings=max_position_embeddings, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=use_cache, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, - rope_parameters=rope_parameters, - attention_bias=attention_bias, - attention_dropout=attention_dropout, **kwargs, ) - del self.pretraining_tp - del self.mlp_bias - del self.head_dim - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 12000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) class ApertusMLP(NemotronMLP): diff --git a/src/transformers/models/arcee/configuration_arcee.py b/src/transformers/models/arcee/configuration_arcee.py index b8a2015bbf7a..50d120f3d7cb 100644 --- a/src/transformers/models/arcee/configuration_arcee.py +++ b/src/transformers/models/arcee/configuration_arcee.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class ArceeConfig(PreTrainedConfig): @@ -163,14 +163,7 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py index f1d8e4193c2e..268a9307c741 100644 --- a/src/transformers/models/aria/configuration_aria.py +++ b/src/transformers/models/aria/configuration_aria.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ..auto import CONFIG_MAPPING, AutoConfig @@ -168,14 +168,7 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/bamba/configuration_bamba.py b/src/transformers/models/bamba/configuration_bamba.py index c27c6d1a1e6e..fe03c6ad0f66 100644 --- a/src/transformers/models/bamba/configuration_bamba.py +++ b/src/transformers/models/bamba/configuration_bamba.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -171,16 +171,6 @@ def __init__( self.num_logits_to_keep = num_logits_to_keep self.attn_layer_indices = attn_layer_indices - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - self.partial_rotary_factor = 0.5 - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - mamba_intermediate = mamba_expand * hidden_size if mamba_intermediate % mamba_n_heads != 0: @@ -203,6 +193,8 @@ def __init__( self.mamba_conv_bias = mamba_conv_bias self.mamba_proj_bias = mamba_proj_bias self.z_loss_coefficient = z_loss_coefficient + self.rope_parameters = rope_parameters + kwargs["partial_rotary_factor"] = 0.5 # hardcode for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/bitnet/configuration_bitnet.py b/src/transformers/models/bitnet/configuration_bitnet.py index 5a88939ff0b2..27d77785722c 100644 --- a/src/transformers/models/bitnet/configuration_bitnet.py +++ b/src/transformers/models/bitnet/configuration_bitnet.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -97,6 +97,7 @@ class BitNetConfig(PreTrainedConfig): model_type = "bitnet" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 def __init__( self, @@ -138,14 +139,7 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/blt/configuration_blt.py b/src/transformers/models/blt/configuration_blt.py index 73603a361d39..326176af5e9a 100644 --- a/src/transformers/models/blt/configuration_blt.py +++ b/src/transformers/models/blt/configuration_blt.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -30,6 +30,7 @@ class BltLocalEncoderConfig(PreTrainedConfig): """ model_type = "blt_local_encoder" + default_theta = 500000.0 def __init__( self, @@ -65,14 +66,7 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.hidden_act = hidden_act self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -85,6 +79,7 @@ class BltLocalDecoderConfig(PreTrainedConfig): """ model_type = "blt_local_decoder" + default_theta = 500000.0 def __init__( self, @@ -120,14 +115,7 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.hidden_act = hidden_act self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -140,6 +128,7 @@ class BltGlobalTransformerConfig(PreTrainedConfig): """ model_type = "blt_global_transformer" + default_theta = 500000.0 def __init__( self, @@ -167,13 +156,7 @@ def __init__( self.max_position_embeddings = max_position_embeddings self.hidden_act = hidden_act self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters = rope_parameters # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -247,13 +230,7 @@ def __init__( self.hidden_act = "silu" # Blt uses silu activation self.intermediate_size = intermediate_size or int(8 * self.hidden_size / 3) self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters = rope_parameters # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) @@ -329,6 +306,7 @@ class BltConfig(PreTrainedConfig): model_type = "blt" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 sub_configs = { "patcher_config": BltPatcherConfig, "encoder_config": BltLocalEncoderConfig, @@ -375,13 +353,6 @@ def __init__( self.realtime_patching = kwargs.get("realtime_patching", True) self.patching_threshold_add = kwargs.get("patching_threshold_add") self.monotonicity = kwargs.get("monotonicity", False) - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) # Cross attention configurations self.cross_attn_k = cross_attn_k @@ -434,6 +405,8 @@ def __init__( encoder_cross_output_size if encoder_cross_output_size != self.global_config.hidden_size else None ) + self.rope_parameters = rope_parameters + # Remove tie_word_embeddings from kwargs to avoid duplicate parameter error kwargs.pop("tie_word_embeddings", None) super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index 4e0c24c52c59..d22cecb87ef1 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -230,13 +230,7 @@ def __init__( self.attention_dropout = attention_dropout self.model_parallel_size = model_parallel_size self.swin_norm = swin_norm - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters = rope_parameters if vq_config is None: vq_config = {} diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index 1ef5b72a8ad4..6b9e40e5e143 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -106,6 +106,7 @@ class CohereConfig(PreTrainedConfig): model_type = "cohere" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -165,13 +166,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.use_qk_norm = use_qk_norm - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cohere2/configuration_cohere2.py b/src/transformers/models/cohere2/configuration_cohere2.py index 835143add517..7aadaaff94ce 100644 --- a/src/transformers/models/cohere2/configuration_cohere2.py +++ b/src/transformers/models/cohere2/configuration_cohere2.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Cohere2Config(PreTrainedConfig): @@ -166,20 +166,10 @@ def __init__( self.attention_dropout = attention_dropout self.sliding_window = sliding_window self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + # Need to specify head_dim in the config so it can be used in the attention forward functions self.head_dim = hidden_size // num_attention_heads - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4) @@ -192,10 +182,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) __all__ = ["Cohere2Config"] diff --git a/src/transformers/models/cohere2/modular_cohere2.py b/src/transformers/models/cohere2/modular_cohere2.py index 9e8bc6b564e4..e4b83c535dd1 100644 --- a/src/transformers/models/cohere2/modular_cohere2.py +++ b/src/transformers/models/cohere2/modular_cohere2.py @@ -27,8 +27,6 @@ from ...modeling_rope_utils import ( RopeParameters, dynamic_rope_update, - rope_config_validation, - standardize_rope_params, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack @@ -190,20 +188,10 @@ def __init__( self.attention_dropout = attention_dropout self.sliding_window = sliding_window self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + # Need to specify head_dim in the config so it can be used in the attention forward functions self.head_dim = hidden_size // num_attention_heads - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - # BC -> the pattern used to be a simple int, and it's still present in configs on the Hub self._sliding_window_pattern = kwargs.get("sliding_window_pattern", 4) @@ -216,10 +204,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class Cohere2RotaryEmbedding(CohereRotaryEmbedding): diff --git a/src/transformers/models/csm/configuration_csm.py b/src/transformers/models/csm/configuration_csm.py index 7b077ae63a52..d673444bb1c2 100644 --- a/src/transformers/models/csm/configuration_csm.py +++ b/src/transformers/models/csm/configuration_csm.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -103,6 +103,7 @@ class CsmDepthDecoderConfig(PreTrainedConfig): model_type = "csm_depth_decoder_model" base_config_key = "depth_decoder_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 def __init__( self, @@ -132,13 +133,6 @@ def __init__( if kwargs.pop("tie_word_embeddings", False): raise ValueError("`tie_word_embeddings=True` is not supported for CsmDepthDecoderConfig") - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=False, - **kwargs, - ) self.num_codebooks = num_codebooks self.vocab_size = vocab_size self.backbone_hidden_size = backbone_hidden_size @@ -161,14 +155,15 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=False, + **kwargs, + ) class CsmConfig(PreTrainedConfig): @@ -264,6 +259,7 @@ class CsmConfig(PreTrainedConfig): model_type = "csm" base_config_key = "csm_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 sub_configs = { "codec_config": AutoConfig, "depth_decoder_config": CsmDepthDecoderConfig, @@ -348,14 +344,7 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cwm/configuration_cwm.py b/src/transformers/models/cwm/configuration_cwm.py index 4374e37f462b..6c482fd4fc65 100644 --- a/src/transformers/models/cwm/configuration_cwm.py +++ b/src/transformers/models/cwm/configuration_cwm.py @@ -22,7 +22,6 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import rope_config_validation, standardize_rope_params class CwmConfig(PreTrainedConfig): @@ -107,6 +106,7 @@ class CwmConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + default_theta = 1_000_000.0 def __init__( self, @@ -177,14 +177,7 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1_000_000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cwm/modular_cwm.py b/src/transformers/models/cwm/modular_cwm.py index 00e63efb0f0e..a2830174ddb0 100644 --- a/src/transformers/models/cwm/modular_cwm.py +++ b/src/transformers/models/cwm/modular_cwm.py @@ -21,7 +21,6 @@ from ...configuration_utils import layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import standardize_rope_params from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging from ..llama.configuration_llama import LlamaConfig @@ -103,6 +102,7 @@ class CwmConfig(LlamaConfig): """ model_type = "cwm" + default_theta = 1_000_000.0 def __init__( self, @@ -182,10 +182,6 @@ def __init__( # CWM models don't use attention bias, remove it from config del self.attention_bias - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1_000_000.0) - standardize_rope_params(self, rope_theta=rope_theta) - class CwmRotaryEmbedding(Qwen2RotaryEmbedding): pass diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index 82182c49bd3f..3399c8618f5c 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -17,7 +17,7 @@ from typing import Any, Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -221,13 +221,7 @@ def __init__( if tie_word_embeddings: raise ValueError("tie_word_embeddings is not supported for DBRX models.") - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - standardize_rope_params(self, rope_theta=10000.0) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py index 042781485449..2b0ee668ae69 100644 --- a/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py +++ b/src/transformers/models/deepseek_v2/configuration_deepseek_v2.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class DeepseekV2Config(PreTrainedConfig): @@ -214,14 +214,7 @@ def __init__( self.mlp_bias = mlp_bias self.head_dim = qk_rope_head_dim - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py index df511f576cb1..abf68f399b8b 100644 --- a/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py +++ b/src/transformers/models/deepseek_v3/configuration_deepseek_v3.py @@ -19,7 +19,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {} @@ -225,19 +225,7 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - - for key in ["beta_fast", "beta_slow", "factor"]: - if key in self.rope_parameters: - self.rope_parameters[key] = float(self.rope_parameters[key]) - - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, @@ -247,5 +235,21 @@ def __init__( **kwargs, ) + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: Optional[set] = None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + + # Convert to float because RoPE fn expect a float. Models on the hub were saved as int + for key in ["beta_fast", "beta_slow", "factor"]: + if key in self.rope_parameters: + self.rope_parameters[key] = float(self.rope_parameters[key]) + return kwargs + __all__ = ["DeepseekV3Config"] diff --git a/src/transformers/models/dia/configuration_dia.py b/src/transformers/models/dia/configuration_dia.py index dadda0ce3724..7927d299ca8b 100644 --- a/src/transformers/models/dia/configuration_dia.py +++ b/src/transformers/models/dia/configuration_dia.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -92,14 +92,8 @@ def __init__( self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + super().__init__(**kwargs) @@ -198,14 +192,8 @@ def __init__( self.num_channels = num_channels self.initializer_range = initializer_range self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) diff --git a/src/transformers/models/diffllama/configuration_diffllama.py b/src/transformers/models/diffllama/configuration_diffllama.py index 20468eaaa8f7..5f9a731f1bd1 100644 --- a/src/transformers/models/diffllama/configuration_diffllama.py +++ b/src/transformers/models/diffllama/configuration_diffllama.py @@ -20,7 +20,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class DiffLlamaConfig(PreTrainedConfig): @@ -145,14 +145,7 @@ def __init__( self.attention_dropout = attention_dropout self.lambda_std_dev = lambda_std_dev self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/doge/configuration_doge.py b/src/transformers/models/doge/configuration_doge.py index 27e7b8404225..aa139d084ddf 100644 --- a/src/transformers/models/doge/configuration_doge.py +++ b/src/transformers/models/doge/configuration_doge.py @@ -23,7 +23,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class DogeConfig(PreTrainedConfig): @@ -189,14 +189,7 @@ def __init__( self.norm_topk_prob = norm_topk_prob self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters # for backward compatibility if num_key_value_heads is None: diff --git a/src/transformers/models/doge/modular_doge.py b/src/transformers/models/doge/modular_doge.py index eacea60cf442..8d53ef5ec681 100644 --- a/src/transformers/models/doge/modular_doge.py +++ b/src/transformers/models/doge/modular_doge.py @@ -31,7 +31,7 @@ from ...integrations.flex_attention import compile_friendly_flex_attention from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import AttentionInterface, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, is_torch_flex_attn_available, logging @@ -218,14 +218,7 @@ def __init__( self.norm_topk_prob = norm_topk_prob self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters # for backward compatibility if num_key_value_heads is None: diff --git a/src/transformers/models/dots1/configuration_dots1.py b/src/transformers/models/dots1/configuration_dots1.py index 779d6fbfe454..57ded54fcc75 100644 --- a/src/transformers/models/dots1/configuration_dots1.py +++ b/src/transformers/models/dots1/configuration_dots1.py @@ -15,7 +15,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -192,10 +192,6 @@ def __init__( self.sliding_window = sliding_window self.max_window_layers = max_window_layers - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - self.layer_types = layer_types if self.layer_types is None: self.layer_types = [ @@ -206,10 +202,7 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/efficientloftr/configuration_efficientloftr.py b/src/transformers/models/efficientloftr/configuration_efficientloftr.py index 0d151b7850ab..19c20a9a5fbf 100644 --- a/src/transformers/models/efficientloftr/configuration_efficientloftr.py +++ b/src/transformers/models/efficientloftr/configuration_efficientloftr.py @@ -14,7 +14,6 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation, standardize_rope_params class EfficientLoFTRConfig(PreTrainedConfig): @@ -67,10 +66,7 @@ class EfficientLoFTRConfig(PreTrainedConfig): fine_kernel_size (`int`, *optional*, defaults to 8): Kernel size used for the fine feature matching batch_norm_eps (`float`, *optional*, defaults to 1e-05): - The epsilon used by the batch normalization layers. - partial_rotary_factor (`float`, *optional*, defaults to 4.0): - Dim factor for the RoPE embeddings, in EfficientLoFTR, frequencies should be generated for - the whole hidden_size, so this factor is used to compensate. + The epsilon used by the batch normalization layers rope_parameters (`RopeParameters`, *optional*): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE @@ -121,7 +117,6 @@ def __init__( coarse_matching_border_removal: int = 2, fine_kernel_size: int = 8, batch_norm_eps: float = 1e-5, - partial_rotary_factor: float = 4.0, rope_parameters: Optional[dict] = None, fine_matching_slice_dim: int = 8, fine_matching_regress_temperature: float = 10.0, @@ -176,16 +171,9 @@ def __init__( self.fine_matching_regress_temperature = fine_matching_regress_temperature self.num_key_value_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 4.0) # assign default for BC super().__init__(**kwargs) diff --git a/src/transformers/models/efficientloftr/modeling_efficientloftr.py b/src/transformers/models/efficientloftr/modeling_efficientloftr.py index bcf0da0da2af..cbe1d5e14f26 100644 --- a/src/transformers/models/efficientloftr/modeling_efficientloftr.py +++ b/src/transformers/models/efficientloftr/modeling_efficientloftr.py @@ -126,7 +126,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/emu3/configuration_emu3.py b/src/transformers/models/emu3/configuration_emu3.py index c372b3ea156b..546803916db4 100644 --- a/src/transformers/models/emu3/configuration_emu3.py +++ b/src/transformers/models/emu3/configuration_emu3.py @@ -17,7 +17,7 @@ from typing import Optional, Union from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Emu3VQVAEConfig(PreTrainedConfig): @@ -188,6 +188,7 @@ class Emu3TextConfig(PreTrainedConfig): model_type = "emu3_text_model" base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 def __init__( self, @@ -226,14 +227,7 @@ def __init__( self.attention_bias = attention_bias self.initializer_range = initializer_range self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ernie4_5/configuration_ernie4_5.py b/src/transformers/models/ernie4_5/configuration_ernie4_5.py index 6df44230d68e..13a8d4d94fe7 100644 --- a/src/transformers/models/ernie4_5/configuration_ernie4_5.py +++ b/src/transformers/models/ernie4_5/configuration_ernie4_5.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Ernie4_5Config(PreTrainedConfig): @@ -92,6 +92,7 @@ class Ernie4_5Config(PreTrainedConfig): model_type = "ernie4_5" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 # Default tensor parallel plan for base model `Ernie4_5Model` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -148,14 +149,7 @@ def __init__( self.use_cache = use_cache self.use_bias = use_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py index 93fd87f365dc..bf3b8403d782 100644 --- a/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py +++ b/src/transformers/models/ernie4_5_moe/configuration_ernie4_5_moe.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -115,6 +115,7 @@ class Ernie4_5_MoeConfig(PreTrainedConfig): model_type = "ernie4_5_moe" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = {"num_experts": "moe_num_experts", "num_experts_per_tok": "moe_k"} + default_theta = 500000.0 # Default tensor parallel plan for base model `Ernie4_5_MoE` base_model_tp_plan = { @@ -181,14 +182,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.use_bias = use_bias - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) # MoE arguments self.moe_intermediate_size = moe_intermediate_size @@ -201,6 +194,7 @@ def __init__( self.moe_norm_min = moe_norm_min self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/evolla/configuration_evolla.py b/src/transformers/models/evolla/configuration_evolla.py index 4dab03fb9314..91981b3aaeb0 100644 --- a/src/transformers/models/evolla/configuration_evolla.py +++ b/src/transformers/models/evolla/configuration_evolla.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -190,6 +190,7 @@ class EvollaConfig(PreTrainedConfig): model_type = "EvollaModel" sub_configs = {"protein_encoder_config": SaProtConfig} + default_theta = 500000.0 def __init__( self, @@ -249,14 +250,7 @@ def __init__( self.resampler_heads = resampler_heads self.resampler_num_latents = resampler_num_latents self.resampler_ff_mult = resampler_ff_mult - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters # Subconfig if protein_encoder_config is None: diff --git a/src/transformers/models/exaone4/configuration_exaone4.py b/src/transformers/models/exaone4/configuration_exaone4.py index 04a632c36f0c..fbe7de454282 100644 --- a/src/transformers/models/exaone4/configuration_exaone4.py +++ b/src/transformers/models/exaone4/configuration_exaone4.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Exaone4Config(PreTrainedConfig): @@ -164,9 +164,6 @@ def __init__( self.attention_dropout = attention_dropout self.sliding_window = sliding_window self.sliding_window_pattern = sliding_window_pattern - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.sliding_window is None: @@ -182,10 +179,7 @@ def __init__( self.cache_implementation = "hybrid" layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/exaone4/modular_exaone4.py b/src/transformers/models/exaone4/modular_exaone4.py index d6004db0d28c..8c054f693f7a 100644 --- a/src/transformers/models/exaone4/modular_exaone4.py +++ b/src/transformers/models/exaone4/modular_exaone4.py @@ -30,7 +30,7 @@ BaseModelOutputWithPast, CausalLMOutputWithPast, ) -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import ( @@ -197,9 +197,6 @@ def __init__( self.attention_dropout = attention_dropout self.sliding_window = sliding_window self.sliding_window_pattern = sliding_window_pattern - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.sliding_window is None: @@ -215,10 +212,7 @@ def __init__( self.cache_implementation = "hybrid" layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 6515cae36273..d8d0266c3bee 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -162,14 +162,7 @@ def __init__( else: self.ffn_hidden_size = ffn_hidden_size - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/falcon_h1/configuration_falcon_h1.py b/src/transformers/models/falcon_h1/configuration_falcon_h1.py index 6ba590f15025..23d5ca53d4cc 100644 --- a/src/transformers/models/falcon_h1/configuration_falcon_h1.py +++ b/src/transformers/models/falcon_h1/configuration_falcon_h1.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -197,15 +197,6 @@ def __init__( self.use_cache = use_cache self.num_logits_to_keep = num_logits_to_keep - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - self.projectors_bias = projectors_bias mamba_intermediate = mamba_expand * hidden_size if mamba_d_ssm is None else mamba_d_ssm @@ -271,6 +262,8 @@ def __init__( else: self.ssm_out_multiplier = 1.0 + self.rope_parameters = rope_parameters + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/flex_olmo/configuration_flex_olmo.py b/src/transformers/models/flex_olmo/configuration_flex_olmo.py index 124cdd2543df..a61148f95ee7 100644 --- a/src/transformers/models/flex_olmo/configuration_flex_olmo.py +++ b/src/transformers/models/flex_olmo/configuration_flex_olmo.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class FlexOlmoConfig(PreTrainedConfig): @@ -110,6 +110,7 @@ class FlexOlmoConfig(PreTrainedConfig): model_type = "flex_olmo" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = {"num_local_experts": "num_experts"} + default_theta = 500000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k @@ -175,14 +176,7 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.norm_topk_prob = norm_topk_prob - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/flex_olmo/modular_flex_olmo.py b/src/transformers/models/flex_olmo/modular_flex_olmo.py index 8daa8dc1e759..e3b24c5d02fe 100644 --- a/src/transformers/models/flex_olmo/modular_flex_olmo.py +++ b/src/transformers/models/flex_olmo/modular_flex_olmo.py @@ -19,15 +19,15 @@ from torch import nn from ...cache_utils import Cache, DynamicCache +from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import OutputRecorder, check_model_inputs from ..mixtral.modeling_mixtral import MixtralModel, MixtralPreTrainedModel from ..olmo2.modeling_olmo2 import Olmo2Attention, Olmo2RMSNorm, Olmo2RotaryEmbedding -from ..olmoe.configuration_olmoe import OlmoeConfig from ..olmoe.modeling_olmoe import ( OlmoeDecoderLayer, OlmoeForCausalLM, @@ -36,7 +36,7 @@ ) -class FlexOlmoConfig(OlmoeConfig): +class FlexOlmoConfig(PreTrainedConfig): r""" This is the configuration class to store the configuration of a [`FlexOlmoModel`]. It is used to instantiate an FlexOlmo model according to the specified arguments, defining the model architecture. Instantiating a configuration with the @@ -120,6 +120,8 @@ class FlexOlmoConfig(OlmoeConfig): model_type = "flex_olmo" keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_local_experts": "num_experts"} + default_theta = 500000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k "layers.*.self_attn.k_proj": "colwise_rep", # we need to replicate here due to the added norm on q and k @@ -162,26 +164,32 @@ def __init__( norm_topk_prob: Optional[bool] = False, **kwargs, ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.num_experts_per_tok = num_experts_per_tok + self.num_experts = num_experts + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + self.norm_topk_prob = norm_topk_prob + self.rope_parameters = rope_parameters + super().__init__( - vocab_size=vocab_size, - max_position_embeddings=max_position_embeddings, - hidden_size=hidden_size, - intermediate_size=intermediate_size, - num_hidden_layers=num_hidden_layers, - num_attention_heads=num_attention_heads, - num_key_value_heads=num_key_value_heads, - hidden_act=hidden_act, - initializer_range=initializer_range, - rms_norm_eps=rms_norm_eps, - use_cache=use_cache, - rope_parameters=rope_parameters, - attention_bias=attention_bias, - attention_dropout=attention_dropout, - num_experts_per_tok=num_experts_per_tok, - num_experts=num_experts, - output_router_logits=output_router_logits, - router_aux_loss_coef=router_aux_loss_coef, - norm_topk_prob=norm_topk_prob, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, @@ -189,13 +197,6 @@ def __init__( **kwargs, ) - del self.clip_qkv - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - # FlexOlmo RMS norm reuses Olmo2 RMS norm, which handles low precision slightly differently than the original Olmoe. class FlexOlmoRMSNorm(Olmo2RMSNorm): diff --git a/src/transformers/models/fuyu/configuration_fuyu.py b/src/transformers/models/fuyu/configuration_fuyu.py index 36c3beb1cc98..dbe828e01fbe 100644 --- a/src/transformers/models/fuyu/configuration_fuyu.py +++ b/src/transformers/models/fuyu/configuration_fuyu.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..auto import CONFIG_MAPPING, AutoConfig @@ -77,9 +77,6 @@ class FuyuConfig(PreTrainedConfig): The dropout ratio after applying the MLP to the hidden states. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio after computing the attention scores. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - Percentage of the query and keys which will have rotary embedding. - pad_token_id (`int`, *optional*): The id of the *padding* token. bos_token_id (`int`, *optional*, defaults to 1): @@ -101,6 +98,7 @@ class FuyuConfig(PreTrainedConfig): model_type = "fuyu" sub_configs = {"text_config": AutoConfig} keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 25000.0 def __init__( self, @@ -122,7 +120,6 @@ def __init__( qk_layernorm: Optional[bool] = True, hidden_dropout: Optional[float] = 0.0, attention_dropout: Optional[float] = 0.0, - partial_rotary_factor: Optional[float] = 0.5, pad_token_id: Optional[int] = None, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, @@ -146,7 +143,6 @@ def __init__( "qk_layernorm": qk_layernorm, "hidden_dropout": hidden_dropout, "attention_dropout": attention_dropout, - "partial_rotary_factor": partial_rotary_factor, "pad_token_id": pad_token_id, "bos_token_id": bos_token_id, "eos_token_id": eos_token_id, @@ -172,16 +168,9 @@ def __init__( self.qk_layernorm = qk_layernorm self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - self.partial_rotary_factor = partial_rotary_factor self.image_token_id = image_token_id - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 25000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index bc1a2497c708..df0a94c014f1 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class GemmaConfig(PreTrainedConfig): @@ -152,14 +152,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.use_bidirectional_attention = use_bidirectional_attention - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma/modular_gemma.py b/src/transformers/models/gemma/modular_gemma.py index cc1ae2991a68..e305ba56a561 100644 --- a/src/transformers/models/gemma/modular_gemma.py +++ b/src/transformers/models/gemma/modular_gemma.py @@ -23,7 +23,7 @@ from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -177,14 +177,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.use_bidirectional_attention = use_bidirectional_attention - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/gemma2/configuration_gemma2.py b/src/transformers/models/gemma2/configuration_gemma2.py index 23097820fbec..a26cffab7bca 100644 --- a/src/transformers/models/gemma2/configuration_gemma2.py +++ b/src/transformers/models/gemma2/configuration_gemma2.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Gemma2Config(PreTrainedConfig): @@ -153,13 +153,6 @@ def __init__( use_bidirectional_attention: Optional[bool] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -180,9 +173,6 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types self.use_bidirectional_attention = use_bidirectional_attention - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ @@ -190,10 +180,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) __all__ = ["Gemma2Config"] diff --git a/src/transformers/models/gemma2/modular_gemma2.py b/src/transformers/models/gemma2/modular_gemma2.py index fd08997fece9..b991f2d2fc65 100644 --- a/src/transformers/models/gemma2/modular_gemma2.py +++ b/src/transformers/models/gemma2/modular_gemma2.py @@ -30,8 +30,6 @@ ROPE_INIT_FUNCTIONS, RopeParameters, dynamic_rope_update, - rope_config_validation, - standardize_rope_params, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack @@ -182,13 +180,6 @@ def __init__( use_bidirectional_attention: Optional[bool] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -209,9 +200,6 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types self.use_bidirectional_attention = use_bidirectional_attention - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ @@ -219,10 +207,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class Gemma2RMSNorm(GemmaRMSNorm): diff --git a/src/transformers/models/gemma3/configuration_gemma3.py b/src/transformers/models/gemma3/configuration_gemma3.py index eedca6a49624..884020640c9b 100644 --- a/src/transformers/models/gemma3/configuration_gemma3.py +++ b/src/transformers/models/gemma3/configuration_gemma3.py @@ -22,7 +22,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..siglip import SiglipVisionConfig @@ -130,6 +130,7 @@ class Gemma3TextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + default_theta = {"global": 1_000_000.0, "local": 10_000.0} def __init__( self, @@ -160,13 +161,6 @@ def __init__( use_bidirectional_attention: Optional[bool] = False, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -187,16 +181,6 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - if rope_parameters is None: - rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling} - elif "full_attention" in rope_parameters: - rope_parameters["full_attention"].update(rope_scaling) - else: - rope_parameters.update(rope_scaling) - - self.rope_parameters = rope_parameters self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: self.sliding_window = (self.sliding_window // 2) + 1 # due to fa we set exclusive bounds @@ -211,13 +195,38 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1_000_000.0) - rope_local_base_freq = getattr(self, "rope_local_base_freq", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} + self.rope_parameters = rope_parameters + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"]) ) - rope_config_validation(self) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs class Gemma3Config(PreTrainedConfig): diff --git a/src/transformers/models/gemma3/modular_gemma3.py b/src/transformers/models/gemma3/modular_gemma3.py index 31f25550df03..2d489d77f5c7 100644 --- a/src/transformers/models/gemma3/modular_gemma3.py +++ b/src/transformers/models/gemma3/modular_gemma3.py @@ -30,8 +30,6 @@ ROPE_INIT_FUNCTIONS, RopeParameters, dynamic_rope_update, - rope_config_validation, - standardize_rope_params, ) from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack @@ -146,6 +144,7 @@ class Gemma3TextConfig(Gemma2Config, PreTrainedConfig): """ model_type = "gemma3_text" + default_theta = {"global": 1_000_000.0, "local": 10_000.0} def __init__( self, @@ -176,13 +175,6 @@ def __init__( use_bidirectional_attention: Optional[bool] = False, **kwargs, ): - PreTrainedConfig.__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -203,16 +195,6 @@ def __init__( self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - if (rope_scaling := kwargs.pop("rope_scaling", None)) is not None: - if rope_parameters is None: - rope_parameters = {"sliding_attention": {"rope_type": "default"}, "full_attention": rope_scaling} - elif "full_attention" in rope_parameters: - rope_parameters["full_attention"].update(rope_scaling) - else: - rope_parameters.update(rope_scaling) - - self.rope_parameters = rope_parameters self.use_bidirectional_attention = use_bidirectional_attention if use_bidirectional_attention: self.sliding_window = (self.sliding_window // 2) + 1 # due to fa we set exclusive bounds @@ -227,13 +209,38 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1_000_000.0) - rope_local_base_freq = getattr(self, "rope_local_base_freq", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} + self.rope_parameters = rope_parameters + PreTrainedConfig.__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"]) ) - rope_config_validation(self) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs class Gemma3Config(PreTrainedConfig): diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py index e518eb5737be..b7415b21877d 100644 --- a/src/transformers/models/gemma3n/configuration_gemma3n.py +++ b/src/transformers/models/gemma3n/configuration_gemma3n.py @@ -23,7 +23,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import is_timm_available, logging, requires_backends @@ -157,6 +157,7 @@ class Gemma3nTextConfig(PreTrainedConfig): "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), "norm": (["hidden_states"], ["hidden_states"]), } + default_theta = {"global": 1_000_000.0, "local": 10_000.0} def __init__( self, @@ -192,13 +193,6 @@ def __init__( activation_sparsity_pattern: Optional[Union[float, Sequence[float]]] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs, - ) - if isinstance(intermediate_size, Sequence) and (intsize_len := len(intermediate_size)) != num_hidden_layers: raise ValueError( "intermediate_size must have an explicit intermediate size for every layer or one for all layers. " @@ -225,9 +219,6 @@ def __init__( self.sliding_window = sliding_window self.final_logit_softcapping = final_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if layer_types is None: self.layer_types = [ @@ -238,14 +229,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - rope_local_base_freq = kwargs.get("rope_local_base_freq", 100000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) - self.hidden_size_per_layer_input = hidden_size_per_layer_input self.num_kv_shared_layers = num_kv_shared_layers @@ -266,6 +249,37 @@ def __init__( f"Expected {num_hidden_layers} values but got {len_asp}." ) self.activation_sparsity_pattern = activation_sparsity_pattern + self.rope_parameters = rope_parameters + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs class Gemma3nAudioConfig(PreTrainedConfig): diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py index cc79bf00cdc5..a97ef8bceeeb 100644 --- a/src/transformers/models/gemma3n/modular_gemma3n.py +++ b/src/transformers/models/gemma3n/modular_gemma3n.py @@ -28,7 +28,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -168,6 +168,7 @@ class Gemma3nTextConfig(Gemma2Config, PreTrainedConfig): """ model_type = "gemma3n_text" + default_theta = {"global": 1_000_000.0, "local": 10_000.0} def __init__( self, @@ -203,13 +204,6 @@ def __init__( activation_sparsity_pattern: Optional[Union[float, Sequence[float]]] = None, **kwargs, ): - PreTrainedConfig.__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs, - ) - if isinstance(intermediate_size, Sequence) and (intsize_len := len(intermediate_size)) != num_hidden_layers: raise ValueError( "intermediate_size must have an explicit intermediate size for every layer or one for all layers. " @@ -236,9 +230,6 @@ def __init__( self.sliding_window = sliding_window self.final_logit_softcapping = final_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if layer_types is None: self.layer_types = [ @@ -249,14 +240,6 @@ def __init__( layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - rope_local_base_freq = kwargs.get("rope_local_base_freq", 100000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) - self.hidden_size_per_layer_input = hidden_size_per_layer_input self.num_kv_shared_layers = num_kv_shared_layers @@ -277,6 +260,37 @@ def __init__( f"Expected {num_hidden_layers} values but got {len_asp}." ) self.activation_sparsity_pattern = activation_sparsity_pattern + self.rope_parameters = rope_parameters + PreTrainedConfig.__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("rope_theta", self.default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("rope_local_base_freq", self.default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs class Gemma3nAudioConfig(PreTrainedConfig): diff --git a/src/transformers/models/glm/configuration_glm.py b/src/transformers/models/glm/configuration_glm.py index 93dfad8f125e..59328a6a4f43 100644 --- a/src/transformers/models/glm/configuration_glm.py +++ b/src/transformers/models/glm/configuration_glm.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class GlmConfig(PreTrainedConfig): @@ -48,7 +48,6 @@ class GlmConfig(PreTrainedConfig): by meanpooling all the original heads within that group. For more details, check out [this paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `num_attention_heads`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. head_dim (`int`, *optional*, defaults to 128): The attention head dimension. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): @@ -112,7 +111,6 @@ def __init__( num_hidden_layers: Optional[int] = 40, num_attention_heads: Optional[int] = 32, num_key_value_heads: Optional[int] = 2, - partial_rotary_factor: Optional[float] = 0.5, head_dim: Optional[int] = 128, hidden_act: Optional[str] = "silu", attention_dropout: Optional[float] = 0.0, @@ -134,7 +132,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.head_dim = head_dim self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -143,14 +140,8 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/glm/modeling_glm.py b/src/transformers/models/glm/modeling_glm.py index 8a508e2de54c..8a3ce70a8a5e 100644 --- a/src/transformers/models/glm/modeling_glm.py +++ b/src/transformers/models/glm/modeling_glm.py @@ -101,7 +101,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm/modular_glm.py b/src/transformers/models/glm/modular_glm.py index 059cb296c972..97b47a9b8f7f 100644 --- a/src/transformers/models/glm/modular_glm.py +++ b/src/transformers/models/glm/modular_glm.py @@ -60,7 +60,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4/configuration_glm4.py b/src/transformers/models/glm4/configuration_glm4.py index b13bc1350aa9..a6d0d2bc3d9c 100644 --- a/src/transformers/models/glm4/configuration_glm4.py +++ b/src/transformers/models/glm4/configuration_glm4.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Glm4Config(PreTrainedConfig): @@ -48,8 +48,6 @@ class Glm4Config(PreTrainedConfig): by meanpooling all the original heads within that group. For more details, check out [this paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `num_attention_heads`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - The factor of the partial rotary position. head_dim (`int`, *optional*, defaults to 128): The attention head dimension. hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): @@ -113,7 +111,6 @@ def __init__( num_hidden_layers: Optional[int] = 40, num_attention_heads: Optional[int] = 32, num_key_value_heads: Optional[int] = 2, - partial_rotary_factor: Optional[float] = 0.5, head_dim: Optional[int] = 128, hidden_act: Optional[str] = "silu", attention_dropout: Optional[float] = 0.0, @@ -135,7 +132,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.head_dim = head_dim self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -144,14 +140,8 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/glm4/modeling_glm4.py b/src/transformers/models/glm4/modeling_glm4.py index c982c36f9aab..625e5d4f2bf9 100644 --- a/src/transformers/models/glm4/modeling_glm4.py +++ b/src/transformers/models/glm4/modeling_glm4.py @@ -305,7 +305,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4_moe/configuration_glm4_moe.py b/src/transformers/models/glm4_moe/configuration_glm4_moe.py index 7b5b67a273e5..573b7fc10f53 100644 --- a/src/transformers/models/glm4_moe/configuration_glm4_moe.py +++ b/src/transformers/models/glm4_moe/configuration_glm4_moe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Glm4MoeConfig(PreTrainedConfig): @@ -47,8 +47,6 @@ class Glm4MoeConfig(PreTrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 96): Number of attention heads for each attention layer in the Transformer encoder. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - The factor of the partial rotary position. num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if @@ -144,7 +142,6 @@ def __init__( intermediate_size: Optional[int] = 10944, num_hidden_layers: Optional[int] = 46, num_attention_heads: Optional[int] = 96, - partial_rotary_factor: Optional[float] = 0.5, num_key_value_heads: Optional[int] = 8, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 131072, @@ -173,7 +170,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -182,14 +178,8 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/glm4_moe/modeling_glm4_moe.py b/src/transformers/models/glm4_moe/modeling_glm4_moe.py index 84e6dd3bd77d..e987e3e9e424 100644 --- a/src/transformers/models/glm4_moe/modeling_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modeling_glm4_moe.py @@ -82,7 +82,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4_moe/modular_glm4_moe.py b/src/transformers/models/glm4_moe/modular_glm4_moe.py index a4e9475a7f19..b4a8dc4dc82d 100644 --- a/src/transformers/models/glm4_moe/modular_glm4_moe.py +++ b/src/transformers/models/glm4_moe/modular_glm4_moe.py @@ -20,7 +20,7 @@ from torch import nn from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..cohere.modeling_cohere import CohereAttention from ..deepseek_v3.modeling_deepseek_v3 import ( @@ -61,8 +61,6 @@ class Glm4MoeConfig(PreTrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 96): Number of attention heads for each attention layer in the Transformer encoder. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - The factor of the partial rotary position. num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if @@ -158,7 +156,6 @@ def __init__( intermediate_size: Optional[int] = 10944, num_hidden_layers: Optional[int] = 46, num_attention_heads: Optional[int] = 96, - partial_rotary_factor: Optional[float] = 0.5, num_key_value_heads: Optional[int] = 8, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 131072, @@ -187,7 +184,6 @@ def __init__( self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -196,14 +192,8 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC # MoE arguments self.moe_intermediate_size = moe_intermediate_size diff --git a/src/transformers/models/glm4v/configuration_glm4v.py b/src/transformers/models/glm4v/configuration_glm4v.py index 56ac02ebbfa9..35c29f07246d 100644 --- a/src/transformers/models/glm4v/configuration_glm4v.py +++ b/src/transformers/models/glm4v/configuration_glm4v.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Glm4vVisionConfig(PreTrainedConfig): @@ -232,16 +232,9 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) - - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs) class Glm4vConfig(PreTrainedConfig): diff --git a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py b/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py index fb57f66a9ae0..dd9f2fba17d3 100644 --- a/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py +++ b/src/transformers/models/glm4v/convert_glm4v_mgt_weights_to_hf.py @@ -702,9 +702,13 @@ def offset_layer(x, offset=llm_layer_offset): "dtype": text_config.get("torch_dtype", "bfloat16"), "use_cache": text_config.get("use_cache", True), "vocab_size": text_config.get("vocab_size", 151552), - "partial_rotary_factor": 0.5, "tie_word_embeddings": False, - "rope_parameters": {"rope_type": "default", "rope_theta": 10000.0, "mrope_section": [8, 12, 12]}, + "rope_parameters": { + "rope_type": "default", + "rope_theta": 10000.0, + "mrope_section": [8, 12, 12], + "partial_rotary_factor": 0.5, + }, } hf_config["text_config"] = txt_config diff --git a/src/transformers/models/glm4v/modeling_glm4v.py b/src/transformers/models/glm4v/modeling_glm4v.py index c843774c5242..9f9a58b4c530 100644 --- a/src/transformers/models/glm4v/modeling_glm4v.py +++ b/src/transformers/models/glm4v/modeling_glm4v.py @@ -425,7 +425,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4v/modular_glm4v.py b/src/transformers/models/glm4v/modular_glm4v.py index 5db661e318ff..aa1ae597aa24 100644 --- a/src/transformers/models/glm4v/modular_glm4v.py +++ b/src/transformers/models/glm4v/modular_glm4v.py @@ -31,7 +31,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -269,16 +269,9 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) - - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs) class Glm4vConfig(PreTrainedConfig): diff --git a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py index dbb85b40f367..20e4f3ad492c 100644 --- a/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/configuration_glm4v_moe.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Glm4vMoeVisionConfig(PreTrainedConfig): @@ -139,7 +139,6 @@ class Glm4vMoeTextConfig(PreTrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 96): Number of attention heads for each attention layer in the Transformer encoder. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if @@ -231,7 +230,6 @@ def __init__( intermediate_size: Optional[int] = 10944, num_hidden_layers: Optional[int] = 46, num_attention_heads: Optional[int] = 96, - partial_rotary_factor: Optional[float] = 0.5, num_key_value_heads: Optional[int] = 8, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 65536, @@ -254,14 +252,12 @@ def __init__( router_aux_loss_coef: Optional[float] = 0.0001, **kwargs, ): - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -270,14 +266,8 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC # MoE arguments self.moe_intermediate_size = moe_intermediate_size @@ -290,6 +280,7 @@ def __init__( self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.router_aux_loss_coef = router_aux_loss_coef + super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs) class Glm4vMoeConfig(PreTrainedConfig): diff --git a/src/transformers/models/glm4v_moe/convert_glm4v_moe_mgt_weights_to_hf.py b/src/transformers/models/glm4v_moe/convert_glm4v_moe_mgt_weights_to_hf.py index d8b08716b6c4..54a9564b69c5 100644 --- a/src/transformers/models/glm4v_moe/convert_glm4v_moe_mgt_weights_to_hf.py +++ b/src/transformers/models/glm4v_moe/convert_glm4v_moe_mgt_weights_to_hf.py @@ -707,7 +707,12 @@ def offset_layer(x, offset=llm_layer_offset): "n_shared_experts": text_config.get("n_shared_experts", 1), "norm_topk_prob": text_config.get("norm_topk_prob", True), "num_experts_per_tok": text_config.get("num_experts_per_tok", 8), - "rope_parameters": {"rope_type": "default", "rope_theta": 10000.0, "mrope_section": [8, 12, 12]}, + "rope_parameters": { + "rope_type": "default", + "rope_theta": 10000.0, + "mrope_section": [8, 12, 12], + "partial_rotary_factor": 0.5, + }, } hf_config["text_config"] = txt_config diff --git a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py index fbb167c762be..4a8f77d38f03 100644 --- a/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modeling_glm4v_moe.py @@ -129,7 +129,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py index c94ad0a9a8f6..9a10a4e4d9b5 100644 --- a/src/transformers/models/glm4v_moe/modular_glm4v_moe.py +++ b/src/transformers/models/glm4v_moe/modular_glm4v_moe.py @@ -23,7 +23,7 @@ from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, RotaryEmbeddingConfigMixin from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, logging @@ -66,7 +66,7 @@ class Glm4vMoeRMSNorm(Glm4MoeRMSNorm): pass -class Glm4vMoeTextConfig(Glm4MoeConfig): +class Glm4vMoeTextConfig(Glm4MoeConfig, RotaryEmbeddingConfigMixin): r""" This is the configuration class to store the configuration of a [`Glm4vMoeModel`]. It is used to instantiate a GLM-4.5V model according to the specified arguments, defining the model architecture. Instantiating a @@ -88,7 +88,6 @@ class Glm4vMoeTextConfig(Glm4MoeConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 96): Number of attention heads for each attention layer in the Transformer encoder. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): The factor of the partial rotary position. num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if @@ -177,7 +176,6 @@ def __init__( intermediate_size: Optional[int] = 10944, num_hidden_layers: Optional[int] = 46, num_attention_heads: Optional[int] = 96, - partial_rotary_factor: Optional[float] = 0.5, num_key_value_heads: Optional[int] = 8, hidden_act: Optional[str] = "silu", max_position_embeddings: Optional[int] = 65536, @@ -200,14 +198,12 @@ def __init__( router_aux_loss_coef: Optional[float] = 0.0001, **kwargs, ): - PreTrainedConfig.__init__(self, tie_word_embeddings=tie_word_embeddings, **kwargs) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.partial_rotary_factor = partial_rotary_factor self.num_key_value_heads = num_key_value_heads self.hidden_act = hidden_act @@ -216,14 +212,8 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC # MoE arguments self.moe_intermediate_size = moe_intermediate_size @@ -236,6 +226,9 @@ def __init__( self.first_k_dense_replace = first_k_dense_replace self.norm_topk_prob = norm_topk_prob self.router_aux_loss_coef = router_aux_loss_coef + PreTrainedConfig.__init__( + self, tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs + ) class Glm4vMoeConfig(Glm4vConfig): @@ -376,7 +369,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/gpt_neox/configuration_gpt_neox.py b/src/transformers/models/gpt_neox/configuration_gpt_neox.py index 90a1d92bbb60..8de9ac83a2b3 100644 --- a/src/transformers/models/gpt_neox/configuration_gpt_neox.py +++ b/src/transformers/models/gpt_neox/configuration_gpt_neox.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -50,8 +50,6 @@ class GPTNeoXConfig(PreTrainedConfig): hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported. - rotary_pct (`float`, *optional*, defaults to 0.25): - percentage of hidden dimensions to allocate to rotary embeddings attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio probability of the attention score. hidden_dropout (`float`, *optional*, defaults to 0.0): @@ -59,8 +57,7 @@ class GPTNeoXConfig(PreTrainedConfig): hidden states. classifier_dropout (`float`, *optional*, defaults to 0.1): Argument used when doing token classification, used in the model [`GPTNeoXForTokenClassification`]. - - The dropout ratio for the hidden layer. + The dropout ratio for the c;assifier head. max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with. Typically set this to something large just in case (e.g., 512 or 1024 or 2048). @@ -119,7 +116,6 @@ def __init__( num_attention_heads: Optional[int] = 64, intermediate_size: Optional[int] = 24576, hidden_act: Optional[str] = "gelu", - rotary_pct: Optional[float] = 0.25, attention_dropout: Optional[float] = 0.0, hidden_dropout: Optional[float] = 0.0, classifier_dropout: Optional[float] = 0.1, @@ -135,7 +131,6 @@ def __init__( attention_bias: Optional[bool] = True, **kwargs, ): - super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -143,32 +138,36 @@ def __init__( self.num_attention_heads = num_attention_heads self.intermediate_size = intermediate_size self.hidden_act = hidden_act - self.rotary_pct = rotary_pct - self.partial_rotary_factor = rotary_pct self.attention_dropout = attention_dropout self.hidden_dropout = hidden_dropout self.classifier_dropout = classifier_dropout self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.tie_word_embeddings = tie_word_embeddings self.use_parallel_residual = use_parallel_residual - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.attention_bias = attention_bias - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rotary_emb_base", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) if self.hidden_size % self.num_attention_heads != 0: raise ValueError( "The hidden size is not divisible by the number of attention heads! Make sure to update them!" ) + super().__init__( + bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs + ) + + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + # Model uses non-standard naming for rope params, overwrite! + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", self.default_theta)) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 0.25) + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs __all__ = ["GPTNeoXConfig"] diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py index fc7d6fd40a80..56af644f21d9 100755 --- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -88,7 +88,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) @@ -194,7 +194,8 @@ def __init__(self, config, layer_idx=None): self.config = config self.head_size = config.hidden_size // config.num_attention_heads self.attention_dropout = config.attention_dropout - self.rotary_ndims = int(self.head_size * config.rotary_pct) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) + self.rotary_ndims = int(self.head_size * partial_rotary_factor) self.scaling = self.head_size**-0.5 self.is_causal = True self.layer_idx = layer_idx diff --git a/src/transformers/models/gpt_neox/modular_gpt_neox.py b/src/transformers/models/gpt_neox/modular_gpt_neox.py index c267753db350..f31575b052d2 100644 --- a/src/transformers/models/gpt_neox/modular_gpt_neox.py +++ b/src/transformers/models/gpt_neox/modular_gpt_neox.py @@ -62,7 +62,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) @@ -146,7 +146,8 @@ def __init__(self, config, layer_idx=None): self.config = config self.head_size = config.hidden_size // config.num_attention_heads self.attention_dropout = config.attention_dropout - self.rotary_ndims = int(self.head_size * config.rotary_pct) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) + self.rotary_ndims = int(self.head_size * partial_rotary_factor) self.scaling = self.head_size**-0.5 self.is_causal = True self.layer_idx = layer_idx diff --git a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py index 1f1336099ac6..93d0e925d1af 100644 --- a/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/configuration_gpt_neox_japanese.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -49,8 +49,6 @@ class GPTNeoXJapaneseConfig(PreTrainedConfig): intermediate_multiple_size. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. - rotary_pct (`float`, *optional*, defaults to 1.00): - percentage of hidden dimensions to allocate to rotary embeddings max_position_embeddings (`int`, *optional*, defaults to 2048): The maximum sequence length that this model might ever be used with. initializer_range (`float`, *optional*, defaults to 0.02): @@ -93,7 +91,6 @@ def __init__( num_attention_heads: Optional[int] = 32, intermediate_multiple_size: Optional[int] = 4, hidden_act: Optional[str] = "gelu", - rotary_pct: Optional[float] = 1.00, max_position_embeddings: Optional[int] = 2048, initializer_range: Optional[float] = 0.02, layer_norm_eps: Optional[int] = 1e-5, @@ -105,7 +102,6 @@ def __init__( hidden_dropout: Optional[float] = 0.0, **kwargs, ): - super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -113,24 +109,27 @@ def __init__( self.num_attention_heads = num_attention_heads self.intermediate_multiple_size = intermediate_multiple_size self.hidden_act = hidden_act - self.rotary_pct = rotary_pct - self.partial_rotary_factor = rotary_pct self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.attention_dropout = attention_dropout self.hidden_dropout = hidden_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rotary_emb_base", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + # Model uses non-standard naming for rope params, overwrite! + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rotary_emb_base", self.default_theta)) + self.rope_parameters["partial_rotary_factor"] = kwargs.pop("rotary_pct", 1.0) + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs __all__ = ["GPTNeoXJapaneseConfig"] diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py index f723defcd088..f2112083d388 100755 --- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py +++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py @@ -174,7 +174,8 @@ def __init__(self, config, use_bias=False, layer_idx=None): ) self.layer_idx = layer_idx - self.rotary_ndims = int(self.head_size * config.rotary_pct) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) + self.rotary_ndims = int(self.head_size * partial_rotary_factor) self.attention_dropout = nn.Dropout(config.attention_dropout) self.norm_factor = math.sqrt(self.head_size) diff --git a/src/transformers/models/gpt_oss/configuration_gpt_oss.py b/src/transformers/models/gpt_oss/configuration_gpt_oss.py index d7e714079e39..78beab335b04 100644 --- a/src/transformers/models/gpt_oss/configuration_gpt_oss.py +++ b/src/transformers/models/gpt_oss/configuration_gpt_oss.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class GptOssConfig(PreTrainedConfig): @@ -28,6 +28,7 @@ class GptOssConfig(PreTrainedConfig): """ model_type = "gpt_oss" + default_theta = 150000.0 base_model_pp_plan = { "embed_tokens": (["input_ids"], ["inputs_embeds"]), "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), @@ -109,14 +110,7 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.output_router_logits = output_router_logits self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 150000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/granite/configuration_granite.py b/src/transformers/models/granite/configuration_granite.py index bfed37bd1ce4..c94c40169e4e 100644 --- a/src/transformers/models/granite/configuration_granite.py +++ b/src/transformers/models/granite/configuration_granite.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -175,14 +175,7 @@ def __init__( self.logits_scaling = logits_scaling self.residual_multiplier = residual_multiplier self.attention_multiplier = attention_multiplier - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, @@ -192,7 +185,5 @@ def __init__( **kwargs, ) - rope_config_validation(self) - __all__ = ["GraniteConfig"] diff --git a/src/transformers/models/granitemoe/configuration_granitemoe.py b/src/transformers/models/granitemoe/configuration_granitemoe.py index 684fc2bb223f..55c619c99e72 100644 --- a/src/transformers/models/granitemoe/configuration_granitemoe.py +++ b/src/transformers/models/granitemoe/configuration_granitemoe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -159,15 +159,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - self.attention_bias = attention_bias self.attention_dropout = attention_dropout @@ -181,6 +172,8 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef + self.rope_parameters = rope_parameters + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -189,7 +182,5 @@ def __init__( **kwargs, ) - rope_config_validation(self) - __all__ = ["GraniteMoeConfig"] diff --git a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py index d947b2045b35..08320e9fb513 100644 --- a/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/configuration_granitemoehybrid.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -198,14 +198,7 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.shared_intermediate_size = shared_intermediate_size - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters mamba_intermediate = mamba_expand * hidden_size diff --git a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py index 00ad7c5ec9a6..23f35a0f1989 100644 --- a/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/configuration_granitemoeshared.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -162,17 +162,6 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - # this model has rope embedding type, hardcoded for BC - self.position_embedding_type = "rope" - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - self.attention_bias = attention_bias self.attention_dropout = attention_dropout @@ -187,6 +176,10 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.shared_intermediate_size = shared_intermediate_size + # this model has rope embedding type, hardcoded for BC + self.position_embedding_type = "rope" + self.rope_parameters = rope_parameters + super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, @@ -195,7 +188,5 @@ def __init__( **kwargs, ) - rope_config_validation(self) - __all__ = ["GraniteMoeSharedConfig"] diff --git a/src/transformers/models/helium/configuration_helium.py b/src/transformers/models/helium/configuration_helium.py index 40334cfd7967..d633b23ccebd 100644 --- a/src/transformers/models/helium/configuration_helium.py +++ b/src/transformers/models/helium/configuration_helium.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class HeliumConfig(PreTrainedConfig): @@ -93,6 +93,7 @@ class HeliumConfig(PreTrainedConfig): model_type = "helium" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 100000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -147,14 +148,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 100000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py index f4828d9112dc..f750e2302d39 100644 --- a/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py +++ b/src/transformers/models/hunyuan_v1_dense/configuration_hunyuan_v1_dense.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -141,14 +141,7 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) # TODO needs model-specific validation? + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, @@ -158,35 +151,5 @@ def __init__( **kwargs, ) - def _rope_parameters_validation(self): - """ - Validate the `rope_parameters` configuration. - """ - if self.rope_parameters is None: - return - - if not isinstance(self.rope_parameters, dict) or len(self.rope_parameters) != 2: - raise ValueError( - "`rope_parameters` must be a dictionary with with two fields, `type` and `factor` or `type` and `alpha`, " - f"got {self.rope_parameters}" - ) - rope_parameters_type = self.rope_parameters.get("type", None) - rope_parameters_factor = self.rope_parameters.get("factor", None) - rope_parameters_alpha = self.rope_parameters.get("alpha", None) - if rope_parameters_type is None or rope_parameters_type not in ["linear", "dynamic"]: - raise ValueError( - f"`rope_parameters`'s type field must be one of ['linear', 'dynamic'], got {rope_parameters_type}" - ) - if rope_parameters_factor is None and rope_parameters_alpha is None: - raise ValueError("`rope_parameters`'s factor or alpha field must be have one, got both of none") - if rope_parameters_factor is not None: - if not isinstance(rope_parameters_factor, float) or rope_parameters_factor <= 1.0: - raise ValueError( - f"`rope_parameters`'s factor field must be a float > 1.0, got {rope_parameters_factor}" - ) - if rope_parameters_alpha is not None: - if not isinstance(rope_parameters_alpha, float) or rope_parameters_alpha <= 1.0: - raise ValueError(f"`rope_parameters`'s alpha field must be a float > 1.0, got {rope_parameters_alpha}") - __all__ = ["HunYuanDenseV1Config"] diff --git a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py index 332c86b82a63..ab6844c8d903 100644 --- a/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py +++ b/src/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py @@ -17,7 +17,7 @@ from typing import Optional, Union from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -157,14 +157,7 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/jetmoe/configuration_jetmoe.py b/src/transformers/models/jetmoe/configuration_jetmoe.py index 757174241b93..3f3c1f632061 100644 --- a/src/transformers/models/jetmoe/configuration_jetmoe.py +++ b/src/transformers/models/jetmoe/configuration_jetmoe.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -147,14 +147,7 @@ def __init__( self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id self.rms_norm_eps = rms_norm_eps - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py index efd4074c36fc..40205002803b 100644 --- a/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py +++ b/src/transformers/models/kyutai_speech_to_text/configuration_kyutai_speech_to_text.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -183,14 +183,7 @@ def __init__( self.attention_dropout = attention_dropout self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.sliding_window = sliding_window - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs diff --git a/src/transformers/models/lfm2/configuration_lfm2.py b/src/transformers/models/lfm2/configuration_lfm2.py index 3dcff8b260c6..9b9129455d3a 100644 --- a/src/transformers/models/lfm2/configuration_lfm2.py +++ b/src/transformers/models/lfm2/configuration_lfm2.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Lfm2Config(PreTrainedConfig): @@ -100,6 +100,7 @@ class Lfm2Config(PreTrainedConfig): model_type = "lfm2" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 def __init__( self, @@ -148,20 +149,13 @@ def __init__( self.block_multiple_of = block_multiple_of self.block_ffn_dim_multiplier = block_ffn_dim_multiplier self.block_auto_adjust_ff_dim = block_auto_adjust_ff_dim - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: full_attn_idxs = full_attn_idxs if full_attn_idxs is not None else list(range(num_hidden_layers)) self.layer_types = ["full_attention" if i in full_attn_idxs else "conv" for i in range(num_hidden_layers)] - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - + self.rope_parameters = rope_parameters tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py index cea82636927b..b6a1dcb1512a 100644 --- a/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py +++ b/src/transformers/models/lfm2_moe/configuration_lfm2_moe.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Lfm2MoeConfig(PreTrainedConfig): @@ -103,6 +103,7 @@ class Lfm2MoeConfig(PreTrainedConfig): model_type = "lfm2_moe" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 def __init__( self, @@ -136,9 +137,6 @@ def __init__( self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.max_position_embeddings = max_position_embeddings self.use_cache = use_cache self.norm_eps = norm_eps @@ -161,11 +159,7 @@ def __init__( self.norm_topk_prob = norm_topk_prob self.layer_types = layer_types - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("theta", kwargs.get("rope_theta", 1000000.0)) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - + self.rope_parameters = rope_parameters tie_word_embeddings = kwargs.get("tie_embedding", tie_word_embeddings) # to fit original config keys super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/llama/configuration_llama.py b/src/transformers/models/llama/configuration_llama.py index de6695e14833..cc3db887fbb3 100644 --- a/src/transformers/models/llama/configuration_llama.py +++ b/src/transformers/models/llama/configuration_llama.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class LlamaConfig(PreTrainedConfig): @@ -171,14 +171,7 @@ def __init__( self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/llama4/configuration_llama4.py b/src/transformers/models/llama4/configuration_llama4.py index 46e0804cfda7..af6b57805a92 100644 --- a/src/transformers/models/llama4/configuration_llama4.py +++ b/src/transformers/models/llama4/configuration_llama4.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -124,14 +124,9 @@ def __init__( self.projector_dropout = projector_dropout self.attention_dropout = attention_dropout self.vision_feature_select_strategy = vision_feature_select_strategy - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + + self.rope_parameters = rope_parameters + super().__init__(**kwargs) @@ -218,6 +213,7 @@ class Llama4TextConfig(PreTrainedConfig): model_type = "llama4_text" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -286,13 +282,6 @@ def __init__( attn_scale=0.1, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.attn_temperature_tuning = attn_temperature_tuning self.attn_scale = attn_scale self.floor_scale = floor_scale @@ -316,10 +305,6 @@ def __init__( self.attention_dropout = attention_dropout self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads self.use_qk_norm = use_qk_norm - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - self.num_experts_per_tok = num_experts_per_tok self.num_local_experts = num_local_experts @@ -352,10 +337,14 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class Llama4Config(PreTrainedConfig): diff --git a/src/transformers/models/longcat_flash/configuration_longcat_flash.py b/src/transformers/models/longcat_flash/configuration_longcat_flash.py index e99c2b8265c2..79f2e82daf30 100644 --- a/src/transformers/models/longcat_flash/configuration_longcat_flash.py +++ b/src/transformers/models/longcat_flash/configuration_longcat_flash.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class LongcatFlashConfig(PreTrainedConfig): @@ -122,6 +122,7 @@ class LongcatFlashConfig(PreTrainedConfig): model_type = "longcat_flash" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 10000000.0 base_model_tp_plan = { "layers.*.self_attn.*.q_b_proj": "colwise", "layers.*.self_attn.*.kv_b_proj": "colwise", @@ -210,19 +211,7 @@ def __init__( self.zero_expert_num = zero_expert_num self.expert_ffn_hidden_size = expert_ffn_hidden_size self.routed_scaling_factor = routed_scaling_factor - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - - for key in ["beta_fast", "beta_slow", "factor"]: - if key in self.rope_parameters: - self.rope_parameters[key] = float(self.rope_parameters[key]) - - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, @@ -232,5 +221,21 @@ def __init__( **kwargs, ) + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: Optional[set] = None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + + # Convert to float because RoPE fn expect a float. Models on the hub were saved as int + for key in ["beta_fast", "beta_slow", "factor"]: + if key in self.rope_parameters: + self.rope_parameters[key] = float(self.rope_parameters[key]) + return kwargs + __all__ = ["LongcatFlashConfig"] diff --git a/src/transformers/models/mimi/configuration_mimi.py b/src/transformers/models/mimi/configuration_mimi.py index 372b753ef124..7869bcfafa37 100644 --- a/src/transformers/models/mimi/configuration_mimi.py +++ b/src/transformers/models/mimi/configuration_mimi.py @@ -20,7 +20,7 @@ import numpy as np from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -221,14 +221,7 @@ def __init__( self.head_dim = head_dim or hidden_size // num_attention_heads self.layer_scale_initial_scale = layer_scale_initial_scale self.attention_bias = attention_bias - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters # Handle backward compatibility for frame_rate: # If frame_rate is explicitly provided, use it (backward compatibility) diff --git a/src/transformers/models/minimax/configuration_minimax.py b/src/transformers/models/minimax/configuration_minimax.py index 8661cdd56724..7f5e34d71e77 100644 --- a/src/transformers/models/minimax/configuration_minimax.py +++ b/src/transformers/models/minimax/configuration_minimax.py @@ -23,7 +23,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class MiniMaxConfig(PreTrainedConfig): @@ -132,6 +132,7 @@ class MiniMaxConfig(PreTrainedConfig): model_type = "minimax" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -221,9 +222,6 @@ def __init__( self.linear_attn_beta_factor = linear_attn_beta_factor self.mlp_alpha_factor = mlp_alpha_factor self.mlp_beta_factor = mlp_beta_factor - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ @@ -231,10 +229,7 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/minimax/modular_minimax.py b/src/transformers/models/minimax/modular_minimax.py index 51468cfc7148..d9b428bbce86 100644 --- a/src/transformers/models/minimax/modular_minimax.py +++ b/src/transformers/models/minimax/modular_minimax.py @@ -28,7 +28,7 @@ from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import MoeModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging from ...utils.generic import OutputRecorder, check_model_inputs @@ -157,6 +157,7 @@ class MiniMaxConfig(PreTrainedConfig): model_type = "minimax" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -246,9 +247,6 @@ def __init__( self.linear_attn_beta_factor = linear_attn_beta_factor self.mlp_alpha_factor = mlp_alpha_factor self.mlp_beta_factor = mlp_beta_factor - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ @@ -256,10 +254,7 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, bos_token_id=bos_token_id, diff --git a/src/transformers/models/ministral/configuration_ministral.py b/src/transformers/models/ministral/configuration_ministral.py index 03bdbdb83b84..0afccfb429d9 100644 --- a/src/transformers/models/ministral/configuration_ministral.py +++ b/src/transformers/models/ministral/configuration_ministral.py @@ -7,7 +7,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class MinistralConfig(PreTrainedConfig): @@ -130,13 +130,6 @@ def __init__( layer_types: Optional[list[str]] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -157,19 +150,21 @@ def __init__( self.use_cache = use_cache self.attention_dropout = attention_dropout self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ "sliding_attention" if self.sliding_window is not None else "full_attention" ] * num_hidden_layers - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) __all__ = ["MinistralConfig"] diff --git a/src/transformers/models/ministral/modular_ministral.py b/src/transformers/models/ministral/modular_ministral.py index 26d659d29b8d..9e3a185f2c25 100644 --- a/src/transformers/models/ministral/modular_ministral.py +++ b/src/transformers/models/ministral/modular_ministral.py @@ -7,7 +7,7 @@ from ...configuration_utils import PreTrainedConfig from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring from ...utils.generic import check_model_inputs @@ -131,14 +131,6 @@ def __init__( layer_types: Optional[list[str]] = None, **kwargs, ): - PreTrainedConfig.__init__( - self, - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -159,19 +151,22 @@ def __init__( self.use_cache = use_cache self.attention_dropout = attention_dropout self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ "sliding_attention" if self.sliding_window is not None else "full_attention" ] * num_hidden_layers - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + + PreTrainedConfig.__init__( + self, + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class MinistralMLP(Qwen2MLP): diff --git a/src/transformers/models/mistral/configuration_mistral.py b/src/transformers/models/mistral/configuration_mistral.py index b618ae245c5e..1c6cb9c4e684 100644 --- a/src/transformers/models/mistral/configuration_mistral.py +++ b/src/transformers/models/mistral/configuration_mistral.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -166,14 +166,7 @@ def __init__( "Detected Mistral model with layer_types. Consider using AutoModel or Ministral classes instead to enable alternating attention compatibility." ) - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py index e54db8aa0ecd..18e58f5f6484 100644 --- a/src/transformers/models/mixtral/configuration_mixtral.py +++ b/src/transformers/models/mixtral/configuration_mixtral.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -114,6 +114,7 @@ class MixtralConfig(PreTrainedConfig): model_type = "mixtral" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", "layers.*.self_attn.k_proj": "colwise", @@ -186,14 +187,7 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.router_jitter_noise = router_jitter_noise - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index a7fb988d6d47..3760b0b228f9 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -16,7 +16,6 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import rope_config_validation, standardize_rope_params from ...utils import logging @@ -208,6 +207,7 @@ class MllamaTextConfig(PreTrainedConfig): model_type = "mllama_text_model" base_config_key = "text_config" + default_theta = 500000.0 def __init__( self, @@ -247,14 +247,7 @@ def __init__( self.dropout = dropout self.hidden_act = hidden_act self.max_position_embeddings = max_position_embeddings - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 500000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/modernbert/configuration_modernbert.py b/src/transformers/models/modernbert/configuration_modernbert.py index 774ded93f0e3..80e6c19092c6 100644 --- a/src/transformers/models/modernbert/configuration_modernbert.py +++ b/src/transformers/models/modernbert/configuration_modernbert.py @@ -22,7 +22,7 @@ from typing import Literal, Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class ModernBertConfig(PreTrainedConfig): @@ -130,8 +130,8 @@ class ModernBertConfig(PreTrainedConfig): ```""" model_type = "modernbert" - attribute_map = {"rope_theta": "global_rope_theta"} keys_to_ignore_at_inference = ["past_key_values"] + default_theta = {"global": 160_000.0, "local": 10_000.0} def __init__( self, @@ -171,14 +171,6 @@ def __init__( repad_logits_with_grad: Optional[bool] = False, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - cls_token_id=cls_token_id, - sep_token_id=sep_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -206,9 +198,6 @@ def __init__( self.sparse_pred_ignore_index = sparse_pred_ignore_index self.reference_compile = reference_compile self.repad_logits_with_grad = repad_logits_with_grad - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.classifier_pooling not in ["cls", "mean"]: raise ValueError( @@ -227,13 +216,40 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "global_rope_theta", 160_000.0) - rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} + self.rope_parameters = rope_parameters + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + cls_token_id=cls_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["sliding_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("global_rope_theta", self.default_theta["global"]) ) - rope_config_validation(self) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("local_rope_theta", self.default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs def to_dict(self): output = super().to_dict() diff --git a/src/transformers/models/modernbert/modular_modernbert.py b/src/transformers/models/modernbert/modular_modernbert.py index 8fc375671583..49f6244cfb2f 100644 --- a/src/transformers/models/modernbert/modular_modernbert.py +++ b/src/transformers/models/modernbert/modular_modernbert.py @@ -35,7 +35,7 @@ SequenceClassifierOutput, TokenClassifierOutput, ) -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import PreTrainedModel from ...utils import auto_docstring, is_flash_attn_2_available, logging from ...utils.import_utils import is_triton_available @@ -158,8 +158,8 @@ class ModernBertConfig(PreTrainedConfig): ```""" model_type = "modernbert" - attribute_map = {"rope_theta": "global_rope_theta"} keys_to_ignore_at_inference = ["past_key_values"] + default_theta = {"global": 160_000.0, "local": 10_000.0} def __init__( self, @@ -199,14 +199,6 @@ def __init__( repad_logits_with_grad: Optional[bool] = False, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - cls_token_id=cls_token_id, - sep_token_id=sep_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -234,9 +226,6 @@ def __init__( self.sparse_pred_ignore_index = sparse_pred_ignore_index self.reference_compile = reference_compile self.repad_logits_with_grad = repad_logits_with_grad - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.classifier_pooling not in ["cls", "mean"]: raise ValueError( @@ -255,13 +244,40 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "global_rope_theta", 160_000.0) - rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} + self.rope_parameters = rope_parameters + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + cls_token_id=cls_token_id, + sep_token_id=sep_token_id, + **kwargs, ) - rope_config_validation(self) + + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["sliding_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("global_rope_theta", self.default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("local_rope_theta", self.default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs def to_dict(self): output = super().to_dict() diff --git a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py index 90d2305c7441..aaca8cef86c0 100644 --- a/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/configuration_modernbert_decoder.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class ModernBertDecoderConfig(PreTrainedConfig): @@ -120,8 +120,8 @@ class ModernBertDecoderConfig(PreTrainedConfig): ```""" model_type = "modernbert-decoder" - attribute_map = {"rope_theta": "global_rope_theta"} keys_to_ignore_at_inference = ["past_key_values"] + default_theta = {"global": 160_000.0, "local": 10_000.0} def __init__( self, @@ -157,14 +157,6 @@ def __init__( rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - cls_token_id=cls_token_id, - sep_token_id=sep_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -187,9 +179,6 @@ def __init__( self.classifier_activation = classifier_activation self.use_cache = use_cache self.global_attn_every_n_layers = global_attn_every_n_layers - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters # for consistency with ModernBert self.reference_compile = False @@ -204,16 +193,42 @@ def __init__( else: self.layer_types.append("full_attention") - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "global_rope_theta", 160_000.0) - rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) - # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 + self.rope_parameters = rope_parameters + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + cls_token_id=cls_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["sliding_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("global_rope_theta", self.default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("local_rope_theta", self.default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs __all__ = ["ModernBertDecoderConfig"] diff --git a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py index b39cb440c3e9..cfff8ec564ad 100644 --- a/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py +++ b/src/transformers/models/modernbert_decoder/modular_modernbert_decoder.py @@ -28,7 +28,7 @@ from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_layers import GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -141,8 +141,8 @@ class ModernBertDecoderConfig(PreTrainedConfig): ```""" model_type = "modernbert-decoder" - attribute_map = {"rope_theta": "global_rope_theta"} keys_to_ignore_at_inference = ["past_key_values"] + default_theta = {"global": 160_000.0, "local": 10_000.0} def __init__( self, @@ -178,14 +178,6 @@ def __init__( rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - cls_token_id=cls_token_id, - sep_token_id=sep_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -208,9 +200,6 @@ def __init__( self.classifier_activation = classifier_activation self.use_cache = use_cache self.global_attn_every_n_layers = global_attn_every_n_layers - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters # for consistency with ModernBert self.reference_compile = False @@ -225,16 +214,42 @@ def __init__( else: self.layer_types.append("full_attention") - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "global_rope_theta", 160_000.0) - rope_local_base_freq = getattr(self, "local_rope_theta", 10000.0) - standardize_rope_params( - self, rope_theta={"full_attention": rope_theta, "sliding_attention": rope_local_base_freq} - ) - rope_config_validation(self) - # NOTE: sliding window numbers matches ModernBERT but is only half of it self.sliding_window = local_attention // 2 if local_attention else -1 + self.rope_parameters = rope_parameters + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + cls_token_id=cls_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation=None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + + # Try to set `rope_scaling` if available, otherwise use `rope_parameters`. If we find `rope_parameters` + # as arg in the inputs, we can safely assume that it is in the new format. New naming used -> new format + default_rope_params = { + "sliding_attention": {"rope_type": "default"}, + "full_attention": {"rope_type": "default"}, + } + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else default_rope_params + if rope_scaling is not None: + self.rope_parameters["full_attention"].update(rope_scaling) + self.rope_parameters["sliding_attention"].update(rope_scaling) + self.rope_parameters["full_attention"].setdefault( + "rope_theta", kwargs.pop("global_rope_theta", self.default_theta["global"]) + ) + self.rope_parameters["sliding_attention"].setdefault( + "rope_theta", kwargs.pop("local_rope_theta", self.default_theta["local"]) + ) + + # Standardize and validate the correctness of rotary position embeddings parameters + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs class ModernBertDecoderEmbeddings(ModernBertEmbeddings): diff --git a/src/transformers/models/moonshine/configuration_moonshine.py b/src/transformers/models/moonshine/configuration_moonshine.py index 7fa72bc2cc03..ddc6b3c2ba8b 100644 --- a/src/transformers/models/moonshine/configuration_moonshine.py +++ b/src/transformers/models/moonshine/configuration_moonshine.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class MoonshineConfig(PreTrainedConfig): @@ -87,8 +87,6 @@ class MoonshineConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.9): - Percentage of the query and keys which will have rotary embedding. is_encoder_decoder (`bool`, *optional*, defaults to `True`): Whether the model is used as an encoder/decoder or not. attention_bias (`bool`, *optional*, defaults to `False`): @@ -142,7 +140,6 @@ def __init__( decoder_start_token_id: Optional[int] = 1, use_cache: Optional[bool] = True, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.9, is_encoder_decoder: Optional[bool] = True, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, @@ -174,18 +171,12 @@ def __init__( self.initializer_range = initializer_range self.decoder_start_token_id = decoder_start_token_id self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.is_encoder_decoder = is_encoder_decoder self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.9) # assign default for BC + super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/moonshine/modeling_moonshine.py b/src/transformers/models/moonshine/modeling_moonshine.py index 373e1db4a217..2bd7f032d4a4 100644 --- a/src/transformers/models/moonshine/modeling_moonshine.py +++ b/src/transformers/models/moonshine/modeling_moonshine.py @@ -118,7 +118,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/moonshine/modular_moonshine.py b/src/transformers/models/moonshine/modular_moonshine.py index 5afbdb2db363..717012fd1fc2 100644 --- a/src/transformers/models/moonshine/modular_moonshine.py +++ b/src/transformers/models/moonshine/modular_moonshine.py @@ -35,7 +35,7 @@ Seq2SeqLMOutput, Seq2SeqModelOutput, ) -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging @@ -110,8 +110,6 @@ class MoonshineConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.9): - Percentage of the query and keys which will have rotary embedding. is_encoder_decoder (`bool`, *optional*, defaults to `True`): Whether the model is used as an encoder/decoder or not. attention_bias (`bool`, *optional*, defaults to `False`): @@ -165,7 +163,6 @@ def __init__( decoder_start_token_id: Optional[int] = 1, use_cache: Optional[bool] = True, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.9, is_encoder_decoder: Optional[bool] = True, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, @@ -197,18 +194,12 @@ def __init__( self.initializer_range = initializer_range self.decoder_start_token_id = decoder_start_token_id self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.is_encoder_decoder = is_encoder_decoder self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.9) # assign default for BC + super().__init__( bos_token_id=bos_token_id, eos_token_id=eos_token_id, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index a054f40e2e77..f17dd6dcc14b 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging from ..auto.configuration_auto import AutoConfig @@ -282,14 +282,7 @@ def __init__( self.ffn_dim = ffn_dim self.rms_norm_eps = rms_norm_eps self.num_codebooks = num_codebooks - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters audio_encoder_config = kwargs.pop("audio_encoder_config", {}) audio_encoder_model_type = audio_encoder_config.pop("model_type", "mimi") diff --git a/src/transformers/models/nanochat/configuration_nanochat.py b/src/transformers/models/nanochat/configuration_nanochat.py index 998b08b31959..e690e26fe7f5 100644 --- a/src/transformers/models/nanochat/configuration_nanochat.py +++ b/src/transformers/models/nanochat/configuration_nanochat.py @@ -14,7 +14,7 @@ # limitations under the License. from ...configuration_utils import PretrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class NanoChatConfig(PretrainedConfig): @@ -144,6 +144,7 @@ def __init__( self.use_cache = use_cache self.final_logit_softcapping = final_logit_softcapping self.attention_bias = attention_bias + self.rope_parameters = rope_parameters super().__init__( bos_token_id=bos_token_id, @@ -153,12 +154,5 @@ def __init__( **kwargs, ) - # Validate the correctness of rotary position embeddings parameters - # Must be done after super().__init__() to avoid being overridden by kwargs - self.rope_parameters = rope_parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - __all__ = ["NanoChatConfig"] diff --git a/src/transformers/models/nemotron/configuration_nemotron.py b/src/transformers/models/nemotron/configuration_nemotron.py index cf49fabaf134..083efe87beed 100644 --- a/src/transformers/models/nemotron/configuration_nemotron.py +++ b/src/transformers/models/nemotron/configuration_nemotron.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -80,7 +80,6 @@ class NemotronConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -123,7 +122,6 @@ def __init__( eos_token_id: Optional[int] = 3, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.5, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, mlp_bias: Optional[bool] = False, @@ -141,18 +139,11 @@ def __init__( self.initializer_range = initializer_range self.norm_eps = norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.mlp_bias = mlp_bias - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/nemotron/modeling_nemotron.py b/src/transformers/models/nemotron/modeling_nemotron.py index af1d14ee2da0..2b9b19a52c1b 100644 --- a/src/transformers/models/nemotron/modeling_nemotron.py +++ b/src/transformers/models/nemotron/modeling_nemotron.py @@ -132,7 +132,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) @@ -250,7 +250,7 @@ def __init__(self, config: NemotronConfig, layer_idx: Optional[int] = None): self.num_key_value_groups = self.num_heads // self.num_key_value_heads self.max_position_embeddings = config.max_position_embeddings - self.partial_rotary_factor = config.partial_rotary_factor + self.partial_rotary_factor = config.rope_parameters["partial_rotary_factor"] self.is_causal = True self.rotary_emb = NemotronRotaryEmbedding(config=config) diff --git a/src/transformers/models/olmo/configuration_olmo.py b/src/transformers/models/olmo/configuration_olmo.py index 31627115bee2..d72c6ee4d163 100644 --- a/src/transformers/models/olmo/configuration_olmo.py +++ b/src/transformers/models/olmo/configuration_olmo.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -158,14 +158,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.clip_qkv = clip_qkv - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo2/configuration_olmo2.py b/src/transformers/models/olmo2/configuration_olmo2.py index 6e35ca5729a9..d5a60ea02484 100644 --- a/src/transformers/models/olmo2/configuration_olmo2.py +++ b/src/transformers/models/olmo2/configuration_olmo2.py @@ -27,7 +27,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Olmo2Config(PreTrainedConfig): @@ -158,14 +158,7 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/olmo3/configuration_olmo3.py b/src/transformers/models/olmo3/configuration_olmo3.py index f7e4d1198acd..5bd057c477a2 100644 --- a/src/transformers/models/olmo3/configuration_olmo3.py +++ b/src/transformers/models/olmo3/configuration_olmo3.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Olmo3Config(PreTrainedConfig): @@ -143,13 +143,6 @@ def __init__( layer_types: Optional[list[str]] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -168,10 +161,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rms_norm_eps = rms_norm_eps - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - self.sliding_window = sliding_window self.layer_types = layer_types if self.layer_types is None: @@ -180,10 +169,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) __all__ = ["Olmo3Config"] diff --git a/src/transformers/models/olmo3/modular_olmo3.py b/src/transformers/models/olmo3/modular_olmo3.py index 265ef3bae967..488280bb7c55 100644 --- a/src/transformers/models/olmo3/modular_olmo3.py +++ b/src/transformers/models/olmo3/modular_olmo3.py @@ -25,7 +25,7 @@ from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ..gemma2.modeling_gemma2 import Gemma2RotaryEmbedding @@ -159,13 +159,6 @@ def __init__( layer_types: Optional[list[str]] = None, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -184,10 +177,6 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.rms_norm_eps = rms_norm_eps - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - self.sliding_window = sliding_window self.layer_types = layer_types if self.layer_types is None: @@ -196,10 +185,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class Olmo3RMSNorm(Olmo2RMSNorm): diff --git a/src/transformers/models/olmoe/configuration_olmoe.py b/src/transformers/models/olmoe/configuration_olmoe.py index 786146224b05..38316e05d364 100644 --- a/src/transformers/models/olmoe/configuration_olmoe.py +++ b/src/transformers/models/olmoe/configuration_olmoe.py @@ -14,7 +14,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class OlmoeConfig(PreTrainedConfig): @@ -158,14 +158,7 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.norm_topk_prob = norm_topk_prob - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/persimmon/configuration_persimmon.py b/src/transformers/models/persimmon/configuration_persimmon.py index f913b5c137e4..a11b1658811e 100644 --- a/src/transformers/models/persimmon/configuration_persimmon.py +++ b/src/transformers/models/persimmon/configuration_persimmon.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -70,8 +70,6 @@ class PersimmonConfig(PreTrainedConfig): The dropout ratio after applying the MLP to the hidden states. attention_dropout (`float`, *optional*, default to 0.0): The dropout ratio after computing the attention scores. - partial_rotary_factor (`float`, *optional*, default to 0.5): - Percentage of the query and keys which will have rotary embedding. Example: @@ -102,7 +100,6 @@ def __init__( qk_layernorm: Optional[bool] = True, hidden_dropout: Optional[float] = 0.0, attention_dropout: Optional[float] = 0.0, - partial_rotary_factor: Optional[float] = 0.5, pad_token_id: Optional[int] = None, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, @@ -121,15 +118,8 @@ def __init__( self.qk_layernorm = qk_layernorm self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - self.partial_rotary_factor = partial_rotary_factor - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 25000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py index 4b09a2dd75bf..a7054a2bd989 100644 --- a/src/transformers/models/persimmon/modeling_persimmon.py +++ b/src/transformers/models/persimmon/modeling_persimmon.py @@ -99,7 +99,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) @@ -219,7 +219,7 @@ def __init__(self, config: PersimmonConfig, layer_idx: Optional[int] = None): self.num_heads = config.num_attention_heads self.head_dim = self.hidden_size // self.num_heads - self.rotary_ndims = int(self.head_dim * config.partial_rotary_factor) + self.rotary_ndims = int(self.head_dim * config.rope_parameters["partial_rotary_factor"]) self.is_causal = True if (self.head_dim * self.num_heads) != self.hidden_size: diff --git a/src/transformers/models/phi/configuration_phi.py b/src/transformers/models/phi/configuration_phi.py index 066e31c6a7d8..5af2c3220c2f 100644 --- a/src/transformers/models/phi/configuration_phi.py +++ b/src/transformers/models/phi/configuration_phi.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -79,8 +79,6 @@ class PhiConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - Percentage of the query and keys which will have rotary embedding. qk_layernorm (`bool`, *optional*, defaults to `False`): Whether or not to normalize the Queries and Keys after projecting the hidden states. bos_token_id (`int`, *optional*, defaults to 1): @@ -138,7 +136,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.5, qk_layernorm: Optional[bool] = False, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 2, @@ -162,16 +159,9 @@ def __init__( self.initializer_range = initializer_range self.layer_norm_eps = layer_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.qk_layernorm = qk_layernorm - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py index 4a1530b78564..196b66df0e3a 100644 --- a/src/transformers/models/phi/modeling_phi.py +++ b/src/transformers/models/phi/modeling_phi.py @@ -70,7 +70,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) @@ -186,7 +186,7 @@ def __init__(self, config: PhiConfig, layer_idx: int): self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) self.dense = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=True) - self.rotary_ndims = int(self.head_dim * config.partial_rotary_factor) + self.rotary_ndims = int(self.head_dim * config.rope_parameters["partial_rotary_factor"]) self.qk_layernorm = config.qk_layernorm if self.qk_layernorm: self.q_layernorm = nn.LayerNorm( diff --git a/src/transformers/models/phi/modular_phi.py b/src/transformers/models/phi/modular_phi.py index 3ecc9ba9d4f7..75e52d934097 100644 --- a/src/transformers/models/phi/modular_phi.py +++ b/src/transformers/models/phi/modular_phi.py @@ -54,7 +54,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) @@ -75,7 +75,7 @@ def __init__(self, config: PhiConfig, layer_idx: int): self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=True) self.dense = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=True) del self.o_proj - self.rotary_ndims = int(self.head_dim * config.partial_rotary_factor) + self.rotary_ndims = int(self.head_dim * config.rope_parameters["partial_rotary_factor"]) self.qk_layernorm = config.qk_layernorm if self.qk_layernorm: self.q_layernorm = nn.LayerNorm( diff --git a/src/transformers/models/phi3/configuration_phi3.py b/src/transformers/models/phi3/configuration_phi3.py index 04dbdb889ce1..bcdff4058426 100644 --- a/src/transformers/models/phi3/configuration_phi3.py +++ b/src/transformers/models/phi3/configuration_phi3.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -81,8 +81,6 @@ class Phi3Config(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 1.0): - Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0. bos_token_id (`int`, *optional*, defaults to 1): The id of the "beginning-of-sequence" token. eos_token_id (`int`, *optional*, defaults to 32000): @@ -140,7 +138,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 1.0, bos_token_id: Optional[int] = 1, eos_token_id: Optional[int] = 32000, pad_token_id: Optional[int] = 32000, @@ -166,17 +163,8 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - self._rope_parameters_adjustment() - self._rope_parameters_validation() + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 1.0) # assign default for BC self.sliding_window = sliding_window super().__init__( @@ -187,26 +175,40 @@ def __init__( **kwargs, ) - def _rope_parameters_adjustment(self): - """ - Adjust the `type` of the `rope_parameters` configuration for backward compatibility. - """ - rope_parameters_type = self.rope_parameters.get("rope_type", None) + def convert_rope_params_to_dict( + self, default_theta: int | float = 10_000.0, ignore_keys: Optional[set] = None, **kwargs + ): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) + self.rope_parameters.setdefault("partial_rotary_factor", kwargs["partial_rotary_factor"]) + self.standardize_rope_params() # For backward compatibility if previous version used "su" or "yarn" + rope_parameters_type = self.rope_parameters.get("rope_type", None) if rope_parameters_type is not None and rope_parameters_type in ["su", "yarn"]: self.rope_parameters["rope_type"] = "longrope" + self.validate_rope(ignore_keys=ignore_keys) + return kwargs - def _rope_parameters_validation(self): + def validate_rope(self, ignore_keys: Optional[set] = None): """ Validate the `rope_parameters` configuration. """ + super().validate_rope(ignore_keys=ignore_keys) + + # Run Phi3 specific validation if not isinstance(self.rope_parameters, dict): raise ValueError(f"`rope_parameters` must be a dictionary but got {self.rope_parameters}") rope_parameters_type = self.rope_parameters.get("rope_type", None) rope_parameters_short_factor = self.rope_parameters.get("short_factor", None) rope_parameters_long_factor = self.rope_parameters.get("long_factor", None) - rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor) + rotary_ndims = int( + self.hidden_size // self.num_attention_heads * self.rope_parameters["partial_rotary_factor"] + ) if rope_parameters_type not in ["default", "longrope"]: raise ValueError(f"`rope_parameters`'s type field must be one of ['longrope'], got {rope_parameters_type}") diff --git a/src/transformers/models/phi3/modeling_phi3.py b/src/transformers/models/phi3/modeling_phi3.py index 29b3d2847ed1..3f98bb1b0042 100644 --- a/src/transformers/models/phi3/modeling_phi3.py +++ b/src/transformers/models/phi3/modeling_phi3.py @@ -104,7 +104,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py index 1a918d9d7f0e..c06858bea98a 100644 --- a/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/configuration_phi4_multimodal.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Phi4MultimodalVisionConfig(PreTrainedConfig): @@ -296,8 +296,6 @@ class Phi4MultimodalConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to `1.0`): - Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0. bos_token_id (`int`, *optional*, defaults to 199999): The id of the "beginning-of-sequence" token. eos_token_id (`int` or `list[int]`, *optional*, defaults to `[199999, 200020]`): @@ -367,7 +365,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[int] = 1, bos_token_id: Optional[int] = 199999, eos_token_id: Optional[list[int]] = [199999, 200020], pad_token_id: Optional[int] = 199999, @@ -407,17 +404,8 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - self._rope_parameters_adjustment() - self._rope_parameters_validation() + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 1.0) # assign default for BC self.sliding_window = sliding_window super().__init__( @@ -428,26 +416,40 @@ def __init__( **kwargs, ) - def _rope_parameters_adjustment(self): - """ - Adjust the `type` of the `rope_parameters` configuration for backward compatibility. - """ - rope_parameters_type = self.rope_parameters.get("rope_type", None) + def convert_rope_params_to_dict( + self, default_theta: int | float = 10_000.0, ignore_keys: Optional[set] = None, **kwargs + ): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", default_theta)) + self.rope_parameters.setdefault("partial_rotary_factor", kwargs["partial_rotary_factor"]) + self.standardize_rope_params() # For backward compatibility if previous version used "su" or "yarn" + rope_parameters_type = self.rope_parameters.get("rope_type", None) if rope_parameters_type is not None and rope_parameters_type in ["su", "yarn"]: self.rope_parameters["rope_type"] = "longrope" + self.validate_rope(ignore_keys=ignore_keys) + return kwargs - def _rope_parameters_validation(self): + def validate_rope(self, ignore_keys: Optional[set] = None): """ Validate the `rope_parameters` configuration. """ + super().validate_rope(ignore_keys=ignore_keys) + + # Run Phi4Multimodal specific validation if not isinstance(self.rope_parameters, dict): raise ValueError(f"`rope_parameters` must be a dictionary but got {self.rope_parameters}") rope_parameters_type = self.rope_parameters.get("rope_type", None) rope_parameters_short_factor = self.rope_parameters.get("short_factor", None) rope_parameters_long_factor = self.rope_parameters.get("long_factor", None) - rotary_ndims = int(self.hidden_size // self.num_attention_heads * self.partial_rotary_factor) + rotary_ndims = int( + self.hidden_size // self.num_attention_heads * self.rope_parameters["partial_rotary_factor"] + ) if rope_parameters_type not in ["default", "longrope"]: raise ValueError(f"`rope_parameters`'s type field must be one of ['longrope'], got {rope_parameters_type}") diff --git a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py index 4d4bdab8ce44..6aab73a09b19 100644 --- a/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py @@ -1481,7 +1481,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py index 728a5244468a..5db53409cb76 100644 --- a/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/modular_phi4_multimodal.py @@ -333,8 +333,6 @@ class Phi4MultimodalConfig(Phi3Config): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to `1.0`): - Percentage of the query and keys which will have rotary embedding. Must be between 0.0 and 1.0. bos_token_id (`int`, *optional*, defaults to 199999): The id of the "beginning-of-sequence" token. eos_token_id (`int` or `list[int]`, *optional*, defaults to `[199999, 200020]`): @@ -390,7 +388,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[int] = 1, bos_token_id: Optional[int] = 199999, eos_token_id: Optional[list[int]] = [199999, 200020], pad_token_id: Optional[int] = 199999, @@ -429,7 +426,6 @@ def __init__( use_cache=use_cache, tie_word_embeddings=tie_word_embeddings, rope_parameters=rope_parameters, - partial_rotary_factor=partial_rotary_factor, bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, diff --git a/src/transformers/models/phimoe/configuration_phimoe.py b/src/transformers/models/phimoe/configuration_phimoe.py index c7b626186285..9c89b8036827 100644 --- a/src/transformers/models/phimoe/configuration_phimoe.py +++ b/src/transformers/models/phimoe/configuration_phimoe.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -110,6 +110,7 @@ class PhimoeConfig(PreTrainedConfig): model_type = "phimoe" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 def __init__( self, @@ -167,14 +168,23 @@ def __init__( self.router_aux_loss_coef = router_aux_loss_coef self.router_jitter_noise = router_jitter_noise self.input_jitter_noise = input_jitter_noise - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + def validate_rope(self, ignore_keys=None): + """ + Validate the `rope_parameters` configuration. + """ + super().validate_rope(ignore_keys=ignore_keys) + + # Run model-specific rope validation if self.rope_parameters["rope_type"] != "default": if "original_max_position_embeddings" in self.rope_parameters: self.original_max_position_embeddings = self.rope_parameters["original_max_position_embeddings"] @@ -189,15 +199,5 @@ def __init__( f"`rope_parameters`'s long_mscale field must be a number, got {rope_parameters_long_mscale}" ) - rope_config_validation(self) - - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) - __all__ = ["PhimoeConfig"] diff --git a/src/transformers/models/pixtral/configuration_pixtral.py b/src/transformers/models/pixtral/configuration_pixtral.py index 62c179b20edc..89586615c4a2 100644 --- a/src/transformers/models/pixtral/configuration_pixtral.py +++ b/src/transformers/models/pixtral/configuration_pixtral.py @@ -16,7 +16,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -90,8 +90,6 @@ def __init__( initializer_range: Optional[float] = 0.02, **kwargs, ): - super().__init__(**kwargs) - self.hidden_size = hidden_size self.intermediate_size = intermediate_size self.num_hidden_layers = num_hidden_layers @@ -103,14 +101,9 @@ def __init__( self.hidden_act = hidden_act self.head_dim = hidden_size // num_attention_heads self.initializer_range = initializer_range - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + + super().__init__(**kwargs) __all__ = ["PixtralVisionConfig"] diff --git a/src/transformers/models/qwen2/configuration_qwen2.py b/src/transformers/models/qwen2/configuration_qwen2.py index d088d22dee8e..ffb46ca932aa 100644 --- a/src/transformers/models/qwen2/configuration_qwen2.py +++ b/src/transformers/models/qwen2/configuration_qwen2.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -157,9 +157,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -171,10 +168,7 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py index 00bcdcab1668..6a23e0668083 100644 --- a/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/configuration_qwen2_5_omni.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -291,6 +291,7 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig): model_type = "qwen2_5_omni_text" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 # Default tensor parallel plan for base model `Qwen25OmniText` base_model_tp_plan = { @@ -330,10 +331,6 @@ def __init__( attention_dropout: Optional[float] = 0.0, **kwargs, ): - super().__init__( - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -354,9 +351,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -368,10 +362,12 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters = rope_parameters + super().__init__( + tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope"}, + **kwargs, + ) class Qwen2_5OmniThinkerConfig(PreTrainedConfig): @@ -613,6 +609,7 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig): ```""" model_type = "qwen2_5_omni_talker" + default_theta = 1000000.0 attribute_map = { "image_token_id": "image_token_index", "video_token_id": "video_token_index", @@ -697,9 +694,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.position_id_per_seconds = position_id_per_seconds # zf self.seconds_per_chunk = seconds_per_chunk # zf self.audio_start_token_id = audio_start_token_id # zf @@ -718,12 +712,8 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + self.rope_parameters = rope_parameters + super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs) class Qwen2_5OmniDiTConfig(PreTrainedConfig): @@ -822,14 +812,8 @@ def __init__( self.enc_attention_channels = enc_attention_channels self.enc_res2net_scale = enc_res2net_scale self.enc_se_channels = enc_se_channels - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py index 553213af7e77..44c71c5eb0fa 100644 --- a/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py +++ b/src/transformers/models/qwen2_5_omni/modular_qwen2_5_omni.py @@ -45,7 +45,7 @@ from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...generation import GenerationMixin from ...modeling_outputs import BaseModelOutput, ModelOutput -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import ( @@ -325,6 +325,7 @@ class Qwen2_5OmniTextConfig(PreTrainedConfig): model_type = "qwen2_5_omni_text" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 # Default tensor parallel plan for base model `Qwen25OmniText` base_model_tp_plan = { @@ -364,10 +365,6 @@ def __init__( attention_dropout: Optional[float] = 0.0, **kwargs, ): - super().__init__( - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -388,9 +385,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -402,10 +396,12 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section"}) + self.rope_parameters = rope_parameters + super().__init__( + tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope"}, + **kwargs, + ) class Qwen2_5OmniThinkerConfig(PreTrainedConfig): @@ -647,6 +643,7 @@ class Qwen2_5OmniTalkerConfig(PreTrainedConfig): ```""" model_type = "qwen2_5_omni_talker" + default_theta = 1000000.0 attribute_map = { "image_token_id": "image_token_index", "video_token_id": "video_token_index", @@ -731,9 +728,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.position_id_per_seconds = position_id_per_seconds # zf self.seconds_per_chunk = seconds_per_chunk # zf self.audio_start_token_id = audio_start_token_id # zf @@ -752,12 +746,8 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + self.rope_parameters = rope_parameters + super().__init__(tie_word_embeddings=tie_word_embeddings, ignore_keys_at_rope_validation={"mrope"}, **kwargs) class Qwen2_5OmniDiTConfig(PreTrainedConfig): @@ -856,14 +846,8 @@ def __init__( self.enc_attention_channels = enc_attention_channels self.enc_res2net_scale = enc_res2net_scale self.enc_se_channels = enc_se_channels - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + super().__init__(**kwargs) diff --git a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py index b0ab2188488c..084b4d8c9ce6 100644 --- a/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/configuration_qwen2_5_vl.py @@ -28,7 +28,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Qwen2_5_VLVisionConfig(PreTrainedConfig): @@ -151,6 +151,7 @@ class Qwen2_5_VLTextConfig(PreTrainedConfig): model_type = "qwen2_5_vl_text" base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 # Default tensor parallel plan for base model `Qwen2_5_VL` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -212,9 +213,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -226,21 +224,29 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - if self.rope_parameters["rope_type"] == "mrope": - self.rope_parameters["rope_type"] = "default" - rope_config_validation(self, ignore_keys={"mrope_section"}) - + self.rope_parameters = rope_parameters super().__init__( tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, + ignore_keys_at_rope_validation={"mrope"}, **kwargs, ) + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: Optional[set] = None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) + if self.rope_parameters.get("rope_type", self.rope_parameters.get("type")) == "mrope": + self.rope_parameters["rope_type"] = "default" + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs + class Qwen2_5_VLConfig(PreTrainedConfig): r""" diff --git a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py index 271cf3588f17..567bc0d66dd5 100644 --- a/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/configuration_qwen2_moe.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -185,9 +185,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -209,10 +206,7 @@ def __init__( ] layer_type_validation(self.layer_types) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index e63caaf75271..e4578375036f 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -18,7 +18,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -139,6 +139,7 @@ class Qwen2VLTextConfig(PreTrainedConfig): model_type = "qwen2_vl_text" base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 # Default tensor parallel plan for base model `Qwen2VL` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -200,9 +201,6 @@ def __init__( self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -214,21 +212,29 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - if self.rope_parameters["rope_type"] == "mrope": - self.rope_parameters["rope_type"] = "default" - rope_config_validation(self, ignore_keys={"mrope_section"}) - + self.rope_parameters = rope_parameters super().__init__( tie_word_embeddings=tie_word_embeddings, bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, + ignore_keys_at_rope_validation={"mrope"}, **kwargs, ) + def convert_rope_params_to_dict(self, ignore_keys_at_rope_validation: Optional[set] = None, **kwargs): + rope_scaling = kwargs.pop("rope_scaling", None) + self.rope_parameters = rope_scaling or self.rope_parameters + self.rope_parameters = self.rope_parameters if self.rope_parameters is not None else {} + + # Standardize and validate the correctness of rotary position embeddings parameters + self.rope_parameters.setdefault("rope_theta", kwargs.pop("rope_theta", self.default_theta)) + if self.rope_parameters.get("rope_type", self.rope_parameters.get("type")) == "mrope": + self.rope_parameters["rope_type"] = "default" + self.standardize_rope_params() + self.validate_rope(ignore_keys=ignore_keys_at_rope_validation) + return kwargs + class Qwen2VLConfig(PreTrainedConfig): r""" diff --git a/src/transformers/models/qwen3/configuration_qwen3.py b/src/transformers/models/qwen3/configuration_qwen3.py index 3b4ef21bd386..f3e3caf4061d 100644 --- a/src/transformers/models/qwen3/configuration_qwen3.py +++ b/src/transformers/models/qwen3/configuration_qwen3.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -165,9 +165,6 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -179,10 +176,7 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( tie_word_embeddings=tie_word_embeddings, diff --git a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py index 90ae6b1c92f6..2d0be9fbff12 100644 --- a/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py +++ b/src/transformers/models/qwen3_moe/configuration_qwen3_moe.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -179,14 +179,7 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters # MoE arguments self.decoder_sparse_step = decoder_sparse_step diff --git a/src/transformers/models/qwen3_next/configuration_qwen3_next.py b/src/transformers/models/qwen3_next/configuration_qwen3_next.py index 4a18f6ed4603..b6f8ea322905 100644 --- a/src/transformers/models/qwen3_next/configuration_qwen3_next.py +++ b/src/transformers/models/qwen3_next/configuration_qwen3_next.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -71,8 +71,6 @@ class Qwen3NextConfig(PreTrainedConfig): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE with longer `max_position_embeddings`. - partial_rotary_factor (`float`, *optional*, defaults to 0.25): - Percentage of the query and keys which will have rotary embedding. attention_bias (`bool`, *optional*, defaults to `False`): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -166,7 +164,6 @@ def __init__( use_cache: Optional[bool] = True, tie_word_embeddings: Optional[bool] = False, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, - partial_rotary_factor: Optional[float] = 0.25, attention_bias: Optional[bool] = False, attention_dropout: Optional[float] = 0.0, head_dim: Optional[int] = 256, @@ -187,7 +184,6 @@ def __init__( layer_types: Optional[list[str]] = None, **kwargs, ): - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -199,13 +195,11 @@ def __init__( self.initializer_range = initializer_range self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.head_dim = head_dim - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.25) # assign default for BC self.layer_types = layer_types if self.layer_types is None: @@ -216,11 +210,6 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) - # linear attention part self.linear_conv_kernel_dim = linear_conv_kernel_dim self.linear_key_head_dim = linear_key_head_dim @@ -238,6 +227,7 @@ def __init__( self.output_router_logits = output_router_logits self.router_aux_loss_coef = router_aux_loss_coef self.mlp_only_layers = mlp_only_layers + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) __all__ = ["Qwen3NextConfig"] diff --git a/src/transformers/models/qwen3_next/modeling_qwen3_next.py b/src/transformers/models/qwen3_next/modeling_qwen3_next.py index 362c8fab007f..d0bf37e64de2 100644 --- a/src/transformers/models/qwen3_next/modeling_qwen3_next.py +++ b/src/transformers/models/qwen3_next/modeling_qwen3_next.py @@ -213,7 +213,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/qwen3_next/modular_qwen3_next.py b/src/transformers/models/qwen3_next/modular_qwen3_next.py index 7deedb9c868b..d4f9d017d2d3 100644 --- a/src/transformers/models/qwen3_next/modular_qwen3_next.py +++ b/src/transformers/models/qwen3_next/modular_qwen3_next.py @@ -203,7 +203,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py index 6d49a021e3de..43e6250c0fd4 100644 --- a/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/configuration_qwen3_omni_moe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -261,6 +261,7 @@ class Qwen3OmniMoeTextConfig(PreTrainedConfig): model_type = "qwen3_omni_moe_text" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 # Default tensor parallel plan for base model `Qwen3OmniMoeText` base_model_tp_plan = { @@ -324,14 +325,7 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -345,9 +339,9 @@ def __init__( super().__init__( tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope_section", "interleaved", "mrope_interleaved"}, **kwargs, ) - rope_config_validation(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) class Qwen3OmniMoeThinkerConfig(PreTrainedConfig): @@ -589,9 +583,6 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters self.layer_types = layer_types if self.layer_types is None: @@ -603,10 +594,7 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( tie_word_embeddings=tie_word_embeddings, @@ -768,14 +756,7 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -1018,7 +999,6 @@ def __init__( attention_dropout=0.0, **kwargs, ): - super().__init__(**kwargs) self.codebook_size = codebook_size self.hidden_size = hidden_size self.max_position_embeddings = max_position_embeddings @@ -1036,15 +1016,9 @@ def __init__( self.upsampling_ratios = upsampling_ratios self.decoder_dim = decoder_dim self.attention_dropout = attention_dropout + self.rope_parameters = rope_parameters - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + super().__init__(**kwargs) @property def layer_types(self): diff --git a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py index cd9f94681b9d..d061aaf5e321 100644 --- a/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py +++ b/src/transformers/models/qwen3_omni_moe/modular_qwen3_omni_moe.py @@ -42,7 +42,7 @@ MoeCausalLMOutputWithPast, MoeModelOutputWithPast, ) -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import PreTrainedModel from ...processing_utils import ProcessorMixin, Unpack from ...tokenization_utils_base import TextInput @@ -156,7 +156,117 @@ class Qwen3OmniMoeVisionEncoderConfig(Qwen3VLMoeVisionConfig): pass -class Qwen3OmniMoeTextConfig(Qwen3MoeConfig): +class Qwen3OmniMoeTextConfig(PreTrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Qwen3OmniMoeTextModel`]. It is used to instantiate a + Qwen3OmniMoeText model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of [Qwen/Qwen3-15B-A2B](https://huggingface.co/Qwen/Qwen3-15B-A2B). + + Configuration objects inherit from [`PreTrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PreTrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 151936): + Vocabulary size of the Qwen3OmniMoeText model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`Qwen3OmniMoeTextModel`] + hidden_size (`int`, *optional*, defaults to 2048): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 6144): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 4): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details, check out [this + paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`. + + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 32768): + The maximum sequence length that this model might ever be used with. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. + rope_parameters (`RopeParameters`, *optional*): + Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain + a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE + with longer `max_position_embeddings`. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + use_sliding_window (`bool`, *optional*, defaults to `False`): + Whether to use sliding window attention. + sliding_window (`int`, *optional*, defaults to 4096): + Sliding window attention (SWA) window size. If not specified, will default to `4096`. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + decoder_sparse_step (`int`, *optional*, defaults to 1): + The frequency of the MoE layer. + moe_intermediate_size (`int`, *optional*, defaults to 768): + Intermediate size of the routed expert. + num_experts_per_tok (`int`, *optional*, defaults to 8): + Number of selected experts. + num_experts (`int`, *optional*, defaults to 128): + Number of routed experts. + norm_topk_prob (`bool`, *optional*, defaults to `False`): + Whether to normalize the topk probabilities. + output_router_logits (`bool`, *optional*, defaults to `False`): + Whether or not the router logits should be returned by the model. Enabling this will also + allow the model to output the auxiliary loss, including load balancing loss and router z-loss. + router_aux_loss_coef (`float`, *optional*, defaults to 0.001): + The aux loss factor for the total loss. + mlp_only_layers (`list[int]`, *optional*, defaults to `[]`): + Indicate which layers use Qwen3OmniMoeTextMLP rather than Qwen3OmniMoeTextSparseMoeBlock + The list contains layer index, from 0 to num_layers-1 if we have num_layers layers + If `mlp_only_layers` is empty, `decoder_sparse_step` is used to determine the sparsity. + + ```python + >>> from transformers import Qwen3OmniMoeTextModel, Qwen3OmniMoeTextConfig + + >>> # Initializing a Qwen3OmniMoeText style configuration + >>> configuration = Qwen3OmniMoeTextConfig() + + >>> # Initializing a model from the Qwen3-15B-A2B" style configuration + >>> model = Qwen3OmniMoeTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "qwen3_omni_moe_text" + keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 1000000.0 + + # Default tensor parallel plan for base model `Qwen3OmniMoeText` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.experts.gate_up_proj": "local_rowwise", + "layers.*.mlp.experts.down_proj": "local_rowwise", + "layers.*.mlp.experts": "gather", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + def __init__( self, vocab_size: Optional[int] = 3584, @@ -185,41 +295,38 @@ def __init__( mlp_only_layers: Optional[list[int]] = None, **kwargs, ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.sliding_window = sliding_window + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.rope_parameters = rope_parameters + + # MoE arguments + self.decoder_sparse_step = decoder_sparse_step + self.moe_intermediate_size = moe_intermediate_size + self.num_experts_per_tok = num_experts_per_tok + self.num_experts = num_experts + self.norm_topk_prob = norm_topk_prob + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers + super().__init__( - vocab_size, - hidden_size, - intermediate_size, - num_hidden_layers, - num_attention_heads, - num_key_value_heads, - hidden_act, - max_position_embeddings, - initializer_range, - rms_norm_eps, - use_cache, - tie_word_embeddings, - rope_parameters, - attention_bias, - False, - sliding_window, - attention_dropout, - decoder_sparse_step, - moe_intermediate_size, - num_experts_per_tok, - num_experts, - norm_topk_prob, - output_router_logits, - router_aux_loss_coef, - mlp_only_layers, + tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope_section", "interleaved", "mrope_interleaved"}, **kwargs, ) - del self.use_sliding_window - self.sliding_window = sliding_window - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 1000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "interleaved", "mrope_interleaved"}) class Qwen3OmniMoeThinkerConfig(Qwen2_5OmniThinkerConfig): @@ -653,7 +760,6 @@ def __init__( attention_dropout=0.0, **kwargs, ): - super().__init__(**kwargs) self.codebook_size = codebook_size self.hidden_size = hidden_size self.max_position_embeddings = max_position_embeddings @@ -671,15 +777,9 @@ def __init__( self.upsampling_ratios = upsampling_ratios self.decoder_dim = decoder_dim self.attention_dropout = attention_dropout + self.rope_parameters = rope_parameters - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + super().__init__(**kwargs) @property def layer_types(self): diff --git a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py index d81129a171e9..cf6f17364672 100644 --- a/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/configuration_qwen3_vl.py @@ -21,7 +21,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Qwen3VLVisionConfig(PreTrainedConfig): @@ -130,6 +130,7 @@ class Qwen3VLTextConfig(PreTrainedConfig): model_type = "qwen3_vl_text" base_config_key = "text_config" + default_theta = 500000.0 def __init__( self, @@ -170,16 +171,13 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 5000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) - - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__( + tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope_section", "mrope_interleaved"}, + **kwargs, + ) class Qwen3VLConfig(PreTrainedConfig): diff --git a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py index 60253ce21551..82b385c53744 100644 --- a/src/transformers/models/qwen3_vl/modular_qwen3_vl.py +++ b/src/transformers/models/qwen3_vl/modular_qwen3_vl.py @@ -30,7 +30,7 @@ from ...masking_utils import create_causal_mask from ...modeling_flash_attention_utils import FlashAttentionKwargs from ...modeling_outputs import BaseModelOutputWithPast -from ...modeling_rope_utils import RopeParameters, dynamic_rope_update, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import ProcessingKwargs, Unpack from ...tokenization_utils_base import PreTokenizedInput, TextInput @@ -171,6 +171,7 @@ class Qwen3VLTextConfig(PreTrainedConfig): model_type = "qwen3_vl_text" base_config_key = "text_config" + default_theta = 500000.0 def __init__( self, @@ -211,16 +212,13 @@ def __init__( self.use_cache = use_cache self.attention_bias = attention_bias self.attention_dropout = attention_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters + self.rope_parameters = rope_parameters - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 5000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) - - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__( + tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope_section", "mrope_interleaved"}, + **kwargs, + ) class Qwen3VLConfig(PreTrainedConfig): diff --git a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py index 08d330019088..bdf9d32c57cc 100644 --- a/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/configuration_qwen3_vl_moe.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Qwen3VLMoeTextConfig(PreTrainedConfig): @@ -106,6 +106,7 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig): model_type = "qwen3_vl_moe_text" base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 # Default tensor parallel plan for base model `Qwen3VLMoe` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -166,14 +167,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.head_dim = head_dim or hidden_size // num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 5000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters = rope_parameters # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -182,7 +176,11 @@ def __init__( self.num_experts = num_experts self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__( + tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope_section", "mrope_interleaved"}, + **kwargs, + ) class Qwen3VLMoeVisionConfig(PreTrainedConfig): diff --git a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py index 15e99f440baa..1186b8433cf4 100644 --- a/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py +++ b/src/transformers/models/qwen3_vl_moe/modular_qwen3_vl_moe.py @@ -23,7 +23,7 @@ from ...activations import ACT2FN from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -128,6 +128,7 @@ class Qwen3VLMoeTextConfig(PreTrainedConfig): model_type = "qwen3_vl_moe_text" base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 500000.0 # Default tensor parallel plan for base model `Qwen3VLMoe` base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -188,14 +189,7 @@ def __init__( self.attention_bias = attention_bias self.attention_dropout = attention_dropout self.head_dim = head_dim or hidden_size // num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 5000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"}) + self.rope_parameters = rope_parameters # MoE arguments self.decoder_sparse_step = decoder_sparse_step @@ -204,7 +198,11 @@ def __init__( self.num_experts = num_experts self.mlp_only_layers = [] if mlp_only_layers is None else mlp_only_layers - super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + super().__init__( + tie_word_embeddings=tie_word_embeddings, + ignore_keys_at_rope_validation={"mrope_section", "mrope_interleaved"}, + **kwargs, + ) class Qwen3VLMoeVisionConfig(Qwen3VLVisionConfig): diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py index e9d94e68f840..ce5c63210478 100644 --- a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -75,8 +75,6 @@ class RecurrentGemmaConfig(PreTrainedConfig): Beginning of stream token id. hidden_activation (``str` or `function``, *optional*, defaults to `"gelu_pytorch_tanh"`): The hidden activation used in the recurrent block as well as the MLP layer of the decoder layers. - partial_rotary_factor (`float`, *optional*, defaults to 0.5): - The partial rotary factor used in the initialization of the rotary embeddings. rope_parameters (`RopeParameters`, *optional*): Dictionary containing the configuration parameters for the RoPE embeddings. The dictionary should contain a value for `rope_theta` and optionally parameters used for scaling in case you want to use RoPE @@ -119,7 +117,6 @@ def __init__( eos_token_id: Optional[int] = 1, bos_token_id: Optional[int] = 2, hidden_activation: Optional[str] = "gelu_pytorch_tanh", - partial_rotary_factor: Optional[float] = 0.5, rope_parameters: Optional[RopeParameters | dict[str, RopeParameters]] = None, block_types: Optional[list[str]] = ("recurrent", "recurrent", "attention"), attention_dropout: Optional[float] = 0.0, @@ -139,7 +136,6 @@ def __init__( self.logits_soft_cap = logits_soft_cap self.rms_norm_eps = rms_norm_eps self.use_cache = use_cache - self.partial_rotary_factor = partial_rotary_factor self.block_types = list(block_types) self.hidden_activation = hidden_activation self.head_dim = self.hidden_size // self.num_attention_heads @@ -150,14 +146,8 @@ def __init__( self.attention_bias = attention_bias self.w_init_variance_scale = w_init_variance_scale self.final_w_init_variance_scale = 2.0 / self.num_hidden_layers - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.5) # assign default for BC super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py index dc1a3d4951e2..8f3061d495a0 100644 --- a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py +++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py @@ -102,7 +102,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) diff --git a/src/transformers/models/seed_oss/configuration_seed_oss.py b/src/transformers/models/seed_oss/configuration_seed_oss.py index 48eceb39225c..63a5c20c2858 100644 --- a/src/transformers/models/seed_oss/configuration_seed_oss.py +++ b/src/transformers/models/seed_oss/configuration_seed_oss.py @@ -16,7 +16,7 @@ from typing import Optional from transformers.configuration_utils import PreTrainedConfig -from transformers.modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from transformers.modeling_rope_utils import RopeParameters class SeedOssConfig(PreTrainedConfig): @@ -170,14 +170,7 @@ def __init__( self.residual_dropout = residual_dropout self.mlp_bias = mlp_bias self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/smollm3/configuration_smollm3.py b/src/transformers/models/smollm3/configuration_smollm3.py index 1f8523e5a108..03701376dc26 100644 --- a/src/transformers/models/smollm3/configuration_smollm3.py +++ b/src/transformers/models/smollm3/configuration_smollm3.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class SmolLM3Config(PreTrainedConfig): @@ -108,6 +108,7 @@ class SmolLM3Config(PreTrainedConfig): model_type = "smollm3" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 2000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -151,12 +152,6 @@ def __init__( mlp_bias: Optional[bool] = False, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.mlp_bias = mlp_bias @@ -201,10 +196,13 @@ def __init__( self.layer_types = layer_types layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 2000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) __all__ = ["SmolLM3Config"] diff --git a/src/transformers/models/smollm3/modular_smollm3.py b/src/transformers/models/smollm3/modular_smollm3.py index 995c15f10de0..c72abf8e1b96 100644 --- a/src/transformers/models/smollm3/modular_smollm3.py +++ b/src/transformers/models/smollm3/modular_smollm3.py @@ -21,7 +21,7 @@ from ...cache_utils import Cache from ...configuration_utils import PreTrainedConfig, layer_type_validation from ...modeling_flash_attention_utils import FlashAttentionKwargs -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack from ...utils import logging @@ -125,6 +125,7 @@ class SmolLM3Config(PreTrainedConfig): model_type = "smollm3" keys_to_ignore_at_inference = ["past_key_values"] + default_theta = 2000000.0 base_model_tp_plan = { "layers.*.self_attn.q_proj": "colwise", @@ -168,12 +169,6 @@ def __init__( mlp_bias: Optional[bool] = False, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.mlp_bias = mlp_bias @@ -218,10 +213,13 @@ def __init__( self.layer_types = layer_types layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = getattr(self, "rope_theta", 2000000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) class SmolLM3RotaryEmbedding(Qwen2RotaryEmbedding): diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index c0a5bb2a481a..4e06ef7c4edd 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -86,8 +86,6 @@ class StableLmConfig(PreTrainedConfig): The dropout ratio after applying the MLP to the hidden states. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - partial_rotary_factor (`float`, *optional*, defaults to 0.25): - Percentage of the query and keys which will have rotary embedding. bos_token_id (int, *optional*, defaults to 0): The id of the `BOS` token in the vocabulary. eos_token_id (int, *optional*, defaults to 0): @@ -125,7 +123,6 @@ def __init__( use_parallel_residual: Optional[bool] = False, hidden_dropout: Optional[float] = 0.0, attention_dropout: Optional[float] = 0.0, - partial_rotary_factor: Optional[float] = 0.25, bos_token_id: Optional[int] = 0, eos_token_id: Optional[int] = 0, **kwargs, @@ -148,15 +145,8 @@ def __init__( self.use_parallel_residual = use_parallel_residual self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - self.partial_rotary_factor = partial_rotary_factor - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + kwargs.setdefault("partial_rotary_factor", 0.25) # assign default for BC super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index 3b091726fab4..27a9f6b47ce1 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -98,7 +98,7 @@ def compute_default_rope_parameters( post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). """ base = config.rope_parameters["rope_theta"] - partial_rotary_factor = getattr(config, "partial_rotary_factor", 1.0) + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads dim = int(head_dim * partial_rotary_factor) @@ -254,7 +254,7 @@ def __init__(self, config: StableLmConfig, layer_idx: Optional[int] = None): self.num_key_value_heads = config.num_key_value_heads self.num_key_value_groups = self.num_heads // self.num_key_value_heads - self.rotary_ndims = int(self.head_dim * config.partial_rotary_factor) + self.rotary_ndims = int(self.head_dim * config.rope_parameters["partial_rotary_factor"]) self.is_causal = True self.scaling = self.head_dim**-0.5 diff --git a/src/transformers/models/starcoder2/configuration_starcoder2.py b/src/transformers/models/starcoder2/configuration_starcoder2.py index 89168ea6b3e6..d0fcfea44b42 100644 --- a/src/transformers/models/starcoder2/configuration_starcoder2.py +++ b/src/transformers/models/starcoder2/configuration_starcoder2.py @@ -17,7 +17,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters from ...utils import logging @@ -155,14 +155,7 @@ def __init__( self.attention_dropout = attention_dropout self.residual_dropout = residual_dropout self.embedding_dropout = embedding_dropout - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters super().__init__( bos_token_id=bos_token_id, diff --git a/src/transformers/models/t5gemma/configuration_t5gemma.py b/src/transformers/models/t5gemma/configuration_t5gemma.py index f7191f26f314..93883844a1eb 100644 --- a/src/transformers/models/t5gemma/configuration_t5gemma.py +++ b/src/transformers/models/t5gemma/configuration_t5gemma.py @@ -22,7 +22,7 @@ from typing import Any, Optional, Union from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class T5GemmaModuleConfig(PreTrainedConfig): @@ -150,13 +150,6 @@ def __init__( attn_logit_softcapping: Optional[float] = 50.0, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -176,9 +169,6 @@ def __init__( self.final_logit_softcapping = final_logit_softcapping self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ @@ -186,10 +176,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) class T5GemmaConfig(PreTrainedConfig): diff --git a/src/transformers/models/vaultgemma/configuration_vaultgemma.py b/src/transformers/models/vaultgemma/configuration_vaultgemma.py index 7cb562566877..a4dc54e3f028 100644 --- a/src/transformers/models/vaultgemma/configuration_vaultgemma.py +++ b/src/transformers/models/vaultgemma/configuration_vaultgemma.py @@ -22,7 +22,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig, layer_type_validation -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class VaultGemmaConfig(PreTrainedConfig): @@ -150,13 +150,6 @@ def __init__( attn_logit_softcapping: Optional[float] = 50.0, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - tie_word_embeddings=tie_word_embeddings, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -176,9 +169,6 @@ def __init__( self.final_logit_softcapping = final_logit_softcapping self.attn_logit_softcapping = attn_logit_softcapping self.layer_types = layer_types - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters if self.layer_types is None: self.layer_types = [ @@ -186,10 +176,15 @@ def __init__( ] layer_type_validation(self.layer_types, self.num_hidden_layers) - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) __all__ = ["VaultGemmaConfig"] diff --git a/src/transformers/models/zamba2/configuration_zamba2.py b/src/transformers/models/zamba2/configuration_zamba2.py index 0901ccf57607..69cfd2404f0a 100644 --- a/src/transformers/models/zamba2/configuration_zamba2.py +++ b/src/transformers/models/zamba2/configuration_zamba2.py @@ -23,7 +23,7 @@ from typing import Optional from ...configuration_utils import PreTrainedConfig -from ...modeling_rope_utils import RopeParameters, rope_config_validation, standardize_rope_params +from ...modeling_rope_utils import RopeParameters class Zamba2Config(PreTrainedConfig): @@ -173,12 +173,6 @@ def __init__( use_long_context: Optional[bool] = False, **kwargs, ): - super().__init__( - pad_token_id=pad_token_id, - bos_token_id=bos_token_id, - eos_token_id=eos_token_id, - **kwargs, - ) self.vocab_size = vocab_size self.max_position_embeddings = max_position_embeddings self.hidden_size = hidden_size @@ -195,14 +189,7 @@ def __init__( self.attention_dropout = attention_dropout self.use_mem_rope = use_mem_rope self.use_long_context = use_long_context - # Try to set `rope_scaling` if available, otherwise use `rope_parameters` - rope_scaling = kwargs.pop("rope_scaling", None) - self.rope_parameters = rope_scaling or rope_parameters - - # Validate the correctness of rotary position embeddings parameters - rope_theta = kwargs.get("rope_theta", 10000.0) - standardize_rope_params(self, rope_theta=rope_theta) - rope_config_validation(self) + self.rope_parameters = rope_parameters self.mamba_d_state = mamba_d_state self.mamba_d_conv = mamba_d_conv @@ -246,6 +233,12 @@ def __init__( self.num_logits_to_keep = num_logits_to_keep self.hybrid_layer_ids = [index for index, type in enumerate(self.layers_block_type) if type == "hybrid"] self.use_mem_eff_path = use_mem_eff_path + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) __all__ = ["Zamba2Config"] diff --git a/tests/causal_lm_tester.py b/tests/causal_lm_tester.py index 6dbacb399576..cc5095e69ce0 100644 --- a/tests/causal_lm_tester.py +++ b/tests/causal_lm_tester.py @@ -433,11 +433,14 @@ def test_model_rope_scaling_from_config(self, scaling_type): if not _config_supports_rope_scaling(config): self.skipTest("This model does not support RoPE scaling") + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) short_input = ids_tensor([1, 10], config.vocab_size) long_input = ids_tensor([1, int(config.max_position_embeddings * 1.5)], config.vocab_size) set_seed(42) # Fixed seed at init time so the two models get the same random weights - _set_config_rope_params(config, {"rope_type": "default", "rope_theta": 10_000.0}) + _set_config_rope_params( + config, {"rope_type": "default", "rope_theta": 10_000.0, "partial_rotary_factor": partial_rotary_factor} + ) original_model = self.model_tester_class.base_model_class(config) original_model.to(torch_device) original_model.eval() @@ -445,7 +448,15 @@ def test_model_rope_scaling_from_config(self, scaling_type): original_long_output = original_model(long_input).last_hidden_state set_seed(42) # Fixed seed at init time so the two models get the same random weights - _set_config_rope_params(config, {"rope_type": scaling_type, "factor": 10.0, "rope_theta": 10_000.0}) + _set_config_rope_params( + config, + { + "rope_type": scaling_type, + "factor": 10.0, + "rope_theta": 10_000.0, + "partial_rotary_factor": partial_rotary_factor, + }, + ) scaled_model = self.model_tester_class.base_model_class(config) scaled_model.to(torch_device) scaled_model.eval() @@ -485,6 +496,7 @@ def test_model_rope_scaling_frequencies(self): scaling_factor = 10 short_input_length = 10 + partial_rotary_factor = config.rope_parameters.get("partial_rotary_factor", 1.0) long_input_length = int(config.max_position_embeddings * 1.5) # Inputs @@ -497,7 +509,9 @@ def test_model_rope_scaling_frequencies(self): position_ids_long = position_ids_long.unsqueeze(0) # Sanity check original RoPE - _set_config_rope_params(config, {"rope_type": "default", "rope_theta": 10_000.0}) + _set_config_rope_params( + config, {"rope_type": "default", "rope_theta": 10_000.0, "partial_rotary_factor": partial_rotary_factor} + ) original_rope = rope_class(config=config).to(torch_device) original_cos_short, original_sin_short = original_rope(x, position_ids_short) original_cos_long, original_sin_long = original_rope(x, position_ids_long) @@ -506,7 +520,15 @@ def test_model_rope_scaling_frequencies(self): # Sanity check linear RoPE scaling # New position "x" should match original position with index "x/scaling_factor" - _set_config_rope_params(config, {"rope_type": "linear", "factor": scaling_factor, "rope_theta": 10_000.0}) + _set_config_rope_params( + config, + { + "rope_type": "linear", + "factor": scaling_factor, + "rope_theta": 10_000.0, + "partial_rotary_factor": partial_rotary_factor, + }, + ) linear_scaling_rope = rope_class(config=config).to(torch_device) linear_cos_short, linear_sin_short = linear_scaling_rope(x, position_ids_short) linear_cos_long, linear_sin_long = linear_scaling_rope(x, position_ids_long) @@ -520,7 +542,15 @@ def test_model_rope_scaling_frequencies(self): # Sanity check Dynamic NTK RoPE scaling # Scaling should only be observed after a long input is fed. We can observe that the frequencies increase # with scaling_factor (or that `inv_freq` decreases) - _set_config_rope_params(config, {"rope_type": "dynamic", "factor": scaling_factor, "rope_theta": 10_000.0}) + _set_config_rope_params( + config, + { + "rope_type": "dynamic", + "factor": scaling_factor, + "rope_theta": 10_000.0, + "partial_rotary_factor": partial_rotary_factor, + }, + ) ntk_scaling_rope = rope_class(config=config).to(torch_device) ntk_cos_short, ntk_sin_short = ntk_scaling_rope(x, position_ids_short) ntk_cos_long, ntk_sin_long = ntk_scaling_rope(x, position_ids_long) @@ -534,7 +564,15 @@ def test_model_rope_scaling_frequencies(self): # Sanity check Yarn RoPE scaling # Scaling should be over the entire input - _set_config_rope_params(config, {"rope_type": "yarn", "factor": scaling_factor, "rope_theta": 10_000.0}) + _set_config_rope_params( + config, + { + "rope_type": "yarn", + "factor": scaling_factor, + "rope_theta": 10_000.0, + "partial_rotary_factor": partial_rotary_factor, + }, + ) yarn_scaling_rope = rope_class(config=config).to(torch_device) yarn_cos_short, yarn_sin_short = yarn_scaling_rope(x, position_ids_short) yarn_cos_long, yarn_sin_long = yarn_scaling_rope(x, position_ids_long) diff --git a/tests/utils/test_modeling_rope_utils.py b/tests/utils/test_modeling_rope_utils.py index 3cf063ac116e..730c4967368e 100644 --- a/tests/utils/test_modeling_rope_utils.py +++ b/tests/utils/test_modeling_rope_utils.py @@ -24,7 +24,6 @@ import torch from transformers import ROPE_INIT_FUNCTIONS - from transformers.modeling_rope_utils import rope_config_validation from transformers.models.llama.modeling_llama import LlamaRotaryEmbedding @@ -35,13 +34,13 @@ def test_rope_validation(self): all_rope_types = ROPE_INIT_FUNCTIONS.keys() # The base config is always valid (default RoPE) - rope_config_validation(config) + config.validate_rope() # If we explicitly set the other RoPE types, then validation should fail for rope_type in all_rope_types: config.rope_parameters = {"rope_type": rope_type, "rope_theta": 10000.0} with self.assertRaises(KeyError): - rope_config_validation(config) + config.validate_rope() # Parameters are exclusive to their own RoPE type, and should raise an exception if incorrectly passed valid_param_mapping = { @@ -60,31 +59,31 @@ def test_rope_validation(self): continue else: with self.assertRaises(KeyError): - rope_config_validation(config) + config.validate_rope() # Any other parameters passed to RoPE will raise a warning that a particular key is not used # But sometimes we can have model-specific RoPE kwargs and bypass warning with `ignore_keys` model_specific_kwarg = "mrope_sections" # e,g in Qwen2-VL config.rope_parameters = {"rope_type": "default", "rope_theta": 10000.0, model_specific_kwarg: True} - rope_config_validation(config, ignore_keys={model_specific_kwarg}) + config.validate_rope(ignore_keys={model_specific_kwarg}) with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_validation(config) + config.validate_rope() self.assertEqual(len(logs.output), 1) self.assertIn(model_specific_kwarg, logs.output[0]) # We can indicate Different RoPE params for each attention type # We can also have only one RoPE params defined for all layer, we don't raise an error # because it is not required to have separate RoPE per layer type - config.layer_types = ["global_attn", "local_attn"] + config.layer_types = ["full_attention", "sliding_attention"] config.rope_parameters = { - "global_attn": {"rope_type": "default", "rope_theta": 10000}, - "local_attn": {"rope_type": "linear", "rope_theta": 10000, "factor": 2.0}, + "full_attention": {"rope_type": "default", "rope_theta": 10000}, + "sliding_attention": {"rope_type": "linear", "rope_theta": 10000, "factor": 2.0}, } - rope_config_validation(config) + config.validate_rope() - config.rope_parameters = config.rope_parameters["local_attn"] - rope_config_validation(config) + config.rope_parameters = config.rope_parameters["full_attention"] + config.validate_rope() def test_yarn_original_original_max_position_embeddings_validation(self): """Tests that models with no/bad `original_max_position_embeddings` raise a warning""" @@ -100,7 +99,7 @@ def test_yarn_original_original_max_position_embeddings_validation(self): config.rope_parameters = rope_config with self.assertRaises(AssertionError): # confirm that no warnings are thrown with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_validation(config) + config.validate_rope() # bad rope config, no `original_max_position_embeddings` -> warning rope_config = { @@ -110,7 +109,7 @@ def test_yarn_original_original_max_position_embeddings_validation(self): } config.rope_parameters = rope_config with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_validation(config) + config.validate_rope() self.assertEqual(len(logs.output), 1) self.assertIn("is unset", logs.output[0]) @@ -123,7 +122,7 @@ def test_yarn_original_original_max_position_embeddings_validation(self): } config.rope_parameters = rope_config with self.assertLogs("transformers.modeling_rope_utils", level="WARNING") as logs: - rope_config_validation(config) + config.validate_rope() self.assertEqual(len(logs.output), 1) self.assertIn("implicit factor", logs.output[0]) @@ -373,7 +372,7 @@ def test_longrope_rope_numerically(self): } self.assertEqual(config.rope_parameters.get("attention_factor"), None) # Verify that "TypeError: '<' not supported between instances of 'NoneType' and 'int'" is not raised. - rope_config_validation(config) + config.validate_rope() # Check 2: seq_len == 0 -> short factor is applied to the default frequencies config.rope_parameters = {